Advanced Search
Last updated date: Nov 20, 2022 Views: 955 Forks: 0
For V72*01 (VH186.2) analysis, we used the NEBNext Immune sequencing kit for mouse according to the manufacturer's instructions, and then Miseq V3 reagent chemistry with 300 bp paired end reads. The code used to perform analysis of the resultant data is shown below and can further be obtained as a .rmd file from the authors.
---
title: "Mouse IgG sequence analysis"
output: html_document
---
#load packages and functions
```{r}
# load the packages you'll need to execute the code.
#Note, you will need to install on your local machine if you havent done that yet.
library(ShortRead)
library(tidyverse)
library(RGenetics)
library(Biostrings)
library(data.table)
library(ggbeeswarm)
#other bits needed for post IMGT analysis
# VH186.2 (V72*01) reference sequence
Acc <- "caggtccaactgcagcagcctggggctgagcttgtgaagcctggggcttcagtgaagctgtcctgcaaggcttctggctacaccttcaccagctactggatgcactgggtgaagcagaggcctggacgaggccttgagtggattggaaggattgatcctaatagtggtggtactaagtacaatgagaagttcaagagcaaggccacactgactgtagacaaaccctccagcacagcctacatgcagctcagcagcctgacatctgaggactctgcggtctattattgtgcaaga"
# function to tidy sample names from .fasq files
string.diff.ex<-function(a="ATTCGAN",exclude=c("n","N","?"),ignore.case=TRUE)
{
if(nchar(a)!=nchar(Acc)) stop("Lengths of input strings differ. Please check your input.")
if(ignore.case==TRUE)
{
a<-toupper(a)
Acc<-toupper(Acc)
}
diff.a<-unlist(strsplit(a,split=""))
diff.Acc<-unlist(strsplit(Acc,split=""))
diff.d<-rbind(diff.a,diff.Acc)
for(ex.loop in 1:length(exclude))
{
diff.d<-diff.d[,!(diff.d[1,]==exclude[ex.loop]|diff.d[2,]==exclude[ex.loop])]
}
differences<-sum(diff.d[1,]!=diff.d[2,])
return(differences)
}
```
#Sample 1:
```{r}
# read in fastq
fq<- readFastq("....fastq")
head(fq)
#Extract reads
reads <- sread(fq)
#Extract Isotype call and make ID:
id <- as.character(fq@id)
CRegion <- str_extract(id, "Ig.+")
Cregion <- gsub("\\|.+", "",CRegion)
#make a matrix to output important sample ID information from the Cregion object generated above and generate a # for each sequence (its row # essentially), which when combined with "S1" becomes the unique sequence ID.
#This makes it possible to trace IMGT output back to raw sequence.
read_ID <- matrix(nrow = length(reads), ncol = 4)
read_ID[,1] <- "S1"
read_ID[,2] <- Cregion
read_ID[,3] <- seq(1, length(reads))
read_ID[,4] <- paste(read_ID[,1], read_ID[,2], read_ID[,3],sep = "_")
#make new object thats a list of sequences then name each item of that list. Export as fasta file
All_seqs <- (reads)
names(All_seqs) <- read_ID[,4]
writeXStringSet(All_seqs,
filepath = ".....fasta",
append=FALSE,
compress=FALSE, compression_level=NA, format="fasta")
```
#Next steps:
upload fasta sequences to IMGT High V Quest and extract default outputs
https://www.imgt.org/HighV-QUEST/home.action
load in outputs below and continue
#Post IMGT!
# Sample1
```{r}
nts<- read.delim(".../3_Nt-sequences.txt", stringsAsFactors = F)
SeqSummary<- read.delim(".../1_Summary.txt", stringsAsFactors = F)
Mutations<- read.delim(".../8_V-REGION-nt-mutation-statistics.txt", stringsAsFactors = F)
#remove light chains
nts_clean<- nts %>% filter(!grepl('IgK|IgL', Sequence.ID))
#remove unproductive and missing seqs
nts_clean <- nts_clean[!nts_clean$V.DOMAIN.Functionality %in% c( "unproductive" , "No rearrangement found"),]
nts_clean <- nts_clean[!nts_clean$V.D.J.REGION %in% c( ""),]
nts_clean <- nts_clean[!nts_clean$V.GENE.and.allele %in% c(""),]
# count duplicates
count.dups <- function(nts_clean){
DT <- data.table(nts_clean)
DT[,.N, by = nts_clean$V.D.J.REGION]
}
duplicated <- count.dups(nts_clean)
names(duplicated) <- c("V.D.J.REGION", "seqcounts")
hist(duplicated[duplicated$seqcounts>1,]$seqcounts, breaks = 100 ,main="",
xlab="# instances of sequence")
#remove duplicates
nts_clean <- nts_clean[!duplicated(nts_clean$V.D.J.REGION),]
nts_clean <- left_join(nts_clean, duplicated, by = "V.D.J.REGION")
#extract out each isotype - all V genes.
#Note, these objects are written over when analysing IGHV1-72 below.
IgM<- nts_clean %>% filter(grepl('IgM', Sequence.ID))
IgA<- nts_clean %>% filter(grepl('IgA', Sequence.ID))
IgGb<- nts_clean %>% filter(grepl('IgGb', Sequence.ID))
IgGa<- nts_clean %>% filter(grepl('IgGa', Sequence.ID))
IgGs <- rbind(IgGa, IgGb)
###### Join nucleotide sequence information with mutation info
nts_clean <- left_join(nts_clean, Mutations[, c("Sequence.ID", "V.REGION.Nb.of.mutations" ,"V.REGION.Nb.of.silent.mutations","V.REGION.Nb.of.nonsilent.mutations","FR1.IMGT.Nb.of.mutations" ,"CDR1.IMGT.Nb.of.mutations" ,"FR2.IMGT.Nb.of.mutations","CDR2.IMGT.Nb.of.mutations" ,"FR3.IMGT.Nb.of.mutations","CDR3.IMGT.Nb.of.mutations")], by = "Sequence.ID")
#annoyingly, IMGT has included mutation count including numbers, as well as sometimes numbers within brackets (which represent the # mutations including any nts added during N-diversity).
#The following code removes the numbers within brackets and converts back to numeric variables.
mutationVars <- c("V.REGION.Nb.of.mutations" ,"V.REGION.Nb.of.silent.mutations","V.REGION.Nb.of.nonsilent.mutations","FR1.IMGT.Nb.of.mutations" ,"CDR1.IMGT.Nb.of.mutations" ,"FR2.IMGT.Nb.of.mutations","CDR2.IMGT.Nb.of.mutations" ,"FR3.IMGT.Nb.of.mutations","CDR3.IMGT.Nb.of.mutations")
cleanMuts<-function(x){as.numeric(gsub("\\(.*?)","",x))}
nts_clean[, mutationVars[1]] <- cleanMuts(nts_clean[, mutationVars[1]])
nts_clean[, mutationVars[2]] <- cleanMuts(nts_clean[, mutationVars[2]])
nts_clean[, mutationVars[3]] <- cleanMuts(nts_clean[, mutationVars[3]])
nts_clean[, mutationVars[4]] <- cleanMuts(nts_clean[, mutationVars[4]])
nts_clean[, mutationVars[5]] <- cleanMuts(nts_clean[, mutationVars[5]])
nts_clean[, mutationVars[6]] <- cleanMuts(nts_clean[, mutationVars[6]])
nts_clean[, mutationVars[7]] <- cleanMuts(nts_clean[, mutationVars[7]])
nts_clean[, mutationVars[8]] <- cleanMuts(nts_clean[, mutationVars[8]])
nts_clean[, mutationVars[9]] <- cleanMuts(nts_clean[, mutationVars[9]])
S1 <- nts_clean
```
# Focus on just "Musmus IGHV1-72*01 F" sequences
```{r}
#find amino acid at pos 99
IGHV1_7201 <- nts_clean[which(nts_clean$V.GENE.and.allele == "Musmus IGHV1-72*01 F"),c(1:16,120:129)]
codons <- IGHV1_7201
codons$pos99nts <-substr(codons$V.D.J.REGION, 295, 297)
codons <- codons[!codons$pos99nts == "",]
codons <- codons[!grepl('n', codons$pos99nts),]
codons <- codons[!nchar(codons$pos99nts)<3,]
codons$pos99aa <-unlist(lapply(codons$pos99nts, codonToAAthree))
IGHV1_7201 <- left_join(IGHV1_7201,codons[,c("Sequence.ID","pos99nts", "pos99aa" )], by = "Sequence.ID" )
#find amino acid at pos 33
codons <- IGHV1_7201
codons$pos33nts <-substr(codons$V.D.J.REGION, 97, 99)
codons <- codons[!codons$pos33nts == "",]
codons <- codons[!grepl('n', codons$pos33nts),]
codons <- codons[!nchar(codons$pos33nts)<3,]
codons$pos33aa <- unlist(lapply(codons$pos33nts, codonToAAthree))
IGHV1_7201 <- left_join(IGHV1_7201,codons[,c("Sequence.ID","pos33nts", "pos33aa" )], by = "Sequence.ID" )
#find amino acid at pos 59
codons <- IGHV1_7201
codons$pos59nts <-substr(codons$V.D.J.REGION, 175, 177)
codons <- codons[!codons$pos59nts == "",]
codons <- codons[!grepl('n', codons$pos59nts),]
codons <- codons[!nchar(codons$pos59nts)<3,]
codons$pos59aa <- unlist(lapply(codons$pos59nts, codonToAAthree))
IGHV1_7201 <- left_join(IGHV1_7201,codons[,c("Sequence.ID","pos59nts", "pos59aa" )], by = "Sequence.ID" )
#split by isotypes
IgM<- IGHV1_7201 %>% filter(grepl('IgM', Sequence.ID))
IgA<- IGHV1_7201 %>% filter(grepl('IgA', Sequence.ID))
IgGb<- IGHV1_7201 %>% filter(grepl('IgGb', Sequence.ID))
IgGa<- IGHV1_7201 %>% filter(grepl('IgGa', Sequence.ID))
#combine IgGs
IgGs <- rbind(IgGa, IgGb)
S1_IgG_7201 <- IgGs
write.csv(IgM, file ="S1_VH186.2_IgM_Analsis.csv")
write.csv(IgA, file ="S1_VH186.2_IgA_Analsis.csv")
write.csv(IgGs, file ="S1_VH186.2_IgG_Analsis.csv")
```
Do you have any questions about this protocol?
Post your question to gather feedback from the community. We will also invite the authors of this article to respond.
Share
Bluesky
X
Copy link