Advanced Search
Last updated date: Aug 6, 2021 Views: 877 Forks: 0
### Single-cell bioinformatics analysis for thyroid cancer study
#--------------------------------------------------------------------------------------------------------
### Integration of scRNAs-seq data
#--------------------------------------------------------------------------------------------------------
# dependency package install
devtools::install_github('satijalab/seurat-wrappers',force = T)
need_pkgs <- c('Seurat','cowplot', 'slingshot',
'data.table', 'reticulate','dplyr',
'RColorBrewer','SingleCellExperiment')
for(i in need_pkgs){
require(i, quietly = T, character.only = T)
}
# setting the file pathway
setwd('/home/gyeongdaekim/')
file_list <- list.files()
print(file_list)
sampleNames = mapply(function(x) x[length(x)], strsplit(file_list, split = '_'))
sampleNames <- gsub('-','_',sampleNames)
meta.info = data.table(file_Path = file_list, sampleID = sampleNames)
meta.info[, SubType:=mapply(function(x) x[1], strsplit(sampleID, '_'))]
DT::datatable(meta.info)
# setting the integration and QC option
norm_method = 'standard'
treat = FALSE
show = FALSE
MT_cut = 15
batch_list = list()
for(i in 1:nrow(meta.info)){
dir_of_10X = meta.info$file_Path[i]
sampleID = meta.info$sampleID[i]
subtype = meta.info$SubType[i]
seq_batch = meta.info$Batch[i]
print(paste("Starting processing", sampleID, 'at', Sys.time()))
sc_mat = Read10X(dir_of_10X)
sc_tumor <- CreateSeuratObject(counts = sc_mat, project = sampleID,
min.cells = 3, min.features = 200)
sc_tumor[["percent.mt"]] <- PercentageFeatureSet(sc_tumor, pattern = "^MT-")
qc = VlnPlot(sc_tumor, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
plot1 <- FeatureScatter(sc_tumor, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(sc_tumor, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
sc_tumor <- subset(sc_tumor, subset = nFeature_RNA > 200 & percent.mt <= MT_cut)
if(norm_method == 'SCT'){
sc_tumor <- SCTransform(sc_tumor, vars.to.regress = 'percent.mt')
} else{
sc_tumor <- NormalizeData(sc_tumor, verbose = F)
sc_tumor <- FindVariableFeatures(sc_tumor, selection.method = "vst",
nfeatures = 2000, verbose = FALSE)
sc_tumor <- ScaleData(sc_tumoer, vrbose = F)
}
sc_tumor <- RenameCells(object = sc_tumor, add.cell.id = sampleID) # add sample name as prefix
if(treat == TRUE){
sc_tumor <- RunPCA(sc_tumor, verbose = T)
sc_tumor <- RunUMAP(sc_tumor, dims = 1:30, verbose = FALSE)
sc_tumor <- RunTSNE(sc_tumor, dims = 1:30, verbose = FALSE)
sc_tumor <- FindNeighbors(sc_tumor, dims = 1:30, verbose = FALSE)
sc_tumor <- FindClusters(sc_tumor, verbose = FALSE)
umap = DimPlot(sc_tumor, label = TRUE) + NoLegend()
tsne = TSNEPlot(sc_tumor, label = TRUE) + NoLegend()
if(show == TRUE){
print(qc)
plot_grid(plot1,plot2)
#dev.off()
print(umap)
print(tsne)
}
}
# add information
sc_tumor@meta.data[,'SubType'] = subtype
sc_tumor@meta.data[,'SampleID'] = sampleID
sc_tumor@meta.data[,'Batch'] = seq_batch
### add batch data into a list to merge
#assign(paste0(sampleID,'.Seurat'),sc_tumor)
batch_list <- append(batch_list, sc_tumor)
print(paste("Finishing processing", sampleID, 'at', Sys.time()))
}
data.combined <- Reduce(function(x,y){merge(x,y,merge.data = TRUE)},batch_list)
# QC check
qc = VlnPlot(data.combined, group.by = 'SampleID', features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3,pt.size = 0)
qc
#--------------------------------------------------------------------------------------------------------
### scRNA-seq data analysis using the Seurat
#--------------------------------------------------------------------------------------------------------
library(Seurat)
data.combined <- FindVariableFeatures(data.combined, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
data.combined <- ScaleData(data.combined, vars.to.regress = c('nCount_RNA', 'percent.mt'),verbose = F)
data.combined <- RunPCA(data.combined, npcs=30, verbose=F)
# selection of the principle component number
data.combined <- JackStraw(data.combined, num.replicate = 100)
data.combined <- ScoreJackStraw(data.combined, dims = 1:20)
JackStrawPlot(data.combined, dims = 1:20)
ElbowPlot(object = data.combined)
num_PC = 30
data.combined <- RunUMAP(data.combined, reduction = 'pca', dims=1:num_PC)
data.combined <- FindNeighbors(data.combined, reduction = 'pca', dims=1:num_PC)
data.combined <- FindClusters(data.combined, resolution = 1)
DimPlot(data.combined, label = TRUE) + NoLegend()
DimPlot(data.combined, label = TRUE,split.by = "orig.ident",ncol = 4) + NoLegend()
#--------------------------------------------------------------------------------------------------------
### batch correction using the fastMNN
#--------------------------------------------------------------------------------------------------------
library(SeuratWrappers)
library(SeuratData)
data.combined5 <- RunFastMNN(object.list = SplitObject(data.combined, split.by = "SampleID"))
data.combined5 <- RunUMAP(data.combined5, reduction = "mnn", dims = 1:30)
data.combined5 <- FindNeighbors(data.combined5, reduction = "mnn", dims = 1:30)
data.combined5 <- FindClusters(data.combined5, resolution = 0.7)
DimPlot(data.combined5, label = TRUE) + NoLegend()
#--------------------------------------------------------------------------------------------------------
### calculation of the number of differentially expressed genes(DEGs) between clusters and merge
#--------------------------------------------------------------------------------------------------------
for(j in 0:length(names(data.combined5@active.ident))){
n=table(data.combined5@active.ident)
ident=j
n[names(n) %in% c(0:ident)]=0
n=n[-1]
o=n[n>0]
ident2=names(o[1])
bim <- FindMarkers(data.combined5, ident.1 = ident, ident.2 =ident2, only.pos = F, test.use = "bimod")
deg=bim[bim$p_val_adj < 0.01,]
deg1=deg[abs(deg$avg_logFC) >= 1,]
degs=dim(deg)
degs1=dim(deg1)
for(i in c(names(o[-1]))){
bim <- FindMarkers(data.combined5, ident.1 = ident, ident.2 = i, only.pos = F, test.use = "bimod")
deg=bim[bim$p_val_adj < 0.01,]
deg1=deg[abs(deg$avg_logFC) >= 1,]
degs=cbind(degs,dim(deg))
degs1=cbind(degs1,dim(deg1))
}
print(degs1)
}
# merge the similar clusters having the difference of DEGs less than 10
data.combined5 <- RenameIdents(object = data.combined5,"0"="0", "1"="0", "2"="2", "3"="0", "4"="4", "5"="5", "6"="0", "7"="7", "8"="8", "9"="9", "10"="10", "11"="11", "12"="12", "13"="13", "14"="14", "15"="15", "16"="16", "17"="17")
# again, calculation of the number of DEGs between clusters
data.combined5 <- RenameIdents(object = data.combined5,"0"="0", "2"="1", "4"="2", "5"="3", "7"="4", "8"="5", "9"="6", "10"="7", "11"="8", "12"="9", "13"="10", "14"="11", "15"="12", "16"="13", "17"="14", "18"="15")
# marker of each cluster
data.combined5.marker <- FindAllMarkers(data.combined5, max.cells.per.ident = 100, min.diff.pct = 0.3, only.pos = TRUE)
write.csv(data.combined5.marker,file = "marker.csv")
FeaturePlot(data.combined5, features = c("SLC34A2"), cols = c("yellow2","DARKGREEN","BLUE4"))
# annotation each cluster
new.cluster.ids <- c("PTC", "NK cell", "Macrophage", "Follicular", "T cell", "Endothelium", "Fibroblast1", "Macrophage2", "PTC2", "Fibroblast2", "B cell", "Mast", "ATC", "Activated T cell", "TAMC", "Parafollicular")
names(new.cluster.ids) <- levels(data.combined5)
data.combined5 <- RenameIdents(data.combined5, new.cluster.ids)
# sort cell types
ident <- c("Follicular", "PTC", "PTC2", "ATC", "Parafollicular", "Fibroblast1", "Fibroblast2", "Endothelium", "NK cell", "T cell", "B cell", "Mast", "Activated T cell", "Macrophage", "Macrophage2", "TAMC")
data.combined5@active.ident<- factor(x = data.combined5@active.ident, levels = ident)
DimPlot(data.combined5, label = TRUE ) + NoLegend()
DimPlot(data.combined5, label = TRUE, split.by = "SubType" , ncol = 4, label.size = 0) + NoLegend()
#--------------------------------------------------------------------------------------------------------
### stacked vlnplot
#--------------------------------------------------------------------------------------------------------
modify_vlnplot<- function(obj,
feature,
pt.size = 0,
plot.margin = unit(c(-0.75, 0, -0.75, 0), "cm"),
...) {
p<- VlnPlot(obj, features = feature, pt.size = pt.size, ... ) +
xlab("") + ylab(feature) + ggtitle("") +
theme(legend.position = "none",
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = rel(1), angle = 0),
axis.text.y = element_text(size = rel(1)),
plot.margin = plot.margin )
return(p)
}
## extract the max value of the y axis
extract_max<- function(p){
ymax<- max(ggplot_build(p)$layout$panel_scales_y[[1]]$range$range)
return(ceiling(ymax))
}
## main function
StackedVlnPlot<- function(obj, features,
pt.size = 0,
plot.margin = unit(c(-0.75, 0, -0.75, 0), "cm"),
...) {
plot_list<- purrr::map(features, function(x) modify_vlnplot(obj = obj,feature = x, ...))
# Add back x-axis title to bottom plot. patchwork is going to support this?
plot_list[[length(plot_list)]]<- plot_list[[length(plot_list)]] +
theme(axis.text.x=element_text(), axis.ticks.x = element_line())
# change the y-axis tick to only max value
ymaxs<- purrr::map_dbl(plot_list, extract_max)
plot_list<- purrr::map2(plot_list, ymaxs, function(x,y) x +
scale_y_continuous(breaks = c(y)) +
expand_limits(y = y))
p<- patchwork::wrap_plots(plotlist = plot_list, ncol = 1)
return(p)
}
features <- cluster marker
StackedVlnPlot(obj = data.combined5, features = features)
save(data.combined5, file="/home/gyeongdaekim/thyroid/object/data.combined.RData")
#--------------------------------------------------------------------------------------------------------
### cancer progression trajectory from Seurat object
#--------------------------------------------------------------------------------------------------------
library(monocle)
# isolation of specific cell types
Fol<-subset(data.combined5, cells=names(data.combined5@active.ident[data.combined5@active.ident %in% c("Follicular")]))
Fol<-subset(Fol, cells=names(Fol@active.ident[Fol$SubType%in% c("NOM","PTC","ATC")]))
ATC<-subset(data.combined5, cells=names(data.combined5@active.ident[data.combined5@active.ident %in% c("ATC")]))
ATC<-subset(ATC, cells=names(ATC@active.ident[ATC$SubType %in% c("ATC","PTC")]))
PTC<-subset(data.combined5, cells=names(data.combined5@active.ident[data.combined5$orig.ident %in% c("PTC_XHY1_190702", "PTC_XHY2_190702")]))
PTC<-subset(PTC, cells=names(PTC@active.ident[PTC@active.ident %in% c("PTC")]))
aPTC<-subset(data.combined5, cells=names(data.combined5@active.ident[data.combined5@active.ident %in% c("PTC")]))
aPTC<-subset(aPTC, cells=names(aPTC@active.ident[aPTC$SubType %in% c("ATC")]))
PTC2<-subset(data.combined5, cells=names(data.combined5@active.ident[data.combined5@active.ident %in% c("PTC2")]))
PTC2<-subset(PTC2, cells=names(PTC2@active.ident[PTC2$SubType %in% c("PTC")]))
a<-subset(Fol, cells=names(Fol@active.ident[Fol$SubType %in% c("NOM")]))
b<-subset(Fol, cells=names(Fol@active.ident[Fol$SubType %in% c("PTC")]))
c<-subset(Fol, cells=names(Fol@active.ident[Fol$SubType %in% c("ATC")]))
e<-subset(ATC, cells=names(ATC@active.ident[ATC$SubType %in% c("PTC")]))
f<-subset(ATC, cells=names(ATC@active.ident[ATC$SubType %in% c("ATC")]))
Idents(object =Fol, cells =colnames(a))="NOM_follicular"
Idents(object =Fol, cells =colnames(b))="PTC_follicular"
Idents(object =Fol, cells =colnames(c))="ATC_follicular"
Idents(object =ATC, cells =colnames(e))="pATC"
Idents(object =ATC, cells =colnames(f))="aATC"
Idents(object =PTC, cells =colnames(PTC))="pPTC"
Idents(object =aPTC, cells =colnames(aPTC))="aPTC"
table(Fol@active.ident)
table(ATC@active.ident)
thyroid <- merge(Fol, y = c(ATC,PTC,PTC2,aPTC), project = "thyroid")
# monocle2 workflow
table(thyroid@active.ident)
thyroid$nCount_RNA=NULL
thyroid$nFeature_RNA=NULL
thyroid$percent.mt=NULL
thyroid$orig.ident<-thyroid@active.ident
thyroid@meta.data=thyroid@meta.data[,1:3]
x=GetAssayData(object = thyroid, slot = "counts")[rownames(GetAssayData(object = thyroid, slot = "counts")) %in% rownames(GetAssayData(object = thyroid)), colnames(GetAssayData(object = thyroid, slot = "counts")) %in% colnames(GetAssayData(object = thyroid))]
target=x[,colnames(x) %in% rownames(thyroid@meta.data)]
target<-as.matrix(target)
target=as.matrix(t(target))
name=merge(thyroid@meta.data, target, by="row.names")
rownames(name)=name$Row.names
name=name[,-1]
name[1:5,1:4]
thyroid@meta.data=name[,1:4]
name=name[,-1:-3]
target=t(name)
target[1:5,1:4]
g=as.data.frame(rownames(x))
rownames(g)=rownames(x)
colnames(g)=c("gene_short_name")
pd <- new("AnnotatedDataFrame", data = thyroid@meta.data)
fd <- new("AnnotatedDataFrame", data = g)
HSMM <- newCellDataSet(as.matrix(target), phenoData = pd, featureData = fd, expressionFamily=negbinomial.size())
HSMM <- estimateSizeFactors(HSMM)
HSMM <- estimateDispersions(HSMM)
HSMM <- detectGenes(HSMM, min_expr = 0.1)
print(head(fData(HSMM)))
expressed_genes <- row.names(subset(fData(HSMM), num_cells_expressed >= 10))
disp_table <- dispersionTable(HSMM)
ordering_genes <- subset(disp_table, mean_expression >= 0.05 & dispersion_empirical >= 2 * dispersion_fit)$gene_id
HSMM <- setOrderingFilter(HSMM, ordering_genes)
plot_ordering_genes(HSMM)
HSMM <- reduceDimension(HSMM, max_components=2)
HSMM <- orderCells(HSMM, reverse=FALSE)
coordinate<-t(HSMM@reducedDimS)
head(coordinate)
colnames(coordinate)<-c("x","y")
coordinate<-as.data.frame(coordinate)
coordinate<-cbind(-coordinate$y,coordinate$x)
coordinate<-t(coordinate)
HSMM@reducedDimS<-coordinate
colnames(HSMM@reducedDimS)<-colnames(HSMM)
plot_cell_trajectory(HSMM, show_tree = FALSE ,show_backbone = FALSE, color_by="orig.ident", show_branch_points = FALSE, cell_size=1)
plot_cell_trajectory(HSMM, show_tree = FALSE ,show_backbone = FALSE, color_by="orig.ident", show_branch_points = FALSE, cell_size=1) + scale_color_manual(breaks = c("aPTC", "pATC","ATC" ,"PTC", "PTC2", "NOM_follicular", "PTC_follicular", "ATC_follicular"), values=c("black", "darkorchid","darkseagreen2", "dodgerblue3", "indianred3","yellow2","turquoise3","RED1")) + theme(legend.position = "right")
#----------------------------------------------------------------------------------------------------
### gene expression on the trajectory
#----------------------------------------------------------------------------------------------------
pseudo=t(HSMM@reducedDimS)
colnames(pseudo)=c("component1","component2")
expression=target["COL1A1",]
expression[expression>5]=5
ggplot() + geom_point(data=as.data.frame(pseudo), aes(component1, component2, color=expression), size=1.5) + scale_colour_gradient(low = "grey",high = "red")
#----------------------------------------------------------------------------------------------------
### expression change along the pseudotime
#----------------------------------------------------------------------------------------------------
HSMM_subset <- HSMM2[,colnames(HSMM2)%in% rownames(subset(pData(HSMM2), State %in% c("1","2")))]
HSMM_subset@reducedDimS<-HSMM_subset@reducedDimS[,colnames(HSMM_subset@reducedDimS)%in%colnames(HSMM_subset)]
HSMM_expressed_genes <- row.names(subset(fData(HSMM_subset), num_cells_expressed >= 10))
HSMM_expressed_genes <- row.names(subset(fData(HSMM), num_cells_expressed >= 10))
HSMM_filtered <- HSMM[HSMM_expressed_genes,]
HSMM_filtered <- HSMM_subset[HSMM_expressed_genes,]
diff_test_res <- differentialGeneTest(HSMM_filtered, fullModelFormulaStr="~sm.ns(Pseudotime)")
diff=diff_test_res[,c("gene_short_name", "pval", "qval","use_for_ordering")]
diff=diff[diff$qval < 0.01, ]
diff=diff[order(diff$qval),]
genelist <- row.names(diff_test_res[order(diff_test_res$qval)[1:800],])
genelist<-as.matrix(genelist)
plot_pseudotime_heatmap(HSMM[genelist,],num_clusters = 2, cores =1, show_rownames = T)
#----------------------------------------------------------------------------------------------------
### extraction gene list
#----------------------------------------------------------------------------------------------------
pseudotime_list<-plot_pseudotime_heatmap(HSMM[genelist,],num_clusters = 2, cores = 1,show_rownames = T, return_heatmap = T)
pseudotime_list <- as.data.frame(cutree(pseudotime_list $tree_row, k=2))
colnames(pseudotime_list) <- "Cluster"
pseudotime_list $Gene <- rownames(pseudotime_list)
head(pseudotime_list)
write.csv(pseudotime_list,file ="coordinate.csv", sep = "," )
Do you have any questions about this protocol?
Post your question to gather feedback from the community. We will also invite the authors of this article to respond.
Share
Bluesky
X
Copy link