TCGA數據庫LUSC亞型批量差異分析
- 2020 年 3 月 11 日
- 筆記
前些天我布置了一個學徒作業:這一個圖背後是12個差異分析的綜合
作業參考的文獻:Integrated analysis reveals five potential ceRNA biomarkers in human lung adenocarcinoma
所以我設置的學徒作業是:下載TCGA數據庫中LUSC的轉錄組信號值矩陣,LUSC病人分成了4類T1-4亞型分別與Normal組做差異分析,就是3*4=12個表達矩陣,12次差異分析,畫PCA圖,熱圖,火山圖,以及用於差異分析結果比較的Venn圖。
下面讓我們一起看看一個優秀學徒的表演,該學徒很久以前在我們這裡分享過他跨專業進入生信學習圈子的感悟:在華大工作五年還不如生信技能樹3天?
下載數據
緊跟群主的TCGA視頻課程,從UCSC的XENA下載LUSC表達矩陣,臨床信息,探針注釋GMT文件!
視頻課程見:4個小時TCGA腫瘤數據庫知識圖譜視頻教程又有學習筆記啦
rm(list = ls()) pd=read.table("TCGA-LUSC.GDC_phenotype.tsv.gz",header=T,sep = "t", quote = """,dec = ".", fill = TRUE, comment.char = "",row.names = 1) gset_mRNA=read.table("TCGA-LUSC.htseq_counts.tsv.gz",header=T,sep = "t", quote = """,dec = ".", fill = TRUE, comment.char = "",row.names = 1) gset_miRNA=read.table("TCGA-LUSC.mirna.tsv.gz",header=T,sep = "t",row.names = 1) #由於導入數據列名自動把「-」變為了「.」,可用gsub替換回來。 colnames(gset_miRNA)=gsub("\.","-", colnames(gset_miRNA)) colnames(gset_mRNA)=gsub("\.","-", colnames(gset_mRNA)) # 去掉mRNA和miRNA表達矩陣不一致的樣本 table(colnames(gset_mRNA) %in% colnames(gset_miRNA)) gset_mRNA=gset_mRNA[,colnames(gset_mRNA) %in% colnames(gset_miRNA)] gset_miRNA=gset_miRNA[,colnames(gset_miRNA) %in% colnames(gset_mRNA)] table(substring(colnames(gset_mRNA),14,16)) table(pd$pathologic_T) # TCGA數據庫的樣本分組規律(感謝Jimmy大神提供):Tumor types range from 01 - 09, normal types from 10 - 19 and control samples from 20 - 29. # 分期里T1分為了T1a,T1b,T1;T2分為了T2a,T2b,T2;這裡就不細分了,分別合在一起組成T1和T2期亞群 group_list_mRNA=ifelse(substring(colnames(gset_mRNA),14,15) == "11","Normal",ifelse(colnames(gset_mRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T1",]),"T1",ifelse(colnames(gset_mRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T1",]),"T1",ifelse(colnames(gset_mRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T2",]),"T2",ifelse(colnames(gset_mRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T3",]),"T3",ifelse(colnames(gset_mRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T4",]),"T4","q")))))) group_list_miRNA=ifelse(substring(colnames(gset_miRNA),14,15) == "11","Normal",ifelse(colnames(gset_miRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T1",]),"T1",ifelse(colnames(gset_miRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T1",]),"T1",ifelse(colnames(gset_miRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T2",]),"T2",ifelse(colnames(gset_miRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T3",]),"T3",ifelse(colnames(gset_miRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T4",]),"T4","q")))))) table(group_list_mRNA) table(group_list_miRNA) # 查下TCGA官網數據的說明,mRNA表達矩陣取了log2(fpkm+1),miRNA表達矩陣取了log2(RPM+1),所以這裡要還原回去 head(gset_miRNA) head(gset_mRNA) gset_miRNA=(2^gset_miRNA -1)*1000 #差異分析用的RPKM,所以要乘以1000 gset_mRNA=2^gset_mRNA -1 # mRNA表達矩陣包括codingRNA和lncRNA,需要拆分下 library(rtracklayer) library(SummarizedExperiment) gtf1 <- rtracklayer::import('gencode.v22.annotation.gtf/gencode.v22.annotation.gtf') gtf_df <- as.data.frame(gtf1) head(gtf_df) ensem2symbol <- gtf_df[gtf_df$type == 'gene',c('gene_id', 'gene_type', 'gene_name', 'source')] rownames(ensem2symbol) <- ensem2symbol$gene_id head(sort(table(ensem2symbol$gene_type),decreasing = T)) gset_cdRNA=gset_mRNA[rownames(gset_mRNA) %in% rownames(ensem2symbol[ensem2symbol$gene_type == "protein_coding",]),] gset_lncRNA=gset_mRNA[rownames(gset_mRNA) %in% rownames(ensem2symbol[ensem2symbol$gene_type == "lincRNA",]),] #保存表達矩陣和分組信息 save(pd,ensem2symbol,gset_miRNA,gset_cdRNA,gset_lncRNA,group_list_mRNA,group_list_miRNA,file = "step1_output.Rdata")
- gsub批量整體替換字符很方便
- ifelse函數篩選T1-T4的樣本ID,得到表達矩陣及分組信息
- 用基因探針GMT文件注釋拆分mRNA表達矩陣成cdRNA(編碼蛋白的基因)和lncRNA表達矩陣
- 注意TCGA上對表達矩陣的格式說明,DESeq2差異分析是對count值表達矩陣,必要需要還原!

DESeq2差異分析


1.比較LUSC患者T1-4分型與正常樣本的差異基因或miRNA RNA表達矩陣
1.1 檢查數據
## 全部腫瘤樣本及正常樣本表達矩陣的PCA圖,熱圖 rm(list = ls()) load(file = "step1_output.Rdata") dat=gset_cdRNA group_list=ifelse(substring(colnames(gset_cdRNA),14,15) == "11","Normal","Tumor") ### 去掉低質量的探針行 dat=dat[apply(dat,1, function(x) sum(x>1) > 50),] ### 檢查數據 table(group_list) source("run_check_h_pca.R") pro = "cdRNA_Tumor_vs_Normal" run_check_h_pca(pro)

- 樣本分組 GroupNormalT1T2T3T4樣本個數381062796921
- 全部Tumor樣本和Normal組的熱圖和PCA圖可以看出,Tumor組樣本大都與Normal組有顯著差異,從而可進行下一步差異分析。
1.2 差異分析
## T1-T4亞型組與正常組表達矩陣分別差異分析 ### 去掉低質量的探針行 gset=gset_cdRNA gset=gset[apply(gset,1, function(x) sum(x>1) > 50),] ### for循環批量差異分析 source("function.R") for (Typel in c("T1","T2","T3","T4")) { pro=paste0("cdRNA_",Typel,"_vs_Normal_2") run_filter_check_DESeq2(Typel,pro) }
1.3 比較差異分析結果
## 比較T1-4分別與正常樣本差異基因情況 rm(list=ls()) load("cdRNA_T1_vs_Normal_deseq2_DEG.Rdata") p0=p1 # 一個賦值小錯誤,由於包裝函數里是p1,所以導致賦值時p1和p4一樣,所以暫且改成p0賦值了 DEG_T1=DEG load("cdRNA_T2_vs_Normal_deseq2_DEG.Rdata") p2=p1 DEG_T2=DEG load("cdRNA_T3_vs_Normal_deseq2_DEG.Rdata") p3=p1 DEG_T3=DEG load("cdRNA_T4_vs_Normal_deseq2_DEG.Rdata") p4=p1 DEG_T4=DEG library(ggplot2) library(gridExtra) plots <- list(p0,p2,p3,p4) p= grid.arrange(grobs = plots, ncol = 2, top = "compare") ggsave(p,filename = "cdRNA_compare_volcano.png",width = 20,height = 15) venn <- function(x,y,z,a,name){ if(!require(VennDiagram))install.packages('VennDiagram') library (VennDiagram) venn.diagram(x= list(T1_vs_Normal = x,T2_vs_Normal = y,T3_vs_Normal = z,T4_vs_Normal = a),filename = "cdRNA_DEGgene.png", height = 1000, width = 1000,resolution =300, imagetype="tiff", col="transparent",fill=c("cornflowerblue","green","yellow","darkorchid1"),alpha = 0.5, cex=0.45, cat.cex=0.45) } DEG_T1$change!="NOT" DEGs=function(df){ rownames(df)[df$change!="NOT"] } library(VennDiagram) venn(DEGs(DEG_T1),DEGs(DEG_T2),DEGs(DEG_T3),DEGs(DEG_T4), "DEGgene" ) DEGsl = intersect(intersect(intersect(DEGs(DEG_T1),DEGs(DEG_T2)),DEGs(DEG_T3)),DEGs(DEG_T4)) length(DEGsl)

2. lncRNA和miRNA表達矩陣一樣批量分析
- 這裡就直接上文獻中類似的venn圖結果

- T1-4期患者樣本分別與正常樣本差異分析的閾值:log2FC=1,FDR=0.01
- T1-4期患者樣本分別與正常樣本差異分析結果
- cdRNA:19814個基因里有5573個共同差異基因
- lncRNA:7656個基因里有1571個共同的差異lncRNA基因
- miRNA:1881個miRNA里有164個共同的差異miRNA

總結


- 首先特別感謝Jimmy 大神,以上函數代碼均引自Jimmy大神及生信技能樹。
- TCGA數據庫的樣本編碼規律:Tumor types range from 01 – 09, normal types from 10 – 19 and control samples from 20 – 29,方法對樣本進行分組。
- 基因注釋GMT文件把mRNA矩陣注釋拆分成了coding RNA和lncRNA表達矩陣。
- DESeq2包分析的表達矩陣必須是count矩陣,TCGA等數據庫下載的表達矩陣需查看格式說明,如是否取log,如有需要轉換回來。
- 包裝函數for循環方便根據臨床表型信息批量篩選表達矩陣,繪製熱圖、PCA圖、差異分析並得到火山圖、venn圖。
- 模仿文獻分析方法挖掘數據需要仔細閱讀文獻,查看錶達矩陣的過濾條件和差異分析閾值(FC和log2FC有區別)。

函數代碼


檢查數據函數
run_check_h_pca <- function(pro = "T1_vs_Normal"){ rm(list = ls()) ## 魔幻操作,一鍵清空~ options(stringsAsFactors = F) table(group_list) # 每次都要檢測數據 dat[1:4,1:4] ## 下面是畫PCA的必須操作,需要看說明書。 dat=t(dat)#畫PCA圖時要求是行名時樣本名,列名時探針名,因此此時需要轉換 dat=as.data.frame(dat)#將matrix轉換為data.frame dat=cbind(dat,group_list) #cbind橫向追加,即將分組信息追加到最後一列 library("FactoMineR")#畫主成分分析圖需要加載這兩個包 library("factoextra") # The variable group_list (index = 54676) is removed # before PCA analysis dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)#現在dat最後一列是group_list,需要重新賦值給一個dat.pca,這個矩陣是不含有分組信息的 fviz_pca_ind(dat.pca, geom.ind = "point", # show points only (nbut not "text") col.ind = dat$group_list, # color by groups # palette = c("#00AFBB", "#E7B800"), addEllipses = TRUE, # Concentration ellipses legend.title = "Groups" ) ggsave(filename = paste0(pro,'_all_samples_PCA.png')) rm(list = ls()) ## 魔幻操作,一鍵清空~ load(file = 'step1_output.Rdata') dat[1:4,1:4] cg=names(tail(sort(apply(dat,1,sd)),1000))#apply按行('1'是按行取,'2'是按列取)取每一行的方差,從小到大排序,取最大的1000個 library(pheatmap) #pheatmap(dat[cg,],show_colnames =F,show_rownames = F) #對那些提取出來的1000個基因所在的每一行取出,組合起來為一個新的表達矩陣 n=t(scale(t(dat[cg,]))) # 'scale'可以對log-ratio數值進行歸一化 n[n>2]=2 n[n< -2]= -2 n[1:4,1:4] pheatmap(n,show_colnames =F,show_rownames = F) ac=data.frame(g=group_list) rownames(ac)=colnames(n) #把ac的行名給到n的列名,即對每一個探針標記上分組信息(是'noTNBC'還是'TNBC') ## 可以看到TNBC具有一定的異質性,拿它來區分乳腺癌亞型指導臨床治療還是略顯粗糙。 pheatmap(n,show_colnames =F,show_rownames = F, annotation_col=ac,filename = paste0(pro,"_heatmap_top1000_sd.png")) rm(list = ls()) load(file = 'step1_output.Rdata') dat[1:4,1:4] exprSet=dat pheatmap::pheatmap(cor(exprSet)) # 組內的樣本的相似性應該是要高於組間的! colD=data.frame(group_list=group_list) rownames(colD)=colnames(exprSet) pheatmap::pheatmap(cor(exprSet), annotation_col = colD, show_rownames = F, filename = paste0(pro,"_cor_all.png")) dim(exprSet) exprSet=exprSet[apply(exprSet,1, function(x) sum(x>1) > 5),] dim(exprSet) exprSet=log(edgeR::cpm(exprSet)+1) dim(exprSet) exprSet=exprSet[names(sort(apply(exprSet, 1,mad),decreasing = T)[1:500]),] dim(exprSet) M=cor(log2(exprSet+1)) pheatmap::pheatmap(M, show_rownames = F, annotation_col = colD, filename = paste0(pro,"_cor_top500.png")) }
DESeq2差異分析函數
run_filter_check_DESeq2 <- function(Typel,pro){ #按照T1-T4分類肺癌組織樣本的得到四個表達矩陣 RNA_Normal=gset[,substring(colnames(gset),14,15) == "11"] RNA_LUSC=gset[,substring(colnames(gset),14,15) == "01"] colnames(RNA_Normal) colnames(RNA_LUSC) # T1-4期亞型的表達矩陣 RNA_T=RNA_LUSC[,colnames(RNA_LUSC) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == Typel,])] dat=cbind(RNA_Normal,RNA_T) colnames(dat) group_list=c(rep("Normal",38),rep(Typel,ncol(RNA_T))) table(group_list) ## 差異分析 DESeq2 if(!require(DESeq2))BiocManager::install("DESeq2") library(DESeq2) #需修改results()的contrast參數 #輸入:表達矩陣和分組信息 #輸出:差異分析結果、火山圖 #構建colData (condition存在於colData中,是表示分組的因子型變量) countData <- floor(dat) colData <- data.frame(row.names =colnames(countData), condition=group_list) dds <- DESeqDataSetFromMatrix( countData = countData, colData = colData, design = ~ condition) dds <- DESeq(dds) # 兩兩比較 res <- results(dds, contrast = c("condition",Typel,"Normal")) resOrdered <- res[order(res$padj),] # 按照P值排序 DEG <- as.data.frame(resOrdered) DEG <- na.omit(DEG) #添加change列標記基因上調下調,在DESeq里FDR就是pdaj,所以要把pvalue改成pdaj #logFC_cutoff <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)) ) logFC_cutoff <- 1 DEG$change = as.factor( ifelse(DEG$padj < 0.01 & abs(DEG$log2FoldChange) > logFC_cutoff, ifelse(DEG$log2FoldChange >= logFC_cutoff ,'UP','DOWN'),'NOT') ) head(DEG) boxplot(as.numeric(countData[rownames(DEG)[1],])~group_list) # Heatmap熱圖 choose_gene=head(rownames(DEG),100) ## 選定部分差異基因 choose_matrix=countData[choose_gene,] pheatmap::pheatmap(choose_matrix) annotation_col = data.frame( group = factor(group_list)) rownames(annotation_col) = colnames(countData) pheatmap::pheatmap(choose_matrix, scale = "row", annotation_col = annotation_col) # 火山圖 library(ggplot2) this_tile <- paste0('Cutoff for log2FoldChange is ',round(logFC_cutoff,3), 'nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) , 'nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',]) ) p1 = ggplot(data=DEG, aes(x=log2FoldChange, y=-log10(padj),color=change)) + geom_point(alpha=0.4, size=3.5) + theme_set(theme_set(theme_bw(base_size=20)))+ xlab("log2 fold change") + ylab("-log10 padj") + geom_vline(xintercept=c(-logFC_cutoff,logFC_cutoff),lty=4,col="black",lwd=0.8) + geom_hline(yintercept = -log10(0.05),lty=4,col="black",lwd=0.8) + ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+ scale_colour_manual(values = c('blue','grey','red')) ## corresponding to the levels(res$change) p1 ggsave(p1, filename = paste0(pro,"deseq2_vocalno.png"), width = 10, height = 7 ) save(DEG,p1,file = paste0(pro,"_deseq2_DEG.Rdata")) }文末友情宣傳 強烈建議你推薦給身邊的博士後以及年輕生物學PI,多一點數據認知,讓他們的科研上一個台階: