#if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") #BiocManager::install("limma") #if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") #BiocManager::install("sva") #引用包 library(limma) library(sva) tcgaExpFile="TCGA.TPM.txt" #TCGA表达数据文件 geoExpFile="geoMatrix.txt" #GEO表达数据文件 geneFile="module_brown.txt" #基因列表文件 setwd("C:\\biowolf\\immWGCNA\\15.intersect") #设置工作目录 #读取TCGA基因表达文件,并对数据进行处理 rt=read.table(tcgaExpFile, header=T, sep="\t", check.names=F) rt=as.matrix(rt) rownames(rt)=rt[,1] exp=rt[,2:ncol(rt)] dimnames=list(rownames(exp),colnames(exp)) tcga=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames) tcga=avereps(tcga) tcga=log2(tcga+1) #删掉正常样品 group=sapply(strsplit(colnames(tcga),"\\-"), "[", 4) group=sapply(strsplit(group,""), "[", 1) group=gsub("2", "1", group) tcga=tcga[,group==0] tcga=t(tcga) rownames(tcga)=gsub("(.*?)\\-(.*?)\\-(.*?)\\-.*", "\\1\\-\\2\\-\\3", rownames(tcga)) tcga=t(avereps(tcga)) #读取geo基因表达文件,并对数据进行处理 rt=read.table(geoExpFile, header=T, sep="\t", check.names=F) rt=as.matrix(rt) rownames(rt)=rt[,1] exp=rt[,2:ncol(rt)] dimnames=list(rownames(exp),colnames(exp)) geo=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames) geo=avereps(geo) #如果GEO数据没有取log2,会自动对数据取log2 qx=as.numeric(quantile(geo, c(0, 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) LogC=( (qx[5]>100) || ( (qx[6]-qx[1])>50 && qx[2]>0) ) if(LogC){ geo[geo<0]=0 geo=log2(geo+1)} geo=normalizeBetweenArrays(geo) #对基因取交集,分别得到交集基因在TCGA矩阵和GEO矩阵的表达量 sameGene=intersect(row.names(tcga),row.names(geo)) tcgaOut=tcga[sameGene,] geoOut=geo[sameGene,] #批次矫正 all=cbind(tcgaOut,geoOut) batchType=c(rep(1,ncol(tcgaOut)),rep(2,ncol(geoOut))) outTab=ComBat(all, batchType, par.prior=TRUE) tcgaOut=outTab[,colnames(tcgaOut)] tcgaOut[tcgaOut<0]=0 geoOut=outTab[,colnames(geoOut)] geoOut[geoOut<0]=0 #输出矫正后的数据 tcgaTab=rbind(ID=colnames(tcgaOut), tcgaOut) write.table(tcgaTab, file="TCGA.normalize.txt", sep="\t", quote=F, col.names=F) geoTab=rbind(ID=colnames(geoOut), geoOut) write.table(geoTab,file="GEO.normalize.txt",sep="\t",quote=F,col.names=F) #获取模块基因的表达量 gene=read.table(geneFile, header=F, sep="\t", check.names=F) sameGene=intersect(as.vector(gene[,1]), rownames(tcgaOut)) tcgaShareExp=tcgaOut[sameGene,] geoShareExp=geoOut[sameGene,] #输出模块基因的表达量 tcgaShareExp=rbind(ID=colnames(tcgaShareExp),tcgaShareExp) write.table(tcgaShareExp,file="TCGA.share.txt",sep="\t",quote=F,col.names=F) geoShareExp=rbind(ID=colnames(geoShareExp),geoShareExp) write.table(geoShareExp,file="GEO.share.txt",sep="\t",quote=F,col.names=F)