# Steve Horvath: Estimating DNAm age. # This file assumes a data frame exists called dat1 whose rows correspond to CpGs # and whose first column reports the CpG identifier # and whose remaining columns corresponds to samples (e.g. Illumina arrays). fastImputation=FALSE #STEP 1: DEFINE QUALITY METRICS meanMethBySample =as.numeric(apply(as.matrix(dat1[,-1]),2,mean,na.rm=TRUE)) minMethBySample =as.numeric(apply(as.matrix(dat1[,-1]),2,min,na.rm=TRUE)) maxMethBySample =as.numeric(apply(as.matrix(dat1[,-1]),2,max,na.rm=TRUE)) datMethUsed= t(dat1[,-1]) colnames(datMethUsed)=as.character(dat1[,1]) noMissingPerSample=apply(as.matrix(is.na(datMethUsed)),1,sum) table(noMissingPerSample) #STEP 2: Imputing if (! fastImputation & nSamples>1 & max(noMissingPerSample,na.rm=TRUE)<3000 ){ # run the following code if there is at least one missing if ( max(noMissingPerSample,na.rm=TRUE)>0 ){ dimnames1=dimnames(datMethUsed) datMethUsed= data.frame(t(impute.knn(t(datMethUsed))$data)) dimnames(datMethUsed)=dimnames1 } # end of if } # end of if (! fastImputation ) if ( max(noMissingPerSample,na.rm=TRUE)>=3000 ) fastImputation=TRUE if ( fastImputation | nSamples==1 ){ noMissingPerSample=apply(as.matrix(is.na(datMethUsed)),1,sum) table(noMissingPerSample) if ( max(noMissingPerSample,na.rm=TRUE)>0 & max(noMissingPerSample,na.rm=TRUE) >= 3000 ) {normalizeData=FALSE} # run the following code if there is at least one missing if ( max(noMissingPerSample,na.rm=TRUE)>0 & max(noMissingPerSample,na.rm=TRUE) < 3000 ){ dimnames1=dimnames(datMethUsed) for (i in which(noMissingPerSample>0) ){ selectMissing1=is.na(datMethUsed[i,]) datMethUsed[i,selectMissing1] = as.numeric(probeAnnotation21kdatMethUsed$goldstandard2[selectMissing1]) } # end of for loop dimnames(datMethUsed)=dimnames1 } # end of if } # end of if (! fastImputation ) # STEP 3: Data normalization (each sample requires about 8 seconds). It would be straightforward to parallelize this operation. if (normalizeData ){ datMethUsedNormalized=BMIQcalibration(datM=datMethUsed,goldstandard.beta= probeAnnotation21kdatMethUsed$goldstandard2,plots=FALSE) } if (!normalizeData ){ datMethUsedNormalized=datMethUsed } rm(datMethUsed); gc() #STEP 4: Predict age and create a data frame for the output (referred to as datout) selectCpGsClock=is.element(dimnames(datMethUsedNormalized)[[2]], as.character(datClock$CpGmarker[-1])) if ( sum( selectCpGsClock) < dim(datClock)[[1]]-1 ) {stop("The CpGs listed in column 1 of the input data did not contain the CpGs needed for calculating DNAm age. Make sure to input cg numbers such as cg00075967.")} if ( sum( selectCpGsClock) > dim(datClock)[[1]]-1 ) {stop("ERROR: The CpGs listed in column 1 of the input data contain duplicate CpGs. Each row should report only one unique CpG marker (cg number).")} if (nSamples>1 ) { datMethClock0=data.frame(datMethUsedNormalized[,selectCpGsClock]) datMethClock= data.frame(datMethClock0[ as.character(datClock$CpGmarker[-1])]) dim(datMethClock) predictedAge=as.numeric(anti.trafo(datClock$CoefficientTraining[1]+as.matrix(datMethClock)%*% as.numeric(datClock$CoefficientTraining[-1]))) } # end of if if (nSamples==1 ) { datMethUsedNormalized2=data.frame(rbind(datMethUsedNormalized,datMethUsedNormalized)) datMethClock0=data.frame(datMethUsedNormalized2[,selectCpGsClock]) datMethClock= data.frame(datMethClock0[ as.character(datClock$CpGmarker[-1])]) dim(datMethClock) predictedAge=as.numeric(anti.trafo(datClock$CoefficientTraining[1]+as.matrix(datMethClock)%*% as.numeric(datClock$CoefficientTraining[-1]))) predictedAge=predictedAge[1] } # end of if # Let's add comments to the age prediction Comment=ifelse ( predictedAge <0, "Negative DNAm age.", ifelse ( predictedAge >100, "Old DNAm age.", rep("",length(predictedAge)))) Comment[is.na(predictedAge)]="Age prediction was not possible. " if ( sum( selectCpGsClock) < dim(datClock)[[1]]-1 ) { Comment=rep("ERROR: The CpGs listed in column 1 of the input data did not contain the CpGs needed for calculating DNAm age. Make sure to input cg numbers such as cg00075967.",length(predictedAge) )} if ( sum( selectCpGsClock) > dim(datClock)[[1]]-1 ) { Comment=rep("ERROR: The CpGs listed in column 1 of the input data contain duplicate CpGs. Each row should report only one unique CpG marker (cg number).",length(predictedAge) )} restSamples=-minMethBySample>0.05 | maxMethBySample>1.05; restSamples[is.na(restSamples)]=FALSE lab1="MAJOR WARNING: Probably you did not input beta values since either minMethBySample<-0.05 or maxMethBySample>1.05.";Comment[restSamples]= paste(Comment[restSamples],lab1) restSamples= noMissingPerSample >0 & noMissingPerSample <=100;lab1="WARNING: Some beta values were missing, see noMissingPerSample."; Comment[restSamples]= paste(Comment[restSamples],lab1) restSamples= noMissingPerSample >3000;lab1="MAJOR WARNING: More than 3k missing values!!"; Comment[restSamples]= paste(Comment[restSamples],lab1) restSamples= noMissingPerSample >100 & noMissingPerSample <=3000 ;lab1="MAJOR WARNING: noMissingPerSample>100" Comment[restSamples]= paste(Comment[restSamples],lab1) restSamples=meanMethBySample>.35; restSamples[is.na(restSamples)]=FALSE lab1="Warning: meanMethBySample is >0.35";Comment[restSamples]= paste(Comment[restSamples],lab1) restSamples=meanMethBySample<.25; restSamples[is.na(restSamples)]=FALSE; lab1="Warning: meanMethBySample is <0.25" Comment[restSamples]= paste(Comment[restSamples],lab1) datout=data.frame(SampleID=colnames(dat1)[-1], DNAmAge=predictedAge, Comment, noMissingPerSample,meanMethBySample, minMethBySample, maxMethBySample) if ( !is.null( meanXchromosome) ){ if ( length( meanXchromosome)==dim(datout)[[1]] ){ predictedGender=ifelse(meanXchromosome>.4,"female", ifelse(meanXchromosome<.38,"male","Unsure")) datout=data.frame(datout,predictedGender=predictedGender,meanXchromosome=meanXchromosome) } # end of if } # end of if