#----Preliminaries---- # READ ME #Format your data in excel similar to the supplied Sample Data file (.xlsx) with samples as rows and measured variables as columns #Column A: "Sample" in A1; the rest of your A cells should have your sample labels; label background samples "Background" in the end rows #Column B: your variable of interest to be regressed against, the response variable; B1 should have the name of the response variable (i.e. Iba1); # the rest of the B cells have measured values of the variable #Column C and onward: the first row should have the label, the following rows should have measured values of predictor variables (i.e. cytokines or phospho-proteins) rm(list=ls()) #Remove all variables from current environment #User Input (Change this so it fits for your data) dataFileName="MyData.xlsx" #the name of your data file, which should be saved in the same folder as this R file SwapAxes=FALSE #If you want to swap LV1 and LV2, make this TRUE, otherwise FALSE; must capitalize #Set working directory and load needed libraries setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #Set working directory to the folder where the code file resides if(!"pacman" %in% installed.packages()){install.packages("pacman")} pacman::p_load(tidyverse,rio,gplots,matrixStats,readxl,heatmap3,ggplot2,plsRglm,RColorBrewer,stats) #loads and installs all needed packages #Set heatmap3 color bar parameters breakBarColors=c(-200,seq(-1.5, 1.5, 0.01),200) #Outside numbers clip outliers. This is for zscoring. barColors = colorpanel(length(breakBarColors)-1, "blue", "white", "red2") #Define function for making color bars (taken from http://www.colbyimaging.com/wiki/statistics/color-bars) color.bar = function(lut, min, max=-min, nticks=11, ticks=seq(min, max, len=nticks), title='') { scale = (length(lut)-1)/(max-min) plot(c(0,10), c(min,max), type='n', bty='n', xaxt='n', xlab='', yaxt='n', ylab='', main=title) axis(2, ticks, las=1) for (i in 1:(length(lut)-1)) { y = (i-1)/scale + min rect(0,y,10,y+1/scale, col=lut[i], border=NA) } } #----Read Data and Subtract Background---- df <- import(dataFileName) dataBackground <- df %>% filter(Sample=="Background") %>% select(3:ncol(df)) %>% as.matrix() dataPredictorVariablesIn <- df %>% filter(Sample!="Background") %>% select(3:ncol(df)) %>% as.matrix() dataResponseVariable <- df %>% filter(Sample!="Background") %>% select(2) %>% as.matrix() #subtract background from predictor variables dataPredictorVariables=dataPredictorVariablesIn for (i in 1:ncol(dataPredictorVariables)) { dataPredictorVariables[,i]=dataPredictorVariables[,i]-mean((dataBackground[,i])) #subtract each column of data (each cytokine) by the mean of that cytokine's background values } dataPredictorVariables[dataPredictorVariables<0]=0; #if value is below background, set to zero rownames(dataPredictorVariables)=df$Sample[which(df$Sample!="Background")] dataPredictorVariablesZ=apply(dataPredictorVariables,2,scale); #z-score the data dataResponseVariableZ=apply(dataResponseVariable,2,scale); #z-score response variable (for use in heatmap) rownames(dataPredictorVariablesZ)=rownames(dataPredictorVariables) #----PLS---- #Remove constant columns and then run the PLS indConstantColumns=colSds(dataPredictorVariablesZ)==0 plsOut=plsRglm(dataY=dataResponseVariable,dataX=dataPredictorVariablesZ[,!indConstantColumns],2,modele="pls", scaleX = TRUE) P=varimax(plsOut$pp) #Use varimax to rotate the LVs #Return the loadings with missing values set to 0 P1=matrix(0,ncol = 1, nrow=dim(dataPredictorVariablesZ)[2]) P2=matrix(0,ncol = 1, nrow=dim(dataPredictorVariablesZ)[2]) P1[!indConstantColumns]=P$loadings[,1] P2[!indConstantColumns]=P$loadings[,2] rownames(P1)=colnames(dataPredictorVariablesZ) rownames(P2)=colnames(dataPredictorVariablesZ) rotSave=P$rotmat scores = plsOut$tt %*% rotSave T1=scores[,1] T2=scores[,2] if(SwapAxes==TRUE){ P1[!indConstantColumns]=P$loadings[,2] P2[!indConstantColumns]=P$loadings[,1] T1=scores[,2] T2=scores[,1] } #Scores plot myPalette = colorRampPalette(rev(brewer.pal(11, "Spectral"))) sc = scale_colour_gradientn(colours = barColors, limits=c(min(dataResponseVariable), max(dataResponseVariable))) ScoresPlot=data.frame(T1,T2,(dataResponseVariable)) colnames(ScoresPlot)=c("T1","T2","ResponseVariable") pdf("ScoresPlot.pdf", width=5,height=4,useDingbats=FALSE, pointsize = 18) ggplot(ScoresPlot, aes(x=T1, y=T2, color=ResponseVariable)) + geom_vline(xintercept=0)+ geom_hline(yintercept=0)+ geom_point( size=6)+ sc+ xlab("Scores on LV1")+ ylab("Scores on LV2")+ ggtitle("PLSR Scores")+ xlim(-1.1*max(abs(T1)),1.1*max(abs(T1)))+ ylim(-1.1*max(abs(T2)),1.1*max(abs(T2)))+ theme(panel.background = element_rect(fill = 'white',colour='black'), text = element_text(size=20), panel.border=element_blank(), plot.title = element_text(hjust = 0.5), axis.text = element_text(color = "black"), axis.ticks=element_line(color="black") )+ labs(color = "") dev.off() #LV1 versus Response Variable graph to determine acceptability of regression LV1Regression=data.frame(T1,(dataResponseVariable)) colnames(LV1Regression)=c("T1","ResponseVariable") linmodel=lm(ResponseVariable~T1,LV1Regression) RSQ=summary(linmodel)$r.squared #Bar plot the loadings P1Sort=sort(t(P1))/max(abs(P1)) indP1=sort(P1, index.return=TRUE)$ix pdf("LV1vsResponseVariable.pdf", width=4,height=4,useDingbats=FALSE, pointsize = 18) ggplot(LV1Regression, aes(x=T1, y=ResponseVariable)) + geom_smooth(method='lm',se=FALSE,color='red',linetype="longdash")+ geom_point(size=4)+ sc+ xlab("Scores on LV1")+ ylab("ResponseVariable")+ ggtitle("PLSR")+ xlim(1.03*min(T1),1.03*max(T1))+ ylim(0.97*min(dataResponseVariable),1.03*max(dataResponseVariable))+ annotate("text",size=5, x=0.8*max(T1)+0.2*min(T1), y=min(dataResponseVariable), label=bquote({R^2}[PLS]==.(round(RSQ,digits=2))))+ theme(panel.background = element_rect(fill = 'white'), plot.title=element_text(hjust=0.5), text = element_text(size=20), panel.border=element_blank(), axis.line=element_line(color='black'), axis.text.x = element_text(face = "bold", color = "black", size = 18), axis.text.y = element_text(face = "bold", color = "black", size = 16))+ labs(color = "") dev.off() pdf("LV1_Loadings.pdf", width=10,height=5,pointsize = 18) barplot(P1Sort, main = "Signals in LV1", xlab = "", ylab = "", names.arg = rownames(P1)[indP1], col = "lightblue", horiz = FALSE, las=2, ylim=c(-1,1)) dev.off() P2Sort=sort(t(P2))/max(abs(P2)) indP2=sort(P2, index.return=TRUE)$ix pdf("LV2_Loadings.pdf", width=10,height=5,pointsize = 18) barplot(P2Sort, main = "Signals in LV2", xlab = "", ylab = "", names.arg = rownames(P2)[indP2], col = "lightblue", horiz = FALSE, las=2, ylim=c(-1,1)) dev.off() #Sort data based on Response Variable values (used in displaying heatmap) ResponseVariableSort=sort(dataResponseVariableZ, decreasing=FALSE, index.return=TRUE) indSort=ResponseVariableSort$ix dataPredictorVariablesZ_sort=dataPredictorVariablesZ[indSort,] dataResponseVariableZ_sort=dataResponseVariableZ[indSort] #Heatmap with rows sorted by response variable and columns sorted by LV1 dataHeatmap_P1sort=data.frame(dataPredictorVariablesZ_sort[,indP1],dataResponseVariableZ_sort) ColLabels=colnames(dataHeatmap_P1sort) pdf("Heatmap_LV1sort.pdf", width=7,height=5,pointsize = 10) heatmap3(dataHeatmap_P1sort, col=barColors, breaks=breakBarColors,legendfun=function()showLegend(legend=c(NA),col=c(NA),cex=2.5), Rowv=NA, Colv=NA, scale="none", cexCol=1,cexRow=1.1, margins=c(20,2), highlightCell=data.frame(rep(1:dim(dataPredictorVariables)[1],each=(dim(dataPredictorVariables)[2]+1)),rep(1:(dim(dataPredictorVariables)[2]+1),times=dim(dataPredictorVariables)[1]),'black'), labCol=ColLabels ) dev.off() #----LOOCV---- LOOCV_runs=200 randomSample=sample(1:dim(dataPredictorVariables)[1],LOOCV_runs,replace=TRUE) #creates randomized array of sample indices #Create blank matrices to record scores/loadings from each iteration Load_LV1_LOOCV=matrix(0, nrow=LOOCV_runs, ncol=dim(dataPredictorVariables)[2]) Load_LV2_LOOCV=matrix(0, nrow=LOOCV_runs, ncol=dim(dataPredictorVariables)[2]) #iterations of PLSR for (i in 1:LOOCV_runs) { leftOut=randomSample[i] #index of sample to leave out dataPredictorVariablesLOOCVZ=dataPredictorVariablesZ[-leftOut,] dataResponseVariableLOOCV=dataResponseVariable[-leftOut] #Conduct PLSR, store loadings indConstantColumns=colSds(dataPredictorVariablesLOOCVZ)<1e-10 #Remove constant columns and then run the PLS plsOut_LOOCV=plsRglm(dataY=dataResponseVariableLOOCV,dataX=dataPredictorVariablesLOOCVZ[,!indConstantColumns],nt=2,modele="pls", scaleX = TRUE,EstimXNA=TRUE) P$loadings=plsOut_LOOCV$pp%*% rotSave #Return the loadings with missing values set to 0 Load_LV1_LOOCV[i,!indConstantColumns]=P$loadings[,1] Load_LV2_LOOCV[i,!indConstantColumns]=P$loadings[,2] if(SwapAxes==TRUE){ Load_LV1_LOOCV[i,!indConstantColumns]=P$loadings[,2] Load_LV2_LOOCV[i,!indConstantColumns]=P$loadings[,1] } } #compute mean and standard deviation for stored loadings over all iterations P1_LOOCV=colMeans(Load_LV1_LOOCV) P2_LOOCV=colMeans(Load_LV2_LOOCV) stdevLV1_load=colSds(Load_LV1_LOOCV)/max(abs(P1_LOOCV)) stdevLV2_load=colSds(Load_LV2_LOOCV)/max(abs(P2_LOOCV)) #Bar plot the loadings with LOOCV error bars P1Sort_LOOCV=sort(t(P1_LOOCV))/max(abs(P1_LOOCV)) indP1_LOOCV=sort(P1_LOOCV, index.return=TRUE)$ix ColLabels=colnames(dataPredictorVariables)[indP1_LOOCV] pdf("LV1_Loadings_SD.pdf", width=10,height=5,pointsize = 18) barCenters <- barplot(P1Sort_LOOCV, main = "Signals in LV1", xlab = "", ylab = "", names.arg = ColLabels, col = "lightblue", horiz = FALSE, las=2, ylim=c(-1.49,1.49)) segments(barCenters,P1Sort_LOOCV-stdevLV1_load[indP1_LOOCV],barCenters,P1Sort_LOOCV+stdevLV1_load[indP1_LOOCV]) arrows(barCenters,P1Sort_LOOCV-stdevLV1_load[indP1_LOOCV],barCenters,P1Sort_LOOCV+stdevLV1_load[indP1_LOOCV],code=3,angle=90,length=0.05) dev.off() P2Sort_LOOCV=sort(t(P2_LOOCV))/max(abs(P2_LOOCV)) indP2_LOOCV=sort(P2_LOOCV, index.return=TRUE)$ix ColLabels=colnames(dataPredictorVariables)[indP2_LOOCV] pdf("LV2_Loadings_SD.pdf", width=10,height=5,pointsize = 18) barCenters <- barplot(P2Sort_LOOCV, main = "Signals in LV2", xlab = "", ylab = "", names.arg = ColLabels, col = "lightblue", horiz = FALSE, las=2, ylim=c(-1.49,1.49)) segments(barCenters,P2Sort_LOOCV-stdevLV2_load[indP2_LOOCV],barCenters,P2Sort_LOOCV+stdevLV2_load[indP2_LOOCV]) arrows(barCenters,P2Sort_LOOCV-stdevLV2_load[indP2_LOOCV],barCenters,P2Sort_LOOCV+stdevLV2_load[indP2_LOOCV],code=3,angle=90,length=0.05) dev.off() #Heatmap with rows sorted by response variable and columns sorted by LV1 dataHeatmap_P1sort_LOOCV=data.frame(dataPredictorVariablesZ_sort[,indP1_LOOCV],dataResponseVariableZ_sort) ColLabels=colnames(dataHeatmap_P1sort_LOOCV) pdf("Heatmap_LV1sort_LOOCV.pdf", width=7,height=5,pointsize = 10) heatmap3(dataHeatmap_P1sort_LOOCV, col=barColors, breaks=breakBarColors,legendfun=function()showLegend(legend=c(NA),col=c(NA),cex=2.5), Rowv=NA, Colv=NA, scale="none", cexCol=1,cexRow=1.1, margins=c(20,2), highlightCell=data.frame(rep(1:dim(dataPredictorVariables)[1],each=(dim(dataPredictorVariables)[2]+1)),rep(1:(dim(dataPredictorVariables)[2]+1),times=dim(dataPredictorVariables)[1]),'black'), labCol=ColLabels ) dev.off() #Make the color-axis scale bar pdfPath="Scalebar_cytokines.pdf" pdf(pdfPath, width=2,height=8,pointsize = 14) color.bar(barColors,-1.5) dev.off()