\documentclass[a4paper,10pt]{article} \usepackage{longtable} \usepackage[noams]{SIunits} \setlongtables \newcommand{\code}[1]{{\tt #1}} \newcommand{\Rfunc}[1]{{\tt #1}} \newcommand{\myincfig}[3]{% \begin{figure}[htbp] \begin{center} \includegraphics[width=#2]{#1} \caption{\label{#1}#3} \end{center} \end{figure} } \title{Microarray segmentation methods significantly influence data precision} \author{Ahmed Ahmed \and James D. Brenton} \begin{document} \maketitle \section{Data description} Expression microarrays containing 6528 pairs of duplicate cDNA spots were used (Cancer Research UK DNA Microarray Facility at the Institute of Cancer Research; CR-UK DMF Human 6.5k genome-wide array). All microarrays used were from the same printing batch. Total RNA was obtained from the cell line HCT116 and an isogenic daughter line with a targeted disruption of the \emph{EP300} gene derived by homologous recombination~(Iyer~\emph{et al}, submitted). Total RNA was used for reverse transcription and indirect labeling with Cy3 and Cy5 dyes (Amersham) using random hexamers. Measurements of the amount of purified cDNA and Cy3/Cy5 incorporation were made before hybridization using the Nanodrop ND-1000 spectrophotometer (Nanodrop Technologies, Inc.). Two sets of experiments (\emph{A} and \emph{B}) were carried out, using 6 slides in each with a balanced dye-swap design (three slides for each dye). Experiment~\emph{A} and \emph{B} were identical but used \unit{10}{\micro\gram} and \unit{15}{\micro\gram} total RNA for labelling for each hybridization. The names of the slides and the dyes used for each slide are as follows: \begin{longtable}{rrrr} Experiment & Slide ID & p300 knock-out & HCT116\\ \hline \\ A& 3084 & Cy3 & Cy5\\ & 3089 & Cy3 & Cy5\\ & 3097 & Cy3 & Cy5\\ & 3092 & Cy5 & Cy3\\ & 3093 & Cy5 & Cy3\\ & 3069 & Cy5 & Cy3\\ \\ B& 3025 & Cy3 & Cy5\\ & 3033 & Cy3 & Cy5\\ & 3054 & Cy3 & Cy5\\ & 3034 & Cy5 & Cy3\\ & 3036 & Cy5 & Cy3\\ & 3037 & Cy5 & Cy3\\ \end{longtable} Segmentation was performed using QuantArray (Perkin Elmer) and GenePix Pro 4.1 (Axon Instruments, Inc.) software. All three methods of segmentation available within the QuantArray package were evaluated. When using the segmentation software, channel 1 was always assigned to cy3 regardless of the sample, knock-out or wild type, used. The suffixes A, F, H and G were used, following the slide ID, to name the raw data files generated by the adaptive, fixed circle, histogram and genepix segmentation methods respectively. All statistical analysis was conducted using the R environment~\cite{Ihaka1996} and the R package ``Statistics for Microarray Analysis''~\cite{Dudoit2002}. We start the analysis by startting the sma library then loading the data files into R. <<>>= library(sma) for(i in 1:length(dir(pattern="....[AFHG]"))){ data <- read.table(dir(pattern="....[AFHG]")[i], sep='\t', header= TRUE) assign(dir(pattern="....[AFHG]")[i], data) } @ We generate objects according to the experiment used, A or B, and the segmentation method. For experiment A we build \Rfunc{expA.A}, \Rfunc{expA.F}, \Rfunc{expA.H}, \Rfunc{expA.G} for adaptive, fixedcircle, histogram and genepix respectively: <<>>= expA.A <- c( "3084A", "3089A", "3097A", "3092A", "3093A", "3069A") expA.F <- c( "3084F", "3089F", "3097F", "3092F", "3093F", "3069F") expA.H <- c( "3084H", "3089H", "3097H", "3092H", "3093H", "3069H") expA.G <- c( "3084G" , "3089G", "3097G", "3092G", "3093G", "3069G" ) @ We do the same for experiment B: <<>>= expB.A <- c( "3025A", "3033A", "3054A", "3034A", "3036A", "3037A") expB.F <- c( "3025F", "3033F", "3054F", "3034F", "3036F", "3037F") expB.H <- c( "3025H", "3033H", "3054H", "3034H", "3036H", "3037H") expB.G <- c( "3025G", "3033G", "3054G", "3034G", "3036G", "3037G") @ Note that the first half of each of the vector of names represents slides where the test in the experiment, in this case the knock-out clone, was labeled by cy3 and the the control, in this case the wild type clone, labeled by cy5. \section{Statistical methods} Log intensity ratios for each spot were obtained, as a measure of the differential expression between the samples, with and without background subtraction. All spots from each microarray were included in the analysis. Data precision was initially assessed by using correlation coefficients. \subsection{The \Rfunc{get.cor} function} To allow for iterating over the entire data set and to investigate the effect of various normalization methods and background subtraction we use a function \Rfunc{get.cor}. \subsubsection{Description} This function extracts the log2 ratios for all the slides used as well as correlations between replicates within each slide and between different slides. \subsubsection{Usage} get.cor(x, BG.subtract=TRUE, normalization) \subsubsection{Arguments} \begin{longtable}{rp{8cm}} x: & a vector the names of the raw data objects.\\\\ BG.subtract: & a logical argument meaning; `should the back ground values be subtracted?'. \\\\ normalization: & the method of normalization to be used; `n' for no normalization, `s' for scaled normalization, `m' for median normalization, `p' for print-tip group lowess normalization and `l' for global `lowess` normalization.\\ \end{longtable} \subsubsection{Details} In the function we build a list containing the raw data R, red=Channel 2, Rb, background for channel 2, G ,green= Channel 1, and Gb, background for channel 1. Note that, for consistency, we have always put cy3, G, in Channel 1 regardless of the sample used. Also note that by not specifying the background data, Rb and Gb, no background subtraction is performed. We then specify the grid information for the images as follows; nspot.r= the number rows in a block, nspot.c= the number of columns within a block, ngrid.r= the number of rows of blocks and ngrid.c= the number of columns of blocks. We use the normalization function in the sma package. We then correct the sign for the ratios for the first half of the elements in (x) to account for the dye-swapping. We assign the ratios of all the slides to an object called M. Note that the number of columns in M represents the number of elements in (x). In other words the number of slides tested. We define index objects rep.a and rep.b for the first and second replicates in each slide. To obtain the within-slide correlations we compute the correlation coefficients using the \Rfunc{cor} function in R. For between-slidecorrelations we obtain the correlations for M using the the \Rfunc{cor} function. \subsubsection{Value} \begin{longtable}{rp{8cm}} within: & a vector containing the within-slide correlations.\\\\ between:& a vector containing the between-slidecorrelations.\\\\ ratios: & a data frame containing the log2 ratios of knock-out over wild-type for each spot in a slide. The number of columns correspond to the number of slides and the number of rows to the total number of spots in a slide.\\ \end{longtable} \subsubsection{Script} <<>>= get.cor <- function(x, BG.subtract=TRUE, normalization){ correlations <- list (within=NULL, between= NULL, ratios= NULL) res <- list(R = NULL, G = NULL, Rb = NULL, Gb = NULL) if (BG.subtract==TRUE) for(i in (1:length(x))){ y<-get(x[i]) res$R <- cbind(res$R,y$ch2.Intensity) res$G <- cbind(res$G,y$ch1.Intensity) res$Rb <- cbind(res$Rb,y$ch2.Background) res$Gb <- cbind(res$Gb,y$ch1.Background) } else for(i in (1:length(x))){ y<-get(x[i]) res$R <- cbind(res$R,y$ch2.Intensity) res$G <- cbind(res$G,y$ch1.Intensity) } data.grid<-list(nspot.r=16,nspot.c=34,ngrid.r=12,ngrid.c=2) data.ma<-stat.ma(res,data.grid,norm=normalization) halfn= length(x)/2 data.ma$M[,c(1:halfn)]<- (data.ma$M[,c(1:halfn)])*-1 M<-data.ma$M rep.a <- rep(c(TRUE,FALSE),12,each=544) rep.b <- rep(c(FALSE,TRUE),12,each=544) cors<-NULL for(i in 1:ncol(M)){ y<-cor(M[rep.a,i],M[rep.b,i],use='pairwise.complete.obs') cors<-c(cors,y) } cors.between <- cor(M,use='pairwise.complete.obs') upper.diagonal <- col(cors.between)>row(cors.between) cors.between <- cors.between[upper.diagonal] correlations$within <- c( correlations$within, cors) correlations$between <- c( correlations$between, cors.between) correlations$ratios <- cbind( correlations$ratio, M) correlations } @ \subsection{Obtaining the data} We then obtain the data for the different methods of segmentation for experiment A: First, with background subtraction and no normalization: <<>>= dataA.A.BGS.n <- get.cor ( expA.A, BG.subtract=TRUE, normalization='n') dataA.F.BGS.n <- get.cor ( expA.F, BG.subtract=TRUE, normalization='n') dataA.H.BGS.n <- get.cor ( expA.H, BG.subtract=TRUE, normalization='n') dataB.A.BGS.n <- get.cor ( expB.A, BG.subtract=TRUE, normalization='n') dataB.F.BGS.n <- get.cor ( expB.F, BG.subtract=TRUE, normalization='n') dataB.H.BGS.n <- get.cor ( expB.H, BG.subtract=TRUE, normalization='n') @ Then without background subtraction <<>>= dataA.A.NoBGS.n <- get.cor ( expA.A, BG.subtract=FALSE, normalization='n') dataA.F.NoBGS.n <- get.cor ( expA.F, BG.subtract=FALSE, normalization='n') dataA.H.NoBGS.n <- get.cor ( expA.H, BG.subtract=FALSE, normalization='n') dataB.A.NoBGS.n <- get.cor ( expB.A, BG.subtract=FALSE, normalization='n') dataB.F.NoBGS.n <- get.cor ( expB.F, BG.subtract=FALSE, normalization='n') dataB.H.NoBGS.n <- get.cor ( expB.H, BG.subtract=FALSE, normalization='n') @ Then we repeat all the process with scaled normalization: <<>>= dataA.A.BGS.s <- get.cor ( expA.A, BG.subtract=TRUE, normalization='s') dataA.F.BGS.s <- get.cor ( expA.F, BG.subtract=TRUE, normalization='s') dataA.H.BGS.s <- get.cor ( expA.H, BG.subtract=TRUE, normalization='s') dataB.A.BGS.s <- get.cor ( expB.A, BG.subtract=TRUE, normalization='s') dataB.F.BGS.s <- get.cor ( expB.F, BG.subtract=TRUE, normalization='s') dataB.H.BGS.s <- get.cor ( expB.H, BG.subtract=TRUE, normalization='s') dataA.A.NoBGS.s <- get.cor ( expA.A, BG.subtract=FALSE, normalization='s') dataA.F.NoBGS.s <- get.cor ( expA.F, BG.subtract=FALSE, normalization='s') dataA.H.NoBGS.s <- get.cor ( expA.H, BG.subtract=FALSE, normalization='s') dataB.A.NoBGS.s <- get.cor ( expB.A, BG.subtract=FALSE, normalization='s') dataB.F.NoBGS.s <- get.cor ( expB.F, BG.subtract=FALSE, normalization='s') dataB.H.NoBGS.s <- get.cor ( expB.H, BG.subtract=FALSE, normalization='s') @ \subsection{Extracting the correlations to data frames} We then build a data frame of the results of within-slide correlations and the identifiers as follows; the first column contains the correlation values, the second contains the segmentation method and the third contains the DNA content whether Low, in experiment A, or High, in experiment B: <<>>= Results.within.BGS <- data.frame( c(dataA.A.BGS.n$within, dataA.F.BGS.n$within, dataA.H.BGS.n$within, dataB.A.BGS.n$within, dataB.F.BGS.n$within, dataB.H.BGS.n$within), factor(rep(c('A','F','H'), each=6, 2)), factor(rep(c('Low', 'High'), each=18) )) names(Results.within.BGS) <- c('Correlations', 'Method', 'DNA') @ And with no background subtraction: <<>>= Results.within.NoBGS <- data.frame( c(dataA.A.NoBGS.n$within, dataA.F.NoBGS.n$within, dataA.H.NoBGS.n$within, dataB.A.NoBGS.n$within, dataB.F.NoBGS.n$within, dataB.H.NoBGS.n$within), factor(rep(c('A','F','H'), each=6, 2)), factor(rep(c('Low', 'High'), each=18) )) names(Results.within.NoBGS) <- c('Correlations', 'Method', 'DNA') @ We do the same for between-slidecorrelations and we add a fourth column to identify whether the dyes used for the two slide where the same, S, or different, D. Note that for between-slidecomparisons we used normalized data: <<>>= Results.between.BGS <- data.frame( c(dataA.A.BGS.s$between, dataA.F.BGS.s$between, dataA.H.BGS.s$between, dataB.A.BGS.s$between, dataB.F.BGS.s$between, dataB.H.BGS.s$between), factor(rep(c('A','F','H'), each=15, 2)), factor(rep(c('Low', 'High'), each=45)), factor(rep(c('S','D','S','D','S'), c(3,6,1,3,2))) ) names(Results.between.BGS) <- c('Correlations', 'Method', 'DNA', 'Dye') Results.between.NoBGS <- data.frame( c(dataA.A.NoBGS.s$between, dataA.F.NoBGS.s$between, dataA.H.NoBGS.s$between, dataB.A.NoBGS.s$between, dataB.F.NoBGS.s$between, dataB.H.NoBGS.s$between), factor(rep(c('A','F','H'), each=15, 2)), factor(rep(c('Low', 'High'), each=45)), factor(rep(c('S','D','S','D','S'), c(3,6,1,3,2))) ) names(Results.between.NoBGS) <- c('Correlations', 'Method', 'DNA', 'Dye') @ \section{Segmentation method significantly influences within-slide correlations} To investigate whether the segmentation method was an important determinant of within-slide correlations, we first plot the within-slide correlations data categorized by the segmentation method. We use a function to plot the medians of the different categories on the dot plots as follows. <<>>= stripchart.plot.median.by.factor <- # plot median bars with "jitter" function(x, y, z){ for (i in 1:length(x)) { lines(c(x[i] - z, x[i] + z), c(y[i], y[i])) } } @ \begin{center} <>= op<-par(mfrow=c(1,2), las=1) stripchart(Results.within.BGS$Correlations[1:18] ~ Results.within.BGS$Method[1:18], ylab="Correlation coefficient", ylim=c(0,0.8), xlim=c(0.5,3.5), pch=1, vertical=T, method="stack", offset=1, main="With background subtracted") stripchart.plot.median.by.factor(1:3, as.vector(by(Results.within.BGS$Correlations[1:18], Results.within.BGS$Method[1:18], median)), 0.25) stripchart(Results.within.NoBGS$Correlations[1:18] ~ Results.within.NoBGS$Method[1:18], ylab="Correlation coefficient", ylim=c(0,0.8), xlim=c(0.5,3.5), pch=1, vertical=T, method="stack", offset=1, main="With no background subtracted") stripchart.plot.median.by.factor(1:3, as.vector(by(Results.within.NoBGS$Correlations[1:18], Results.within.NoBGS$Method[1:18], median, na.rm=F)), 0.25) @ \end{center} We then turn the graphics window off. <<>>= dev.off() @ The dot plot shows differences in the correlations between the three methods of segmentation, A, F and H. To investigate whether these differences were significant we used a one-way ANOVA which is suitable for investigating data grouped into more than two categories. Applying a simple t test would not account for the problem of multiple testing of differences between the means of categories (A$-$F, A$-$H and F$-$H). First, one-way ANOVA analysis was performed on the background subtracted within-slide correlations for experiment A (first 18 rows): <<>>= ANOVA.within.one.BGS <- lm (Correlations[1:18] ~ Method[1:18], Results.within.BGS) summary(ANOVA.within.one.BGS) @ We found a significant difference ($p<0.001$) between the three methods of segmentation. Note that the R-squared test equals $0.84$ indicating a good fit. We also perform a diagnostic QQ plot to examine the fitness of the model to the data. We find a reasonable fit of the quantiles. \begin{center} <>= qqnorm(ANOVA.within.one.BGS$res) @ \end{center} We then use the Tukey HSD test to investigate the significance level at each level of comparison. <<>>= TukeyHSD(aov(Correlations[1:18] ~ Method[1:18], Results.within.BGS)) @ We found that the confidence intervals at each level of comparison do not include zero indicating that the Null hypothesis of equal category means is rejected. We then do the same analysis on the data with no background subtraction and do the diagnostic plot. We find a reasonable fit for the quantiles. <<>>= ANOVA.within.one.NoBGS <- lm (Correlations[1:18] ~ Method[1:18], Results.within.NoBGS) summary(ANOVA.within.one.NoBGS) @ \begin{center} <>= qqnorm(ANOVA.within.one.NoBGS$res) @ \end{center} <<>>= TukeyHSD(aov(Correlations[1:18] ~ Method[1:18], Results.within.NoBGS)) @ The overall difference between the methods remained ($p<0.001$) although the difference between the histogram and fixed circle method was no longer significant. \section{Histogram segmentation gives lower pixel-to-pixel variability} We hypothesized that the better precision for the histogram method was because of less variability in pixel intensity, as fluctuations in pixel values have been shown to increase noise~\cite{Brown2001}. The histogram method summarizes centiles of pixel intensities obtained from a square centered around the true spot. From this it follows that a narrow window of centiles will reduce the within-spot variability as compared to the other methods of segmentation used here. We therefore calculated the coefficient of variability (CV) for foreground and background pixels for each feature in experiments~\emph{A} and \emph{B} in both Cy3 and Cy5 channels. \subsection{Categorizing the data} In order to obtain the CV values we created a data object containing all the names of the raw data objects created by the three different methods of quantarray segmentation: <<>>= All.slides.methods <- c( expA.A, expA.F, expA.H, expB.A, expB.F, expB.H) @ We build the identifier for the interaction between the method of segmentation and the DNA content: <<>>= Method.content <- factor(rep(c('LowA', 'LowF', 'LowH', 'HighA', 'HighF', 'HighH'), each=6)) @ \subsection{The \Rfunc{get.cv} function} \subsubsection{Description} This is a function to compute the spot CV values. \subsubsection{Usage} get.cv(x) \subsubsection{Arguments} \begin{longtable}{rp{8cm}} x: & an object that has the names of the raw data objects.\\ \end{longtable} \subsubsection{Details} We first get the median coefficient of variability (CV) values for each slide. First the median CV for cy3, channel 1, by dividing the standard deviation (Std.Dev) over the mean intensity derived from the total number of pixels per spot. We then get the median of the CVs for all the spots on the slide. We do the same for the background and for cy5. \subsubsection{Value} A list containing the following: \begin{longtable}{rp{8cm}} Cy3.foreground: & the medians for all the CVs of cy3 foreground of all spots in a slide.\\\\ Cy3.background: & the medians for all the CVs of cy3 background of all spots in a slide.\\\\ Cy5.foreground: & the medians for all the CVs of cy5 foreground of all spots in a slide.\\\\ Cy5.background: & the medians for all the CVs of cy5 background of all spots in a slide.\\\\ \end{longtable} \subsubsection{Script} <<>>= get.cv <- function(x){ Median.CV.Values <- list ( Cy3.foreground= NULL, Cy3.background= NULL, Cy5.foreground=NULL, Cy5.Background=NULL) for (i in 1:length(x)){ data <- get(x[i]) CV.cy3.fore <- data$ch1.Intensity.Std.Dev/data$ch1.Intensity median.cvs.cy3.fore <- median(CV.cy3.fore, na.rm=TRUE) CV.cy3.back <- data$ch1.Background.Std.Dev/data$ch1.Background median.cvs.cy3.back <- median(CV.cy3.back, na.rm=TRUE) CV.cy5.fore <- data$ch2.Intensity.Std.Dev/data$ch2.Intensity median.cvs.cy5.fore <- median(CV.cy5.fore, na.rm=TRUE) CV.cy5.back <- data$ch2.Background.Std.Dev/data$ch2.Background median.cvs.cy5.back <- median(CV.cy5.back, na.rm=TRUE) Median.CV.Values$Cy3.foreground <- c(Median.CV.Values$Cy3.foreground, median.cvs.cy3.fore) Median.CV.Values$Cy3.background <- c(Median.CV.Values$Cy3.background, median.cvs.cy3.back) Median.CV.Values$Cy5.foreground <- c(Median.CV.Values$Cy5.foreground, median.cvs.cy5.fore) Median.CV.Values$Cy5.background <- c(Median.CV.Values$Cy5.background, median.cvs.cy5.back) Median.CV.Values } } @ \section{Results of analysis of CV values} We get the median CV values for all the data set and plot the results as follows: <<>>= Median.CVs <- get.cv( All.slides.methods) attach(Median.CVs) @ \begin{center} <>= op<-par(mfrow=c(2,2), las=1)#, cex=0.7) stripchart(Cy3.foreground~Method.content , xlab="", ylab="Median CV", xlim=c(0.5,6.5), ylim=c(0,1), vertical=T, pch=1, method="stack", offset=1, main="Cy3 foreground") stripchart.plot.median.by.factor(1:6, as.vector(by(Cy3.foreground, Method.content , median)), 0.25) stripchart(Cy3.background~Method.content , xlab="", ylab="Median CV", xlim=c(0.5,6.5), ylim=c(0,1), vertical=T, pch=1,method="stack", offset=1, main="Cy3 background") stripchart.plot.median.by.factor(1:6, as.vector(by(Cy3.background, Method.content , median)), 0.25) stripchart(Cy5.foreground~Method.content , xlab="", ylab="Median CV", xlim=c(0.5,6.5), ylim=c(0,1), vertical=T, pch=1,method="stack", offset=1, main="Cy5 foreground") stripchart.plot.median.by.factor(1:6, as.vector(by(Cy5.foreground, Method.content , median)), 0.25) stripchart(Cy5.background~Method.content , xlab="", ylab="Median CV", xlim=c(0.5,6.5), ylim=c(0,1), vertical=T, pch=1,method="stack", offset=1, main="Cy5 background") stripchart.plot.median.by.factor(1:6, as.vector(by(Cy5.background, Method.content , median)), 0.25) @ \end{center} <<>>= detach(Median.CVs) @ The histogram method had the lowest CV values in both foreground and background. \section{Dye-swapping confounds the precision of between-slide comparisons} We next studied the effect of segmentation on between-slide variability by deriving a matrix of correlations for all possible pair-wise comparisons between the slides for each method (fifteen comparisons for each of three segmentation methods). A one-way ANOVA was conducted as above. <<>>= ANOVA.between.one.BGS <- lm (Correlations[1:45] ~ Method[1:45], Results.between.BGS) summary(ANOVA.between.one.BGS) @ In contrast to our results from within-slide comparisons, no significant differences in the correlations were found. The correlation coefficients between slides with dye swapping were mostly negative, indicating low overall repeatability of the data. As dye swapping would be expected to alter correlations between slides, we reanalyzed the data by restricting the comparisons to those between replicates in which cDNA probes had been labelled with the same fluors. We plot the effect of dye swapping on correlations using the interaction plot: <<>>= low <- Results.between.BGS[1:45,] @ \begin{center} <>= op <- par(mfrow=c(1,2), las=1) stripchart(low$Correlations~ low$Dye, xlim=c(0.5,2.5), ylim=c(-0.5, 0.5), ylab="Correlation coefficient", xlab="", pch=1, vertical=TRUE, method="stack", offset=1) tmpFD <- low$Correlations[low$Method=="F" & low$Dye=="D"] tmpHD <- low$Correlations[low$Method=="H" & low$Dye=="D"] tmpFS <- low$Correlations[low$Method=="F" & low$Dye=="S"] tmpHS <- low$Correlations[low$Method=="H" & low$Dye=="S"] points(x=rep(c(1),9), y=tmpFD, col="magenta") points(x=rep(c(1),9), y=tmpHD, col="green") points(x=rep(c(2),6), y=tmpFS, col="magenta") points(x=rep(c(2),6), y=tmpHS, col="green") mtext("a", at=0, cex=2, font=2) interaction.plot(low$Dye,low$Method,low$Correlations, xlab="", ylab="Mean correlation", lty=1, col=c("black","magenta", "green"),legend=FALSE, ylim=c(-0.5, 0.5), xtick=TRUE) mtext("b", at=0.5, cex=2, font=2) @ \end{center} <<>>= dev.off() @ As expected, comparisons between replicates with the same dyes had higher correlations than between slides with swapped dyes. However, the beneficial effect of histogram segmentation on correlation was observed for slides with the same dye. \section{Precision is determined by choice of segmentation method and amount of labelled probe} \subsection{The effect of segmentation method and amount of labelled probe on within-slide correlations} The impact of the quantity of RNA used on the overall precision of microarray data has been previously reported~\cite{Yue2001}. In experiment~\emph{A}, we labelled \unit{10}{\micro\gram} of total RNA for each slide which yielded a median of \unit{2.1}{\micro\gram} (IQR 1.1--3.1) of cDNA probe after purification, and incorporated a median of \unit{151}{\pico\mole} (IQR 104--206) of each fluor. In order to identify whether the low between-slide correlations were caused by inadequate specific activity of our samples, we repeated the experiment using starting material of \unit{15}{\micro\gram} of total RNA for each sample (experiment~\emph{B}; median labelled cDNA \unit{5.5}{\micro\gram}, IQR 4.6--6.6, median incorporation of each fluor \unit{463}{\pico\mole}, IQR 384--534). First, we look at the effect of the two variables on the within-slide correlations using an interaction plot. <<>>= attach(Results.within.BGS) @ \begin{center} <>= #op <- par(mfrow=c(1,2), las=1) #interaction.plot(DNA,Method,Correlations, ylab="Mean correlation", #xlab="",lty=1, col=c("black","magenta", "green"), main="Within-slide #comparison", xtick=TRUE, legend=FALSE) interaction.plot(DNA,Method,Correlations) @ \end{center} <<>>= detach(Results.within.BGS) @ The figure indicates a lack of interaction between the two variables and suggests that differences exist between the categories within each variable, low and high for the DNA content and A, F, H for the segmentation method. In order to investigate whether these differences were significant, we performed a two-way ANOVA for within-slide correlations to examine the effect of DNA content and the method of segmentation. Note that we fit the interaction term in the model to examine for interactions. <<>>= ANOVA.within.two.BGS <- lm (Correlations ~ Method * DNA, Results.within.BGS) anova( ANOVA.within.two.BGS) @ The amount of labelled sample and method of segmentation independently and significantly ($p<0.001$) influenced correlations. There was no significant interaction between the two variables. We then perform a diagnostic QQ plot to examine how the models fits the data and find a good fit. \begin{center} <>= qqnorm(ANOVA.within.two.BGS$res) @ \end{center} We repeat the same process with no background subtraction <<>>= ANOVA.within.two.NoBGS <- lm (Correlations ~ Method * DNA, Results.within.NoBGS) anova( ANOVA.within.two.NoBGS) @ \begin{center} <>= qqnorm(ANOVA.within.two.NoBGS$res) @ \end{center} \subsection{The effect of segmentation method and amount of labelled probe on between-slide correlations} We then plot the effect of the two variables on between-slide correlations <<>>= attach(Results.between.BGS) @ \begin{center} <>= #interaction.plot(DNA,Method,Correlations, ylab="Mean correlation", #xlab="",lty=1, col=c("black","magenta", "green"), main="Between-slide #comparison", xtick=TRUE, legend=FALSE) #legend(x=1.75, y=0.5, c("H","F","A"), lty=1, bty="n", pch="-", col=c("green", #"magenta", "black")) interaction.plot(DNA,Method,Correlations) @ \end{center} <<>>= detach(Results.between.BGS) dev.off() @ The plot shows obvious interaction between the two variables at the low DNA concentration level. In order to formally assess the apparent differences between categories within each variable we perform a two-way ANOVA. <<>>= ANOVA.between.two.BGS <- lm (Correlations ~ Method * DNA, Results.between.BGS) anova( ANOVA.between.two.BGS) @ Again we perform a diagnostic QQ plot and find a good quantile fit. \begin{center} <>= qqnorm(ANOVA.between.two.BGS$res) @ \end{center} For between-slide comparisons, using a larger amount of labelled sample significantly ($p<0.001$) improved the correlations independently of the segmentation method used. However significant interaction was observed between the amount of labelled sample and the segmentation method. Therefore, while there was no advantage for any segmentation method when low amounts of labelled sample were used, there were marked differences for the methods when using higher amounts. We examine the effects with no background subtraction as follows: <<>>= ANOVA.between.two.NoBGS <- lm (Correlations ~ Method * DNA, Results.between.NoBGS) anova( ANOVA.between.two.NoBGS) @ \begin{center} <>= qqnorm(ANOVA.between.two.NoBGS$res) @ \end{center} \section{Coefficient of repeatability confirms higher precision} A low value for the correlation coefficient does not necessarily mean low repeatability as the correlation coefficient is not a measure of sameness~\cite{Bland1986}. Previous reports have shown discrepancies between correlation coefficients and repeatability coefficients~\cite{Bland1986, Jenssen2002}. In order to confirm our findings, we repeated the analysis using the coefficient of repeatability (CR) values to compare between the three different methods of segmentation and included a fourth proprietary method encoded within the GenePix software package. \subsection{Data processing for the Genepix segmentation} We focus the attention on the data from experiment B to avoid the effect of DNA content. We only examine the data with no background subtraction to examine the effect independent of background estimation. \subsection{The \Rfunc{Get.ratios.genepix} function} \subsubsection{Description} A function to automate the process of obtaining the normalized log2 ratios for the genepix method. \subsubsection{Usage} Get.ratios.genepix(x) \subsubsection{Arguments} \begin{longtable}{rp{8cm}} x: & an object containing the names of the genepix raw data objects.\\ \end{longtable} \subsubsection{Details} For each raw data object we get the intensity values for cy5, Red= F635.Mean, and for cy3, Green= F532.Mean. We apply the grid information. We use scaled normalization as above. \subsubsection{Value} A data frame containing the normalized log2 ratios of each spot. The number of columns corresponds to the number of elements in x. \subsubsection{Script} <<>>= Get.ratios.genepix <- function(x){ res <- list(R = NULL, G = NULL, Rb = NULL, Gb = NULL) for(i in (1:length(x))){ y<-get(x[i]) res$R <- cbind(res$R,y$F635.Mean) res$G <- cbind(res$G,y$F532.Mean) res } data.grid<-list(nspot.r=16,nspot.c=34,ngrid.r=12,ngrid.c=2) data.ma<-stat.ma(res,data.grid,norm="s") halfn= length(x)/2 data.ma$M[,c(1:halfn)]<- (data.ma$M[,c(1:halfn)])*-1 data.ma$M } @ \subsubsection{Obtaining the ratios} <<>>= Res.genepix <- Get.ratios.genepix(expB.G) @ \subsection{The get.sigma function} \subsubsection{Description} A function that calculates the coeffecient of repeatability (CR) values as described by ~\cite{Jenssen2002}. \subsubsection{Usage} get.sigma(x,arrays) \newpage \subsubsection{Arguments} \begin{longtable}{rp{8cm}} x: & a three-dimensional array having 2 columns, by 6528 rows, by the number of slides in an experiment as a third dimension.\\\\ arrays: & the number of slides in an experiment\\ \end{longtable} \subsubsection{Details} The function computes the CR value for each spot by getting the square root of the mean of sum of squares for replicates across the slides. First the mean of the ratios of any two replicates on a slide: <<>>= spot.mean <- function(x){ y<-mean(x,na.rm=T)} @ Where x is a vector containing two elements representing the log2 ratios of a duplicate spot on a slide. Then the sum of squares for a duplicate spot pair on a slide: <<>>= sum.sq.spot <- function(x){ square.difference.spot<-NULL for(i in 1:length(x)){ y<-x[i]-(spot.mean(x)) y<-y^2 square.difference.spot<-c(square.difference.spot,y) } sum.square.difference.spot<-sum(square.difference.spot) sum.square.difference.spot } @ where x is a vector containing two elements representing the log2 ratios of a duplicate spot on a slide. First we get the difference between the ratio of each spot and the mean of the two spot ratios. Then we get the sum of the differences We repeat the same process for each duplicate on a slide: <<>>= sum.sq.allspots <- function(x){ ss.allspots<-NULL for( i in 1:nrow(x)){ y<-x[i,] y<-sum.sq.spot(y) ss.allspots<-c(ss.allspots,y) ss.allspots } } @ Where x is a matrix of two columns, representing the two replicates in a slide, and 6528 rows, representing the different clones on the slide. Finally, we obtain the CR values by computing the square root of the mean of sum of squares for replicates across the slides. \subsubsection{Value} An object containing the CR values for each spot in the object x. \subsubsection{Script} <<>>= get.sigma <- function(x,arrays){ CR.values <- NULL for (i in 1:arrays){ y<-x[,,i] y<-sum.sq.allspots(y) CR.values<-cbind(CR.values,y) CR.values } res<-apply(CR.values,1,mean,na.rm=T) res<-sqrt(res) res } @ \subsection{Obtaining the CR values} We convert the data to a three dimensional matrix for the different segmentation method. \subsubsection{The adaptive method} <<>>= rep.a <- rep(c(TRUE,FALSE),12,each=544) rep.b <- rep(c(FALSE,TRUE),12,each=544) Ratio.adaptive <- NULL for (i in 1:6){ y <- cbind(dataB.A.NoBGS.s$ratios[rep.a,i], dataB.A.NoBGS.s$ratios[rep.b,i]) Ratio.adaptive <- cbind ( Ratio.adaptive, y) } dim(Ratio.adaptive) <- c(6528, 2, 6) @ \subsubsection{The fixed circle method} <<>>= Ratio.fixed <- NULL for (i in 1:6){ y <- cbind(dataB.F.NoBGS.s$ratios[rep.a,i], dataB.F.NoBGS.s$ratios[rep.b,i]) Ratio.fixed <- cbind ( Ratio.fixed, y) } dim(Ratio.fixed) <- c(6528, 2, 6) @ \subsubsection{The histogram method} <<>>= Ratio.histogram <- NULL for (i in 1:6){ y <- cbind(dataB.H.NoBGS.s$ratios[rep.a,i], dataB.H.NoBGS.s$ratios[rep.b,i]) Ratio.histogram <- cbind ( Ratio.histogram, y) } dim(Ratio.histogram) <- c(6528, 2, 6) @ \subsubsection{The genepix method} <<>>= Ratio.genepix <- NULL for (i in 1:6){ y <- cbind(Res.genepix[rep.a,i], Res.genepix[rep.b,i]) Ratio.genepix <- cbind ( Ratio.genepix, y) } dim(Ratio.genepix) <- c(6528, 2, 6) @ \subsubsection{We then obtain the CR values for each method of segmentation.} <<>>= CR.adaptive <- get.sigma( Ratio.adaptive, 6) CR.fixed <- get.sigma( Ratio.fixed, 6) CR.histogram <- get.sigma( Ratio.histogram, 6) CR.genepix <- get.sigma( Ratio.genepix, 6) @ \subsubsection{We then calculate the median values for each method and plot the data.} <<>>= apply(cbind(CR.adaptive, CR.fixed, CR.histogram, CR.genepix), 2, median, na.rm=T) @ \begin{center} <>= boxplot(data.frame(cbind(CR.adaptive, CR.fixed, CR.histogram, CR.genepix)), ylab="Coefficient of repeatability", xlab= "") @ \end{center} The box plots for the sigma factors obtained for each feature from the slides from experiment~\emph{B} (no background subtraction) showed that the histogram methods had the lowest median CR value. \section{Estimating the differentially expressed genes} The probability of a gene to be differentially expressed is dependent on the variability of the data for that gene across the replicates of an experiment~\cite{Lee2000}. It follows from our findings that the segmentation method could have a direct effect on the number of differentially expressed genes identified. In order to test this assumption we used a Bayesian method to estimate the number of differentially expressed genes at a $p$ value of $0.01$ from the data set of experiment~\emph{B}~\cite{Lonnstedt2002}. We take account of each the duplicate clones on a slide in estimating significance. To achieve this we reorder the clones so that duplicates are sequentially in pairs as this is required by the stat.bayes function in sma. \subsection{The order.clones function} \subsubsection{Description} A function to reorder the clones within a slide so that duplicates are listed sequentially. \subsubsection{Usage} order.clones(x) \subsubsection{Arguments} \begin{longtable}{rp{8cm}} x: & a matrix of 13056 rows corresponding to all the clones on a slide. The number of columns corresponds to the number of slides in an experiment.The data in x are the log2 ratios. \\ y: & the number of clones on each subgrid of a microarray.\\ \end{longtable} \subsubsection{Details} We exclude any clones with a replicate that has a missing value in the experiment and reorder the matrix so that duplicate clones in a slide are sequential: \subsubsection{Script} <<>>= order.clones <- function(x, y){ rep.a<-rep(c(TRUE,FALSE),each=y) rep.b<-rep(c(FALSE,TRUE),each=y) replicates <- cbind(x[rep.a,],x[rep.b,]) replicates <- na.omit(replicates) reordered.clones <- NULL for(i in 1:ncol(x)){ y<-c(rbind(replicates[,i],replicates[,i+ncol(x)])) reordered.clones<-cbind(reordered.clones,y) reordered.clones } } @ \subsection{Identification of differential expression} We then compute the probabilities for differential expression: <<>>= adaptive.bayesian <- stat.bayesian( M= order.clones(dataB.A.NoBGS.s$ratios, y= 544), nb=6, nw=2) fixed.bayesian <- stat.bayesian( M= order.clones(dataB.F.NoBGS.s$ratios, y= 544), nb=6, nw=2) histogram.bayesian <- stat.bayesian( M= order.clones(dataB.H.NoBGS.s$ratios, y= 544), nb=6, nw=2) genepix.bayesian <- stat.bayesian( M= order.clones(Res.genepix, y= 544), nb=6, nw=2) @ The number of genes with a log odds ratio of more than zero for each method was as follows: <<>>= length(which(adaptive.bayesian$lods>0)) length(which(fixed.bayesian$lods>0)) length(which(histogram.bayesian$lods>0)) length(which(genepix.bayesian$lods>0)) @ We plot the volcano plot for each method of segmentation with the log2 ratios (M) on the x axis and the lods, probabilities of differential expression, on the y axis. \begin{center} <>= par(mfrow=c(1,4), las=1) par(moi=c(0.5,0,0,0)) plot(adaptive.bayesian$Xprep$Mbar,adaptive.bayesian$lods, xlim=c(-1,1.5), ylim=c(min(adaptive.bayesian$lods),18.5), ylab="Lod", xlab="M", main="A") text("n=345",x=-0.95, y=18.5, adj=c(0,0)) abline(h=0) plot(fixed.bayesian$Xprep$Mbar,fixed.bayesian$lods, xlim=c(-1,1.5), ylim=c(min(fixed.bayesian$lods),18.5), ylab="", xlab="M", main="F") text("n=967",x=-0.95, y=18.5, adj=c(0,0)) abline(h=0) plot(histogram.bayesian$Xprep$Mbar,histogram.bayesian$lods, xlim=c(-1,1.5), ylim=c(min(histogram.bayesian$lods),18.5), ylab="", xlab="M", main="H") text("n=832",x=-0.95, y=18.5, adj=c(0,0)) abline(h=0) plot(genepix.bayesian$Xprep$Mbar,genepix.bayesian$lods, xlim=c(-1,1.5), ylim=c(min(genepix.bayesian$lods),18.5), ylab="", xlab="M", main="G") text("n=944",x=-0.95, y=18.5, adj=c(0,0)) abline(h=0) @ \end{center} \begin{thebibliography}{20} \bibitem{Ihaka1996} Ihaka, R. and Gentleman, R. (1996) {R}: {A} language for data analysis and graphics. \newblock \emph{J. Comp. Graph. Stat.}, \textbf{5}, 299--314. \bibitem{Dudoit2002} Dudoit, S., Yang, Y.~H., Callow, M.~J. and Speed, T.~P. (2002) Statistical methods for identifying differentially expressed genes in replicated {cDNA} microarray experiments. \newblock \emph{Stat. Sin.}, \textbf{12}, 111--139. \bibitem{Brown2001} Brown, C.~S., Goodwin, P.~C. and Sorger, P.~K. (2001) Image metrics in the statistical analysis of {DNA} microarray data. \newblock \emph{Proc. Natl. Acad Sci U.S.A.}, \textbf{98}, 8944--9. \bibitem{Yue2001} Yue, H., Eastman, P.~S., Wang, B.~B., Minor, J., Doctolero, M.~H., Nuttall, R.~L., Stack, R., Becker, J.~W., Montgomery, J.~R., Vainer, M. and Johnston, R. (2001) An evaluation of the performance of {cDNA} microarrays for detecting changes in global {mRNA} expression. \newblock \emph{Nucleic Acids Res.}, \textbf{29}, E41--1. \bibitem{Bland1986} Bland, J.~M. and Altman, D.~G. (1986) Statistical methods for assessing agreement between two methods of clinical measurement. \newblock \emph{Lancet}, \textbf{1}, 307--10. \bibitem{Jenssen2002} Jenssen, T.~K., Langaas, M., Kuo, W.~P., Smith-Sorensen, B., Myklebost, O. and Hovig, E. (2002) Analysis of repeatability in spotted {cDNA} microarrays. \newblock \emph{Nucleic Acids Res.}, \textbf{30}, 3235--44. \bibitem{Lee2000} Lee, M.~L., Kuo, F.~C., Whitmore, G.~A. and Sklar, J. (2000) Importance of replication in microarray gene expression studies: statistical methods and evidence from repetitive {cDNA} hybridizations. \newblock \emph{Proc. Natl. Acad. Sci. U.S.A.}, \textbf{97}, 9834--9. \bibitem{Lonnstedt2002} L{\"o}nnstedt, I. and Speed, T. (2002) Replicated microarray data. \newblock \emph{Stat. Sin.}, \textbf{12}, 31--46. \end{thebibliography} \end{document}