The following analysis workflow is a companion to the manuscript “Copy number signatures predict chromothripsis and clinical outcomes in newly diagnosed multiple myeloma” by Maclachlan et al.
Chromothripsis is a complex chromosomal shattering event associated with random rejoining, and is emerging as strong, independent adverse prognostic factor across multiple malignancies. Reliable detection of chromothripsis requires whole genome sequencing (WGS) and the integration of both structural variants (SVs) and copy number (CN) data.
CN signature analysis (analytical code in Supplementary Data 1) and SV signature analysis (code in Supplementary Data 2) take whole genome sequencing (WGS) data (or whole exome sequencing for CN signatures) and produce a dataframe detailing the relative proportional contribution from each signature.
Here we demonstrate how genomic signature analysis can be used to predict for the presence of chromothripsis by estimating the average area-under-the-curve (AUC) from receiver operating characteristic (ROC) curves using 10-fold cross validation.
In Supplementary Data 4 we demonstrate how CN signatures are more accurate than an alternate CN tool for the prediction of chromothripsis.
library(plyr)
library(dplyr)
library('pROC')
WGS_ALL_SIGS <- read.delim("WGS_ALL_SIGS.txt")
head(WGS_ALL_SIGS)
## sample CN_SIG1 CN_SIG2 CN_SIG3 CN_SIG4 CN_SIG5 chromothripsis_code
## 1 MMRF_1016_1_BM 0.37861 0.08882 0.09925 0.42988 0.001611 1
## 2 MMRF_1021_1_BM 0.47677 0.09290 0.12820 0.29430 0.004890 0
## 3 MMRF_1029_1_BM 0.69528 0.12583 0.12488 0.04900 0.001713 0
## 4 MMRF_1030_1_BM 0.54355 0.29376 0.14255 0.01705 0.001210 0
## 5 MMRF_1031_1_BM 0.08782 0.60671 0.05130 0.24602 0.001655 0
## 6 MMRF_1032_1_BM 0.06403 0.32360 0.05881 0.53281 0.006058 1
## SV_SIG1 SV_SIG2 SV_SIG3 SV_SIG4 SV_SIG5 SV_SIG6 SV_SIG7 SV_SIG8
## 1 0.26567 0.06450 0.31108 0.0783333 0.22058 0.0015000 0.001833 0.013750
## 2 0.17114 0.12795 0.16159 0.0004545 0.37091 0.0009091 0.005455 0.145227
## 3 0.52286 0.25071 0.10071 0.0007143 0.06357 0.0035714 0.015714 0.006429
## 4 0.56750 0.19600 0.05100 0.0005000 0.04200 0.0005000 0.001500 0.045000
## 5 0.07365 0.02750 0.05058 0.0003846 0.80635 0.0005769 0.001154 0.032692
## 6 0.18340 0.04263 0.16231 0.0939744 0.02891 0.3190385 0.001538 0.009231
## SV_SIG9 SV_SIG10
## 1 0.010667 0.0005833
## 2 0.012273 0.0000000
## 3 0.027857 0.0000000
## 4 0.020000 0.0000000
## 5 0.004808 0.0000000
## 6 0.016154 0.0015385
AUCV = NULL
ROCVSENS = NULL
ROCVSPENS = NULL
RUNV =NULL
Len = length(WGS_ALL_SIGS[,1])
SS = 1:Len
start=1
end=75 # if the dataset contains 752 samples, then 10x cross-fold validation requires 75 for this value
num_add<- 75 # and again here
for (i in 1:10){
kk = SS[-(start:end)]
start=start + num_add
end=end + num_add
aacov1 = data.frame(chromth=WGS_ALL_SIGS$chromothripsis_code[kk],
factor1 = (WGS_ALL_SIGS$SV_SIG1)[kk],
factor2 = (WGS_ALL_SIGS$SV_SIG2)[kk],
factor3 = (WGS_ALL_SIGS$SV_SIG3)[kk],
factor4 = (WGS_ALL_SIGS$SV_SIG4)[kk],
factor5 = (WGS_ALL_SIGS$SV_SIG5)[kk],
factor6 = (WGS_ALL_SIGS$SV_SIG6)[kk],
factor7 = (WGS_ALL_SIGS$SV_SIG7)[kk],
factor8 = (WGS_ALL_SIGS$SV_SIG8)[kk],
factor9 = (WGS_ALL_SIGS$SV_SIG9)[kk],
factor10 = (WGS_ALL_SIGS$SV_SIG10)[kk],
factor12 = (WGS_ALL_SIGS$CN_SIG1)[kk],
factor13 = (WGS_ALL_SIGS$CN_SIG2)[kk],
factor14 = (WGS_ALL_SIGS$CN_SIG3)[kk],
factor15 = (WGS_ALL_SIGS$CN_SIG4)[kk],
factor16 = (WGS_ALL_SIGS$CN_SIG5)[kk]
)
aacov = glm(chromth~.,data = aacov1,family='binomial')
aacov2 = data.frame(chromth=WGS_ALL_SIGS$chromothripsis_code[-kk],
factor1 = (WGS_ALL_SIGS$SV_SIG1)[-kk],
factor2 = (WGS_ALL_SIGS$SV_SIG2)[-kk],
factor3 = (WGS_ALL_SIGS$SV_SIG3)[-kk],
factor4 = (WGS_ALL_SIGS$SV_SIG4)[-kk],
factor5 = (WGS_ALL_SIGS$SV_SIG5)[-kk],
factor6 = (WGS_ALL_SIGS$SV_SIG6)[-kk],
factor7 = (WGS_ALL_SIGS$SV_SIG7)[-kk],
factor8 = (WGS_ALL_SIGS$SV_SIG8)[-kk],
factor9 = (WGS_ALL_SIGS$SV_SIG9)[-kk],
factor10 = (WGS_ALL_SIGS$SV_SIG10)[-kk],
factor12 = (WGS_ALL_SIGS$CN_SIG1)[-kk],
factor13 = (WGS_ALL_SIGS$CN_SIG2)[-kk],
factor14 = (WGS_ALL_SIGS$CN_SIG3)[-kk],
factor15 = (WGS_ALL_SIGS$CN_SIG4)[-kk],
factor16 = (WGS_ALL_SIGS$CN_SIG5)[-kk]
)
predpr <- predict(aacov,newdata=aacov2,type=c("response"))
roccurve <- roc(aacov2$chromth,predpr)
aa = auc(roccurve)
aa = as.numeric(aa)
AUCV = c(AUCV,aa)
ROCVSENS = c(ROCVSENS,roccurve$sensitivities)
ROCVSPENS = c(ROCVSPENS,roccurve$specificities)
RUNV =c(RUNV,rep(i,length(roccurve$specificities)))
}
aa<- mean(AUCV)
aa
## [1] 0.9601
# [1] 0.9601
Y = ROCVSENS
X1 = 1 - ROCVSPENS
X11 = 1-roccurve$specificities
Y11 = roccurve$sensitivities
X11= sort(unique(X1))
par(mar = c(5, 5,3,2) +0.01)
fit <- smooth.spline(X1, Y, nknots = 10)
pred <- stats:::predict.smooth.spline(fit, X11)$y
plot(X1[RUNV==1], Y[RUNV==1],lwd=1.7,type='l',col='blue',main=paste0('Chromothripsis from genomic sigs ', "AUC=",aa),
ylab='True Positive',xlab='False Positive', cex.lab = 1.5, cex.axis = 1.3, mgp = c(3, 1,0))
for (kk in 1:10){
points(X1[RUNV==kk], Y[RUNV==kk],type='l',col='blue',lwd=1.7)
}
lines(X11, pred, lwd = 2, col = 2)
abline(a=0,b=1)