########################################
#  DATA SEARCH FROM ALL THREE DATABASES:
#  DRUG RESPONSE DATA FOR EXON 19 DEL LUNG CANCER CELL LINES VS. WT CELL LINES
########################################
#Set working directory
setwd("C:/")

#####################
# LIBRARIES
#####################
library(reshape2)
library(dplyr)
library(ggplot2)
library(tidyr)          
library(magrittr)       
library(rcompanion)
library(grid)
library(ggthemes)

#####################
#CCLE DATABASE SEARCH
#####################

######################
#  DATA INPUT
######################
#Central files collected from website:
CCLEdrugdata=read.table("CCLE_NP24.2009_Drug_Data_2015.02.24_fixed_16022018.csv", sep=";", header=T, stringsAsFactors = F)

#I have collected all mutation data from CCLE Mutations website as .csv files and GDSC mutation data and collected them into files containing exon 19 del mutations and all other ERBB mutations. Separate files because of different naming systems between databases.
mutationdata=read.table("CCLE mutation list.csv",sep=";",header=T,stringsAsFactors=F)

# List of selected drugs (erlotinib + lapatinib)
ERBB_drugs_CCLE = read.csv("ERBB drugs CCLE.csv", sep = ";")


##########################
#  COLLECTION OF DRUG DATA
##########################

#Collect drug data
#Erlotinib
SelectedDrug1 = CCLEdrugdata[which(CCLEdrugdata$Compound=="Erlotinib"),]

#Collect the cell line data
SelectedMutations1 = merge(mutationdata,SelectedDrug1, by="CCLE.Cell.Line.Name")


#Lapatinib
SelectedDrug2 = CCLEdrugdata[which(CCLEdrugdata$Compound=="Lapatinib"),]

#Collect the cell line data that appears in both files
SelectedMutations2 = merge(mutationdata,SelectedDrug2, by="CCLE.Cell.Line.Name")

CCLE_ERBB = rbind(SelectedMutations1,SelectedMutations2)
CCLE_ERBB = CCLE_ERBB[which(CCLE_ERBB$CancerType=="Lung"),]
write.csv(CCLE_ERBB,file="CCLE_MUT_Cell_Lines.csv")


#Collect WT cell line data:
ccle_whole1 = SelectedDrug1
CellLines1 = SelectedMutations1$CCLE.Cell.Line.Name
OtherLines1 = ccle_whole1[!ccle_whole1$CCLE.Cell.Line.Name %in% CellLines1,]

#For other drug
ccle_whole2 = SelectedDrug2
CellLines2 = SelectedMutations2$CCLE.Cell.Line.Name
OtherLines2 = ccle_whole2[!ccle_whole2$CCLE.Cell.Line.Name %in% CellLines2,]

merged_CCLE = rbind(OtherLines1,OtherLines2)
merged_CCLE = merged_CCLE[which(merged_CCLE$CancerType=="Lung"),]
write.csv(merged_CCLE,file="CCLE_WT_Cell_Lines.csv")

#Calculate AUC
erlo=merged_CCLE[which(merged_CCLE$Compound=="Erlotinib"),]
mean(erlo$ActArea)
#Mean of negative ctrl cell lines = 0.6118893

lapa=merged_CCLE[which(merged_CCLE$Compound=="Lapatinib"),]
mean(lapa$ActArea)
#Mean of negative ctrl cell lines = 0.7242165


#Bring files to excel and add 0 然 values to 100% and calculate response values for the rest (value = 100+x).




#####################
#CTRP DATABASE SEARCH
#####################

######################
#  DATA INPUT
######################

#Central files collected from website:

#Data for experiments
#information about experimental growth conditions, media, and SNP fingerprinting
CTRP_ExperimentData = read.csv("v20.meta.per_experiment.csv",header=T,sep=";",stringsAsFactors = F)

#Data about cell lines
#contextual cancer cell line information and annotation
CTRP_CellLineData = read.csv("v20.meta.per_cell_line.csv",header=T,sep=";",stringsAsFactors = F)

#average measured and predicted viability values following quality control and curve-fitting for each cancer cell line treated with compound for each concentration point tested
#experiment_id, cpv_avg_pv, cpd_conc_umol, master_cpd_id
CTRP_AvrgViab = read.table("v20.data.per_cpd_post_qc.txt", sep="\t", header = T, stringsAsFactors = F)

#AUC and EC50 data
#Contains AUC values, exp_id, master_cpd_id
CTRP_AUC_data=read.table("v20.data.curves_post_qc.txt", sep = "\t", header = T, stringsAsFactors = F)

#Mutation data file done by checking CTRP database numbering for cell lines
CTRP_mutationdata=read.table("CTRP mutation list.csv",sep=";",header=T,stringsAsFactors=F)

# List of selected drugs
CTRP_ERBB_drugs = read.csv("ERBB drugs CTRP.csv", sep = ";")



########################
#  COLLECTING DRUG DATA
########################

# Goes through the drug list to collect all data
FilteredDrugDatas = list(NA)

for(DrugOfIteration in CTRP_ERBB_drugs$master_cpd_id ){
  # filter only drug on choice, which is DrugOfIteration
  drugdata.Filtered1 = CTRP_AvrgViab[CTRP_AvrgViab$master_cpd_id == DrugOfIteration,]
  # make small ExpID table of only relevant columns
  ExpID = CTRP_ExperimentData[,c("experiment_id","master_ccl_id")]
  # make small CellLineID table of only relevant columns
  CellLineID = CTRP_CellLineData[,c("ccl_name","master_ccl_id","ccle_primary_site")]
  
  #merge experiment number to filtered raw data
  drugdata.Filtered2 = merge(drugdata.Filtered1,ExpID,by="experiment_id")
  # merge ccl_id to filtered raw data
  drugdata.Filtered3 = merge(drugdata.Filtered2,CellLineID,by="master_ccl_id")
  ncol(drugdata.Filtered3)
  # Add a data.frame as a list element to the list FilteredDrugDatas
  FilteredDrugDatas[[DrugOfIteration]] = drugdata.Filtered3
}

FilteredDrugDatas = do.call(rbind,FilteredDrugDatas)
FilteredDrugDatas = FilteredDrugDatas[-1,]

# Take only lung cell lines
FilteredDrugDatas = FilteredDrugDatas[which(FilteredDrugDatas$ccle_primary_site=="lung"),]


#Collect only cell lines that have data
CTRP_cells = CTRP_mutationdata$Cell_Line_Name

sub.table = FilteredDrugDatas[which(FilteredDrugDatas$ccl_name %in% CTRP_cells),]


#################
#Re-organize data
#################
dataset = sub.table 
dataset2 = dcast(dataset, master_ccl_id+experiment_id+ccl_name+master_cpd_id ~ cpd_conc_umol, value.var="cpd_avg_pv",fun.aggregate = sum,fill=Inf) 

dataset2[dataset2 == Inf] <- ""

sub.table2=CTRP_mutationdata[which(CTRP_mutationdata$master_ccl_id %in% dataset2$master_ccl_id),]

CTRPSelectedMutations = merge(dataset2,CTRP_mutationdata, by="master_ccl_id",all.x=T)

write.csv(CTRPSelectedMutations,file="CTRP organized MUT cells.csv")


##########################
#COLLECT AUC AND EC50 INFO
##########################
#Collecting AUC values for MUT
#Select columns
CTRP_drug_data = CTRPSelectedMutations

#Select those that share same master_cpd_id
filtered.data1 = CTRP_AUC_data[CTRP_AUC_data$master_cpd_id %in% unique(CTRP_drug_data$master_cpd_id),]
#Select those that have same ID
filtered.data2 = filtered.data1[filtered.data1$experiment_id %in% CTRP_drug_data$experiment_id,]
#Combine by ID
Combined.data1 = merge(CTRP_drug_data,filtered.data2,by=c("experiment_id","master_cpd_id"))

#Take only these columns
Combined.data2 = Combined.data1[,c("ccl_name", "master_cpd_id", "apparent_ec50_umol", "area_under_curve", "experiment_id")]

write.table(Combined.data2,file= "CTRP MUT EC50 and AUC.csv", sep=",", col.names= T)

########################################
#COLLECT NEGATIVE CTRL AUC AND EC50 INFO
########################################
#Collect only cell lines that have data
neg_CTRP_cells = unique(CTRP_mutationdata$Cell_Line_Name)

neg.sub.table = FilteredDrugDatas[!(FilteredDrugDatas$ccl_name %in% neg_CTRP_cells),]

#Re-shape  
neg.dataset = neg.sub.table
neg.dataset2 = dcast(neg.dataset, master_ccl_id+experiment_id+ccl_name+master_cpd_id ~ cpd_conc_umol, value.var="cpd_avg_pv",fun.aggregate = sum,fill=Inf) 
neg.dataset2[neg.dataset2 == Inf] <- ""

write.csv(neg.dataset2,file="CTRP organized WT cells.csv")



#Collecting AUC values for WT
CTRP_neg_drug_data=neg.dataset2
#Select columns
CTRP_neg_drug_data = CTRP_neg_drug_data[,c("ccl_name","master_cpd_id","experiment_id")]
#Select unique column combinations
#CTRP_neg_drug_data = unique(CTRP_neg_drug_data)
#Select those that share same master_cpd_id
Neg.filtered.data1 = CTRP_AUC_data[CTRP_AUC_data$master_cpd_id %in% unique(CTRP_neg_drug_data$master_cpd_id),]
#Select those that have same ID
Neg.filtered.data2 = Neg.filtered.data1[Neg.filtered.data1$experiment_id %in% CTRP_neg_drug_data$experiment_id,]
#Combine by ID
Neg.Combined.data1 = merge(CTRP_neg_drug_data, Neg.filtered.data2,by=c("experiment_id","master_cpd_id"))

#Take only these columns
Neg.Combined.data2 = Neg.Combined.data1[,c("ccl_name", "master_cpd_id", "apparent_ec50_umol", "area_under_curve", "experiment_id"),]

write.table(Neg.Combined.data2,file= "CTRP WT EC50 and AUC.csv", sep=",", col.names= T)



#Negative ctrl values for different drugs: 52926=gefitinib, 52928=erlotinib, 606135=afatinib, 634309=lapatinib
gefitinib_ctrl = Neg.Combined.data2[Neg.Combined.data2$master_cpd_id=="52926",]
length(gefitinib_ctrl$area_under_curve)
# mean from 86 cell lines
mean(gefitinib_ctrl$area_under_curve)
# AUC neg ctrl 12.4321

erlotinib_ctrl = Neg.Combined.data2[Neg.Combined.data2$master_cpd_id=="52928",]
length(erlotinib_ctrl$area_under_curve)
# mean from 90 cell lines
mean(erlotinib_ctrl$area_under_curve)
# AUC neg ctrl 13.44199

afatinib_ctrl = Neg.Combined.data2[Neg.Combined.data2$master_cpd_id=="606135",]
length(afatinib_ctrl$area_under_curve)
# mean from 88 cell lines
mean(afatinib_ctrl$area_under_curve)
# AUC neg ctrl 10.36891

lapatinib_ctrl = Neg.Combined.data2[Neg.Combined.data2$master_cpd_id=="634309",]
length(lapatinib_ctrl$area_under_curve)
# mean from 86 cell lines
mean(lapatinib_ctrl$area_under_curve)
# AUC neg ctrl 12.86479


######################
# Fit response values by adding 0 然 concentration point at 100, calculate for values by x*100
######################







#####################
#GDSC DATABASE SEARCH
#####################

######################
#  DATA INPUT
######################
#Central files collected from website:
#I have gone through "Screened_Compounds.xlsx" to search RTK drugs. Drug data is available in file bolow:
GDSC_data=read.table("v17.3_public_raw_data.csv", sep=",", header=T, stringsAsFactors = F)

GDSC_AUC=read.table("v17.3_fitted_dose_response.csv", sep=";", header=T, stringsAsFactors = F)

GDSC_drugs=read.table("GDSC_search_drugs.csv", sep=";", header=T, stringsAsFactors = F)

GDSC_cells=read.table("GDSC_Cell_Line_Info.csv",sep=";",header=T,stringsAsFactors=F)

GDSC_mutations =read.table("GDSC mutations.csv", sep = ";", header = T, stringsAsFactors = F)


#Select all the drug data
# 1=Erlotinib
SelectedDrug1 = GDSC_data[which(GDSC_data$DRUG_ID==1),]
# 119=Lapatinib
SelectedDrug2 = GDSC_data[which(GDSC_data$DRUG_ID==119),]
# 1032=Afatinib
SelectedDrug3 = GDSC_data[which(GDSC_data$DRUG_ID==1032),]
# 1377=Afatinib
SelectedDrug4 = GDSC_data[which(GDSC_data$DRUG_ID==1377),]
# 1010=Gefitinib
SelectedDrug5 = GDSC_data[which(GDSC_data$DRUG_ID==1010),]


#Merge these together
GDSC_data2 = rbind(SelectedDrug1,SelectedDrug2, SelectedDrug3, SelectedDrug4, SelectedDrug5)

# Add cell line info
GDSC_data_info = merge(GDSC_data2, GDSC_cells, by="COSMIC_ID", all.x = T)

# Select cell lines
filtered.data1 = GDSC_data_info[GDSC_data_info$CELL_LINE_NAME %in% unique(GDSC_mutations$CELL_LINE_NAME),]
filtered.data1 = filtered.data1[which(filtered.data1$Site=="lung"),]

filtered.data.wt = GDSC_data_info[!GDSC_data_info$CELL_LINE_NAME %in% unique(GDSC_mutations$CELL_LINE_NAME),]

filtered.data.wt = filtered.data.wt[which(filtered.data.wt$Site=="lung"),]


#Collect negative controls (NC) for MUT
NC = GDSC_data[which(GDSC_data$TAG=="NC-0"),]
GDSC_ID = unique(filtered.data1$COSMIC_ID)
NC_GDSC = NC[NC$COSMIC_ID %in% GDSC_ID,]
SCAN_ID = unique(filtered.data1$SCAN_ID)
NC_GDSC_TOT_MUT = NC_GDSC[NC_GDSC$SCAN_ID %in% SCAN_ID,]



#Collect negative controls for WT cells
NC = GDSC_data[which(GDSC_data$TAG=="NC-0"),]
GDSC_ID = unique(filtered.data.wt$COSMIC_ID)
NC_GDSC = NC[NC$COSMIC_ID %in% GDSC_ID,]
SCAN_ID = unique(filtered.data.wt$SCAN_ID)
NC_GDSC_TOT_WT = NC_GDSC[NC_GDSC$SCAN_ID %in% SCAN_ID,]



#Calculate the negative control means
meantable = NC_GDSC_TOT_MUT %>% group_by(SCAN_ID,CELL_LINE_NAME) %>%
  summarise(mean = mean(INTENSITY))

meantable2 = NC_GDSC_TOT_WT %>% group_by(SCAN_ID,CELL_LINE_NAME) %>%
  summarise(mean = mean(INTENSITY))

#write.csv(meantable, file="GDSC NC means for MUT.csv")
#write.csv(meantable2, file="GDSC NC means for WT.csv")


# Re-shape data for MUT
GDSC_search = dcast(filtered.data1, CELL_LINE_NAME+SCAN_ID+DRUG_ID ~ CONC, value.var="INTENSITY",fun.aggregate = sum,fill=Inf) 
GDSC_search[GDSC_search == Inf] <- ""

#write.csv(GDSC_search,file="All GDSC drugs MUT.csv")

# Re-shape data for WT
GDSC_search2 = dcast(filtered.data.wt, CELL_LINE_NAME+SCAN_ID+DRUG_ID ~ CONC, value.var="INTENSITY",fun.aggregate = sum,fill=Inf) 
GDSC_search2[GDSC_search2 == Inf] <- ""

#write.csv(GDSC_search2,file="All GDSC drugs WT.csv")


# Add negative control means to points
GDSC_data3 = merge(GDSC_search, meantable, by=c("SCAN_ID","CELL_LINE_NAME"), all.x = T)

GDSC_data4 = merge(GDSC_search2, meantable2, by=c("SCAN_ID","CELL_LINE_NAME"), all.x = T)

#write.csv(GDSC_data3, file="Merged neg control and points MUT.csv")
#write.csv(GDSC_data4, file="Merged neg control and points WT.csv")



#Collect AUC for MUT
AUC1 = unique(filtered.data1$COSMIC_ID)
AUC2 = GDSC_AUC[GDSC_AUC$COSMIC_ID %in% AUC1,]
AUC3 = unique(filtered.data1$DRUG_ID)
AUC4 = AUC2[AUC2$DRUG_ID %in% AUC3,]


# Add AUC data 
GDSC_data3_AUC4 = merge(GDSC_data3, AUC4, by=c("CELL_LINE_NAME","DRUG_ID"), all.x = T)
#write.csv(GDSC_data3_AUC4, file = "GDSC_MUT_drug_AUC.csv")

# Add mutation data
GDSC_data3_AUC4_GDSC_mutations = merge(GDSC_data3_AUC4,GDSC_mutations , by=c("CELL_LINE_NAME"), all.x = T)
write.csv(GDSC_data3_AUC4_GDSC_mutations, file = "GDSC_MUT_drug_AUC_mutations.csv")


#Collect AUC info for WT
AUC1 = unique(filtered.data.wt$COSMIC_ID)
AUC2 = GDSC_AUC[GDSC_AUC$COSMIC_ID %in% AUC1,]
AUC3 = unique(filtered.data.wt$DRUG_ID)
AUC4 = AUC2[AUC2$DRUG_ID %in% AUC3,]


# Add AUC data
GDSC_data3_AUC4 = merge(GDSC_data4, AUC4, by=c("CELL_LINE_NAME","DRUG_ID"), all.x = T)
write.csv(GDSC_data3_AUC4, file = "GDSC_WT_drug_AUC.csv")





#Calculate WT AUC value means
Gefitinib_ctrl = AUC4[AUC4$DRUG_NAME=="Gefitinib",]
mean(as.numeric(Gefitinib_ctrl$AUC),na.rm=T)
length(Gefitinib_ctrl$AUC)
#hist(as.numeric(Gefitinib_ctrl$AUC))
# AUC neg ctrl 0.9572212; 94

Afatinib_ctrl1 = AUC4[AUC4$DRUG_ID==1032,]
mean(as.numeric(Afatinib_ctrl1$AUC),na.rm=T)
length(Afatinib_ctrl1$AUC)
# AUC neg ctrl 0.9385724, max conc 0.5; 94

Afatinib_ctrl2 = AUC4[AUC4$DRUG_ID==1377,]
mean(as.numeric(Afatinib_ctrl2$AUC),na.rm=T)
length(Afatinib_ctrl2$AUC)
# AUC neg ctrl 0.9025666, max conc 10; 105

## The two afatinib series together
Afatinib_ctrl = AUC4[AUC4$DRUG_NAME=="Afatinib",]
mean(as.numeric(Afatinib_ctrl$AUC),na.rm=T)
length(Afatinib_ctrl$AUC)
# AUC neg ctrl 0.9195744; 199

Lapatinib_ctrl = AUC4[AUC4$DRUG_NAME=="Lapatinib",]
mean(as.numeric(Lapatinib_ctrl$AUC),na.rm=T)
length(Lapatinib_ctrl$AUC)
# AUC neg ctrl 0.9509353; 45

Erlotinib_ctrl = AUC4[AUC4$DRUG_NAME=="Erlotinib",]
mean(as.numeric(Erlotinib_ctrl$AUC),na.rm=T)
length(Erlotinib_ctrl$AUC)
# AUC neg ctrl 0.9505726; 43


#Calculate the normalized and fitted drug response values.




#######################
#FINALIZING THE DATASET
#######################

#Combine all three dataset files into one file that has all the fitted drug values in one concentration series with cell line names unified.
# Check CNV status from Harmonizome (http://amp.pharm.mssm.edu/Harmonizome/) and RAS status from mutation files.







#####################
# p value calculation
#####################

#Two-sample t-test (independent)
# compare the differences between the averages
#Note that, unpaired two-samples t-test can be used only under certain conditions:
#when the two groups of samples (A and B), being compared, are normally distributed. This can be checked using Shapiro-Wilk test.
#and when the variances of the two groups are equal. This can be checked using F-test.

#############################################
# In our situation the F-test results in favor of Welch t test (two groups being compared are different (heteroscedasticity)) because of the very different group sizes. If we use this correct method, the p values are not significant but if we would assume variance equality, we could use two sample t test where the difference is significant in all occasions but lapatinib. The difference becomes significant if we pool the erlotinib, gefitinib and afatinib together and compare it to lapatinib (p-value = 0.002963).
#############################################



CTRP_AUC_data=read.csv("CTRP AUC values.csv",sep=";",header=T,stringsAsFactors=F)

####################
# TEST FOR ERLOTINIB
CTRP_AUC_data1 = CTRP_AUC_data[CTRP_AUC_data$Compound=="Erlotinib",]

a=CTRP_AUC_data1[CTRP_AUC_data1$Protein.change=="WT",]
a=a$AUC
plotNormalHistogram(a)

b=CTRP_AUC_data1[CTRP_AUC_data1$Protein.change=="p.ELREA746del",]
b=b$AUC
plotNormalHistogram(b)

var.test(a,b)
#F test to compare two variances
#   data:  a and b
#   F = 0.10057, num df = 89, denom df = 3, p-value =
#   2.035e-05
#   alternative hypothesis: true ratio of variances is not equal to 1
#   95 percent confidence interval:
#   0.007202728 0.328526133
#   sample estimates:
#   ratio of variances 
#   0.1005713

t.test(a,b, var.equal=FALSE, paired=FALSE)
#Welch Two Sample t-test
#   data:  a and b
#   t = 1.852, df = 3.0269, p-value = 0.1603
#   alternative hypothesis: true difference in means is not equal to 0
#   95 percent confidence interval:
#     -3.71757 14.19329
#   sample estimates:
#     mean of x mean of y 
#   13.441987  8.204125

t.test(a,b, var.equal=TRUE, paired=FALSE)
#   data:  a and b
#   t = 5.0393, df = 92, p-value = 2.325e-06
#   alternative hypothesis: true difference in means is not equal to 0
#   95 percent confidence interval:
#     3.173518 7.302205
#   sample estimates:
#     mean of x mean of y 
#   13.441987  8.204125



####################
# TEST FOR GEFITINIB
CTRP_AUC_data2 = CTRP_AUC_data[CTRP_AUC_data$Compound=="Gefitinib",]

a=CTRP_AUC_data2[CTRP_AUC_data2$Protein.change=="WT",]
a=a$AUC
plotNormalHistogram(a)

b=CTRP_AUC_data2[CTRP_AUC_data2$Protein.change=="p.ELREA746del",]
b=b$AUC
plotNormalHistogram(b)

var.test(a,b)
#   data:  a and b
#   F = 0.093771, num df = 85, denom df = 3, p-value =
#     1.014e-05
#   alternative hypothesis: true ratio of variances is not equal to 1
#   95 percent confidence interval:
#     0.006714337 0.306996758
#   sample estimates:
#     ratio of variances 
#   0.09377099

t.test(a,b, var.equal=FALSE, paired=FALSE)
#Welch Two Sample t-test
#   data:  a and b
#   t = 2.0305, df = 3.0262, p-value = 0.1345
#   alternative hypothesis: true difference in means is not equal to 0
#   95 percent confidence interval:
#     -3.054038 13.967743
#   sample estimates:
#     mean of x mean of y 
#   12.43210   6.97525  

t.test(a,b, var.equal=TRUE, paired=FALSE)
#Two Sample t-test
#   data:  a and b
#   t = 5.6338, df = 88, p-value = 2.085e-07
#   alternative hypothesis: true difference in means is not equal to 0
#   95 percent confidence interval:
#     3.531983 7.381721
#   sample estimates:
#     mean of x mean of y 
#   12.43210   6.97525




####################
# TEST FOR AFATINIB
CTRP_AUC_data3 = CTRP_AUC_data[CTRP_AUC_data$Compound=="Afatinib",]

a=CTRP_AUC_data3[CTRP_AUC_data3$Protein.change=="WT",]
a=a$AUC
plotNormalHistogram(a)

b=CTRP_AUC_data3[CTRP_AUC_data3$Protein.change=="p.ELREA746del",]
b=b$AUC
plotNormalHistogram(b)

var.test(a,b)
#F test to compare two variances
#   data:  a and b
#   F = 0.17995, num df = 87, denom df = 3, p-value =
#     0.003098
#   alternative hypothesis: true ratio of variances is not equal to 1
#   95 percent confidence interval:
#     0.0128866 0.5884749
#   sample estimates:
#     ratio of variances 
#   0.1799528 


t.test(a,b, var.equal=FALSE, paired=FALSE)
#Welch Two Sample t-test
#   data:  a and b
#   t = 2.36, df = 3.0493, p-value = 0.098
#   alternative hypothesis: true difference in means is not equal to 0
#   95 percent confidence interval:
#     -1.708909 11.872982
#   sample estimates:
#     mean of x mean of y 
#   10.368911  5.286875


t.test(a,b, var.equal=TRUE, paired=FALSE)
#Two Sample t-test
#   data:  a and b
#   t = 5.0902, df = 90, p-value = 1.946e-06
#   alternative hypothesis: true difference in means is not equal to 0
#   95 percent confidence interval:
#     3.098558 7.065514
#   sample estimates:
#     mean of x mean of y 
#   10.368911  5.286875 




####################
# TEST FOR LAPATINIB
CTRP_AUC_data4 = CTRP_AUC_data[CTRP_AUC_data$Compound=="Lapatinib",]

a=CTRP_AUC_data4[CTRP_AUC_data4$Protein.change=="WT",]
a=a$AUC
plotNormalHistogram(a)

b=CTRP_AUC_data4[CTRP_AUC_data4$Protein.change=="p.ELREA746del",]
b=b$AUC
plotNormalHistogram(b)

var.test(a,b)
#F test to compare two variances
#   data:  a and b
#   F = 0.35828, num df = 85, denom df = 2, p-value =
#     0.134
#   alternative hypothesis: true ratio of variances is not equal to 1
#   95 percent confidence interval:
#     0.009073606 1.380711738
#   sample estimates:
#     ratio of variances 
#   0.3582816


t.test(a,b, var.equal=FALSE, paired=FALSE)
#Welch Two Sample t-test
#   data:  a and b
#   t = 1.0569, df = 2.0503, p-value = 0.399
#   alternative hypothesis: true difference in means is not equal to 0
#   95 percent confidence interval:
#     -4.721640  7.893812
#   sample estimates:
#     mean of x mean of y 
#   12.86479  11.27870


t.test(a,b, var.equal=TRUE, paired=FALSE)
#Two Sample t-test
#   data:  a and b
#   t = 1.7116, df = 87, p-value = 0.09054
#   alternative hypothesis: true difference in means is not equal to 0
#   95 percent confidence interval:
#     -0.2557874  3.4279594
#   sample estimates:
#   mean of x mean of y 
#   12.86479  11.27870 


################################
# TEST FOR POOLED ERLO+GEFI+AFA
CTRP_AUC_data5 = rbind(CTRP_AUC_data1,CTRP_AUC_data2, CTRP_AUC_data3)

a=CTRP_AUC_data5[CTRP_AUC_data5$Protein.change=="WT",]
a=a$AUC
plotNormalHistogram(a)

b=CTRP_AUC_data5[CTRP_AUC_data5$Protein.change=="p.ELREA746del",]
b=b$AUC
plotNormalHistogram(b)

var.test(a,b)
#F test to compare two variances
#   data:  a and b
#   F = 0.20363, num df = 263, denom df = 11, p-value =
#     1.276e-06
#   alternative hypothesis: true ratio of variances is not equal to 1
#   95 percent confidence interval:
#     0.06995375 0.41584486
#   sample estimates:
#     ratio of variances 
#   0.2036349


t.test(a,b, var.equal=FALSE, paired=FALSE)
#Welch Two Sample t-test
#   data:  a and b
#   t = 3.7777, df = 11.205, p-value = 0.002963
#   alternative hypothesis: true difference in means is not equal to 0
#   95 percent confidence interval:
#     2.204915 8.328219
#   sample estimates:
#     mean of x mean of y 
#   12.088651  6.822083 

### -> BECOMES SIGNIFICANT WHEN SAMPLE SIZE IS INCREASED


t.test(a,b, var.equal=TRUE, paired=FALSE)
#Two Sample t-test
#   data:  a and b
#   t = 7.6468, df = 274, p-value = 3.501e-13
#   alternative hypothesis: true difference in means is not equal to 0
#   95 percent confidence interval:
#     3.910689 6.622445
#   sample estimates:
#     mean of x mean of y 
#   12.088651  6.822083








#####################
# Figure
#####################

#publication theme modified from:
# https://rpubs.com/Koundy/71792

windowsFonts()

FontName <- "sans"

theme_Publication <- function(base_size=14,base_family=FontName) {
  library(grid)
  library(ggthemes)
  (theme_foundation(base_size=base_size)
    + theme(plot.title = element_text(face = "bold",
                                      size = rel(1.2), hjust = 0.5,family=FontName),
            text = element_text(),
            panel.background = element_rect(colour = NA),
            plot.background = element_rect(colour = NA),
            panel.border = element_rect(colour = NA),
            axis.title = element_text(face = "bold",size = rel(1),family=FontName),
            axis.title.y = element_text(angle=90,vjust =2),
            axis.title.x = element_text(vjust = -0.2),
            axis.text = element_text(family=FontName), 
            axis.line = element_line(colour="black"),
            axis.ticks = element_line(),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            legend.key = element_rect(colour = NA),
            legend.position = "bottom",
            legend.direction = "horizontal",
            legend.key.size= unit(0.2, "cm"),
            legend.margin = unit(0, "cm"),
            legend.text = element_text(family=FontName),
            legend.title = element_text(face="italic",family=FontName),
            plot.margin=unit(c(10,5,5,5),"mm"),
            strip.background=element_rect(colour="#f0f0f0",fill="#f0f0f0"),
            strip.text = element_text(face="bold",family=FontName)
    ))
  
}


#####################################
cells=read.table("Search results for exon 19 del mutation data Figure.csv", sep=";", header=T)

cells$uniqID = paste("ID",rep(1:nrow(cells)),sep="")
Cells_Long = gather(cells,concentration,SurvCellsPer,X0.然:X66.然,factor_key=T)

Cells_Long$concentrationNum = Cells_Long$concentration
Cells_Long$concentrationNum = gsub("X","",Cells_Long$concentrationNum)
Cells_Long$concentrationNum = gsub(".然","",Cells_Long$concentrationNum)
Cells_Long$concentrationNum = as.numeric(Cells_Long$concentrationNum)

Cells_Long2 = Cells_Long[is.na(Cells_Long$concentrationNum)==F,]
Cells_Long2 = Cells_Long2[is.na(Cells_Long2$SurvCellsPer)==F,]

Cells_Long3 = Cells_Long2[Cells_Long2$Database == "GDSC",]
Cells_Long3 = Cells_Long3[,]
Cells_Long4 = Cells_Long3[Cells_Long3$Compound == "Gefitinib",]
Cells_Long4 = Cells_Long4[,]

#black for WT, red for exon19
# p.E746_A750del, WT
cbPalette = c("red", "black")

plottoshow = ggplot(data=Cells_Long4,aes(x=factor(concentrationNum),y=SurvCellsPer,color=Protein.change,group=uniqID),na.rm=T) + geom_line() + geom_point()+ scale_colour_manual(values=cbPalette)
plottoshow = plottoshow+theme_Publication()
plot(plottoshow)