############################################ #Drugbank XML file extraction (R script) #Date: 20/01/2016 # Contact: ashenafi.legehar@helsinki.fi, leo.ghemtio@helsinki.fi ########################################### #changing the working directory setwd('/home/ashenafi/Documents/project/drugdata/DRUGBANK') #Install XML package in R install.packages("XML") # import XML packge to R enviroment library(XML) #the data stored int variable and with internal node xml_data1<-xmlTreeParse('drugbank.xml', options=HUGE) #### data understanding before data mining and extraction### class(xml_data1) #Root the xml file xmltop = xmlRoot(xml_data1) ###node name xmlName(c1) #number of children xmlSize(c1) # classify the data and store their indices for (i in 1:length(xml_list)) { #return 1/True if the attribute type is small molecule index=which(xml_list[[i]][['.attrs']][['type']]=='small molecule') index2=which(xml_list[[i]][['.attrs']][['type']]=='biotech') if (length(index)==1) { check[i]<-i }else{ biotech[i]<-i } } #remove all the NA small_index<-check[!is.na(check)] biotech_index<-biotech[!is.na(biotech)] #######Befor execute the for loop intialize the variables below #index for small molecules counter<-1 #remove all small molecule list remove(small_molecule) #create all small molecule list small_molecule<-list() ###extract all small molelcule###### for (i in small_index) { small_molecule[[counter]]<-xml_list[[i]] counter<-counter+1 } ###extract biotech molecule###### #index for biotech c<-1 #remove all small molecule list remove(biotech_m) #create all small molecule list biotech_m<-list() #getting all biotech molecule in the list for (i in biotech_index) { biotech_m[[c]]<-xml_list[[i]] c<-c+1 } # data extraction selected calculated-properties from drugbank database for (i in 1:length(biotech_m)) { #drug id t_a_b[i]<-small_molecule[[i]][[1]][['text']] #name of the drug t_n_b[i]<-small_molecule[[i]][['name']] t_m_b[i]<-0 #group t_b_b[i]<-small_molecule[[i]][['groups']][['group']] #Melting Point t_m_b[i]<-biotech_m[[i]][['experimental-properties']][[2,1]] #Hydrophobicity t_h_b[i]<-biotech_m[[i]][['experimental-properties']][[2,1]] #Isoelectric Point t_i_b[i]<-biotech_m[[i]][['experimental-properties']][[2,2]] #Molecular Weight t_mw_b[i]<-biotech_m[[i]][['experimental-properties']][[2,3]] #Molecular Formula t_m_f[i]<-biotech_m[[i]][['experimental-properties']][[2,4]] } data_bio<-data.frame(t_a_b, t_n_b, t_b_b), t_m_b, t_h_b, t_i_b, t_mw_b, t_m_f) View(data_bio) colnames(data_bio)<-c('id','drug_name','group'), 'Melting Point', 'Hydrophobicity', 'Isoelectric Point', 'Molecular Weight', 'Molecular Formula') # data extraction selected calculated-properties from drugbank database for (i in 1:length(small_molecule)) { # Initialization the varivable t_so[i]<-0 t_so_t[i]<-'Null' l_p[i]<-0 l_p_s[i]<-'Null' t_m[i]<-0 t_m_s[i]<-'Null' t_mo[i]<-0 t_mo_s[i]<-'Null' t_p[i]<-0 t_p_s[i]<-'Null' t_r[i]<-0 t_r_s[i]<-'Null' t_po[i]<-0 t_po_s[i]<-'Null' t_ro[i]<-0 t_ro[i]<-'Null' t_h[i]<-0 t_h_s[i]<-'Null' t_hc[i]<-0 t_hc_s[i]<-'Null' t_pka[i]<-0 t_pka[i]<-'Null' t_pkb[i]<-0 t_pkb_s[i]<-'Null' t_py[i]<-0 t_py_s[i]<-'Null' t_nr[i]<-0 t_so_t[i]<-'Null' t_nr_s[i]<-'Null' t_e[i]<-0 t_f[i]<-'Null' #drug id t_a[i]<-small_molecule[[i]][[1]][['text']] name of the drug t_n[i]<-small_molecule[[i]][['name']] #group t_b[i]<-small_molecule[[i]][['groups']][['group']] #absorption if (is.null(small_molecule[[i]]['absorption'][[1]])) { t_ad[i]<-'NULL' }else{ t_ad[i]<-gsub("<.*?>","",small_molecule[[i]] ['absorption'][[1]]) } #volume-of-distribution if (is.null(small_molecule[[i]] { ['volume-of-distribution'][[1]])) { t_di[i]<-'NULL' }else{ t_di[i]<-gsub("<.*?>","",small_molecule[[i]] ['volume-of-distribution'][[1]]) } #Metabolism if (is.null(small_molecule[[i]]['metabolism'][[1]])) { t_me[i]<-'NULL' }else{ t_me[i]<-gsub("<.*?>","",small_molecule[[i]] ['metabolism'][[1]]) } #route-of-elimination if (is.null(small_molecule[[i]] ['route-of-elimination'][[1]])) { t_el[i]<-'NULL' } else{ t_el[i]<-gsub("<.*?>","",small_molecule[[i]] ['route-of-elimination'][[1]]) } #toxicity if (is.null(small_molecule[[i]]['toxicity'][[1]])) { t_to[i]<-'NULL' }else{ t_to[i]<-gsub("<.*?>","",small_molecule[[i]] ['toxicity'][[1]]) } #logP ALOGPS if (is.null(small_molecule[[i]] [['calculated-properties']][[2,1]])) { t_c[i]<-0 } else { t_c[i]<-small_molecule[[i]] [['calculated-properties']][[2,1]] } #logP ALOGPS Source if (is.null(small_molecule[[i]] [['calculated-properties']][[3,1]])) { t_d[i]<-'Null' }else{ #source t_d[i]<-small_molecule[[i]] [['calculated-properties']][[3,1]] } if(i!=7362 ) { # log S if (is.null(small_molecule[[i]] [['calculated-properties']][[2,2]])) { t_so[i]<-0 } else{ #logS t_so[i]<-small_molecule[[i]] [['calculated-properties']][[2,2]]####NewADDED } #logS ALOGPS if (is.null(small_molecule[[i]] [['calculated-properties']][[3,2]] )) { t_so_t[i]<-'Null' }else{ t_so_t[i]<-small_molecule[[i]] [['calculated-properties']][[3,2]]####NewADDED t_so_t[116]<-'Null' t_so_t[205]<-'Null' } } #Water Solublity if(i!=5292 & i!=7362) { if (is.null(small_molecule[[i]] [['calculated-properties']][[2,3]])) { t_e[i]<-0 }else{ #Water Solublity t_e[i]<-small_molecule[[i]] [['calculated-properties']][[2,3]] t_e[116]<-'Null' t_e[205]<-'Null' } #Wate Solublity source if (is.null(small_molecule[[i]] [['calculated-properties']][[3,3]])) { t_f[i]<-'Null' }else { #Wate Solublity source t_f[i]<-small_molecule[[i]] [['calculated-properties']][[3,3]] t_f[116]<-'Null' t_f[205]<-'Null' } } #logP ChemAxon, Molecular Weight, Molecular Fromula if(i!=5292 & i!=7362) { #logP ChemAxon if (is.null(small_molecule[[i] ][['calculated-properties']][[2,4]])) { l_p[i]<-0 }else{ l_p[i]<-small_molecule[[i]] [['calculated-properties']][[2,4]] ####newAdded l_p[116]<-0 l_p[205]<-0 } #logP ChemAxon Source if (is.null(small_molecule[[i]] [['calculated-properties']][[3,4]])) { l_p_s[i]<-'Null' } else{ l_p_s[i]<-small_molecule[[i]] [['calculated-properties']][[3,4]] l_p_s[116]<-'Null' l_p_s[205]<-'Null' } #Molecular Weight if (i==116 & i==205) { t_m[116]<-small_molecule[[i]] [['calculated-properties']][[2,4]] t_m[205]<-small_molecule[[i]] [['calculated-properties']][[2,4]] }else if (is.null(small_molecule[[i]] [['calculated-properties']][[2,7]])) { t_m[i]<-0 }else{ t_m[i]<-small_molecule[[i]] [['calculated-properties']][[2,7]] } #Molecular Weight source if(i==116 & i==205) { t_m_s[116]<-small_molecule[[i]] [['calculated-properties']][[3,4]] t_m_s[205]<-small_molecule[[i]] [['calculated-properties']][[3,4]] }else if (is.null(small_molecule[[i]] [['calculated-properties']][[3,7]])) { t_m_s[i]<-'Null' } else{ t_m_s[i]<-small_molecule[[i]] [['calculated-properties']][[3,7]] } #Molecular Fromula if(i==116 & i == 205) { t_mo[116]<-small_molecule[[i]] [['calculated-properties']][[2,7]] t_mo[205]<-small_molecule[[i]] [['calculated-properties']][[2,7]] }else if(is.null(small_molecule[[i]] [['calculated-properties']][[2,10]])) { t_mo[i]<-0 }else { t_mo[i]<-small_molecule[[i]] [['calculated-properties']][[2,10]] } if(i==116 & i==205) { t_mo_s[116]<-small_molecule[[i]] [['calculated-properties']][[3,7]] t_mo_s[205]<-small_molecule[[i]] [['calculated-properties']][[3,7]] } #Molecular Formula Source else if(is.null(small_molecule[[i]] [['calculated-properties']][[3,10]])) { t_mo_s[i]<-'Null' } else{ t_mo_s[i]<-small_molecule[[i]] [['calculated-properties']][[3,10]] } } # Polar Surface Area (PSA) Source, Refractivity, Polarizablity, Rotatable_bond_Count #H Bond Acceptor Count, H Bond Donor Count,#pKa (strongest acidic) #pKa (strongest basic), Physiological Charge if (i!=783 & i!=5292 &i!=7362) { #Polar Surface Area (PSA) Source if(i==116 & i==205) { t_p[116]<-small_molecule[[i]] [['calculated-properties']][[2,10]] t_p[205]<-small_molecule[[i]] [['calculated-properties']][[2,10]] }else if(is.null(small_molecule[[i]] [['calculated-properties']][[2,13]])) { t_p[i]<-0 } else { #polar surface area t_p[i]<-small_molecule[[i]] [['calculated-properties']][[2,13]] } #Polar Surface Area (PSA) Source if(i==116 & i==205) { t_p_s[116]<-small_molecule[[i] ][['calculated-properties']][[3,10]] t_p_s[205]<-small_molecule[[i]] [['calculated-properties']][[3,10]] } else if(is.null(small_molecule[[i]] [['calculated-properties']][[3,13]])) { t_p_s[i]<-'Null' }else { t_p_s[i]<-small_molecule[[i]] [['calculated-properties']][[3,13]] } if(i==116 & i==205) { t_r[116]<-small_molecule[[i]] [['calculated-properties']][[2,11]] t_r[205]<-small_molecule[[i]] [['calculated-properties']][[2,11]] } # Refractivity else if (is.null(small_molecule[[i]] [['calculated-properties']][[2,14]])) { t_r[i]<-0 }else { t_r[i]<-small_molecule[[i]] [['calculated-properties']][[2,14]] ####newAdded } #Refractivity Source if(i==116 & i==205) { t_r_s[116]<-small_molecule[[i]] [['calculated-properties']][[2,11]] t_r_s[205]<-small_molecule[[i]] [['calculated-properties']][[2,11]] }else if (is.null(small_molecule[[i]] [['calculated-properties']][[3,14]])) { t_r_s[i]<-'Null' } else { t_r_s[i]<-small_molecule[[i]] [['calculated-properties']][[3,14]] ####newAdded } if(i!=595) { #Polarizablity if(i==116 & i==205) { t_po[116]<-small_molecule[[i]] [['calculated-properties']][[2,12]] t_po[205]<-small_molecule[[i]] [['calculated-properties']][[2,12]] } else if(is.null(small_molecule[[i]] [['calculated-properties']][[2,15]])) { t_po[i]<-0 }else { #polarizabiliy t_po[i]<-small_molecule[[i]] [['calculated-properties']][[2,15]] } #polarizablity source if(i==116 & i==205) { t_po_s[116]<-small_molecule[[i]] [['calculated-properties']][[2,12]] t_po_s[205]<-small_molecule[[i]] [['calculated-properties']][[2,12]] }else if(is.null(small_molecule[[i]] [['calculated-properties']][[3,15]])) { t_po_s[i]<-'Null' } else { t_po_s[i]<-small_molecule[[i]] [['calculated-properties']][[3,15]] } if (i!=8 & i!=91) { # Rotatable_bond_Count if(i==116 & i==205) { t_ro[116]<-small_molecule[[i]] [['calculated-properties']][[2,13]] t_ro[205]<-small_molecule[[i]] [['calculated-properties']][[2,13]] }else if(is.null(small_molecule[[i]] [['calculated-properties']][[2,15]])) { t_ro[i]<-0 }else { #Rotatable_bond_Count t_ro[i]<-small_molecule[[i]] [['calculated-properties']][[2,16]] } # Rotatable_bond_Count source if(i==116 & i==205) { t_ro_s[116]<-small_molecule[[i]] [['calculated-properties']][[2,13]] t_ro_s[205]<-small_molecule[[i]] [['calculated-properties']][[2,13]] }else if(is.null(small_molecule[[i]] [['calculated-properties']][[3,16]])) { t_ro_s[i]<-'Null' }else { t_ro_s[i]<-small_molecule[[i]] [['calculated-properties']][[3,16]] } #H Bond Acceptor Count, H Bond Donor Count,#pKa (strongest acidic) #pKa (strongest basic), Physiological Charge if (i!=2428 & i!=3835 &i!=4090 &i!=4979) { #H Bond Acceptor Count if(i==116 & i==205) { t_h[116]<-small_molecule[[i]] [['calculated-properties']][[2,14]] t_h[205]<-small_molecule[[i]] [['calculated-properties']][[2,14]] } else if(is.null(small_molecule[[i]] [['calculated-properties']][[2,17]])) { t_h[i]<-0 }else { #H Bond Acceptor Count t_h[i]<-small_molecule[[i]] [['calculated-properties']][[2,17]] } #H Bond Acceptor Count source if(i==116 & i==205) { t_h_s[116]<-small_molecule[[i]] [['calculated-properties']][[2,14]] t_h_s[205]<-small_molecule[[i]] [['calculated-properties']][[2,14]] }else if(is.null(small_molecule[[i]] [['calculated-properties']][[3,17]])) { t_h_s[i]<-'Null' } else { #H Bond Acceptor Count source t_h_s[i]<-small_molecule[[i]] [['calculated-properties']][[3,17]] } if(i!=3231 & i!=4114 & i!=4142 & i!=4244 ) { #H Bond Donor Count if(is.null(small_molecule[[i]] [['calculated-properties']][[2,17]])) { t_hc[i]<-0 }else { #H Bond Donor Count if(i==116 & i==205) { t_hc[116]<-small_molecule[[i]] [['calculated-properties']][[2,15]] t_hc[205]<-small_molecule[[i]] [['calculated-properties']][[2,15]] }else { t_hc[i]<-small_molecule[[i]] [['calculated-properties']][[2,18]] } } if(is.null(small_molecule[[i]] [['calculated-properties']][[3,18]])) { t_hc_s[i]<-'Null' }else { #H Bond Donor Count source if(i==116 & i==205) { t_hc_s[116]<-small_molecule[[i]] [['calculated-properties']][[2,15]] } else { t_hc_s[i]<-small_molecule[[i]] [['calculated-properties']][[3,18]] } } #Pka if(i!=1811 & i!=2683 &i!=3592) { if(is.null(small_molecule[[i]] [['calculated-properties']][[2,19]]) ) { t_pka[i]<-0 }else { #pKa (strongest acidic) if(i==116 & i==205) { t_pka[116]<-small_molecule[[i]] [['calculated-properties']][[2,16]] t_pka[205]<-small_molecule[[i]][['calculated-properties']][[2,16]] } else { t_pka[i]<-small_molecule[[i]] [['calculated-properties']][[2,19]] } } #pKa (strongest acidic) source if(is.null(small_molecule[[i]] [['calculated-properties']][[3,19]] )) { t_pka_s[i]<-'Null' }else { #pKa (strongest acidic) source if(i==116 & i==205) { t_pka_s[116]<-small_molecule[[i]] [['calculated-properties']][[2,16]] t_pka_s[205]<-small_molecule[[i]] [['calculated-properties']][[2,16]] }else { t_pka_s[i]<-small_molecule[[i]] [['calculated-properties']][[3,19]] } } if(i!=859 & i!=927 & i!=1204 & i!=1215 & i!=1219 & i!=1324 & i!=1415 & i!=1416 & i!=1973 & i!=2903 & i!=3097 & i!=4143 & i!=4637 & i!=4721 & i!=4954 & i!=4968 & i!=5306 & i!=5330 & i!=7396) { if(is.null(small_molecule[[i]] [['calculated-properties']][[2,20]])) { t_pkb[i]<-0 }else { #pKa (strongest basic) if(i==116 & i==205) { t_pkb[116]<-small_molecule[[i]] [['calculated-properties']][[2,17]] t_pkb[205]<-small_molecule[[i]] [['calculated-properties']][[2,17]] }else { t_pkb[i]<-small_molecule[[i]] [['calculated-properties']][[2,20]] } } if(is.null(small_molecule[[i]] [['calculated-properties']][[3,20]])) { t_pkb_s[i]<-'Null' }else { #pKa (strongest basic) if(i==116 & i==205) { t_pkb_s[116]<-small_molecule[[i]] [['calculated-properties']][[2,17]] t_pkb_s[205]<-small_molecule[[i]] [['calculated-properties']][[2,17]] } else { t_pkb_s[i]<-small_molecule[[i]] [['calculated-properties']][[3,20]] } } } } } } } } } } #Store the extracted data in to dataframe data<-data.frame(t_a,t_n,t_b,as.numeric(t_c),t_d,as.numeric(t_so), t_so_t, as.numeric(l_p),l_p_s,as.numeric(t_m),t_m_s,t_mo,t_mo_s, as.numeric(t_p), t_p_s, as.numeric(t_r), t_r_s, as.numeric(t_po),t_po_s, as.numeric(t_ro), t_ro_s, as.numeric(t_h), t_h_s,as.numeric(t_hc), t_hc_s, as.numeric(t_pka), t_pka_s, as.numeric(t_pkb), t_pkb_s, as.numeric(t_py), t_py_s, t_ad, t_di,t_me,t_el, t_to) #Rename the extracted data column colnames(data)<-c('id','drug_name','group','logP', 'source_logP','logS', 'source_logS', 'logP ChemAxon', 'source_logP ','Molecular_weight', 'Mweight_source', 'Molecular_Formula','M_source','Polar_Surface_Area', 'PSA_source', 'Refractivity','Refractivity_source', 'Polarizability', 'Polarizability_source', 'Rotatable_bond_Count', 'RBC_source', 'Hydrogen_bond_Acceptor', 'HBAC_source', 'Hydrogen_bond_donor', 'BDC_source','Pka_acidic', 'Pka_source', 'Pka_basic', 'Pka_basic_source','Pysiological_charge', 'PC_source', 'Absorption', 'Volume-of-distribution','Metabolism', 'route-of-elimination','Toxicity') #export the extracted file as CSV write.csv(data, "drugbank.csv",row.names=FALSE)