setwd("C:/Users/anjoroge/Desktop/edwin/publication bias") library(RCurl) library(XML) library(tm) # A function to get all abstracts and pubmed ids for papers from the journal "journaltitle" in the year "year" # by scraping the Pubmed API. getAbstractsPmids = function(journaltitle,year){ # esearch url <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?" q = paste("db=pubmed&term=",gsub(" ","+",journaltitle),"[ta]+AND+",year,"[dp]&usehistory=y",sep="") esearch <- xmlTreeParse(getURL(paste(url, q, sep="")), useInternal = T) webenv <- xmlValue(getNodeSet(esearch, "//WebEnv")[[1]]) key <- xmlValue(getNodeSet(esearch, "//QueryKey")[[1]]) # efetch url <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" q <- "db=pubmed&retmode=xml&rettype=abstract" efetch <- xmlTreeParse(getURL(paste(url, q, "&WebEnv=", webenv, "&query_key=", key, sep="")), useInternal = T) r = xmlRoot(efetch) n = xmlSize(r) abstracts = pmid = titles = rep(NA,n) for(i in 1:n){abstracts[i] = xmlValue(r[[i]][[1]][["Article"]][["Abstract"]]); pmid[i] = xmlValue(r[[i]][[1]][["PMID"]]); titles[i] = xmlValue(r[[i]][[1]][["Article"]][["ArticleTitle"]]) } return(list(abstracts=abstracts,pmid=pmid,titles=titles)) } #A function to remove trailing zeros from the P-value strings removeTrailing = function(string){ while(length(grep("[0-9]",strsplit(string,"")[[1]][nchar(string)])) == 0){ string = substr(string,1,(nchar(string)-1)) } return(string) } # A function to convert the scientific notation used by journals into # numeric values that can be analyzed. convertScientific = function(string){ if(length(grep("[[:punct:]][[:space:]]",string))>0){ string = strsplit(string,"[[:punct:]][[:space:]]")[[1]][1] string = removeTrailing(string) } if(length(grep("[×x]",string))>0){ string = gsub("[:space:]","",string) tmp1 = as.numeric(strsplit(string,"[×x]")[[1]][1]) tmp2 = as.numeric(strsplit(strsplit(string,"[×x]")[[1]][2],"[--]")[[1]][2]) return(tmp1*10^(-tmp2)) }else{ return(as.numeric(string)) } } # A function to scrape the P-values from a vector of abstracts with corresponding # pubmed ids getPvalues = function(abstract,pmid){ pvalues = numeric(0) trunc = numeric(0) ids = numeric(0) # Get the truncated p-values ind = grep("[Pp][[:space:]]?[<=]",abstract) for(i in 1:length(ind)){ tmp = strsplit(abstract[ind[i]],"[[:space:](][Pp][[:space:]]?[<=]")[[1]] n = length(tmp) for(j in 1:n){ if(length(grep("[.0123456789]",substr(tmp[j],1,2))) > 0){ if(length(grep("[A-Z]",substr(tmp[j],1,1)))>0){next;} tmp2 = strsplit(tmp[j],"[^[:punct:][:digit:]x[:space:]]")[[1]][1] tmp2 = removeTrailing(tmp2) tmp2 = gsub(" ","",tmp2) tmp2 = convertScientific(tmp2) pvalues = c(pvalues,as.numeric(tmp2)) trunc = c(trunc,1) ids = c(ids,pmid[ind[i]]) } } } # Get the truncated p-values ind = grep("[Pp][[:space:]]?=",abstract) for(i in 1:length(ind)){ tmp = strsplit(abstract[ind[i]],"[[:space:](][Pp][[:space:]]?=")[[1]] n = length(tmp) for(j in 1:n){ if(length(grep("[.0123456789]",substr(tmp[j],1,2))) > 0){ if(length(grep("[A-Z]",substr(tmp[j],1,1)))>0){next;} tmp2 = strsplit(tmp[j],"[^[:punct:][:space:][:digit:]x[:space:]]")[[1]][1] tmp2 = removeTrailing(tmp2) tmp2 = gsub(" ","",tmp2) tmp2 = convertScientific(tmp2) pvalues = c(pvalues,as.numeric(tmp2)) trunc = c(trunc,0) ids = c(ids,pmid[ind[i]]) } } } return(list(pvalues=pvalues,ids=ids,trunc=trunc)) } journals =c("J Contemp Dent Pract","Br J Oral Maxillofac Surg","Int J Oral Maxillofac Surg","J Clin Dent", "Int J Dent Hyg","BMC Oral Health","Oral Health Prev Dent","Community Dent Oral Epidemiol","J Oral Sci","Braz Oral Res","J Adhes Dent", "J Clin Pediatr Dent","J Craniofac Surg","Am J Dent","Community Dent Health","Gerodontology", "J Oral Maxillofac Surg","Int Endod J","Eur J Orthod","J Oral Implantol","Gen Dent","J Endod", "J Clin Periodontol","J Dent","J Periodontol","Caries Res","J Periodontal Res","Arch Oral Biol", "J Prosthet Dent","Int Dent J","Br Dent J","Angle Orthod ","Clin Implant Dent Relat Res") #"World J Orthod", years = 2004:2014 pvalueData = matrix(NA,nrow=1,ncol=6) colnames(pvalueData) = c("pvalue","pvalueTruncated","pubmedID","year","abstract","title") npapers = matrix(NA,nrow=length(journals),ncol=length(years)) for(i in 1:length(journals)){ for(j in 1:length(years)){ cat(journals[i]); cat(" "); cat(years[j]); cat(" "); tmpData = getAbstractsPmids(journals[i],years[j]) while(length(tmpData$abstracts) ==1 & is.na(tmpData$abstracts[1])){tmpData = getAbstractsPmids(journals[i],years[j])} cat("Downloaded"); cat(" "); npapers[i,j] = length(tmpData$abstracts) tmpOut = getPvalues(tmpData$abstracts,tmpData$pmid) nPvalues = length(tmpOut$pvalues) aa = match(tmpOut$ids,tmpData$pmid) tmpMatrix = cbind(tmpOut$pvalues,tmpOut$trunc,as.numeric(tmpOut$ids),rep(years[j],nPvalues),tmpData$abstracts[aa],tmpData$titles[aa]) rownames(tmpMatrix) = rep(journals[i],nPvalues) pvalueData = rbind(pvalueData,tmpMatrix) cat("Done\n") } } pvalueData = pvalueData[!is.na(pvalueData[,1]),] save(pvalueData,npapers,file="overall3.rda") write.csv(pvalueData, file = "overall3.csv")