setwd("C:/Users/anjoroge/Desktop/edwin/publication bias")


library(RCurl)
library(XML)
library(tm)


# A function to get all abstracts and pubmed ids for papers from the journal "journaltitle" in the year "year"
# by scraping the Pubmed API.

getAbstractsPmids = function(journaltitle,year){
  # esearch
  url <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
  q = paste("db=pubmed&term=",gsub(" ","+",journaltitle),"[ta]+AND+",year,"[dp]&usehistory=y",sep="")
  esearch <- xmlTreeParse(getURL(paste(url, q, sep="")), useInternal = T)
  webenv  <- xmlValue(getNodeSet(esearch, "//WebEnv")[[1]])
  key     <- xmlValue(getNodeSet(esearch, "//QueryKey")[[1]])
  
  # efetch
  url <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
  q   <- "db=pubmed&retmode=xml&rettype=abstract"
  efetch <- xmlTreeParse(getURL(paste(url, q, "&WebEnv=", webenv, "&query_key=", key, sep="")), useInternal = T)
  r = xmlRoot(efetch)
  
  n = xmlSize(r)
  abstracts = pmid = titles = rep(NA,n)
  for(i in 1:n){abstracts[i] =  xmlValue(r[[i]][[1]][["Article"]][["Abstract"]]); pmid[i] = xmlValue(r[[i]][[1]][["PMID"]]); titles[i] = xmlValue(r[[i]][[1]][["Article"]][["ArticleTitle"]]) }
  return(list(abstracts=abstracts,pmid=pmid,titles=titles))
}


#A function to remove trailing zeros from the P-value strings
removeTrailing = function(string){
  while(length(grep("[0-9]",strsplit(string,"")[[1]][nchar(string)])) == 0){
    string = substr(string,1,(nchar(string)-1))
  }
  return(string)
}


# A function to convert the scientific notation used by journals into
# numeric values that can be analyzed. 
convertScientific = function(string){
  if(length(grep("[[:punct:]][[:space:]]",string))>0){
    string = strsplit(string,"[[:punct:]][[:space:]]")[[1]][1]
    string = removeTrailing(string)
  }
  if(length(grep("[莮]",string))>0){
    string = gsub("[:space:]","",string)
    tmp1 = as.numeric(strsplit(string,"[莮]")[[1]][1])
    tmp2 = as.numeric(strsplit(strsplit(string,"[莮]")[[1]][2],"[--]")[[1]][2])
    return(tmp1*10^(-tmp2))
  }else{
    return(as.numeric(string))
  }
}


# A function to scrape the P-values from a vector of abstracts with corresponding
# pubmed ids

getPvalues = function(abstract,pmid){
  pvalues = numeric(0)
  trunc = numeric(0)
  ids = numeric(0)
  
  # Get the truncated p-values
  ind = grep("[Pp][[:space:]]?[<=]",abstract)
  for(i in 1:length(ind)){
    tmp = strsplit(abstract[ind[i]],"[[:space:](][Pp][[:space:]]?[<=]")[[1]]
    n = length(tmp)
    for(j in 1:n){
      if(length(grep("[.0123456789]",substr(tmp[j],1,2))) > 0){
        if(length(grep("[A-Z]",substr(tmp[j],1,1)))>0){next;}
        tmp2 = strsplit(tmp[j],"[^[:punct:][:digit:]x[:space:]]")[[1]][1]
        tmp2 = removeTrailing(tmp2)
        tmp2 = gsub(" ","",tmp2)
        tmp2 = convertScientific(tmp2)
        pvalues = c(pvalues,as.numeric(tmp2))
        trunc = c(trunc,1)
        ids = c(ids,pmid[ind[i]])
      }
    }
  }
  
  # Get the truncated p-values
  ind = grep("[Pp][[:space:]]?=",abstract)
  for(i in 1:length(ind)){
    tmp = strsplit(abstract[ind[i]],"[[:space:](][Pp][[:space:]]?=")[[1]]
    n = length(tmp)
    for(j in 1:n){
      if(length(grep("[.0123456789]",substr(tmp[j],1,2))) > 0){
        if(length(grep("[A-Z]",substr(tmp[j],1,1)))>0){next;}
        tmp2 = strsplit(tmp[j],"[^[:punct:][:space:][:digit:]x[:space:]]")[[1]][1]
        tmp2 = removeTrailing(tmp2)
        tmp2 = gsub(" ","",tmp2)
        tmp2 = convertScientific(tmp2)
        pvalues = c(pvalues,as.numeric(tmp2))
        trunc = c(trunc,0)
        ids = c(ids,pmid[ind[i]])
      }
    }
  }
  return(list(pvalues=pvalues,ids=ids,trunc=trunc))
}

journals =c("J Contemp Dent Pract","Br J Oral Maxillofac Surg","Int J Oral Maxillofac Surg","J Clin Dent",
            "Int J Dent Hyg","BMC Oral Health","Oral Health Prev Dent","Community Dent Oral Epidemiol","J Oral Sci","Braz Oral Res","J Adhes Dent",
            "J Clin Pediatr Dent","J Craniofac Surg","Am J Dent","Community Dent Health","Gerodontology",
"J Oral Maxillofac Surg","Int Endod J","Eur J Orthod","J Oral Implantol","Gen Dent","J Endod",
"J Clin Periodontol","J Dent","J Periodontol","Caries Res","J Periodontal Res","Arch Oral Biol",
"J Prosthet Dent","Int Dent J","Br Dent J","Angle Orthod ","Clin Implant Dent Relat Res")
#"World J Orthod",
years = 2004:2014

pvalueData = matrix(NA,nrow=1,ncol=6)
colnames(pvalueData) = c("pvalue","pvalueTruncated","pubmedID","year","abstract","title")

npapers = matrix(NA,nrow=length(journals),ncol=length(years))

for(i in 1:length(journals)){
  for(j in 1:length(years)){
    cat(journals[i]); cat(" "); cat(years[j]); cat(" "); 
    tmpData = getAbstractsPmids(journals[i],years[j])
    while(length(tmpData$abstracts) ==1 & is.na(tmpData$abstracts[1])){tmpData = getAbstractsPmids(journals[i],years[j])}
    cat("Downloaded"); cat(" ");
    npapers[i,j] = length(tmpData$abstracts)
    tmpOut = getPvalues(tmpData$abstracts,tmpData$pmid)
    
    nPvalues = length(tmpOut$pvalues)
    aa = match(tmpOut$ids,tmpData$pmid)
    
    tmpMatrix = cbind(tmpOut$pvalues,tmpOut$trunc,as.numeric(tmpOut$ids),rep(years[j],nPvalues),tmpData$abstracts[aa],tmpData$titles[aa])
    rownames(tmpMatrix) = rep(journals[i],nPvalues)
    pvalueData = rbind(pvalueData,tmpMatrix)
    cat("Done\n")
  }
}
pvalueData = pvalueData[!is.na(pvalueData[,1]),]
save(pvalueData,npapers,file="overall3.rda")
write.csv(pvalueData, file = "overall3.csv")