# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - MAY 19, 2016
library(lattice)
library(dplyr)
setwd("/Users/Violeta/Desktop/R_files/Outcomes")

# 1. Read all the files and put them all in the same format (from .sas7bdat to .csv)
library(sas7bdat)
Demographics <- read.sas7bdat("/BioLINCC/ACCORD/3_Data_Sets_Anal_Data_Sets/accord_key.sas7bdat")
write.csv(Demographics, file="Demographics.csv")

# 2. Focus on files that contain the variable 'Visit' (BP, HbA1c, Lipids, and Other Labs)
# Change variable for numeric instead of character and put all levels in the same numeric format
BP <- read.csv("BP.csv")
BP$Visit <- as.character(BP$Visit)
BP$Visit[BP$Visit == "BLR"] <- 00
BP$Visit <- sub("F","",BP$Visit)      # In order to replace the "F" for nothing (instead of visit F01, just 01)
BP$Visit[BP$Visit == "EXIT"] <- 86    # Final visit was set to 86 months (>7yr)
BP$Visit <- as.numeric(BP$Visit)
write.csv(BP,file="/R_files/BP.csv", row.names=FALSE)
save(BP,file="/R_files/BP.RData")

# 3. Join all the files with the variable 'Visit'
test1 <- merge(
              merge(BP, HbA1c, by=c("MaskID", "Visit"), all=TRUE), 
        Lipids, by=c("MaskID", "Visit"), all=TRUE)
newtest1 <- test1[c(-3,-7,-9)]  #Since it creates new rows, we have to exclude variables 3 (X.x), 7(X.y), and 9(X)
finaltest <- merge(newtest1, Labs, by=c("MaskID", "Visit"), all=TRUE) #To put everything together
newfinaltest<-finaltest[c(-12)] #To eliminate the extra row

# 4. Put all the files together (including those without 'visits')
Demographics <- read.csv("A-Demo.csv")
Demographics <- Demographics[c(-1)]   # Eliminate the extra row (X)
# Read all the files [...]
JoinedTest1<-Reduce(dplyr::full_join, 
                    list(Demographics, newfinaltest, HypoEv, Hypo1st, SAE, CVDo, mvo))
write.csv(JoinedTest1,file="/R_files/JoinedTest1.csv", row.names=FALSE)
save(JoinedTest1,file="/R_files/JoinedTest1.RData")
# 'JoinedTest1' includes 212,781 observations (repeated ob per patient) with 122 variables (complete dataset)

# 5. Cleaning the dataset previously generated 
names(JoinedTest1)
Nephro<-select(JoinedTest1, -c(5,6,8,29,31,33:40,42:80,96:122))

NephroNA<-sapply(Nephro,as.character)   #For WEKA conveniences, convert NA's to blank spaces
NephroNA[is.na(Nephro)]<-""
NephroNA<-as.data.frame(NephroNA)   #To convert the matrix R generated to data frame
write.csv(NephroNA,file="/R_files/Cleaning/NephroNA.csv", row.names=FALSE)

# 6.Change names to make the analysis easier
Dataset <- NephroNA
names(Dataset) <- c("ID", "Gender", "Base_age", "Arm", "CVD_History", "Race",
                    "Visit", "sBP", "dBP", "HR",
                    "HbA1c",
                    "Chol", "Trig", "vLDL", "LDL", "HDL",
                    "FPG", "ALT", "CPK", "K", "SCr", "GFR", "UAlb", "UCr", "ACR",
                    "Glc50", "time_hypo", 
                    "time_SAE",
                    "Decline", "D_time", "fu_D",
                    "Macro", "M_time", "fu_M",
                    "RF", "RF_time", "fu_RF",
                    "Any", "Any_t", "fu_Any",
                    "Micro", "mtime", "fum")
save(Dataset,file="/Users/Violeta/Desktop/R_files/Outcomes/Dataset.RData")

# 8. Create a new column that tells you whether or not the patient developed any kind of nephropathy
OutcomeSum <- Dataset %>% 
              select(Decline, Macro, RF, Micro) %>%  #Select only the rows that will be added  
              distinct(ID)
# Convert factor to numeric classes in order to allow the addition of the values
OutcomeSum$Decline<-as.numeric(levels(OutcomeSum$Decline))[OutcomeSum$Decline]    
sum <- rowSums (OutcomeSum, na.rm=TRUE)   #Gives you back the sum of the rows ignoring NAs (just the value)
sum <- as.data.frame(sum)                 #Turns it into a data frame (1 single column)
sum$out <- ""                    #Adds a new empty column (outcome)
sum$out <- ifelse(sum>=1,1,0)   #if the sum is >1, then outcome=1 (YES)
NephropathyNA <- cbind(OutcomeSum,sum)  #Joining the outcomes with their "score"

# 9. Join the new outcome with the rest of the dataset
Data2 <- Dataset %>%             
  select (1:28,30,31,33,34,36:40, 42,43)
FinalData <- merge(Data2, FinalOut, by="ID", all=TRUE)   #FinalOut is NephropathyNA+ID
FinalData$out[FinalData$out==0] <- "No"
FinalData$out[FinalData$out==1] <- "Yes"
names(FinalData)[45] <- "Nephropathy"     #This column tells us wether or not the patient developed any type of Nephropathy
names(FinalData)[44] <- "Number_Nephro"   #Tells us how many Nephropathies were developed

write.csv(FinalData,file="/R_files/Outcomes/FinalData.csv", row.names=FALSE)
save(FinalData,file="/R_files/Outcomes/FinalData.RData")

# 10. Create a final dataset by individual: 10,251 observations of 45 variables
FinalData_ID <- FinalData %>% distinct(ID) 
FinalData_ID$Decline <- as.factor(FinalData_ID$Decline)
FinalData_ID$Macro <- as.factor(FinalData_ID$Macro)
FinalData_ID$RF <- as.factor(FinalData_ID$RF)
FinalData_ID$Micro <- as.factor(FinalData_ID$Micro)
FinalData_ID$Number_Nephro <- as.factor(FinalData_ID$Number_Nephro)

save(FinalData_ID,file="R_files/Outcomes/FinalData_ID.RData")
write.csv(FinalData_ID,file="/R_files/Outcomes/FinalData_ID.csv")

# 11. Change names per visit
Baseline <- Baseline %>% filter(Visit==0)  # =Baseline <- Baseline[which(Baseline$Visit==0),]
names(Baseline)
colnames(Baseline) <- c("ID","Visit","sBP00","dBP00","HR00","HbA1c00","Chol00",
                        "Trig00","vLDL00","LDL00","HDL00","FPG00","ALT00",
                        "CPK00","K00","SCr00","GFR00","UAlb00", "UCr00","UACR00")
Baseline <- select(Baseline, -Visit)
names(Baseline)

# Join all the variables for the second window of time 6 - 11.9 months, which contains all the values till Visit<12 months
# Split the datasets per visit.
load("~/Year1.RData")
a <- full_join(Baseline, Visit02, by="ID")
a <- a %>% distinct(ID)
a <- full_join(a,Visit04, by="ID")
a <- a %>% distinct(ID)
a <- full_join(a,Visit06, by="ID")
a <- a %>% distinct(ID)
a <- full_join(a,Visit08, by="ID")
a <- a %>% distinct(ID)
a <- full_join(a,Visit10, by="ID")
a <- a %>% distinct(ID)


# Calculate the slopes per variable:
DATA$sBP.s1 <- round((DATA$sBP.12-DATA$sBP.0)/1, digits=2)
DATA$sBP.s2 <- round((DATA$sBP.24-DATA$sBP.0)/2, digits=2)
DATA$sBP.s3 <- round((DATA$sBP.36-DATA$sBP.0)/3, digits=2)
DATA$sBP.s4 <- round((DATA$sBP.48-DATA$sBP.0)/4, digits=2)
DATA$sBP.s5 <- round((DATA$sBP.60-DATA$sBP.0)/5, digits=2)
DATA$sBP.fs <- round((DATA$sBP.86-DATA$sBP.0)/7, digits=2)

# Split in training (66%) and testing (34%) datasets
ctrl <- trainControl(method = "repeatedcv", repeats = 5,
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary)

trainIndex <- createDataPartition(DATA_6mo$Nephro, p = .66,
                                  list = FALSE,
                                  times = 1
)
smotetrain_6mo<- DATA_6mo[trainIndex,]
smotetest_6mo <- DATA_6mo[-trainIndex,]
write.csv(smotetrain_6mo,file="/R_files/Splitting/smotetrain_6mo.csv", row.names=FALSE)
write.csv(smotetest_6mo,file="/R_files/Splitting/smotetest_6mo.csv", row.names=FALSE)


# GeneratE a BALANCED DATASET from the previous splits:
train_bal_6mo <- SMOTE(Nephropathy ~ ., data = smotetrain_6mo)
table(train_bal_6mo$Nephropathy)


# For WEKA conveniences:
names(train_bal_6mo)
train_bal_6mo[ ,c(7:57)] = apply(train_bal_6mo[ ,c(7:57)], 2, function(x) as.numeric(as.character(x)))
train_bal_6mo<-sapply(train_bal_6mo,as.character) 
train_bal_6mo[is.na(train_bal_6mo)]<-""
train_bal_6mo<-as.data.frame(train_bal_6mo)
write.csv(train_bal_6mo,file="/R_files/Balanced/train_bal_6mo.csv", row.names=FALSE)