# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - MAY 19, 2016 library(lattice) library(dplyr) setwd("/Users/Violeta/Desktop/R_files/Outcomes") # 1. Read all the files and put them all in the same format (from .sas7bdat to .csv) library(sas7bdat) Demographics <- read.sas7bdat("/BioLINCC/ACCORD/3_Data_Sets_Anal_Data_Sets/accord_key.sas7bdat") write.csv(Demographics, file="Demographics.csv") # 2. Focus on files that contain the variable 'Visit' (BP, HbA1c, Lipids, and Other Labs) # Change variable for numeric instead of character and put all levels in the same numeric format BP <- read.csv("BP.csv") BP$Visit <- as.character(BP$Visit) BP$Visit[BP$Visit == "BLR"] <- 00 BP$Visit <- sub("F","",BP$Visit) # In order to replace the "F" for nothing (instead of visit F01, just 01) BP$Visit[BP$Visit == "EXIT"] <- 86 # Final visit was set to 86 months (>7yr) BP$Visit <- as.numeric(BP$Visit) write.csv(BP,file="/R_files/BP.csv", row.names=FALSE) save(BP,file="/R_files/BP.RData") # 3. Join all the files with the variable 'Visit' test1 <- merge( merge(BP, HbA1c, by=c("MaskID", "Visit"), all=TRUE), Lipids, by=c("MaskID", "Visit"), all=TRUE) newtest1 <- test1[c(-3,-7,-9)] #Since it creates new rows, we have to exclude variables 3 (X.x), 7(X.y), and 9(X) finaltest <- merge(newtest1, Labs, by=c("MaskID", "Visit"), all=TRUE) #To put everything together newfinaltest<-finaltest[c(-12)] #To eliminate the extra row # 4. Put all the files together (including those without 'visits') Demographics <- read.csv("A-Demo.csv") Demographics <- Demographics[c(-1)] # Eliminate the extra row (X) # Read all the files [...] JoinedTest1<-Reduce(dplyr::full_join, list(Demographics, newfinaltest, HypoEv, Hypo1st, SAE, CVDo, mvo)) write.csv(JoinedTest1,file="/R_files/JoinedTest1.csv", row.names=FALSE) save(JoinedTest1,file="/R_files/JoinedTest1.RData") # 'JoinedTest1' includes 212,781 observations (repeated ob per patient) with 122 variables (complete dataset) # 5. Cleaning the dataset previously generated names(JoinedTest1) Nephro<-select(JoinedTest1, -c(5,6,8,29,31,33:40,42:80,96:122)) NephroNA<-sapply(Nephro,as.character) #For WEKA conveniences, convert NA's to blank spaces NephroNA[is.na(Nephro)]<-"" NephroNA<-as.data.frame(NephroNA) #To convert the matrix R generated to data frame write.csv(NephroNA,file="/R_files/Cleaning/NephroNA.csv", row.names=FALSE) # 6.Change names to make the analysis easier Dataset <- NephroNA names(Dataset) <- c("ID", "Gender", "Base_age", "Arm", "CVD_History", "Race", "Visit", "sBP", "dBP", "HR", "HbA1c", "Chol", "Trig", "vLDL", "LDL", "HDL", "FPG", "ALT", "CPK", "K", "SCr", "GFR", "UAlb", "UCr", "ACR", "Glc50", "time_hypo", "time_SAE", "Decline", "D_time", "fu_D", "Macro", "M_time", "fu_M", "RF", "RF_time", "fu_RF", "Any", "Any_t", "fu_Any", "Micro", "mtime", "fum") save(Dataset,file="/Users/Violeta/Desktop/R_files/Outcomes/Dataset.RData") # 8. Create a new column that tells you whether or not the patient developed any kind of nephropathy OutcomeSum <- Dataset %>% select(Decline, Macro, RF, Micro) %>% #Select only the rows that will be added distinct(ID) # Convert factor to numeric classes in order to allow the addition of the values OutcomeSum$Decline<-as.numeric(levels(OutcomeSum$Decline))[OutcomeSum$Decline] sum <- rowSums (OutcomeSum, na.rm=TRUE) #Gives you back the sum of the rows ignoring NAs (just the value) sum <- as.data.frame(sum) #Turns it into a data frame (1 single column) sum$out <- "" #Adds a new empty column (outcome) sum$out <- ifelse(sum>=1,1,0) #if the sum is >1, then outcome=1 (YES) NephropathyNA <- cbind(OutcomeSum,sum) #Joining the outcomes with their "score" # 9. Join the new outcome with the rest of the dataset Data2 <- Dataset %>% select (1:28,30,31,33,34,36:40, 42,43) FinalData <- merge(Data2, FinalOut, by="ID", all=TRUE) #FinalOut is NephropathyNA+ID FinalData$out[FinalData$out==0] <- "No" FinalData$out[FinalData$out==1] <- "Yes" names(FinalData)[45] <- "Nephropathy" #This column tells us wether or not the patient developed any type of Nephropathy names(FinalData)[44] <- "Number_Nephro" #Tells us how many Nephropathies were developed write.csv(FinalData,file="/R_files/Outcomes/FinalData.csv", row.names=FALSE) save(FinalData,file="/R_files/Outcomes/FinalData.RData") # 10. Create a final dataset by individual: 10,251 observations of 45 variables FinalData_ID <- FinalData %>% distinct(ID) FinalData_ID$Decline <- as.factor(FinalData_ID$Decline) FinalData_ID$Macro <- as.factor(FinalData_ID$Macro) FinalData_ID$RF <- as.factor(FinalData_ID$RF) FinalData_ID$Micro <- as.factor(FinalData_ID$Micro) FinalData_ID$Number_Nephro <- as.factor(FinalData_ID$Number_Nephro) save(FinalData_ID,file="R_files/Outcomes/FinalData_ID.RData") write.csv(FinalData_ID,file="/R_files/Outcomes/FinalData_ID.csv") # 11. Change names per visit Baseline <- Baseline %>% filter(Visit==0) # =Baseline <- Baseline[which(Baseline$Visit==0),] names(Baseline) colnames(Baseline) <- c("ID","Visit","sBP00","dBP00","HR00","HbA1c00","Chol00", "Trig00","vLDL00","LDL00","HDL00","FPG00","ALT00", "CPK00","K00","SCr00","GFR00","UAlb00", "UCr00","UACR00") Baseline <- select(Baseline, -Visit) names(Baseline) # Join all the variables for the second window of time 6 - 11.9 months, which contains all the values till Visit<12 months # Split the datasets per visit. load("~/Year1.RData") a <- full_join(Baseline, Visit02, by="ID") a <- a %>% distinct(ID) a <- full_join(a,Visit04, by="ID") a <- a %>% distinct(ID) a <- full_join(a,Visit06, by="ID") a <- a %>% distinct(ID) a <- full_join(a,Visit08, by="ID") a <- a %>% distinct(ID) a <- full_join(a,Visit10, by="ID") a <- a %>% distinct(ID) # Calculate the slopes per variable: DATA$sBP.s1 <- round((DATA$sBP.12-DATA$sBP.0)/1, digits=2) DATA$sBP.s2 <- round((DATA$sBP.24-DATA$sBP.0)/2, digits=2) DATA$sBP.s3 <- round((DATA$sBP.36-DATA$sBP.0)/3, digits=2) DATA$sBP.s4 <- round((DATA$sBP.48-DATA$sBP.0)/4, digits=2) DATA$sBP.s5 <- round((DATA$sBP.60-DATA$sBP.0)/5, digits=2) DATA$sBP.fs <- round((DATA$sBP.86-DATA$sBP.0)/7, digits=2) # Split in training (66%) and testing (34%) datasets ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE, summaryFunction = twoClassSummary) trainIndex <- createDataPartition(DATA_6mo$Nephro, p = .66, list = FALSE, times = 1 ) smotetrain_6mo<- DATA_6mo[trainIndex,] smotetest_6mo <- DATA_6mo[-trainIndex,] write.csv(smotetrain_6mo,file="/R_files/Splitting/smotetrain_6mo.csv", row.names=FALSE) write.csv(smotetest_6mo,file="/R_files/Splitting/smotetest_6mo.csv", row.names=FALSE) # GeneratE a BALANCED DATASET from the previous splits: train_bal_6mo <- SMOTE(Nephropathy ~ ., data = smotetrain_6mo) table(train_bal_6mo$Nephropathy) # For WEKA conveniences: names(train_bal_6mo) train_bal_6mo[ ,c(7:57)] = apply(train_bal_6mo[ ,c(7:57)], 2, function(x) as.numeric(as.character(x))) train_bal_6mo<-sapply(train_bal_6mo,as.character) train_bal_6mo[is.na(train_bal_6mo)]<-"" train_bal_6mo<-as.data.frame(train_bal_6mo) write.csv(train_bal_6mo,file="/R_files/Balanced/train_bal_6mo.csv", row.names=FALSE)