rm(list = ls()) setwd("/home/REDMOND.CORP.MICROSOFT.COM/avbijral/Nature/") require(data.table) ## Read Cluster Estimates Mortdat <- fread("Data/Crude_EB_AF estimates_RK_07102018.csv") names(Mortdat) <- tolower(names(Mortdat)) dim(Mortdat) names(Mortdat)[13:15] <- c("Stunting", "Underweight", "Wasting") ## Village-Cluster Linking File Linkdat <- fread("Data/Linking_ClustersVillages_05122018.csv") Linkdat <- Linkdat[URBAN_RURA=="R"] names(Linkdat) <- tolower(names(Linkdat)) Linkdat$cluster <- Linkdat$dhsclust print("Unique Villages Mapped") length(unique(Linkdat[,unqv_id])) Linkdat$unqv_id <- gsub('_', '',Linkdat$unqv_id) ## Remove Clusters that don't exist in the Linking file common <- intersect(Mortdat$cluster,Linkdat$cluster) ind <- which(Mortdat$cluster %in% common) Mortdat1 <- Mortdat[ind,] Mortdat <- Mortdat1 #clusts <- read.csv("Data/TrainingClusters.csv", header = T, stringsAsFactors = F) #label_vills <- unique(clusts$Cluster_Village[which(clusts$Cluster_Village != "")]) ## Demographic/Amenities Features Alldat_temp <- read.csv("Data/MLinfomap_rural_clean_census2011_fixed_amenities_merged_for Avleen_09162019_final.csv", sep = ",", stringsAsFactors = FALSE, header = T) names(Alldat_temp) <- tolower(names(Alldat_temp)) Alldat_temp <- Alldat_temp[,-c(c(1:7),c(10:18),c(21:25),275)] # remove id columns #codebook <- read.csv("Data/MLinfomap_rural_clean_census2011_fixed_amenities_merged_for Avleen_codebook.csv") require("fastDummies") Alldat <- dummy_cols(Alldat_temp, select_columns = "state_name", remove_first_dummy=F) Alldat <- Alldat[,-2] Alldat[,2:ncol(Alldat)] <- apply(Alldat[,2:ncol(Alldat)], 2, as.numeric) ## REMOVED #demodat_temp <- fread("DemoAmen_Village_match.csv") #colnames(demodat_temp) <- gsub('\\s+', '', colnames(demodat_temp)) #length(unique(demodat_temp[,unqv_id])) #keep <- c("State", "unqv_id", "DIST_ID", "POINT_X", "POINT_Y") #demodat <- merge(demodat_temp, Censdat[, keep, with = FALSE], by = "unqv_id") #demodat <- demodat[, !c("Village Code", "DIST_ID")] #dim(demodat) #require("fastDummies") #demodat <- dummy_cols(demodat, select_columns = c("State"), remove_first_dummy=T) #demodat <- demodat[, !c("State")] #dim(demodat) ## Return One-to-One Mapped Villages and Clusters #d <- function(x) duplicated(x) | duplicated(x, fromLast=TRUE) # Merge with Mortdat with Linkdat #keep <- c("cluster", "unqv_id", "Stunting", "Underweight", "Wasting") #df <- merge(Mortdat, Linkdat, by="cluster", all.x=T, all.y=F) #df <- df[, keep, with=FALSE] #df1 <- df[unqv_id %in% label_vills] #df1 <- aggregate(.~unqv_id, data = df1[,-1], FUN = mean) #df1 <- df1[, lapply(.SD,mean), by = unqv_id, .SDcols = c("Stunting", "Underweight", "Wasting")] #head(df1) ## Link (Un)Labeled Villages With Features ## Link (Un)Labeled Villages With Features #names(demodat) #df_all <- merge(demodat, df1, by = "unqv_id", all.x = T) #df_all <- df_all[-which(rowMeans(df_all[,!names(df_all) %in% "EB4L_f_literacy"]==0, na.rm = T) > 0.9),] # Remove rows with many 0's #df <- df_all #dim(df) # Create the Initial Label Set vills_rands <- data.frame(matrix(rep(NA, 5*nrow(Mortdat)), ncol = 5, nrow = nrow(Mortdat))) names(vills_rands) <- c("cluster", "unqv_id", "Stunting", "Underweight", "Wasting") vills_len <- c() for(i in 1:length(Mortdat$cluster)) { vills <- unique(Linkdat$unqv_id[Linkdat$cluster == Mortdat$cluster[i]] ) vills_len <- c(vills_len, length(vills)) # if(length(vills)>0) { ## Assign Randomly # vills_rands[i,] <- c(Mortdat$cluster[i], sample(vills, 1), Mortdat[i,13], Mortdat[i,14], Mortdat[i,15]) # } if(length(vills)==1) { ## Assign to clusters with only one village vills_rands[i,] <- c(Mortdat$cluster[i], vills, Mortdat[i,13], Mortdat[i,14], Mortdat[i,15]) } } df_vills_clusts <- na.omit(vills_rands) df_vills <- aggregate(. ~ unqv_id, data = df_vills_clusts[,-1], FUN = mean) ## Merge with Features Data frame to create the input feature file df_all <- merge(Alldat, df_vills[,!names(df_vills) %in% "cluster"], by = "unqv_id", all.x = T) df <- data.table(df_all) dim(df) # Remove Correlated Columns df[, c("tot_w", "f_illt", "m_illt", "tot_lit", "tot_pop", "tot_l6", "tot_sc", "tot_illt", "tot_mnw", "tot_cult", "tot_aglb", "tot_oth_w", "tot_mrw", "tot_nnw"):=NULL] # Create final feature Matrix #ind_L <- which(!is.na(df[,Stunting])) #ind_U <- which(is.na(df[,Stunting])==T) #length(ind_L) #length(ind_U) normalit <- function(m) { (m - min(m)) / (max(m) - min(m)) } X <- df[,!c("unqv_id", "Stunting", "Underweight", "Wasting") ] require("Hmisc") X <- apply(X, 2, function(x) impute(x, median)) X <- scale(X) print("Done Imputing.") dim(X) require("SemiSupervised") # Expand Label Function ExpandLabels <- function(Y, df, Mortdat, Linkdat, all_clusts, metric, iter = i) { all_clusts <- Mortdat[,cluster] lab_vills <- c() mapped_vills <- data.frame(Cluster=rep(NA,length(all_clusts)), Village=rep(NA,length(all_clusts))) for(i in 1:length(all_clusts)){ mapVill <- Linkdat[,unqv_id][Linkdat[,cluster] %in% all_clusts[i]] vills <- intersect(df[,unqv_id], mapVill) if(length(vills)>0) { Yvill <- as.numeric(Y[df[,unqv_id] %in% vills]) Ymort <- as.numeric(Mortdat[(Mortdat[,cluster] %in% all_clusts[i]),..metric]) #print(Ymort) #print(Ymort-Yvill) ind <- which.min((Ymort-Yvill)^2) Y[df[,unqv_id] %in% vills[ind]] <- Ymort lab_vills <- c(lab_vills, vills[ind]) mapped_vills[i,1] <- all_clusts[i] mapped_vills[i,2] <- vills[ind] } # stop("here") } write.csv(mapped_vills, paste("MappedVillages_", "12-18-2019", "_", toString(iter), ".csv", sep="")) return(list(Y = Y, Vills = lab_vills)) } # Iterative Semi- Supervised Model fitting ModelfitExpand <- function(y, X, metric, metric_name,iters=10) { agraph.fit <- agraph(x = X, y = y, metric = "euclidean") preds <- fitted(agraph.fit) preds[which(preds > 1)] <- max(preds[which(preds < 1)]) preds[which(preds < 0)] <- min(preds[which(preds > 0)]) print(paste(metric_name,"Iteration 0",sep=" ")) save(preds, file=paste("Predictions/Predictions_",metric_name,"_","0",".RData", sep="")) for( i in 1:iters) { res <- ExpandLabels(preds, df, Mortdat, Linkdat, all_clusts, metric=metric, iter = i) y_new <- res$Y lab_vills <- res$Vills y_new[!is.na(y)] <- y[!is.na(y)] ## Clamp to the Initial Label Set y_new[df[,unqv_id] %in% setdiff(df[,unqv_id], lab_vills)] <- NA agraph.fit <- agraph(x = X, y = y_new, metric = "euclidean") preds <- fitted(agraph.fit) preds[which(preds > 1)] <- max(preds[which(preds < 1)]) preds[which(preds < 0)] <- min(preds[which(preds > 0)]) save(preds, file=paste("Predictions/Predictions_",metric_name,"_",toString(i),".RData", sep="")) } return(list(model=agraph.fit, predicted = preds)) } all_clusts <- Mortdat$cluster stunting.res <- ModelfitExpand(df$Stunting, X = X, metric = 13, metric_name = "Stunting", iters=10) underweight.res <- ModelfitExpand(df$Underweight, X = X, metric = 14, metric_name = "Underweight",iters=,10) wasting.res <- ModelfitExpand(df$Wasting, X = X, metric = 15, metric_name = "Wasting",iters=10)