rm(list = ls())
setwd("/home/REDMOND.CORP.MICROSOFT.COM/avbijral/Nature/")
require(data.table)

## Read Cluster Estimates
Mortdat <- fread("Data/Crude_EB_AF estimates_RK_07102018.csv")
names(Mortdat) <- tolower(names(Mortdat))
dim(Mortdat)
names(Mortdat)[13:15] <- c("Stunting", "Underweight", "Wasting")

## Village-Cluster Linking File
Linkdat <- fread("Data/Linking_ClustersVillages_05122018.csv")
Linkdat <- Linkdat[URBAN_RURA=="R"]
names(Linkdat) <- tolower(names(Linkdat))
Linkdat$cluster <- Linkdat$dhsclust
print("Unique Villages Mapped")
length(unique(Linkdat[,unqv_id])) 
Linkdat$unqv_id <- gsub('_', '',Linkdat$unqv_id)

## Remove Clusters that don't exist in the Linking file
common <- intersect(Mortdat$cluster,Linkdat$cluster)
ind <- which(Mortdat$cluster %in% common)
Mortdat1 <- Mortdat[ind,]
Mortdat <- Mortdat1

#clusts       <- read.csv("Data/TrainingClusters.csv", header = T, stringsAsFactors = F)
#label_vills  <- unique(clusts$Cluster_Village[which(clusts$Cluster_Village != "")])


## Demographic/Amenities Features
Alldat_temp <- read.csv("Data/MLinfomap_rural_clean_census2011_fixed_amenities_merged_for Avleen_09162019_final.csv", sep = ",",  stringsAsFactors = FALSE, header = T)
names(Alldat_temp) <- tolower(names(Alldat_temp))
Alldat_temp <- Alldat_temp[,-c(c(1:7),c(10:18),c(21:25),275)] # remove id columns
#codebook <- read.csv("Data/MLinfomap_rural_clean_census2011_fixed_amenities_merged_for Avleen_codebook.csv")
require("fastDummies")
Alldat <- dummy_cols(Alldat_temp, select_columns = "state_name", remove_first_dummy=F)
Alldat <- Alldat[,-2]
Alldat[,2:ncol(Alldat)] <- apply(Alldat[,2:ncol(Alldat)], 2, as.numeric)

## REMOVED

#demodat_temp <- fread("DemoAmen_Village_match.csv")
#colnames(demodat_temp) <- gsub('\\s+', '', colnames(demodat_temp))
#length(unique(demodat_temp[,unqv_id])) 
#keep    <- c("State", "unqv_id", "DIST_ID", "POINT_X", "POINT_Y")
#demodat <- merge(demodat_temp, Censdat[, keep, with = FALSE], by = "unqv_id")
#demodat <- demodat[, !c("Village Code", "DIST_ID")]
#dim(demodat)
#require("fastDummies")
#demodat <- dummy_cols(demodat, select_columns = c("State"), remove_first_dummy=T)
#demodat <- demodat[, !c("State")]
#dim(demodat)
## Return One-to-One Mapped Villages and Clusters

#d <- function(x) duplicated(x) | duplicated(x, fromLast=TRUE)

# Merge with Mortdat with Linkdat
#keep <- c("cluster", "unqv_id", "Stunting", "Underweight", "Wasting")
#df   <- merge(Mortdat, Linkdat, by="cluster", all.x=T, all.y=F)
#df   <- df[, keep, with=FALSE]
#df1  <- df[unqv_id %in% label_vills]
#df1  <- aggregate(.~unqv_id, data = df1[,-1], FUN = mean)
#df1 <- df1[, lapply(.SD,mean), by = unqv_id, .SDcols = c("Stunting", "Underweight", "Wasting")]
#head(df1)
## Link (Un)Labeled Villages With Features
## Link (Un)Labeled Villages With Features
#names(demodat)
#df_all <- merge(demodat, df1, by = "unqv_id", all.x = T)
#df_all <- df_all[-which(rowMeans(df_all[,!names(df_all) %in% "EB4L_f_literacy"]==0, na.rm = T) > 0.9),] # Remove rows with many 0's
#df <- df_all
#dim(df)

# Create the Initial Label Set 
vills_rands        <- data.frame(matrix(rep(NA, 5*nrow(Mortdat)), ncol = 5, nrow = nrow(Mortdat)))
names(vills_rands) <- c("cluster", "unqv_id", "Stunting", "Underweight", "Wasting")
vills_len <- c()
for(i in 1:length(Mortdat$cluster)) {
    
    vills <- unique(Linkdat$unqv_id[Linkdat$cluster == Mortdat$cluster[i]] )
    vills_len <- c(vills_len, length(vills))
   # if(length(vills)>0) {   ## Assign Randomly
   #    vills_rands[i,] <- c(Mortdat$cluster[i], sample(vills, 1),  Mortdat[i,13], Mortdat[i,14], Mortdat[i,15])
   # }
    if(length(vills)==1) {   ## Assign to clusters with only one village
       vills_rands[i,] <- c(Mortdat$cluster[i], vills, Mortdat[i,13], Mortdat[i,14], Mortdat[i,15])
    }
}
df_vills_clusts     <- na.omit(vills_rands)
df_vills <- aggregate(. ~ unqv_id, data = df_vills_clusts[,-1], FUN = mean)

## Merge with Features Data frame to create the input feature file
df_all <- merge(Alldat, df_vills[,!names(df_vills) %in% "cluster"], by = "unqv_id", all.x = T)
df <- data.table(df_all)
dim(df)

# Remove Correlated Columns
df[, c("tot_w",  "f_illt", "m_illt", "tot_lit", "tot_pop", "tot_l6", "tot_sc", "tot_illt", "tot_mnw", "tot_cult", "tot_aglb", "tot_oth_w", "tot_mrw", "tot_nnw"):=NULL]

# Create final feature Matrix
#ind_L    <- which(!is.na(df[,Stunting]))
#ind_U    <- which(is.na(df[,Stunting])==T)
#length(ind_L)
#length(ind_U)
normalit <- function(m) {
  (m - min(m)) / (max(m) - min(m))
}
X <- df[,!c("unqv_id", "Stunting", "Underweight", "Wasting") ]
require("Hmisc")
X <- apply(X, 2, function(x) impute(x, median))
X <- scale(X)
print("Done Imputing.")
dim(X)
require("SemiSupervised")
           
# Expand Label Function
ExpandLabels <- function(Y, df, Mortdat, Linkdat, all_clusts, metric, iter = i) {
    all_clusts <- Mortdat[,cluster]
    lab_vills <- c()
    mapped_vills <- data.frame(Cluster=rep(NA,length(all_clusts)), Village=rep(NA,length(all_clusts)))
    for(i in 1:length(all_clusts)){
         mapVill <- Linkdat[,unqv_id][Linkdat[,cluster] %in% all_clusts[i]]
         vills   <- intersect(df[,unqv_id], mapVill)
         if(length(vills)>0) {
             Yvill <- as.numeric(Y[df[,unqv_id] %in% vills])
             Ymort <- as.numeric(Mortdat[(Mortdat[,cluster] %in% all_clusts[i]),..metric])
             #print(Ymort)
             #print(Ymort-Yvill)
             ind <- which.min((Ymort-Yvill)^2)
             Y[df[,unqv_id] %in% vills[ind]] <- Ymort 
             lab_vills <- c(lab_vills, vills[ind])
             mapped_vills[i,1] <- all_clusts[i]
             mapped_vills[i,2] <- vills[ind]
         }
        # stop("here")
     }
     write.csv(mapped_vills, paste("MappedVillages_", "12-18-2019", "_", toString(iter), ".csv", sep=""))
     return(list(Y = Y, Vills = lab_vills))
}

# Iterative Semi- Supervised Model fitting
ModelfitExpand <- function(y, X, metric, metric_name,iters=10) {
  agraph.fit <- agraph(x = X, y = y, metric = "euclidean")
  preds <- fitted(agraph.fit)
  preds[which(preds > 1)] <- max(preds[which(preds < 1)])
  preds[which(preds < 0)] <- min(preds[which(preds > 0)])
  print(paste(metric_name,"Iteration 0",sep=" "))
  save(preds, file=paste("Predictions/Predictions_",metric_name,"_","0",".RData", sep=""))
  for( i in 1:iters) {
      res       <- ExpandLabels(preds, df, Mortdat, Linkdat, all_clusts, metric=metric, iter = i)
      y_new     <- res$Y
      lab_vills <- res$Vills
      y_new[!is.na(y)] <- y[!is.na(y)]   ## Clamp to the Initial Label Set
      y_new[df[,unqv_id] %in% setdiff(df[,unqv_id], lab_vills)] <- NA
      agraph.fit <- agraph(x = X, y = y_new, metric = "euclidean")
      preds <- fitted(agraph.fit)
      preds[which(preds > 1)] <- max(preds[which(preds < 1)])
      preds[which(preds < 0)] <- min(preds[which(preds > 0)])
      save(preds, file=paste("Predictions/Predictions_",metric_name,"_",toString(i),".RData", sep=""))
  }
  return(list(model=agraph.fit, predicted = preds))
 }
           
all_clusts <- Mortdat$cluster

stunting.res <- ModelfitExpand(df$Stunting, X = X, metric = 13, metric_name = "Stunting", iters=10)

underweight.res  <- ModelfitExpand(df$Underweight, X = X, metric = 14, metric_name = "Underweight",iters=,10)

wasting.res   <- ModelfitExpand(df$Wasting, X = X, metric = 15, metric_name = "Wasting",iters=10)