read.table.x = function (filename, ...) {
    lines <- readLines(filename)
    n <- grep("^#", lines)
    if (length(n) > 0) 
	start <- n[length(n)]
    else start <- 1
    end <- length(lines)
    x <- read.table(text = lines[start:end], header = T, sep = "\t", 
					comment.char = "", check.names = F, ...)
}

library(optparse)
source('util.R')

opt <- interface()

if (opt$debug) save.image('debug.Rdata')

if (opt$verbose) {
    cat("Running command with args:\n",
        paste(commandArgs(), collapse = " "),
        '\n')
}


if (opt$split <= 0 & opt$split > 1) {
    stop("The split arg should be greater than 0 and not greater than 1.")
}

if(is.null(opt$models)) {
    models <- regression
} else {
    models <- strsplit(opt$models, ',')[[1]]
}

library(caret)
library(doMC)
registerDoMC(opt$n.cores)

meta <- read.table.x(opt$metadata)


meta.col <- colnames(meta)


if (is.null(opt$fields)) {
    ## stop("No field is provided to do regression on.")
    outcome.col <- names(meta)
    boring <- c("#SampleID",
                "BarcodeSequence",
                "LinkerPrimerSequence",
                "TARGET_SUBFRAGMENT",
                "ASSIGNED_FROM_GEO",
                "EXPERIMENT_CENTER",
                "RUN_PREFIX",
                "TAXON_ID",
                "ILLUMINA_TECHNOLOGY",
                "COMMON_NAME",
                "EXTRACTED_DNA_AVAIL_NOW",
                "SAMPLE_CENTER",
                "STUDY_CENTER",
                "Description")
    outcome.col <- outcome.col[! outcome.col %in% boring]
} else {
    outcome.col <- strsplit(opt$fields, ',')[[1]]
    x <- which(! outcome.col %in% meta.col)
    if (length(x) > 0) {
        stop("Field(s) ", paste(outcome.col[x], collapse=','), " do not exist in meta data")
    }
}

## extract part of the samples by their meta data
if (! is.null(opt$category)) {
    ## e.g. "SITE::nostril,skin:_:SEX::male"
    extract <- strsplit(opt$category, ':_:')[[1]]
    extract <- strsplit(extract, '::')
    for (x in extract) {
        if (! x[1] %in% meta.col)
            stop("The field ", x[1], " does not exist in meta data")
        i <- meta[[ x[1] ]]
        j <- strsplit(x[2], ',')[[1]]
        if (! all(j %in% i)) {
            ## insanity check to avoid typos
            stop("You specified non-existing values for field ", x[1], " in meta data")
        }
        meta <- meta[ i %in% j, ]
    }
}


if (! is.null(opt$numeric)) {
    ## e.g. "PH::6,12:_:TEMP::,32"
    extract <- strsplit(opt$numeric, ':_:')[[1]]
    extract <- strsplit(extract, '::')
    for (x in extract) {
        if (! x[1] %in% meta.col)
            stop("The field ", x[1], " does not exist in meta data")
        ## in case there is None, NA, etc in the column (R will read it into character
        ## instead of numerical)
        i <- as.numeric(as.character(meta[[ x[1] ]]))
        j <- as.numeric(strsplit(x[2], ',')[[1]])
        ## NA & TRUE -> NA
        ## NA & FALSE -> FALSE
        n <- rep(T, nrow(meta))
        if (! is.na(j[1]))
            n <- n & i > j[1]
        if (! is.na(j[2]))
            n <- n & i < j[2]
        meta <- meta[n, ]
    }
}


otus <- read.table.x(opt$input_otu_table)


tax.16s <- otus[, length(otus)]
tax.16s <- gsub("^Root; ", "", tax.16s)
## insert a newline for every three levels of taxonomy
tax.16s <- gsub("([^;]*); ([^;]*); ([^;]*); ", '\\1; \\2; \\3\n', tax.16s)
names(tax.16s) <- otus[[1]]


## remove the 6-digit suffix of the sample IDs in the mapping file.
## meta.sid <- gsub(".[0-9]{6}$", "", as.character(meta[[1]]))
meta.sid <- as.character(meta[[1]])
rownames(meta) <- meta.sid
sample.ids <- intersect(meta.sid, colnames(otus))
meta <- meta[sample.ids, ]


rownames(otus) <- otus[[1]]
otus <- data.frame(t(otus[, sample.ids]), check.names=FALSE)


## add the categorical fields in the meta data as predictors
if (! is.null(opt$add_numeric)) {
    add.pred <- strsplit(opt$add_numeric, ',', fixed=TRUE)[[1]]
    not.in <- which(! add.pred %in% colnames(meta))
    if (length(not.in) > 0) {
        stop(paste(c(add.pred[not.in], "not in the meta data!!!"), collapse=' '))
    }
    otus <- cbind(as.numeric(meta[, add.pred]), otus)
    colnames(otus)[1:length(add.pred)] <- add.pred
}
## add numeric fields as predictors
if (! is.null(opt$add_category)) {
    add.pred <- strsplit(opt$add_category, ',', fixed=TRUE)[[1]]
    not.in <- which(! add.pred %in% colnames(meta))
    if (length(not.in) > 0) {
        stop(paste(c(add.pred[not.in], "not in the meta data!!!"), collapse=' '))
    }
    if (opt$debug) save.image('debug.Rdata')
    x <- meta[, add.pred, drop=FALSE]
    dummy <- dummyVars(~., data=x)
    otus <- cbind(predict(dummy, x), otus)
}

pdf(sprintf("%s.pdf", opt$output))

min.sample.size <- 12
accuracies <- data.frame()
big.tuned.list <- list()

for(label in outcome.col) {
    if(opt$verbose)
        cat("=========", label, ":\n")


    outcome <- as.character(meta[[label]])
    ## the space in the class names cause problem for some models
    outcome <- as.factor(gsub("[[:space:]]+", "_", outcome))

    if(opt$verbose) {
        cat("---- a glimpse of outcome:\n")
        print(table(outcome))
    }

    ## remove NA values
    outcome.na <- is.na(outcome)
    ## if more than half of the samples are not numeric
    if(sum(outcome.na) > 0.5 * length(outcome)) {
        if(opt$verbose)
            cat("outcome has less than half of numeric values. skip it.\n")
        next
    }

    outcome <- outcome[! outcome.na]
    ## if there less than 2 classes in this category
    if(nlevels(outcome) < 2) {
        if(opt$verbose)
            cat("outcome has less than 2 classes. skip it.\n")
        next
    }

    train.set <- otus[! outcome.na, ]


    if (opt$split < 1) {
        set.seed(1)
        training.rows <- createDataPartition(outcome, p=opt$split, list=F)
    } else {
        training.rows <- 1:length(outcome)
    }

    train.full <- train.set[training.rows, ]
    test.full <- train.set[-training.rows, ]
    train.outcome <- outcome[training.rows]
    test.outcome <- outcome[-training.rows]

    nzv <- nearZeroVar(train.full)
    if (length(nzv) > 0) {
        train.full <- train.full[, -nzv]
        test.full <- test.full[, -nzv]
    }
    tooHigh <- findCorrelation(cor(train.full), .9)
    if (length(tooHigh) > 0) {
        train.full <- train.full[, -tooHigh]
        test.full <- test.full[, -tooHigh]
    }

    ## do feature selection
    if (! is.null(opt$features)) {
        train.full
    }
    ## save the test set results in a data.frame
    if (length(test.outcome) > 0)
        testResults <- data.frame(obs=test.outcome)

    if (opt$debug) save.image('debug.Rdata')

    ## benchmark the specified models
    tuned.list <- list()

    accuracies <- data.frame()
    for (model in models) {
        tuned <- classification.tune(train.full, train.outcome, model)
        ## if(is.na(tuned) | is.null(tuned)) next
        if (class(tuned) != 'train') {
            cat("Warning message:\nModel ", model, " failed.\n")
            next
        }
        tuned.list[[model]] <- tuned

        if (opt$verbose) {
            print(tuned)
        }
        ## add a new column - Model
        tuned$resample$Model <- model
        accuracies <- rbind(accuracies, tuned$resample, c('Kappa', 'Accuracy'))

        if (length(test.outcome) > 0)
            testResults[model] <- predict(tuned, test.full)
        imp <- varImp(tuned)
        pimp <- plot.imp(imp, tax.16s, main=model)
        print(pimp, position=c(0, 0, 0.56, 1))

        save.image(sprintf("%s.Rdata", opt$output))
    }

    ## if (opt$verbose) print(accuracies)

    if (length(tuned.list) > 1) {
        ## compare the model performances
        resamp <- resamples(tuned.list)
        m.diff <- diff(resamp)
        if (opt$verbose) print(summary(m.diff))
        print(dotplot(m.diff))
    }
    big.tuned.list[[label]] <- tuned.list
    ## plot yhat vs. obs
    if (length(test.outcome) > 0) {
        method.names <- names(testResults)
        obs <- testResults[,1]
        for(i in 2:length(testResults)) {
            pred <- testResults[,i]
            plot(pred ~ obs,
                 xlab=method.names[1], ylab=method.names[i])
            abline(0, 1, col="red")
            ## mtext(paste(c("RMSE=", "R^2="),
            ##             c(RMSE())))
            rmse <- format(round(caret::RMSE(pred, obs), 2), nsmall=2)
            rsq <- format(round(caret::R2(pred, obs), 2), nsmall=2)
            legend("topleft", text.col="blue", "ab",
                   paste(c("RMSE","R^2 "), c(rmse, rsq), sep='=', collapse='\n'))
        }
    }
}

dev.off()

save.image(sprintf("%s.Rdata", opt$output))