CowChrom <- c(as.character(seq(1 : 29)), "X", "Y") min.align.len <- 49 ##Preparataion of Megablast output table (D) and of position table (POSILMN) ##The Megablast output table processed such that fields will be atomic ##The position table is filtered (Build information is discarded) if(1) { #Reading in file containing the chromosomal positions as detemined by Illumina POSORIG <- read.table(file = "../Material/BovineSNP50_Final_SNPs_54001.csv", header = T, sep = ",") names(POSORIG) <- c("ID", "BUILD", "CHROM", "POS") #Reading in file containing the megablast output D <- read.table(file = "../Material/OligosVsBovineGenome.megablast", header = F) names(D) <- c("QUERY", "SUBJECT", "IDENT", "ALIGN.LEN", "MISMATCHES", "GAPS", "Q.START", "Q.END", "S.START", "S.END", "E.VALUE", "BIT.SCORE") cat("Number of Hits:\t", nrow(D), "\n") #stop() D <- subset(D, ALIGN.LEN >= min.align.len) n <- nrow(D) cat("Number of hits with an alignment length of ", min.align.len, " or greater:\t", n, "\n") #Pulling out the chromosome from the column SUBJECT (i.e. the hit sequence or genome sequence) Q <- strsplit(as.character(D$SUBJECT), ":") CHROM <- vector(mode = "character", length = n) for(i in 1 : n) { CHROM[i] <- Q[[i]][3] } SNPID <- vector(mode = "character", length = n) ILMNSTRAND <- vector(mode = "character", length = n) SOURCESTRAND <- vector(mode = "character", length = n) #Pulling out the SNPID + strand information from the column QUERY (i.e. the oligo sequence or search sequence) Q <- strsplit(as.character(D$QUERY), "|", fixed = TRUE) for(i in 1 : n) { SNPID[i] <- Q[[i]][1] ILMNSTRAND[i] <- Q[[i]][2] SOURCESTRAND[i] <- Q[[i]][3] } D <- cbind(D, SNPID, CHROM, ILMNSTRAND, SOURCESTRAND) #stop() D <- subset(D, CHROM %in% CowChrom) n <- nrow(D) cat("Number of hits on autosome or X chromosome:\t", n, "\n") POSILMN <- subset(POSORIG, select = c("CHROM", "ID", "POS")) } #stop() l <- nrow(POSILMN) n.multiplehit <- 0 n.nohit <- 0 n.uniquehit <- 0 n.method1 <- 0 n.method2 <- 0 n.method3 <- 0 CHROM <- vector(mode = "character", length = l) POS <- vector(mode = "numeric", length = l) for(i in 1 : l) { id <- as.character(POSILMN$ID[i]) #cat("Number:\t", i, "SNP:\t", id, "\n") D.aux <- subset(D, SNPID == id) #stop() OK <- TRUE ### Just one hit if(nrow(D.aux) == 1) { n.uniquehit <- n.uniquehit + 1 #cat("\teindeutig\n") CHROM[i] <- as.character(D.aux$CHROM[1]) if(D.aux$S.START < D.aux$S.END) { POS[i] <- D.aux$S.END[1] + 1 } else { POS[i] <- D.aux$S.END[1] - 1 } OK <- TRUE } ### No hit if(nrow(D.aux) == 0) { n.nohit <- n.nohit + 1 #cat("\tNo hit\n") is.na(CHROM[i]) <- TRUE is.na(POS[i]) <- TRUE OK <- TRUE } ### More than one hit if(nrow(D.aux) > 1) { #cat("\tmultiplehit\n") n.multiplehit <- n.multiplehit + 1 OK <- FALSE } #stop() ########################################################### ##### D I S A M B I G U A T I O N ######################## ########################################################### ### Method 1 ### Pull out hit with the best bit score ### If there is just one hit with the best bit score we are done if(nrow(D.aux) > 1 && OK == FALSE) { #cat("Methode 1 - multiple hits:\n") #print(D.aux) #cat("Original Position:\n") #print(subset(POSILMN, ID == id)) max.bitscore <- max(D.aux$BIT.SCORE) D.aux <- subset(D.aux, BIT.SCORE == max.bitscore) #print(D.aux) if(nrow(D.aux) == 1) { CHROM[i] <- as.character(D.aux$CHROM[1]) if(D.aux$S.START < D.aux$S.END) { POS[i] <- D.aux$S.END[1] + 1 } else { POS[i] <- D.aux$S.END[1] - 1 } OK <- TRUE n.method1 <- n.method1 + 1 #cat("Method 1 successful\n") } else #Two or more equally good hits { chroms <- unique(D.aux$CHROM) if(length(chroms) == 1 && length(unique(D.aux$S.START)) > 1) #One chromosome but several different positions { CHROM[i] <- as.character(unique(D.aux$CHROM)) is.na(POS[i]) <- TRUE OK <- TRUE n.method2 <- n.method2 + 1 #cat("Method 2 successful\n") } else #Several chromsomes { is.na(POS[i]) <- TRUE is.na(CHROM[i]) <- TRUE n.method3 <- n.method3 + 1 } } } } CHROM.MAP <- CHROM POS.MAP <- POS POSILMN <- cbind(POSILMN, CHROM.MAP, POS.MAP) names(POSILMN) <- c("CHROM.ILMN", "ID", "POS.ILMN", "CHROM.MAP", "POS.MAP") POSILMN <- subset(POSILMN, select = c("ID", "CHROM.ILMN", "POS.ILMN", "CHROM.MAP", "POS.MAP")) write.table(POSILMN, file = "Mapping-Illumina-Genome-25-June-2009.txt", quote = F, row.names = F, sep = "\t") cat("Number of multiple hits:\t", n.multiplehit, "\n") cat("Number of oligos without hit:\t", n.nohit, "\n") cat("Number of oligos with unique hit:\t", n.uniquehit, "\n") cat("Disambiguation\n") cat("\tMethod 1:\t", n.method1, "\n") cat("\tMethod 2:\t", n.method2, "\n") cat("\tMethod 3:\t", n.method3, "\n")