library(seqinr)
library(ape)

#D <- read.dna("../Material/OligoSequences.fasta", format = "fasta", as.character = TRUE)


D <- read.table(file = "../Material/BovineSNP50_B.csv", header = T, sep = ",")




#stop()

cat("Sequence data set read in.\n")

P <- read.table(file = "../Material/Mapping-Illumina-Genome-25-June-2009.txt", header = T)

cat("Position data read in.\n") 

D <- subset(D, select = c(Name, AlleleA_ProbeSeq))

n <- nrow(D)

cat("Number of lines:\t", n, "\n")

MultipleSeqs <- names(which(table(D$AlleleA_ProbeSeq) > 1))

n.ms <- length(MultipleSeqs)

cat("Number of oligo seqences occurring more than once:", n.ms, "\n")

M <- c()

for(i in 1 : n.ms)
{
	D.aux <- subset(D, AlleleA_ProbeSeq == MultipleSeqs[i])
	print(D.aux)
	
	
	P.aux <- subset(P, ID %in% D.aux$Name)
	print(P.aux)
	if(i == 1)
	{
		M <- t(c(as.vector(P.aux$ID), MultipleSeqs[i]))
	}
	else
	{
		M <- rbind(M, t(c(as.vector(P.aux$ID), MultipleSeqs[i])))
	}	

}

M <- as.data.frame(M)
names(M) <- c("ID1", "ID2", "SEQUENCE")


M <- merge(M, P, by.x = "ID1", by.y = "ID")

M <- subset(M, select = c(ID1, ID2, SEQUENCE, CHROM.ILMN, POS.ILMN))
M <- merge(M, P, by.x = "ID2", by.y = "ID")
M <- subset(M, select = -c(CHROM.ILMN.y, CHROM.MAP))

names(M) <- c("SNP1", "SNP2", "SEQUENCE", "CHROM", "POS.ILMN1", "POS.ILMN.2", "POS.MAP") 


write.table(M, file = "DuplicateOligos.txt", sep = "\t", row.names = FALSE, quote = FALSE)

#stop()	

REVCOMP <- vector(length = n, mode = "character")

cat("Producing reverse complementary sequences\n")
for(i in 1 : n)
{
	REVCOMP[i] <- toupper(c2s(comp(rev(s2c(as.character(D$AlleleA_ProbeSeq[i]))))))
	if(i %% 1000 == 0)
	{
		cat(i, "\n")
		
	}	
}
D <- cbind(D, REVCOMP)