#The purpose of this script is to create manifest files for the demultiplexed sequence data from UMinnesota. 
#It is assumed that the working directory is the jobs folder.
import pandas as pd
import os as os

#get list of all file names for the sequence data in the raw_seq directory.
file_list = os.listdir("../raw_seq/")

#create lists that will be used as data for constructing the dataframes
sample_id_amplicon = []
direction = []
path = []

#the manifest file requires absolute paths.
abs_path = os.path.abspath("../raw_seq/")

#append file specific data to lists.
for entry in file_list:
    holder = entry.split("_")
    sample_temp = holder[0] + "_" + holder[1]
    sample_id_amplicon.append(sample_temp)
    path.append(abs_path + "/" + entry)
    if holder[3]=="R1":
        direction.append("forward")
    elif holder[3]=="R2":
        direction.append("reverse")
    else:
        print "Something went wrong!"

#Construct dataframe in paired-end 33 manifest format.
df = pd.DataFrame()
df["sample-id"] = sample_id_amplicon
df["absolute-filepath"] = path
df["direction"] = direction
df = df.set_index("sample-id")

#Split manifest files into 16S and ITS datasets.
df_16S = df[df.index.str.contains("_16S")]
df_ITS = df[df.index.str.contains("_ITS")]

#Save manifest files.
df.to_csv("../metadata/pe-33-manifest")
df_16S.to_csv("../metadata/pe-33-manifest_16S")
df_ITS.to_csv("../metadata/pe-33-manifest_ITS")