#The purpose of this script is to create manifest files for the demultiplexed sequence data from UMinnesota. #It is assumed that the working directory is the jobs folder. import pandas as pd import os as os #get list of all file names for the sequence data in the raw_seq directory. file_list = os.listdir("../raw_seq/") #create lists that will be used as data for constructing the dataframes sample_id_amplicon = [] direction = [] path = [] #the manifest file requires absolute paths. abs_path = os.path.abspath("../raw_seq/") #append file specific data to lists. for entry in file_list: holder = entry.split("_") sample_temp = holder[0] + "_" + holder[1] sample_id_amplicon.append(sample_temp) path.append(abs_path + "/" + entry) if holder[3]=="R1": direction.append("forward") elif holder[3]=="R2": direction.append("reverse") else: print "Something went wrong!" #Construct dataframe in paired-end 33 manifest format. df = pd.DataFrame() df["sample-id"] = sample_id_amplicon df["absolute-filepath"] = path df["direction"] = direction df = df.set_index("sample-id") #Split manifest files into 16S and ITS datasets. df_16S = df[df.index.str.contains("_16S")] df_ITS = df[df.index.str.contains("_ITS")] #Save manifest files. df.to_csv("../metadata/pe-33-manifest") df_16S.to_csv("../metadata/pe-33-manifest_16S") df_ITS.to_csv("../metadata/pe-33-manifest_ITS")