#This is a summary of the commands used to execute transcriptome analyses published in Babonis et al XXXX. #Start #Sample identifiers mesenteries: Mes, M3, M4, nematosomes: Nem, N1, N2, tentacles: Ten, T3, T4 #Format for left (R1) and right (R2) raw reads: M3.R1.gz, M3.R2.gz 1. Adapter trimming (Trimmomatic) java -jar /usr/local/Trimmomatic-0.32/trimmomatic-0.32.jar PE -threads 12 -phred33 \ /home/babonis/development/01-Nematosomes/20-PostReview/00-DATA/M3.R1.gz \ /home/babonis/development/01-Nematosomes/20-PostReview/00-DATA/M3.R2.gz \ M3.OUT.trim.R1.fq M3.OUT.trim.unp1.fq M3.OUT.trim.R2.fq M3.OUT.trim.unp2.fq \ ILLUMINACLIP:/usr/local/Trimmomatic-0.32/adapters/TruSeq3-PE.fa:2:30:12:1:true MINLEN:36 > trimmo.PE.log 2>&1 cat M3.OUT.trim.unp1.fq M3.OUT.trim.unp2.fq > M3.OUT.trim.unp_all.fq 2. Error Correction (Allpaths) perl /usr/local/allpathslg-44837/src/ErrorCorrectReads.pl \ PAIRED_READS_A_IN=M3.OUT.trim.R1.fq \ PAIRED_READS_B_IN=M3.OUT.trim.R2.fq \ UNPAIRED_READS_IN=M3.OUT.trim.unp_all.fq \ PAIRED_SEP=100 PHRED_ENCODING=33 THREADS=30 READS_OUT=M3ECR_RNA > M3.ecr.log 2>&1 u& 3. de novo Assembly (Trinity) #full transcriptome (N = 333942 transcripts) cat *ECR_RNA.paired.R1.fastq *ECR_RNA.unpaired.fastq > allRNA.R1.unp.fq cat *ECR_RNA.paired.R2.fastq > allRNA.R2.fq /usr/local/trinityrnaseq-2.0.6/Trinity/Trinity.pl \ --seqType fq --max_memory 750G --CPU 5 --left allRNA.R1.unp.fq --right allRNA.R2.fq --full_cleanup > trin.log 2>&1 & #working transcriptome (N = 32706 transcripts) cat MesECR_RNA.paired.R1.fastq NemECR_RNA.paired.R1.fastq TenECR_RNA.paired.R1.fastq MesECR_RNA.unpaired.fastq NemECR_RNA.unpaired.fastq TenECR_RNA.unpaired.fastq > combined.R1.unp.fq cat MesECR_RNA.paired.R2.fastq NemECR_RNA.paired.R2.fastq TenECR_RNA.paired.R2.fastq > combined.R2.fq /usr/local/trinity/Trinity.pl \ --seqType fq --JM 750G --CPU 5 --left combined.R1.unp.fq --right combined.R2.fq --full_cleanup > trin.log 2>&1 & 4. Mapping and abundance estimation (Bowtie2, BedTools) bowtie2-build ./NvecRef.fa Nvec_ref #sample M3 example bowtie2 -x ../Nvec_ref -1 M3ECR_RNA.paired.R1.fastq -2 M3ECR_RNA.paired.R2.fastq -U M3ECR_RNA.unpaired.fastq -S M3ECR.sam 2> M3ECR.err samtools view -bS -o M3ECR.bam M3ECR.sam 2> M3ECR.bam.err & samtools sort M3ECR.bam M3ECR.sorted & samtools index M3ECR.sorted.bam 2> M3ECR.index.err & make_bed_from_fasta.py NvecRef.fa > ./NvecRef.bed bedtools multicov -q 30 -p -bams \ MesECR.sorted.bam M3ECR.sorted.bam M4ECR.sorted.bam \ NemECR.sorted.bam N1ECR.sorted.bam N2ECR.sorted.bam \ TenECR.sorted.bam T3ECR.sorted.bam T4ECR.sorted.bam \ -bed NvecRef.bed > mbc.out 2> mbc.err & perl remove_extra_bed_cols.pl mbc.out > count.table 5. Differential expression analysis (DESeq2) #implemented in R version 3.2.3 (2015-12-10) -- "Wooden Christmas-Tree" #count_table_17346 = count.table limited to only transcripts that also had predicted proteins from transdecoder (below) source("http://bioconductor.org/biocLite.R") biocLite("DESeq2") biocLite("RColorBrewer") biocLite("pheatmap") library("DESeq2") library("RColorBrewer") library("pheatmap") CountTable <- read.table("C:/count_table_17346.txt", header=TRUE, row.names=1) CT <- CountTable[,c("Mes","M3","M4","Nem","N1","N2","Ten","T3","T4")] samples <- data.frame(row.names=c("Mes","M3","M4","Nem","N1","N2","Ten","T3","T4"), condition=as.factor(c(rep("mesenteries",3),rep("nematosomes",3),rep("tentacles",3)))) myCDS <- DESeqDataSetFromMatrix(countData = CT, colData=samples, design=~condition) myCDS_1 <- DESeq(myCDS) mesnem_results <- results(myCDS_1,contrast=c("condition","mesenteries","nematosomes")) mesten_results <- results(myCDS_1,contrast=c("condition","mesenteries","tentacles")) nemten_results <- results(myCDS_1,contrast=c("condition","nematosomes","tentacles")) #heatmap vsd <- varianceStabilizingTransformation(myCDS_1) select <- order(rowSums(counts(myCDS_1,normalized=TRUE)),decreasing=TRUE)[1:1000] color <- colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100) pheatmap(assay(vsd)[select,], color=color, cutree_col = 3,cluster_rows=TRUE, clustering_distance_rows="maximum", clustering_method = "ward.D2", show_rownames=FALSE, cluster_cols=TRUE) #PCA plot plotPCA(vsd, intgroup="condition",ntop=1000) 6. Protein prediction (Transdecode) #Edwardsiella lineata example curl -O http://cnidarians.bu.edu/EdwardBase/cgi-bin/blast/ElT1.fasta /usr/local/TransDecoder-2.0.1/TransDecoder.LongOrfs -t ElT1.fasta 7. Orthology Analysis (OrthoMCL) #Edwardsiella lineata example #prepare fasta files for blast mkdir compliantFasta /usr/local/orthomclSoftware-v2.0.9/bin/orthomclAdjustFasta Elin ElT1.fasta.transdecoder_dir/longest_orfs.pep 2 #all adjusted fasta files in compliantFasta /usr/local/orthomclSoftware-v2.0.9/bin/orthomclFilterFasta ./compliantFasta 10 20 #output in goodProteins.fasta makeblastdb -in goodProteins.fasta -dbtype prot blastp -db goodProteins.fasta -query goodProteins.fasta -out all.x.all -outfmt 6 -num_threads 20 & /usr/local/orthomclSoftware-v2.0.9/bin/orthomclBlastParser all.x.all compliantFasta >> similarSequences.txt & #prepare mysql database /usr/local/orthomclSoftware-v2.0.9/bin/orthomclLoadBlast config_file similarSequences.txt & /usr/local/orthomclSoftware-v2.0.9/bin/orthomclPairs config_file orthomclPairs.out cleanup=no & /usr/local/orthomclSoftware-v2.0.9/bin/orthomclDumpPairsFiles config_file & /usr/local/bin/mcl mclInput --abc -I 1.5 -o mclOutput & /usr/local/orthomclSoftware-v2.0.9/bin/orthomclMclToGroups orthomcl_group 1000 < mclOutput > groups.txt &