#This is a summary of the commands used to execute transcriptome analyses published in Babonis et al XXXX.

#Start
#Sample identifiers
	mesenteries: Mes, M3, M4, 
        nematosomes: Nem, N1, N2,
        tentacles:   Ten, T3, T4

#Format for left (R1) and right (R2) raw reads:  M3.R1.gz, M3.R2.gz

1. Adapter trimming (Trimmomatic)
java -jar /usr/local/Trimmomatic-0.32/trimmomatic-0.32.jar PE -threads 12 -phred33 \
/home/babonis/development/01-Nematosomes/20-PostReview/00-DATA/M3.R1.gz \
/home/babonis/development/01-Nematosomes/20-PostReview/00-DATA/M3.R2.gz \
M3.OUT.trim.R1.fq M3.OUT.trim.unp1.fq M3.OUT.trim.R2.fq M3.OUT.trim.unp2.fq \
ILLUMINACLIP:/usr/local/Trimmomatic-0.32/adapters/TruSeq3-PE.fa:2:30:12:1:true MINLEN:36 > trimmo.PE.log 2>&1

cat M3.OUT.trim.unp1.fq M3.OUT.trim.unp2.fq > M3.OUT.trim.unp_all.fq

2. Error Correction (Allpaths)
perl /usr/local/allpathslg-44837/src/ErrorCorrectReads.pl \
PAIRED_READS_A_IN=M3.OUT.trim.R1.fq \
PAIRED_READS_B_IN=M3.OUT.trim.R2.fq \
UNPAIRED_READS_IN=M3.OUT.trim.unp_all.fq \
PAIRED_SEP=100 PHRED_ENCODING=33 THREADS=30 READS_OUT=M3ECR_RNA > M3.ecr.log 2>&1 u&

3. de novo Assembly (Trinity)
#full transcriptome (N = 333942 transcripts)
cat *ECR_RNA.paired.R1.fastq *ECR_RNA.unpaired.fastq > allRNA.R1.unp.fq
cat *ECR_RNA.paired.R2.fastq > allRNA.R2.fq

/usr/local/trinityrnaseq-2.0.6/Trinity/Trinity.pl \
--seqType fq --max_memory 750G --CPU 5 --left allRNA.R1.unp.fq --right allRNA.R2.fq --full_cleanup > trin.log 2>&1 &

#working transcriptome (N = 32706 transcripts)
cat MesECR_RNA.paired.R1.fastq NemECR_RNA.paired.R1.fastq TenECR_RNA.paired.R1.fastq MesECR_RNA.unpaired.fastq NemECR_RNA.unpaired.fastq TenECR_RNA.unpaired.fastq > combined.R1.unp.fq
cat MesECR_RNA.paired.R2.fastq NemECR_RNA.paired.R2.fastq TenECR_RNA.paired.R2.fastq > combined.R2.fq

/usr/local/trinity/Trinity.pl \
--seqType fq --JM 750G --CPU 5 --left combined.R1.unp.fq --right combined.R2.fq --full_cleanup > trin.log 2>&1 &

4. Mapping and abundance estimation (Bowtie2, BedTools) 
bowtie2-build ./NvecRef.fa Nvec_ref
#sample M3 example
bowtie2 -x ../Nvec_ref -1 M3ECR_RNA.paired.R1.fastq -2 M3ECR_RNA.paired.R2.fastq -U M3ECR_RNA.unpaired.fastq -S M3ECR.sam 2> M3ECR.err 
samtools view -bS -o M3ECR.bam M3ECR.sam 2> M3ECR.bam.err &
samtools sort M3ECR.bam M3ECR.sorted &
samtools index M3ECR.sorted.bam 2> M3ECR.index.err &
make_bed_from_fasta.py NvecRef.fa > ./NvecRef.bed

bedtools multicov -q 30 -p -bams \
MesECR.sorted.bam M3ECR.sorted.bam M4ECR.sorted.bam \
NemECR.sorted.bam N1ECR.sorted.bam N2ECR.sorted.bam \
TenECR.sorted.bam T3ECR.sorted.bam T4ECR.sorted.bam \
-bed NvecRef.bed > mbc.out 2> mbc.err &

perl remove_extra_bed_cols.pl mbc.out > count.table

5. Differential expression analysis (DESeq2)
#implemented in R version 3.2.3 (2015-12-10) -- "Wooden Christmas-Tree"
#count_table_17346 = count.table limited to only transcripts that also had predicted proteins from transdecoder (below)

source("http://bioconductor.org/biocLite.R")
biocLite("DESeq2")
biocLite("RColorBrewer")
biocLite("pheatmap")
library("DESeq2")
library("RColorBrewer")
library("pheatmap")

CountTable <- read.table("C:/count_table_17346.txt", header=TRUE, row.names=1)
CT <- CountTable[,c("Mes","M3","M4","Nem","N1","N2","Ten","T3","T4")]
samples <- data.frame(row.names=c("Mes","M3","M4","Nem","N1","N2","Ten","T3","T4"), condition=as.factor(c(rep("mesenteries",3),rep("nematosomes",3),rep("tentacles",3))))
myCDS <- DESeqDataSetFromMatrix(countData = CT, colData=samples, design=~condition)
myCDS_1 <- DESeq(myCDS)
mesnem_results <- results(myCDS_1,contrast=c("condition","mesenteries","nematosomes"))
mesten_results <- results(myCDS_1,contrast=c("condition","mesenteries","tentacles"))
nemten_results <- results(myCDS_1,contrast=c("condition","nematosomes","tentacles"))

#heatmap
vsd <- varianceStabilizingTransformation(myCDS_1)
select <- order(rowSums(counts(myCDS_1,normalized=TRUE)),decreasing=TRUE)[1:1000]
color <- colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100)
pheatmap(assay(vsd)[select,], color=color, cutree_col = 3,cluster_rows=TRUE, clustering_distance_rows="maximum", clustering_method = "ward.D2", show_rownames=FALSE, cluster_cols=TRUE)

#PCA plot
plotPCA(vsd, intgroup="condition",ntop=1000)

6. Protein prediction (Transdecode)
#Edwardsiella lineata example
curl -O http://cnidarians.bu.edu/EdwardBase/cgi-bin/blast/ElT1.fasta
/usr/local/TransDecoder-2.0.1/TransDecoder.LongOrfs -t ElT1.fasta

7. Orthology Analysis (OrthoMCL)
#Edwardsiella lineata example
#prepare fasta files for blast

mkdir compliantFasta
/usr/local/orthomclSoftware-v2.0.9/bin/orthomclAdjustFasta Elin ElT1.fasta.transdecoder_dir/longest_orfs.pep 2
#all adjusted fasta files in compliantFasta
/usr/local/orthomclSoftware-v2.0.9/bin/orthomclFilterFasta ./compliantFasta 10 20
#output in goodProteins.fasta
makeblastdb -in goodProteins.fasta -dbtype prot
blastp -db goodProteins.fasta -query goodProteins.fasta -out all.x.all -outfmt 6 -num_threads 20 &

/usr/local/orthomclSoftware-v2.0.9/bin/orthomclBlastParser all.x.all compliantFasta >> similarSequences.txt &
#prepare mysql database
/usr/local/orthomclSoftware-v2.0.9/bin/orthomclLoadBlast config_file similarSequences.txt &
/usr/local/orthomclSoftware-v2.0.9/bin/orthomclPairs config_file orthomclPairs.out cleanup=no &
/usr/local/orthomclSoftware-v2.0.9/bin/orthomclDumpPairsFiles config_file &
/usr/local/bin/mcl mclInput --abc -I 1.5 -o mclOutput &
/usr/local/orthomclSoftware-v2.0.9/bin/orthomclMclToGroups orthomcl_group 1000 < mclOutput > groups.txt &