######################################################################################################################################################
######################################################################################################################################################
######################################################################################################################################################
############### SOURCE CODE:  Host dependent in planta activation of a putative mating response between two different Rhizophagus irregularis isolates.
#05.11.2019
# Ivan Mateus
#V1

# Code should be used as a guideline to reproduce the data analysis. It is not intended to be used as a copy/paste solution.  
# Modification, of paths, input files, etc is needed in order to execute these commands.
######################################################################################################################################################
######################################################################################################################################################
######################################################################################################################################################


######################################################################################################################################################
                # Bioinformatic pipeline
############################################################################################################################################333

####################################
#1. Download data from bioproject PRJNA494798 on cluster
####################################
prefetch -O /scratch/wally/FAC/FBM/DEE/isanders/popgen_to_var/IM/01_Co-inoculation_raw/ SRX4991141

#2. Change Names

mv SRR7416443_1.fastq SPECIES_ISOLATE_NUCLEI_1.fastq

######################################
#3. extract SRA
######################################

for i in $(ls *.sra); do echo $i; fastq-dump --split-files -O /scratch/wally/FAC/FBM/DEE/isanders/popgen_to_var/IM/01_Co-inoculation_raw/ $i ; done

######################################
#4. Trimm reads with Trimm galore
######################################
#a. Change name of files based on samples name table

mv SRR7416443_1.fastq SPECIES_ISOLATE_NUCLEI_1.fastq
mv SRR7416443_2.fastq SPECIES_ISOLATE_NUCLEI_2.fastq

#b. Trimm and quality filter with TrimGalore https://github.com/FelixKrueger/TrimGalore

#Default parameters
a=0;for i in $(ls *_1.fastq); do echo $(echo $i | cut -d'_' -f1,2,3); ~/Software/TrimGalore-0.6.0/trim_galore --paired  --fastqc --gzip --output_dir /mnt/d/XX_UNIL_DEE/06_SingleNucleiCorradi/02_TrimmedFiles/ $i $(echo $i |cut -d'_' -f1,2,3)_2.fastq;done

######################################
#5. Mapp Reads To Host genome. 
######################################
    STAR - 2-pass mapping with re-generated genome.

#a. Index the ref genome with STAR for 1st-pass mapping

mkdir Genome_STAR_1st-pass #(do not create this folder in the folder "RNAseq")
cd Genome_STAR_1st-pass


MyGenomeDir1=$(pwd)
echo $MyGenomeDir1

cp ../mesculenta_305_v6.1.gene_exons.gff3 . cp ../Mesculenta_305_v6.fa .

~/Software/STAR/bin/Linux_x86_64_static/STAR --runMode genomeGenerate --genomeDir $MyGenomeDir1 --genomeFastaFiles $MyGenomeDir1/Mesculenta_305_v6.fa --runThreadN 8 --sjdbGTFfile $MyGenomeDir1/mesculenta_305_v6.1.gene_exons.gff3 --sjdbGTFfeatureExon CDS --sjdbGTFtagExonParentTranscript Parent --sjdbGTFtagExonParentGene ID --sjdbOverhang 99
~/Software/STAR/bin/Linux_x86_64_static/STAR --runMode genomeGenerate --genomeDir $MyGenomeDir1 --genomeFastaFiles /mnt/d/XX_UNIL_DEE/10_FungalMycToolKit/03_MapLotusJaponica/Genome_STAR_1st-pass/Lj3.0_pseudomol.fna --runThreadN 8 --sjdbGTFfile /mnt/d/XX_UNIL_DEE/10_FungalMycToolKit/03_MapLotusJaponica/Genome_STAR_1st-pass/Lj3.0_gene_models.gff3 --sjdbGTFfeatureExon CDS --sjdbGTFtagExonParentTranscript Parent --sjdbGTFtagExonParentGene ID --sjdbOverhang 99

#b. 1st-pass mapping ( to generate SJ.out.tab file for each sample)

cd Genome_file_indexed_with_STAR_1st-pass
MyGenomeDir1=$(pwd)
echo $MyGenomeDir1

for i in $(ls *_1.fq); do echo $i;  ~/Software/STAR/bin/Linux_x86_64_static/STAR  --runThreadN 8 --genomeDir $MyGenomeDir1 --readFilesIn $i $(echo $i | cut -d'_' -f1,2,3)_2_val_2.fq  --alignIntronMin 20 --alignIntronMax 5000 --outFilterMismatchNoverLmax 0.4 --outFilterMismatchNmax 15 --sjdbOverhang 99 --outFilterIntronMotifs RemoveNoncanonical --alignEndsType EndToEnd --outSAMtype BAM SortedByCoordinate --limitGenomeGenerateRAM 14500000000 --limitBAMsortRAM 11500000000; mv SJ.out.tab $(echo $i | cut -d'_' -f1,2,3).SJ.out.tab; rm *.bam ; done

#c. Create the SJ.out.tab by filtering and merging SJ.out.tab files from all runs

mkdir Genome_file_indexed_with_STAR_2nd-pass_2samples #(do not create this folder in the folder "RNAseq")
cd Genome_file_indexed_with_STAR_2nd-pass_2samples
MyGenomeDir2=$(pwd)


echo "" > Global.SJ.out.tab.Pass2.sjdb
awk 'BEGIN {OFS="\t"; strChar[0]="."; strChar[1]="+"; strChar[2]="-";} {if($5>0){print $1,$2,$3,strChar[$4]}}' ../Unmapped_ERM_rep1_DRS092924.SJ.out.tab >> Global.SJ.out.tab.Pass2.sjdb

#d. Index the genome with STAR for 2nd-pass mapping

 cp ../mesculenta_305_v6.1.gene_exons.gff3 .
cp ../Mesculenta_305_v6.fa .


~/Software/STAR/bin/Linux_x86_64_static/STAR --runMode genomeGenerate --genomeDir $MyGenomeDir2 --genomeFastaFiles  $MyGenomeDir2/Lotusjaponicus_MG20_v3.0_genome.fa --runThreadN 8 --sjdbGTFfile $MyGenomeDir2/Lotusjaponicus_MG20_v3.0_annotations.gff3 --sjdbGTFfeatureExon CDS --sjdbGTFtagExonParentTranscript Parent --sjdbGTFtagExonParentGene ID --sjdbOverhang 99 --sjdbFileChrStartEnd $MyGenomeDir2/Global.SJ.out.tab.Pass2.sjdb

#e. 2nd-pass mapping and produce unmapped reads file

cd Genome_file_indexed_with_STAR_ 2nd-pass
MyGenomeDir2=$(pwd)
echo $MyGenomeDir2



for i in $(ls *_1.fq); do echo $i; ~/Software/STAR/bin/Linux_x86_64_static/STAR --runThreadN 8 --genomeDir $MyGenomeDir2 --readFilesIn $i $(echo $i | cut -d'_' -f1,2,3)_2_val_2.fq --alignIntronMin 20 --alignIntronMax 5000 --outFilterMismatchNoverLmax 0.4 --outFilterMismatchNmax 15 --sjdbOverhang 99 --outFilterIntronMotifs RemoveNoncanonical --alignEndsType EndToEnd --outSAMtype BAM SortedByCoordinate --limitGenomeGenerateRAM 14500000000 --limitBAMsortRAM 11500000000 --outReadsUnmapped Fastx;  mv Aligned.sortedByCoord.out.bam RNAseq-aligned_$(echo $i | cut -d'_' -f1,2,3).bam; mv Unmapped.out.mate1 Unmapped_$(echo $i | cut -d'_' -f1,2,3).1.fq; mv Unmapped.out.mate2 Unmapped_$(echo $i | cut -d'_' -f1,2,3).2.fq; done 

# f. compress unmapped reads

a=0;for folder in $(ls -d */); do echo "-> "$folder; cd $folder; cd T5_Mapping_pass2*; bsub -q dee -L /bin/bash -J $folder'R1' -u ivandario.mateusgonzalez@unil.ch -N "gzip *1.fq"; bsub -q dee -L /bin/bash -J $folder'R2' -u ivandario.mateusgonzalez@unil.ch -N "gzip *2.fq"; cd ../..; done

######################################
# 6. Take unmapped reads and map them to second genome
######################################

#g. Index the second ref genome with STAR for 1st-pass mapping

mkdir Genome_file_indexed_with_STAR_1st-pass #(do not create this folder in the folder "RNAseq")
cd Genome_file_indexed_with_STAR_1st-pass

MyGenomeDir1=$(pwd)
echo $MyGenomeDir1

cp ../DAOM197198_PacBio.gff3 . cp ../DAOM197198_PacBio.fa .

~/Software/STAR/bin/Linux_x86_64_static/STAR --runMode genomeGenerate --genomeDir $MyGenomeDir1 --genomeFastaFiles $MyGenomeDir1/DAOM197198_PacBio.fa --runThreadN 8 --sjdbGTFfile $MyGenomeDir1/DAOM197198_PacBio.gff --sjdbGTFfeatureExon CDS --sjdbGTFtagExonParentTranscript Parent --sjdbGTFtagExonParentGene ID --sjdbOverhang 99

#h. 1st-pass mapping on second genome ( to generate SJ.out.tab file for each sample)

cd Genome_file_indexed_with_STAR_1st-pass
MyGenomeDir1=$(pwd)
echo $MyGenomeDir1

for i in $(ls *.1.fq); do echo $i;~/Software/STAR/bin/Linux_x86_64_static/STAR  --runThreadN 8 --genomeDir $MyGenomeDir1 --readFilesIn $i $(echo $i | cut -d'.' -f1).2.fq  --alignIntronMin 20 --alignIntronMax 5000 --outFilterMismatchNoverLmax 0.4 --outFilterMismatchNmax 15 --sjdbOverhang 99 --outFilterIntronMotifs RemoveNoncanonical --alignEndsType EndToEnd --outSAMtype BAM SortedByCoordinate --limitGenomeGenerateRAM 14500000000 --limitBAMsortRAM 11500000000; mv SJ.out.tab $(echo $i | cut -d'.' -f1).SJ.out.tab; rm *.bam ; done

#i. Create the SJ.out.tab by filtering and merging SJ.out.tab files from all runs

mkdir Genome_file_indexed_with_STAR_2nd-pass_2samples #(do not create this folder in the folder "RNAseq")
cd Genome_file_indexed_with_STAR_2nd-pass_2samples
MyGenomeDir2=$(pwd)

echo "" > Global.SJ.out.tab.Pass2.sjdb
awk 'BEGIN {OFS="\t"; strChar[0]="."; strChar[1]="+"; strChar[2]="-";} {if($5>0){print $1,$2,$3,strChar[$4]}}' ../Unmapped_ERM_rep1_DRS092924.SJ.out.tab >> Global.SJ.out.tab.Pass2.sjdb

#j. Index the 2nd genome with STAR for 2nd-pass mapping

 cp ../mesculenta_305_v6.1.gene_exons.gff3 .
cp ../Mesculenta_305_v6.fa .



~/Software/STAR/bin/Linux_x86_64_static/STAR --runMode genomeGenerate --genomeDir $MyGenomeDir2 --genomeFastaFiles  $MyGenomeDir2/DAOM197198_PacBio.fa --runThreadN 8 --sjdbGTFfile $MyGenomeDir2/DAOM197198_PacBio.gff --sjdbGTFfeatureExon CDS --sjdbGTFtagExonParentTranscript Parent --sjdbGTFtagExonParentGene ID --sjdbOverhang 99 --sjdbFileChrStartEnd $MyGenomeDir2/Global.SJ.out.tab.Pass2.sjdb

#k. 2nd-pass mapping on the second genome

cd Genome_file_indexed_with_STAR_ 2nd-pass
MyGenomeDir2=$(pwd)
echo $MyGenomeDir2


for i in $(ls *.1.fq); do echo $i; ~/Software/STAR/bin/Linux_x86_64_static/STAR --runThreadN 8 --genomeDir $MyGenomeDir2 --readFilesIn $i $(echo $i | cut -d'.' -f1).2.fq --alignIntronMin 20 --alignIntronMax 5000 --outFilterMismatchNoverLmax 0.4 --outFilterMismatchNmax 15 --sjdbOverhang 99 --outFilterIntronMotifs RemoveNoncanonical --alignEndsType EndToEnd --outSAMtype BAM SortedByCoordinate --limitGenomeGenerateRAM 14500000000 --limitBAMsortRAM 11500000000; mv Aligned.sortedByCoord.out.bam MappedRiir_UnMappedLotus_$(echo $i | cut -d'_' -f2,3,4).bam; done

######################################
#7. Count reads on feature
######################################

    ~/Software/subread-1.6.5-source/bin/featureCounts -t CDS -g ID -a Genome_file_indexed_with_STAR_1st-pass/DAOM197198_PacBio.gff -o #############################################################################################################################



######################################################################################################################################################
######################################################################################################################################################
######################################################################################################################################################
######################################################################################################################################################
######################################################################################################################################################
######################################################################################################################################################
######################################################################################################################################################

# Differential transcription analysis on RNA counts on differetn CASSAVA HOST genotypes.
#author: IM

# 08.08.2019
# Version v1
######################################################################################################################################################
######################################################################################################################################################
######################################################################################################################################################


########################################################################
# Libraries IMPORT
########################################################################


library(DESeq2)
library(tidyr)
library(ggplot2)
library(gplots)
library(RColorBrewer)
library(wesanderson)
library(gridExtra)
library(VennDiagram)
library(ggtree)
library(ape)
library(pheatmap)
library(circlize)

########################################################################
# DATA IMPORT
########################################################################

countdata <- read.table(file="C:/Users/comat/Documents/Science/Sanders_lab/Co-inoculation_manuscript/V12_NatMicro_Sanders_comments/02_AMFDATA/featureCounts_co-inoculation_Rirregularis_vf.txt", header=TRUE)

# change colnames
colnames(countdata)<-c("Geneid","Chr","Start","End","Strand","Length",
                       "BRA337_B1_1","BRA337_B1_2","BRA337_B1_3","BRA337_Coinoculation_1","BRA337_Coinoculation_2","BRA337_Coinoculation_3",
                       "BRA337_DAOM197198_1","BRA337_DAOM197198_2","BRA337_DAOM197198_3","BRA337_MOCK_1","BRA337_MOCK_2","BRA337_MOCK_3",
                       "CM4574_B1_1","CM4574_B1_2","CM4574_B1_3","CM4574_Coinoculation_1","CM4574_Coinoculation_2",
                       "CM4574_DAOM197198_1","CM4574_DAOM197198_2","CM4574_DAOM197198_3","CM4574_MOCK_1","CM4574_MOCK_2","CM4574_MOCK_3",
                       "COL2215_B1_1","COL2215_B1_2","COL2215_B1_3","COL2215_Coinoculation_1","COL2215_Coinoculation_2","COL2215_Coinoculation_3",
                       "COL2215_DAOM197198_1","COL2215_DAOM197198_2","COL2215_DAOM197198_3","COL2215_MOCK_1","COL2215_MOCK_2")
rownames(countdata)<-gsub("cds-","",as.character(countdata$Geneid))
head(countdata)

########################################################################
# ANALYSIS FOR 3 treatments (DAOM197198, B1 and Co-inoculation)
########################################################################

# Filter data
COL2215<-countdata[,grep("BRA337",colnames(countdata),invert=T)]
COL2215<-COL2215[,grep("CM4574",colnames(COL2215),invert=T)]
COL2215<-COL2215[,grep("MOCK",colnames(COL2215),invert=T)]

# select only count data
COL2215 <- COL2215[ ,7:ncol(COL2215)]

# Convert to matrix
COL2215 <- as.matrix(COL2215)

# Assign treatments
conditionCOL2215 <- factor(c(rep("B1", 3), rep("Coinoculation", 3), rep("DAOM197198", 3)))

# Make data frame with treatmens 
coldataCOL2215 <- data.frame(row.names=colnames(COL2215), conditionCOL2215)

# Run DESEQ step1
dds <- DESeqDataSetFromMatrix(countData=COL2215, colData=coldataCOL2215, design=~conditionCOL2215)
dds


# Run the DESeq pipeline step2
dds <- DESeq(dds)

# Regularized log transformation for heatmaps
rld <- rlogTransformation(dds)

# Principal components analysis
plotPCA(rld, intgroup="conditionCOL2215")

# Get differential expression results
resCOL2215 <- results(dds)

## Order by adjusted p-value
resCOL2215 <- resCOL2215[order(resCOL2215$padj), ]

## Merge with normalized count data
resdataCOL2215 <- merge(as.data.frame(resCOL2215), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdataCOL2215)[1] <- "Gene"


########################################################################
# ANALYSIs FOR PAIRWISE COMPARISON (DAOM197198 and Co-inoculation)
########################################################################


# Filter data
COL2215DAOM<-COL2215[,grep("B1",colnames(COL2215),invert=T)]

# Convert to matrix
COL2215DAOM <- as.matrix(COL2215DAOM)
head(COL2215DAOM)

# Assign treatments
condition <- factor(c( rep("Coinoculation", 3),rep("DAOM", 3)))

# Make data frame with treatmens 
coldataDAOM <- data.frame(row.names=colnames(COL2215DAOM), condition)

# Run DESEQ step1
dds <- DESeqDataSetFromMatrix(countData=COL2215DAOM, colData=coldataDAOM, design=~condition)
dds

# Run the DESeq pipeline step2
dds <- DESeq(dds)

# Regularized log transformation for heatmaps
rld <- rlogTransformation(dds)

# Principal components analysis
plotPCA(rld, intgroup="condition")

# Get differential expression results
resDAOM <- results(dds)

## Order by adjusted p-value
resDAOM <- resDAOM[order(resDAOM$padj), ]

## Merge with normalized count data
resdataCOL2215DAOM <- merge(as.data.frame(resDAOM), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdataCOL2215DAOM)[1] <- "Gene"

# Show filtered results
resdataCOL2215DAOM[resdataCOL2215DAOM$padj<0.1,]


########################################################################
# VENN DIAGRAM for DAOM 197198 vs CO-inoculation and B1 vs. Co-inoculation
########################################################################

draw.pairwise.venn((1868+79), (1699+79), 79, category = c("DAOM197198 vs. Co-inoculation", "B1 vs. Co-inoculation"), lty = rep("blank",  2), 
                   fill = c("light blue", "pink"), alpha = rep(0.5, 2), cat.pos = c(0,  0), cat.dist = rep(0.025, 2))


########################################################################
# PLOT GENE COUNTS OF A SINGLE GENE
# All treatments gene counts on the same plot
########################################################################


gene="GBC39608.1"

Toplot<-resdataCOL2215[resdataCOL2215$Gene==gene,]

data_long <- gather(Toplot, condition, NormCounts, COL2215_B1_1:COL2215_DAOM197198_3, factor_key=TRUE)
data_long<-cbind.data.frame(data_long,Treatment=c("B1","B1","B1","Co-inoculation","Co-inoculation","Co-inoculation","DAOM197198","DAOM197198","DAOM197198"))


ggplot(data_long, aes(x=Treatment, y=NormCounts, fill=Treatment) ) + scale_fill_manual(values=wes_palette(n=3, name="Darjeeling1")) +
  geom_boxplot()+ ggtitle(paste("Plant genotype COL2215",gene,sep = "\n")) + xlab("") + ylab("Normalized counts") +
  geom_dotplot(binaxis='y', stackdir='center', dotsize=0.6) +
  theme_bw() + theme(axis.line = element_line(colour = "black"), panel.grid.major = element_blank(),
                     panel.grid.minor = element_blank(),
                     panel.border = element_blank(),
                     panel.background = element_blank()) + 
  scale_x_discrete(limits=c("DAOM197198","B1","Co-inoculation"))

########################################################################
# PLOT PHYLOGENY
########################################################################


# IMPORT PHYLOGNEY
tree <- read.tree("C:/Users/comat/Documents/Science/Sanders_lab/Co-inoculation_manuscript/V12_NatMicro_Sanders_comments/02_AMFDATA/06_HMG-box_Mucorales/V1_blastHomologs_upregulated_Mucorales.nhx")

# Change tip labels
tree$tip.label<-unlist(lapply(strsplit(tree$tip.label,"_"), function (x) x[[1]]))  


ggtree(tree) + geom_tiplab() + geom_nodelab()


########################################################################
# Pheatmap genes in sexual reproduction in fungi
########################################################################



tabla <- read.delim(file="C:/Users/comat/Documents/Science/Sanders_lab/Co-inoculation_manuscript/V12_NatMicro_Sanders_comments/02_AMFDATA/01_Differential_transcription/ReproductiveProteins_mundo_ComparisonThisStudy_UP.txt", sep="\t", header=TRUE)
rownames(tabla)<-tabla$Protein

pheatmap(t(tabla[,5:8]), cluster_rows = F,cluster_cols = F,cellwidth = 12,cellheight = 12,color = c("white","black"))
# attention gene nomenclature conserved as in mondo et al.,
factors = 1:length(levels(tabla$Protein))  # just indicate there are 20 sectors



circos.initialize(factors = factors, xlim = c(0, 1))

circos.track(ylim = c(0, 1), factors = factors, bg.col = "white", track.height = 0.01)
circos.trackText(x = rep(0.5,length(levels(tabla$Protein))), y = rep(10,length(levels(tabla$Protein)) ),
                 labels = tabla$Protein,
                 cex = 0.8, factors = factors, col = "black", font = 2,  facing = "clockwise", niceFacing = TRUE,)

circos.track(ylim = c(0, 1), factors = factors,
             bg.col = tabla$Upregulated_COL2215, bg.border = "#EEEEEE", track.height = 0.1)

circos.track(ylim = c(0, 1), factors = factors,
             bg.col = tabla$Upregulated_CM4574.7, bg.border = "#EEEEEE", track.height = 0.1)
circos.track(ylim = c(0, 1), factors = factors,
             bg.col =tabla$Upregulated_BRA337, bg.border = "#EEEEEE", track.height = 0.1)
circos.track(ylim = c(0, 1), factors = factors,
             bg.col = tabla$Upregulated_Mundo2018, bg.border = "#EEEEEE", track.height = 0.1)



########################################################################
# SNP Analysis of RNA-seq 
########################################################################

# SNP ANALYSIS COL 2215


SNP_COL2215<-read.delim("../03_SNPcoinoc/COL2215_SNP_v1.txt",h=F)

head(SNP_COL2215)
colnames(SNP_COL2215)[10:18]<-c("DAOM197198_1","DAOM197198_2","DAOM197198_3","B1_1","B1_2","B1_3","Co-inoculation_1","Co-inoculation_2","Co-inoculation_3")

SNP_COL2215[,10:18 ] <- lapply(SNP_COL2215[,10:18 ], as.character)




AlFq_COL2215_DAOM197198_1<-unlist(lapply(strsplit(split = "\\:",SNP_COL2215$DAOM197198_1), function (x) x[[3]] ))
FQRef_COL2215_DAOM197198_1<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_DAOM197198_1), function (x) x[[1]]))
FQRef_COL2215_DAOM197198_1<-as.numeric(gsub("\\.","0",FQRef_COL2215_DAOM197198_1))
FQAlt_COL2215_DAOM197198_1<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_DAOM197198_1), function (x) x[[2]]))
FQAlt_COL2215_DAOM197198_1<- as.numeric(gsub("\\.","0",FQAlt_COL2215_DAOM197198_1))
       
COL2215_DAOM197198_1<-cbind.data.frame(SNP_COL2215[1:5], RefAF=FQRef_COL2215_DAOM197198_1, AltFQ=FQAlt_COL2215_DAOM197198_1,treat=rep("DAOM197198_1",length(FQAlt_COL2215_DAOM197198_1)) )


AlFq_COL2215_DAOM197198_2<-unlist(lapply(strsplit(split = "\\:",SNP_COL2215$DAOM197198_2), function (x) x[[3]] ))
FQRef_COL2215_DAOM197198_2<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_DAOM197198_2), function (x) x[[1]]))
FQRef_COL2215_DAOM197198_2<-as.numeric(gsub("\\.","0",FQRef_COL2215_DAOM197198_2))
FQAlt_COL2215_DAOM197198_2<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_DAOM197198_2), function (x) x[[2]]))
FQAlt_COL2215_DAOM197198_2<- as.numeric(gsub("\\.","0",FQAlt_COL2215_DAOM197198_2))

COL2215_DAOM197198_2<-cbind.data.frame(SNP_COL2215[1:5], RefAF=FQRef_COL2215_DAOM197198_2, AltFQ=FQAlt_COL2215_DAOM197198_2,treat=rep("DAOM197198_2",length(FQAlt_COL2215_DAOM197198_2)) )

AlFq_COL2215_DAOM197198_3<-unlist(lapply(strsplit(split = "\\:",SNP_COL2215$DAOM197198_3), function (x) x[[3]] ))
FQRef_COL2215_DAOM197198_3<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_DAOM197198_3), function (x) x[[1]]))
FQRef_COL2215_DAOM197198_3<-as.numeric(gsub("\\.","0",FQRef_COL2215_DAOM197198_3))
FQAlt_COL2215_DAOM197198_3<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_DAOM197198_3), function (x) x[[2]]))
FQAlt_COL2215_DAOM197198_3<- as.numeric(gsub("\\.","0",FQAlt_COL2215_DAOM197198_3))

COL2215_DAOM197198_3<-cbind.data.frame(SNP_COL2215[1:5], RefAF=FQRef_COL2215_DAOM197198_3, AltFQ=FQAlt_COL2215_DAOM197198_3,treat=rep("DAOM197198_3",length(FQAlt_COL2215_DAOM197198_3)) )



AlFq_COL2215_B1_1<-unlist(lapply(strsplit(split = "\\:",SNP_COL2215$B1_1), function (x) x[[3]] ))
FQRef_COL2215_B1_1<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_B1_1), function (x) x[[1]]))
FQRef_COL2215_B1_1<-as.numeric(gsub("\\.","0",FQRef_COL2215_B1_1))
FQAlt_COL2215_B1_1<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_B1_1), function (x) x[[2]]))
FQAlt_COL2215_B1_1<- as.numeric(gsub("\\.","0",FQAlt_COL2215_B1_1))

COL2215_B1_1<-cbind.data.frame(SNP_COL2215[1:5], RefAF=FQRef_COL2215_B1_1, AltFQ=FQAlt_COL2215_B1_1,treat=rep("B1_1",length(FQAlt_COL2215_B1_1)) )


AlFq_COL2215_B1_2<-unlist(lapply(strsplit(split = "\\:",SNP_COL2215$B1_2), function (x) x[[3]] ))
FQRef_COL2215_B1_2<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_B1_2), function (x) x[[1]]))
FQRef_COL2215_B1_2<-as.numeric(gsub("\\.","0",FQRef_COL2215_B1_2))
FQAlt_COL2215_B1_2<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_B1_2), function (x) x[[2]]))
FQAlt_COL2215_B1_2<- as.numeric(gsub("\\.","0",FQAlt_COL2215_B1_2))

COL2215_B1_2<-cbind.data.frame(SNP_COL2215[1:5], RefAF=FQRef_COL2215_B1_2, AltFQ=FQAlt_COL2215_B1_2,treat=rep("B1_2",length(FQAlt_COL2215_B1_2)) )

AlFq_COL2215_B1_3<-unlist(lapply(strsplit(split = "\\:",SNP_COL2215$B1_3), function (x) x[[3]] ))
FQRef_COL2215_B1_3<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_B1_3), function (x) x[[1]]))
FQRef_COL2215_B1_3<-as.numeric(gsub("\\.","0",FQRef_COL2215_B1_3))
FQAlt_COL2215_B1_3<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_B1_3), function (x) x[[2]]))
FQAlt_COL2215_B1_3<- as.numeric(gsub("\\.","0",FQAlt_COL2215_B1_3))

COL2215_B1_3<-cbind.data.frame(SNP_COL2215[1:5], RefAF=FQRef_COL2215_B1_3, AltFQ=FQAlt_COL2215_B1_3,treat=rep("B1_3",length(FQAlt_COL2215_B1_3)) )



AlFq_COL2215_Coinoculation_1<-unlist(lapply(strsplit(split = "\\:",SNP_COL2215$'Co-inoculation_1'), function (x) x[[3]] ))
FQRef_COL2215_Coinoculation_1<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_Coinoculation_1), function (x) x[[1]]))
FQRef_COL2215_Coinoculation_1<-as.numeric(gsub("\\.","0",FQRef_COL2215_Coinoculation_1))
FQAlt_COL2215_Coinoculation_1<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_Coinoculation_1), function (x) x[[2]]))
FQAlt_COL2215_Coinoculation_1<- as.numeric(gsub("\\.","0",FQAlt_COL2215_Coinoculation_1))

COL2215_Coinoculation_1<-cbind.data.frame(SNP_COL2215[1:5], RefAF=FQRef_COL2215_Coinoculation_1, AltFQ=FQAlt_COL2215_Coinoculation_1,treat=rep("Coinoculation_1",length(FQAlt_COL2215_Coinoculation_1)) )


AlFq_COL2215_Coinoculation_2<-unlist(lapply(strsplit(split = "\\:",SNP_COL2215$'Co-inoculation_2'), function (x) x[[3]] ))
FQRef_COL2215_Coinoculation_2<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_Coinoculation_2), function (x) x[[1]]))
FQRef_COL2215_Coinoculation_2<-as.numeric(gsub("\\.","0",FQRef_COL2215_Coinoculation_2))
FQAlt_COL2215_Coinoculation_2<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_Coinoculation_2), function (x) x[[2]]))
FQAlt_COL2215_Coinoculation_2<- as.numeric(gsub("\\.","0",FQAlt_COL2215_Coinoculation_2))

COL2215_Coinoculation_2<-cbind.data.frame(SNP_COL2215[1:5], RefAF=FQRef_COL2215_Coinoculation_2, AltFQ=FQAlt_COL2215_Coinoculation_2,treat=rep("Coinoculation_2",length(FQAlt_COL2215_Coinoculation_2)) )

AlFq_COL2215_Coinoculation_3<-unlist(lapply(strsplit(split = "\\:",SNP_COL2215$'Co-inoculation_3'), function (x) x[[3]] ))
FQRef_COL2215_Coinoculation_3<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_Coinoculation_3), function (x) x[[1]]))
FQRef_COL2215_Coinoculation_3<-as.numeric(gsub("\\.","0",FQRef_COL2215_Coinoculation_3))
FQAlt_COL2215_Coinoculation_3<- unlist(lapply(strsplit(split = "\\,", AlFq_COL2215_Coinoculation_3), function (x) x[[2]]))
FQAlt_COL2215_Coinoculation_3<- as.numeric(gsub("\\.","0",FQAlt_COL2215_Coinoculation_3))

COL2215_Coinoculation_3<-cbind.data.frame(SNP_COL2215[1:5], RefAF=FQRef_COL2215_Coinoculation_3, AltFQ=FQAlt_COL2215_Coinoculation_3,treat=rep("Coinoculation_3",length(FQAlt_COL2215_Coinoculation_3)) )

All_alfq<-rbind.data.frame(COL2215_DAOM197198_1,COL2215_DAOM197198_2,COL2215_DAOM197198_3,COL2215_B1_1,COL2215_B1_2,COL2215_B1_3,COL2215_Coinoculation_1,COL2215_Coinoculation_2,COL2215_Coinoculation_3)

All_alfq<-cbind.data.frame(All_alfq,Cond=unlist(lapply(strsplit(split ="_",as.character(All_alfq$treat)),function (x) x[[1]])),FQ=All_alfq$RefAF/(All_alfq$RefAF+All_alfq$AltFQ))



with( All_alfq[complete.cases(All_alfq), ], tapply(FQ, treat, mean))

# REdo allele fq. per sample


a1<-ggplot(All_alfq[grep("DAOM",All_alfq$treat),], aes(x=FQ, fill=treat)) + 
  geom_histogram(bins=30, alpha=1, position="identity") + ggtitle("Plant genotype COL2215\n DAOM197198 replicates") +
  scale_fill_manual(values=c(wes_palette(n=4, name="FantasticFox1")[2] ,wes_palette(n=2, name="FantasticFox1")[1],wes_palette(n=4, name="Moonrise2")[2] )) +
  theme_bw() + theme(axis.line = element_line(colour = "black"), panel.grid.major = element_blank(),
                     panel.grid.minor = element_blank(),
                     panel.border = element_blank(),
                     panel.background = element_blank()) + theme(legend.position = c(0.2, 0.4))

a2<-ggplot(All_alfq[grep("B1",All_alfq$treat),], aes(x=FQ, fill=treat))   + 
  geom_histogram(bins=30, alpha=1, position="identity") + ggtitle("Plant genotype COL2215\n B1 replicates") +
  scale_fill_manual(values=c(wes_palette(n=4, name="Darjeeling1")[1] ,wes_palette(n=4, name="GrandBudapest1")[4],wes_palette(n=5, name="FantasticFox1")[5] )) +
  theme_bw() + theme(axis.line = element_line(colour = "black"), panel.grid.major = element_blank(),
                     panel.grid.minor = element_blank(),
                     panel.border = element_blank(),
                     panel.background = element_blank()) + theme(legend.position = c(0.2, 0.4))

a3<-ggplot(All_alfq[grep("Coinoculation",All_alfq$treat),], aes(x=FQ, fill=treat)) + 
  geom_histogram(bins=30, alpha=1, position="identity") + ggtitle("Plant genotype COL2215\n Co-inoculation replicates") +
  scale_fill_manual(values=c(wes_palette(n=2, name="Darjeeling1")[2] ,wes_palette(n=2, name="Chevalier1")[1],wes_palette(n=2, name="Cavalcanti1")[2] )) +
  theme_bw() + theme(axis.line = element_line(colour = "black"), panel.grid.major = element_blank(),
                     panel.grid.minor = element_blank(),
                     panel.border = element_blank(),
                     panel.background = element_blank())+ theme(legend.position = c(0.2, 0.4))

grid.arrange(a1,a2,a3,
             widths = c(1.5, 2),
             layout_matrix = rbind(c(1, 3),
                                   c(2, 3)))



##################
# Individual loci

All_alfq2<-gather(All_alfq, condition, Counts,RefAF,AltFQ)

test1<-All_alfq2[grep("BDIQ01000197.1",All_alfq2$V1),]
test1<-test1[grep("175586",test1$V2),]

t1<-ggplot(test1, aes(fill=condition, y=Counts, x=treat)) + 
  geom_bar(position="stack", stat="identity") + ggtitle("Plant genotype COL2215\n SNP Scaffold BDIQ01000197.1 Position 175586 ") +
  scale_fill_manual(values=c(wes_palette(n=3, name="Darjeeling1")[3] ,wes_palette(n=3, name="Darjeeling1")[1])) +
  theme_bw() + theme(axis.line = element_line(colour = "black"), panel.grid.major = element_blank(),
                     panel.grid.minor = element_blank(),
                     panel.border = element_blank(),
                     panel.background = element_blank())+ theme(axis.text.x=element_text(angle=90, hjust=1))

test1<-All_alfq2[grep("BDIQ01000001.1",All_alfq2$V1),]
test1<-test1[grep("28234",test1$V2),]

t2<-ggplot(test1, aes(fill=condition, y=Counts, x=treat)) + 
  geom_bar(position="stack", stat="identity") + ggtitle("Plant genotype COL2215\n SNP Scaffold BDIQ01000001.1 Position 28234 ") +
  scale_fill_manual(values=c(wes_palette(n=3, name="Darjeeling1")[3] ,wes_palette(n=3, name="Darjeeling1")[1])) +
  theme_bw() + theme(axis.line = element_line(colour = "black"), panel.grid.major = element_blank(),
                     panel.grid.minor = element_blank(),
                     panel.border = element_blank(),
                     panel.background = element_blank())  + theme(axis.text.x=element_text(angle=90, hjust=1))



test1<-All_alfq2[grep("BDIQ01000205.1",All_alfq2$V1),]
test1<-test1[grep("69970",test1$V2),]

t3<-ggplot(test1, aes(fill=condition, y=Counts, x=treat)) + 
  geom_bar(position="stack", stat="identity") + ggtitle("Plant genotype COL2215\n SNP Scaffold BDIQ01000205.1 Position 69970 ") +
  scale_fill_manual(values=c(wes_palette(n=3, name="Darjeeling1")[3] ,wes_palette(n=3, name="Darjeeling1")[1])) +
  theme_bw() + theme(axis.line = element_line(colour = "black"), panel.grid.major = element_blank(),
                     panel.grid.minor = element_blank(),
                     panel.border = element_blank(),
                     panel.background = element_blank()) + theme(axis.text.x=element_text(angle=90, hjust=1))


grid.arrange(t1,t2,t3,nrow=1)


########################################################################
# Contaminants analysis
########################################################################

#################################################################################################################################



FC<-read.delim("ALL_Samples_Screen.out",h=F)

FC$V6<- as.character(FC$V6)
FC$V7<- as.character(FC$V7)
FC$V2<- as.character(FC$V2)
Species<-unlist(lapply(strsplit(split=" ",FC$V6),function (x) paste(x[[3]],x[[4]],x[[5]]) )  )
Sample<-as.data.frame(matrix(unlist(lapply(strsplit(split="\\_",FC$V7),function (x) gsub(".out","",c(x[[2]],x[[3]],x[[4]]) ))),ncol=3,byrow = T  ) )
treat<-unlist(lapply(strsplit(split="\\_",FC$V7),function (x) gsub(".out","",paste(x[[2]],x[[3]],x[[4]]) )))

SharedHashes<-as.numeric(unlist(lapply(strsplit(split="/",FC$V2),function (x) c(x[[1]]))) )

FC<-cbind.data.frame(Species, Sample,SharedHashes,treat)


data_wide <- spread(FC, treat, SharedHashes)


data_wide[is.na(data_wide)]<- 0



data_wide2<-data.frame(do.call("rbind", by(data_wide[,c(5:38)], data_wide$Species, FUN=colSums)  ))


rownames(data_wide)<- data_wide$Species

pheatmap(data_wide2[,4:34])



########################################################
# Colonization analysis. 
########################################################

Colonization <- read.table("colonization_vf.txt", header = T)




Col_COL2215<-Colonization[Colonization$var=="V4_COL2215",]
Col_COL2215$treat<-gsub("CANB1","Co-inoculation",Col_COL2215$treat)
Col_COL2215$treat<-gsub("CAN","DAOM197198",Col_COL2215$treat)
Col_COL2215$treat<-gsub("A$","MOCK",Col_COL2215$treat)
Col_COL2215$treat<-factor(Col_COL2215$treat, levels = c("MOCK","DAOM197198", "B1", "Co-inoculation"))


Col_BRA337<-Colonization[Colonization$var=="V5_BRA337",]
Col_BRA337$treat<-gsub("CANB1","Co-inoculation",Col_BRA337$treat)
Col_BRA337$treat<-gsub("CAN","DAOM197198",Col_BRA337$treat)
Col_BRA337$treat<-gsub("A$","MOCK",Col_BRA337$treat)
Col_BRA337$treat<-factor(Col_BRA337$treat, levels = c("MOCK","DAOM197198", "B1", "Co-inoculation"))

Col_CM4574<-Colonization[Colonization$var=="V6_CM4574-7",]
Col_CM4574$treat<-gsub("CANB1","Co-inoculation",Col_CM4574$treat)
Col_CM4574$treat<-gsub("CAN","DAOM197198",Col_CM4574$treat)
Col_CM4574$treat<-gsub("A$","MOCK",Col_CM4574$treat)
Col_CM4574$treat<-factor(Col_CM4574$treat, levels = c("MOCK","DAOM197198", "B1", "Co-inoculation"))

AllCol<-rbind.data.frame(Col_COL2215,Col_BRA337,Col_CM4574)



ggplot(Col_COL2215, aes(x=treat, y=perc,fill=treat) ) + geom_boxplot() + scale_fill_manual(values=wes_palette(n=4, name="Darjeeling1")) +
  ggtitle("Plant genotype COL2215") + xlab("") + ylab("Fungal colonization")  +
  theme_bw() + theme(axis.line = element_line(colour = "black"), panel.grid.major = element_blank(),
                     panel.grid.minor = element_blank(),
                     panel.border = element_blank(),
                     panel.background = element_blank()) 

# Stat analysis                    
model=(lm(data=Col_COL2215,perc~treat))
summary(model)
ANOVA=aov(model)

# Tukey test to study each pair of treatment :
TukeyHSD(x=ANOVA, 'treat', conf.level=0.95)



########################################################
# plant phenotypic response analysis.  example of host genotype COL2215
########################################################



Harvest <- read.table("dry_weights.txt", header = T)




Har_COL2215<-Harvest[Harvest$var=="4",]
Har_COL2215$treat<-gsub("A\\+B","Co-inoculation",Har_COL2215$treat)
Har_COL2215$treat<-gsub("A","DAOM197198",Har_COL2215$treat)
Har_COL2215$treat<-gsub("B","B1",Har_COL2215$treat)
Har_COL2215$treat<-gsub("C$","MOCK",Har_COL2215$treat)
Har_COL2215$treat<-factor(Har_COL2215$treat, levels = c("MOCK","DAOM197198", "B1", "Co-inoculation"))



Har_BRA337<-Harvest[Harvest$var=="5",]
Har_BRA337$treat<-gsub("A\\+B","Co-inoculation",Har_BRA337$treat)
Har_BRA337$treat<-gsub("A","DAOM197198",Har_BRA337$treat)
Har_BRA337$treat<-gsub("B","B1",Har_BRA337$treat)
Har_BRA337$treat<-gsub("C$","MOCK",Har_BRA337$treat)
Har_BRA337$treat<-factor(Har_BRA337$treat, levels = c("MOCK","DAOM197198", "B1", "Co-inoculation"))

Har_CM4574<-Harvest[Harvest$var=="6",]
Har_CM4574$treat<-gsub("A\\+B","Co-inoculation",Har_CM4574$treat)
Har_CM4574$treat<-gsub("A","DAOM197198",Har_CM4574$treat)
Har_CM4574$treat<-gsub("B","B1",Har_CM4574$treat)
Har_CM4574$treat<-gsub("C$","MOCK",Har_CM4574$treat)
Har_CM4574$treat<-factor(Har_CM4574$treat, levels = c("MOCK","DAOM197198", "B1", "Co-inoculation"))

AllHar<-rbind.data.frame(Har_COL2215,Har_BRA337,Har_CM4574)
AllHar$var<-gsub("4","COL2215",AllHar$var)
AllHar$var<-gsub("5","BRA337",AllHar$var)
AllHar$var<-gsub("6","CM4574",AllHar$var)



###########
#COL2215
ggplot(Har_COL2215, aes(x=treat, y=total,fill=treat) ) + geom_boxplot() + scale_fill_manual(values=wes_palette(n=4, name="Darjeeling1")) +
  ggtitle("Plant genotype COL2215") + xlab("") + ylab("Total dry weight")  +
  theme_bw() + theme(axis.line = element_line(colour = "black"), panel.grid.major = element_blank(),
                     panel.grid.minor = element_blank(),
                     panel.border = element_blank(),
                     panel.background = element_blank())


# Stat analysis

model=(lm(data=Har_COL2215,total~treat))
summary(model)
ANOVA=aov(model)
# Tukey test to study each pair of treatment :
TukeyHSD(x=ANOVA, 'treat', conf.level=0.95)