## source file for "Chromatin-enriched RNAs mark active and repressive cis-regulation: an analysis of nuclear RNA-seq" ## Author: Xiangying Sun ## Last update: Jun 2019 ########################################################################################### # 1. Genome Alignment using Tophat ########################################################################################### ####### K562_CPE1 tophat --rg-sample SRR3703288 --rg-id SRR3703288 --library-type=fr-firststrand \ --segment-length 50 --segment-mismatches 2 --no-coverage-search --keep-fasta-order -p 32 \ -o /homeDir/GSE83531_cheRNA/tophat/SRR3703288 \ /homeDir/Reference_Sequences/Human/FASTA/GRCh38.primary_assembly.genome \ /homeDir/GSE83531_cheRNA/FastQ/SRR3703288.fastq.gz ####### K562_CPE2 tophat --rg-sample SRR3703289 --rg-id SRR3703289 --library-type=fr-firststrand \ --segment-length 50 --segment-mismatches 2 --no-coverage-search --keep-fasta-order -p 32 \ -o /homeDir/GSE83531_cheRNA/tophat/SRR3703289 \ /homeDir/Reference_Sequences/Human/FASTA/GRCh38.primary_assembly.genome \ /homeDir/GSE83531_cheRNA/FastQ/SRR3703289.fastq.gz ####### K562_CPE3 tophat --rg-sample SRR3703290 --rg-id SRR3703290 --library-type=fr-firststrand \ --segment-length 50 --segment-mismatches 2 --no-coverage-search --keep-fasta-order -p 32 \ -o /homeDir/GSE83531_cheRNA/tophat/SRR3703290 \ /homeDir/Reference_Sequences/Human/FASTA/GRCh38.primary_assembly.genome \ /homeDir/GSE83531_cheRNA/FastQ/SRR3703290.fastq.gz ####### K562_SNE1 tophat --rg-sample SRR3703291 --rg-id SRR3703291 --library-type=fr-firststrand \ --segment-length 50 --segment-mismatches 2 --no-coverage-search --keep-fasta-order -p 32 \ -o /homeDir/GSE83531_cheRNA/tophat/SRR3703291 \ /homeDir/Reference_Sequences/Human/FASTA/GRCh38.primary_assembly.genome \ /homeDir/GSE83531_cheRNA/FastQ/SRR3703291.fastq.gz ####### K562_SNE2 tophat --rg-sample SRR3703292 --rg-id SRR3703292 --library-type=fr-firststrand \ --segment-length 50 --segment-mismatches 2 --no-coverage-search --keep-fasta-order -p 32 \ -o /homeDir/GSE83531_cheRNA/tophat/SRR3703292 \ /homeDir/Reference_Sequences/Human/FASTA/GRCh38.primary_assembly.genome \ /homeDir/GSE83531_cheRNA/FastQ/SRR3703292.fastq.gz ####### K562_SNE3 tophat --rg-sample SRR3703293 --rg-id SRR3703293 --library-type=fr-firststrand \ --segment-length 50 --segment-mismatches 2 --no-coverage-search --keep-fasta-order -p 32 \ -o /homeDir/GSE83531_cheRNA/tophat/SRR3703293 \ /homeDir/Reference_Sequences/Human/FASTA/GRCh38.primary_assembly.genome \ /homeDir/GSE83531_cheRNA/FastQ/SRR3703293.fastq.gz ########################################################################################### # 2. clean bam files using samtools to remove reads mapped mitochondria chromosome ########################################################################################### ####### K562_CPE1 samtools idxstats /homeDir/GSE83531_cheRNA/All_bam/SRR3703288_accepted_hits.bam \ | cut -f 1 | grep 'chr' | grep -v 'chrM' | \ xargs samtools view -b /homeDir/GSE83531_cheRNA/All_bam/SRR3703288_accepted_hits.bam > \ /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703288_clean.bam samtools index /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703288_clean.bam ####### K562_CPE2 samtools idxstats /homeDir/GSE83531_cheRNA/All_bam/SRR3703289_accepted_hits.bam \ | cut -f 1 | grep 'chr' | grep -v 'chrM' | \ xargs samtools view -b /homeDir/GSE83531_cheRNA/All_bam/SRR3703289_accepted_hits.bam > \ /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703289_clean.bam samtools index /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703289_clean.bam ####### K562_CPE3 samtools idxstats /homeDir/GSE83531_cheRNA/All_bam/SRR3703290_accepted_hits.bam \ | cut -f 1 | grep 'chr' | grep -v 'chrM' | \ xargs samtools view -b /homeDir/GSE83531_cheRNA/All_bam/SRR3703290_accepted_hits.bam > \ /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703290_clean.bam samtools index /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703290_clean.bam ####### K562_SNE1 samtools idxstats /homeDir/GSE83531_cheRNA/All_bam/SRR3703291_accepted_hits.bam \ | cut -f 1 | grep 'chr' | grep -v 'chrM' | \ xargs samtools view -b /homeDir/GSE83531_cheRNA/All_bam/SRR3703291_accepted_hits.bam > \ /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703291_clean.bam samtools index /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703291_clean.bam ####### K562_SNE2 samtools idxstats /homeDir/GSE83531_cheRNA/All_bam/SRR3703292_accepted_hits.bam \ | cut -f 1 | grep 'chr' | grep -v 'chrM' | \ xargs samtools view -b /homeDir/GSE83531_cheRNA/All_bam/SRR3703292_accepted_hits.bam > \ /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703292_clean.bam samtools index /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703292_clean.bam ####### K562_SNE3 samtools idxstats /homeDir/GSE83531_cheRNA/All_bam/SRR3703293_accepted_hits.bam \ | cut -f 1 | grep 'chr' | grep -v 'chrM' | \ xargs samtools view -b /homeDir/GSE83531_cheRNA/All_bam/SRR3703293_accepted_hits.bam > \ /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703293_clean.bam samtools index /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703293_clean.bam ########################################################################################### # 3. RABT assembly using Cufflinks ########################################################################################### ####### K562_CPE1 cufflinks -o /homeDir/GSE83531_cheRNA/cufflinks_rabt/SRR3703288 \ -p 32 --library-type fr-firststrand \ -g /homeDir/Reference_Sequences/Human/Annotation/gencode.v25.primary_assembly.annotation.gtf \ -u /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703288_clean.bam ####### K562_CPE2 cufflinks -o /homeDir/GSE83531_cheRNA/cufflinks_rabt/SRR3703289 \ -p 32 --library-type fr-firststrand \ -g /homeDir/Reference_Sequences/Human/Annotation/gencode.v25.primary_assembly.annotation.gtf \ -u /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703280_clean.bam ####### K562_CPE3 cufflinks -o /homeDir/GSE83531_cheRNA/cufflinks_rabt/SRR3703290 \ -p 32 --library-type fr-firststrand \ -g /homeDir/Reference_Sequences/Human/Annotation/gencode.v25.primary_assembly.annotation.gtf \ -u /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703290_clean.bam ####### K562_SNE1 cufflinks -o /homeDir/GSE83531_cheRNA/cufflinks_rabt/SRR3703291 \ -p 32 --library-type fr-firststrand \ -g /homeDir/Reference_Sequences/Human/Annotation/gencode.v25.primary_assembly.annotation.gtf \ -u /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703291_clean.bam ####### K562_SNE2 cufflinks -o /homeDir/GSE83531_cheRNA/cufflinks_rabt/SRR3703292 \ -p 32 --library-type fr-firststrand \ -g /homeDir/Reference_Sequences/Human/Annotation/gencode.v25.primary_assembly.annotation.gtf \ -u /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703292_clean.bam ####### K562_SNE3 cufflinks -o /homeDir/GSE83531_cheRNA/cufflinks_rabt/SRR3703293 \ -p 32 --library-type fr-firststrand \ -g /homeDir/Reference_Sequences/Human/Annotation/gencode.v25.primary_assembly.annotation.gtf \ -u /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703293_clean.bam ########################################################################################### # 4. Build transcriptome using Cuffmerge ########################################################################################### cuffmerge -o /homeDir/GSE83531_cheRNA/cuffmerge_rabt_k562/ -p 32 \ -g /homeDir/Reference_Sequences/Human/Annotation/gencode.v25.primary_assembly.annotation.gtf \ -s /homeDir/Reference_Sequences/Human/FASTA/GRCh38.primary_assembly.genome.fa \ /homeDir/GSE83531_cheRNA/cufflinks_rabt/che_sne_K562.txt ########################################################################## # 5. Get geneCounts using htseq ########################################################################## ####### K562_CPE1 htseq-count -f bam -s reverse -m intersection-nonempty \ /homeDir/GSE83531_cheRNA/cleaned_bam/K562_cleaned_bam/SRR3703288_clean.bam \ /homeDir/GSE83531_cheRNA/cuffmerge_rabt_k562/merged.gtf > \ /homeDir/GSE83531_cheRNA/HTseq_ruthenburg_1000_K562/SRR3703288_geneCounts.out ####### K562_CPE2 htseq-count -f bam -s reverse -m intersection-nonempty \ /homeDir/GSE83531_cheRNA/cleaned_bam/K562_cleaned_bam/SRR3703289_clean.bam \ /homeDir/GSE83531_cheRNA/cuffmerge_rabt_k562/merged.gtf > \ /homeDir/GSE83531_cheRNA/HTseq_ruthenburg_1000_K562/SRR3703289_geneCounts.out ####### K562_CPE3 htseq-count -f bam -s reverse -m intersection-nonempty \ /homeDir/GSE83531_cheRNA/cleaned_bam/K562_cleaned_bam/SRR3703290_clean.bam \ /homeDir/GSE83531_cheRNA/cuffmerge_rabt_k562/merged.gtf > \ /homeDir/GSE83531_cheRNA/HTseq_ruthenburg_1000_K562/SRR3703290_geneCounts.out ####### K562_SNE1 htseq-count -f bam -s reverse -m intersection-nonempty \ /homeDir/GSE83531_cheRNA/cleaned_bam/K562_cleaned_bam/SRR3703291_clean.bam \ /homeDir/GSE83531_cheRNA/cuffmerge_rabt_k562/merged.gtf > \ /homeDir/GSE83531_cheRNA/HTseq_ruthenburg_1000_K562/SRR3703291_geneCounts.out ####### K562_SNE2 htseq-count -f bam -s reverse -m intersection-nonempty \ /homeDir/GSE83531_cheRNA/cleaned_bam/K562_cleaned_bam/SRR3703292_clean.bam \ /homeDir/GSE83531_cheRNA/cuffmerge_rabt_k562/merged.gtf > \ /homeDir/GSE83531_cheRNA/HTseq_ruthenburg_1000_K562/SRR3703292_geneCounts.out ####### K562_SNE3 htseq-count -f bam -s reverse -m intersection-nonempty \ /homeDir/GSE83531_cheRNA/cleaned_bam/K562_cleaned_bam/SRR3703293_clean.bam \ /homeDir/GSE83531_cheRNA/cuffmerge_rabt_k562/merged.gtf > \ /homeDir/GSE83531_cheRNA/HTseq_ruthenburg_1000_K562/SRR3703293_geneCounts.out ########################################################################## # 6. Differential expression analysis using limma package in R ########################################################################## ###### Reference: https://www.bioconductor.org/help/workflows/RNAseq123/ library(limma) library(edgeR) ###### read htseq geneCounts into a DGE matrix used in edgeR htseqToDGE <- function(files_dir){ library(edgeR) files <- list.files(path=files_dir, pattern="geneCounts") files <- sort(files, decreasing=T) files_fullPath <- sapply(files, function(x) paste0(files_dir,x)) ######### raw_DGE <- readDGE(files_fullPath, header=F) list <- c("no_feature", "ambiguous", "too_low_aQual", "not_aligned", "alignment_not_unique") for(i in list){ tmp <- grep(i, rownames(raw_DGE$counts)) if(length(tmp)>0) raw_DGE$counts <- raw_DGE$counts[-tmp,] } colnames(raw_DGE) <- sapply(files, function(x) unlist(strsplit(x, "_", fixed=T))[1]) group <- as.factor(c("SNE", "SNE", "SNE", "CPE", "CPE", "CPE")) raw_DGE$samples$group <- group return(raw_DGE) } ###### filtered raw counts, normalize raw counts and make density plots and barplots filterNormPlot <- function(raw_DGE, method, cpm_cutoff){ library(edgeR) library(RColorBrewer) # plot log-cpm of raw data par(mfrow=c(2,2)) nsamples <- ncol(raw_DGE) col <- brewer.pal(nsamples, "RdYlGn") samplenames <- colnames(tuxedo_DGE) lcpm <- cpm(raw_DGE, log=TRUE) plot(density(lcpm[,1]), col=col[1], lwd=2, las=2, ylim=c(0,0.5), main="", xlab="") title(main=paste0("A. ",method," Raw data"), xlab="Log2-cpm") abline(v=0, lty=3) for (i in 2:nsamples){ den <- density(lcpm[,i]) lines(den$x, den$y, col=col[i], lwd=2) } legend("topright", samplenames, text.col=col, bty="n") # filter by cpm cpm <- cpm(raw_DGE) keep.exprs <- rowSums(cpm>cpm_cutoff)>=3 filtered_DGE <- raw_DGE[keep.exprs, keep.lib.sizes=FALSE] dim(filtered_DGE) filtered_lcpm <- cpm(filtered_DGE, log=TRUE) plot(density(filtered_lcpm[,1]), col=col[1], lwd=2, las=2, ylim=c(0,0.5), main="", xlab="") title(main=paste0("B. ", method, " Filtered data"), xlab="Log2-cpm") abline(v=0, lty=3) for (i in 2:nsamples){ den <- density(filtered_lcpm[,i]) lines(den$x, den$y, col=col[i], lwd=2) } legend("topright", samplenames, text.col=col, bty="n") # plot unnormalized data boxplot(filtered_lcpm, las=2, col=col, main="") title(main=paste0("C. Example: ",method," Unnormalized filtered data"),ylab="Log2-cpm") # normalization norm_DGE <- calcNormFactors(filtered_DGE, method = "TMM") norm_DGE$samples$norm.factors norm_lcpm <- cpm(norm_DGE, log=TRUE) boxplot(norm_lcpm, las=2, col=col, main="") title(main=paste0("D. Example: ",method," Normalised filtered data"),ylab="Log2-cpm") return(norm_DGE) } limmaFitDE <- function(norm_DGE, method){ library(limma) group <- norm_DGE$samples$group design <- model.matrix(~0+group) contr.matrix <- makeContrasts(CPEvsSNE = groupCPE-groupSNE, levels = colnames(design)) par(mfrow=c(1,2)) v <- voom(norm_DGE, design, plot=TRUE) vfit <- lmFit(v, design) vfit <- contrasts.fit(vfit, contrasts=contr.matrix) efit <- eBayes(vfit) plotSA(efit, main=paste0(method, " Final model: Mean???variance trend")) return(efit) } tuxedo_path <- "HTseq_tuxedo/" tuxedo_DGE <- htseqToDGE(tuxedo_path) pdf(file="limma_filter_norm_plot_2.pdf", width=10, height=8) tuxedo_normDGE <- filterNormPlot(tuxedo_DGE, "Tuxedo", 1) dev.off() tuxedo_efit <- limmaFitDE(tuxedo_normDGE, "Tuxedo") summary(decideTests(tuxedo_efit, p.value = 0.05, lfc=log2(1.2))) tuxedo_limmaRES <- topTreat(tuxedo_efit, sort="none",number=Inf) tuxedo_limmaRES$decision <- decideTests(tuxedo_efit, p.value = 0.05, lfc=log2(1.2))