# De Kovel et al, 2018 # Scripts acommpanying Data Descriptor # Transcriptomic analysis of left-right differences in human embryonic forebrain and midbrain #fastq quality control FastQC v0.11.5 fastq ${file} #Aligning with Hisat2 v2.2.0.4 indexdir="/dir/to/resources/Homo_sapiens/UCSC/hg38/hg38/" indexbase=genome fqdir="/dir/to/fastq/files/" bamdir="/dir/to/Bams/" tissue= sample= hisat2 -x ${indexdir}${indexbase} -1 ${fqdir}S${sample}_1.fq.gz -2 ${fqdir}S${sample}_2.fq.gz -S ${bamdir}${tissue}/${sample}.sam -p 4 #Gene counting with RSEM (version 1.3.0), using bowtie2 v2.2.6 tissue= sample= Fastqdir=/dir/to/fastq/files/ Outdir=/output/dir/ Outfile=${sample} Ref=/dir/to/resources/Homo_sapiens/UCSC/hg38/hg38/GCF_GRCh38.refseq_rsem ./rsem-calculate-expression --paired-end --bowtie2 -p 4 --append-names ${Fastqdir}${sample}_1.fastq.gz ${Fastqdir}${sample}_2.fastq.gz ${Ref} ${Outdir}${sample}_ucsc_genecount #-------------------------------------------------------------------------------- ## Processing data with R-package edgeR, limma # R library(edgeR) library(limma) # Read in expression counts to include in dge object genes<-read.table("Forebrain_genes_ex.txt", header=T) counts<-read.delim("Forebrain_expression_counts_ex.txt", header=T, na.strings="NA") rownames(counts)<-counts$gene_id counts$gene_id<-NULL # replace missing values with 0 counts[is.na(counts)]<-0 # Create dge object dge <- DGEList(counts=counts, genes=genes,group=side) dge <- calcNormFactors(dge) # filter out lowly expressed genes using the following commands: keep <- rowSums(cpm(dge)>5) >= 3 dge <- dge[keep, , keep.lib.sizes=FALSE] dge$samples$lib.size<-colSums(dge$counts) #after filter calculate normalisation again dge <- calcNormFactors(dge) # compute expression as log2(cpm) and write to file log2cpm<-log2(normalized.counts) write.table(log2cpm, "BGI_2Tissues_expression_log2.txt", row.names=T, col.names=T, sep="\t")