Overview

This code accompanies the paper Kimble et al. (2018) Extraction of total RNA from single-oocytes and full-length single-cell mRNA sequencing of swine oocytes, as Supplementary file 1. Code created by Fernando Biase. Please, direct questions to fbiase at auburn dot edu.
An updated version of this code may be available on www.biaselaboratory.com .

Code

Load the necessary libraries.

library(knitr)
library(Rsamtools)
library(stringr)
library(gplots)
library(ggplot2)
library(ggpubr)
library(scales)
library(kableExtra)
library(biomaRt)
library(RColorBrewer)
#function for collapsing the list of lists into a single list
#as per the Rsamtools vignette
.unlist <- function (x){
   ## do.call(c, ...) coerces factor to integer, which is undesired
   x1 <- x[[1L]]
   if (is.factor(x1)){
      structure(unlist(x), class = "factor", levels = levels(x1))
   } else {
      do.call(c, x)
   }
}

Load the annotations from Ensembl.

mart  = useMart("ensembl", dataset = "sscrofa_gene_ensembl",host="www.ensembl.org")
annotation.genelength.biomart<-getBM(attributes = c("ensembl_gene_id","ensembl_transcript_id", "transcript_length","gene_biotype"), mart = mart)
annotation.genelength.biomart<-annotation.genelength.biomart[annotation.genelength.biomart$gene_biotype =="protein_coding",]
annotation.genelength.biomart<-annotation.genelength.biomart[order(annotation.genelength.biomart$ensembl_gene_id, -annotation.genelength.biomart$transcript_length),]
#head(annotation.genelength.biomart[duplicated(annotation.genelength.biomart$ensembl_gene_id),])
annotation.genelength.biomart<-annotation.genelength.biomart[annotation.genelength.biomart$transcript_length > 400,]
annotation.genelength.biomart.a<-annotation.genelength.biomart[!duplicated(annotation.genelength.biomart$ensembl_gene_id),]
annotation.gene.biomart<-getBM(attributes = c("ensembl_gene_id", "hgnc_symbol","external_gene_name", "description","gene_biotype"), mart = mart)
annotation.gene.biomart<-annotation.gene.biomart[annotation.gene.biomart$gene_biotype =="protein_coding",]

Load bam files and generate the objects with data.

data_fpkm     <- list()
data_coverage <- list()

bamInPath<-"/home/fernando/oocyte_cumulus_pig/alignment_oocyte_cdna"

#read in BAM file
i<-1

for (sample in  c('SL269995','SL269992','SL270021','SL270003','SL269998')){
bam <- scanBam(paste(bamInPath,sample,"Aligned.out.merged.sorted.bam",sep="/"))
#store names of BAM fields
bam_field <- names(bam[[1]])
#go through each BAM field and unlist
list <- lapply(bam_field, function(y) .unlist(lapply(bam, "[[", y)))
#store as data frame
bam_df <- do.call("DataFrame", list)
names(bam_df) <- bam_field
bam_df<-as.data.frame(bam_df[,c(3,5,10)])
bam_df$transcript<-str_split_fixed(bam_df$rname, "\\.",2)[,1]
bam_df<-bam_df[,c(4,2,3)]

total_fragments_aligned<-dim(bam_df)[1]

bam_df_not_duplicated<-bam_df[bam_df$transcript %in% annotation.genelength.biomart.a$ensembl_transcript_id,]
table_transcript_count<-data.frame(table(bam_df_not_duplicated$transcript))
table_transcript_count<-merge(table_transcript_count, annotation.genelength.biomart.a, by.x="Var1", by.y="ensembl_transcript_id", all=FALSE)
table_transcript_count$fpkm<-( ( (table_transcript_count$Freq) / ((table_transcript_count$transcript_length/1000) * (total_fragments_aligned/1000000)) ) )
table_transcript_count<-table_transcript_count[,c(3,6)]
colnames(table_transcript_count)<-c("ensembl_gene_id", paste(sample))

data_fpkm[[i]]<-table_transcript_count


bam_df1<-bam_df[,c(1,2)]
bam_df2<-bam_df[,c(1,3)]
colnames(bam_df1)<-c("transcript","position")
colnames(bam_df2)<-c("transcript","position")
bam_df3<-rbind(bam_df1,bam_df2)
bam_df3<-bam_df3[!is.na(bam_df3$position),]

data_coverage[[i]]<-bam_df3

i<- i+1

rm(bam,bam_df,bam_df1,bam_df2,bam_df3,bam_df_not_duplicated, list)
}

Create the object that contains the expression data (FPKM) for the genes detected in all oocytes. The code used for saving the file is commented (#) and the table is printed below.

data_fpkm_a<-Reduce(function(x, y) merge(x, y, all=TRUE), data_fpkm)
rownames(data_fpkm_a)<-data_fpkm_a$ensembl_gene_id
data_fpkm_a<-data_fpkm_a[, c(2:dim(data_fpkm_a)[2])]
data_fpkm_b<-data_fpkm_a[rowSums(data_fpkm_a > 0.3) >=5,]
data_fpkm_b<-data_fpkm_b[complete.cases(data_fpkm_b),]
data_fpkm_b_annotated<-merge(data_fpkm_b, annotation.gene.biomart, by.x= "row.names", by.y="ensembl_gene_id", all.x=TRUE, all.y=FALSE)
colnames(data_fpkm_b_annotated)<-c("Ensembl ID","oocyte 1","oocyte 2","oocyte 3","oocyte 4","oocyte 5","hgnc_symbol","external_gene_name","description","gene_biotype")

Supplementary file 2.

kable(head(data_fpkm_b_annotated), format = "html") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"),full_width =TRUE,font_size = 10)
Ensembl ID oocyte 1 oocyte 2 oocyte 3 oocyte 4 oocyte 5 hgnc_symbol external_gene_name description gene_biotype
ENSSSCG00000000002 3.054407 1.673888 1.5659515 1.370527 1.782784 GTSE1 GTSE1 G2 and S-phase expressed 1 [Source:HGNC Symbol;Acc:HGNC:13698] protein_coding
ENSSSCG00000000005 3.272916 8.486265 3.1585472 4.837654 2.376952 CDPF1 CDPF1 cysteine rich DPF motif domain containing 1 [Source:HGNC Symbol;Acc:HGNC:33710] protein_coding
ENSSSCG00000000006 17.612150 17.479475 18.5034100 4.667341 18.522566 PPARA Sus scrofa peroxisome proliferator activated receptor alpha (PPARA), mRNA. [Source:RefSeq mRNA;Acc:NM_001044526] protein_coding
ENSSSCG00000000007 8.155091 11.088408 10.7312353 12.258417 8.616283 TRMU TRMU tRNA 5-methylaminomethyl-2-thiouridylate methyltransferase [Source:HGNC Symbol;Acc:HGNC:25481] protein_coding
ENSSSCG00000000010 2.036271 1.823013 3.5314462 6.023551 1.844337 FBLN1 FBLN1 fibulin 1 [Source:HGNC Symbol;Acc:HGNC:3600] protein_coding
ENSSSCG00000000014 1.089129 0.435553 0.7445085 1.954791 1.228299 FAM118A FAM118A family with sequence similarity 118 member A [Source:HGNC Symbol;Acc:HGNC:1313] protein_coding
data_fpkm_a[is.na(data_fpkm_a)]<-0
heatmap.2(log10(as.matrix(data_fpkm_a+1)),
        labRow=FALSE,
        labCol=FALSE,
        scale='none',
        trace='none',
        keysize=1,
        key.title='Log10(FPKM+1)',
        col=colorRampPalette(c("white", "blue4"))(n = 256),
        margins=c(0,0),
        key.par = list(cex=0.55))

data_coverage_a<-do.call("rbind", data_coverage)
data_coverage_a$sample<-c(rep(c("oocyte_1","oocyte_2","oocyte_3","oocyte_4","oocyte_5"), 
                              c(dim(data_coverage[[1]])[1], dim(data_coverage[[2]])[1],dim(data_coverage[[3]])[1],dim(data_coverage[[4]])[1],dim(data_coverage[[5]])[1]  )) )
data_coverage_a<-merge(data_coverage_a, annotation.genelength.biomart.a, by.x="transcript", by.y="ensembl_transcript_id", all=FALSE)
data_coverage_a<-data_coverage_a[data_coverage_a$ensembl_gene_id %in% rownames(data_fpkm_b),]
data_coverage_a$percentage<-data_coverage_a$position/data_coverage_a$transcript_length * 100
data_coverage_a<-data_coverage_a[data_coverage_a$percentage >0 & data_coverage_a$percentage < 100 ,]

Code to generate the individual images observed on Figure 2 on the paper.

font_1<-8
font_2<-6

data_fpkm_c<-merge(data_fpkm_b, annotation.genelength.biomart.a, by.x='row.names', by.y= 'ensembl_gene_id', all.x=TRUE, all.y=FALSE)
data_fpkm_c<-data_fpkm_c[,c(7,8)]
data_fpkm_c$sample<-"oocyte"
data_fpkm_c<-rbind(data_fpkm_c, data.frame('ensembl_transcript_id'=annotation.genelength.biomart.a$ensembl_gene_id, 'transcript_length'=annotation.genelength.biomart.a$transcript_length, sample='ensembl'))

plot1<-ggplot(data_fpkm_c, aes(x = sample, y = transcript_length)) +
       geom_boxplot() +
       scale_y_continuous(name = "transcript length (nt)")+
       scale_x_discrete(name = NULL, labels=c('Ensembl', 'porcine\noocyte')) +
       theme_bw()+
       theme(
         panel.background = element_blank(),
         plot.background = element_blank(), 
         panel.grid =element_blank(),
         axis.text  =element_text(colour="black", size = font_1),
         axis.title.y =element_text(colour="black", size = font_1, vjust=0, hjust=0.5)
            )

plot2<-ggplot(data_coverage_a, aes(x = percentage)) +
       geom_density()+
       theme_bw()+
       theme(
         panel.background = element_blank(),
         plot.background = element_blank(), 
         panel.grid =element_blank(),
         axis.text  =element_text(colour="black", size = font_1),
         axis.title =element_text(colour="black", size = font_1)
            )


data_coverage_b<-data_coverage_a[data_coverage_a$transcript_length >10000 & data_coverage_a$transcript_length<15000,]
data_coverage_c<-data.frame(
  'x_axis' = c( rep(density(data_coverage_b[data_coverage_b$sample=="oocyte_1",7])$x , 5)),
  'density'= c( density(data_coverage_b[data_coverage_b$sample=="oocyte_1",7])$y, density(data_coverage_b[data_coverage_b$sample=="oocyte_2",7])$y, 
                density(data_coverage_b[data_coverage_b$sample=="oocyte_3",7])$y, density(data_coverage_b[data_coverage_b$sample=="oocyte_4",7])$y,
                density(data_coverage_b[data_coverage_b$sample=="oocyte_5",7])$y),
    
  'sample' = c(rep(c("oocyte_1","oocyte_2","oocyte_3","oocyte_4","oocyte_5"), 
                   c( 
                     length(density(data_coverage_b[data_coverage_b$sample=="oocyte_1",7])$y), length(density(data_coverage_b[data_coverage_b$sample=="oocyte_2",7])$y), 
                     length(density(data_coverage_b[data_coverage_b$sample=="oocyte_3",7])$y), length(density(data_coverage_b[data_coverage_b$sample=="oocyte_4",7])$y),
                     length(density(data_coverage_b[data_coverage_b$sample=="oocyte_5",7])$y))
                     ) 
               ))

plot3<-ggplot(data_coverage_c, aes(x_axis, as.factor(sample))) + 
       geom_tile(aes(fill = density),colour = "white")+
       scale_fill_gradient(low = "#F0F0F0",high = "#000000")+
       ggtitle("transcript length 10,000 - 15,000 nt (80 genes)") +
       theme(
        panel.background = element_blank(),
        plot.background = element_blank(), 
        panel.grid =element_blank(),
       plot.title   =element_text(size=font_1, color="black",vjust = 0.4,hjust = 0.5, margin=margin(t =0, r = 0, b = 0, l = 0, unit = "pt")),
       axis.text.y  =element_text(size=font_1, color="black"),
       axis.text.x  =element_blank(),
       axis.title   =element_blank(),
       axis.ticks   =element_blank(),
       axis.line    =element_blank(),
       legend.title =element_text(size=font_2),
       legend.text  =element_text(size=font_2),
       plot.margin  =margin(t = 0, r = 0, b = 0, l = 0, unit = "pt"),
       legend.key.height=unit(0.2, "cm"),
       legend.key.width=unit(0.2, "cm")
       )


data_coverage_b<-data_coverage_a[data_coverage_a$transcript_length >5000 & data_coverage_a$transcript_length<10000,]
data_coverage_c<-data.frame(
  'x_axis' = c( rep(density(data_coverage_b[data_coverage_b$sample=="oocyte_1",7])$x , 5)),
  'density'= c( density(data_coverage_b[data_coverage_b$sample=="oocyte_1",7])$y, density(data_coverage_b[data_coverage_b$sample=="oocyte_2",7])$y, 
                density(data_coverage_b[data_coverage_b$sample=="oocyte_3",7])$y, density(data_coverage_b[data_coverage_b$sample=="oocyte_4",7])$y,
                density(data_coverage_b[data_coverage_b$sample=="oocyte_5",7])$y),
    
  'sample' = c(rep(c("oocyte_1","oocyte_2","oocyte_3","oocyte_4","oocyte_5"), 
                   c( 
                     length(density(data_coverage_b[data_coverage_b$sample=="oocyte_1",7])$y), length(density(data_coverage_b[data_coverage_b$sample=="oocyte_2",7])$y), 
                     length(density(data_coverage_b[data_coverage_b$sample=="oocyte_3",7])$y), length(density(data_coverage_b[data_coverage_b$sample=="oocyte_4",7])$y),
                     length(density(data_coverage_b[data_coverage_b$sample=="oocyte_5",7])$y))
                     ) 
               ))

plot4<-ggplot(data_coverage_c, aes(x_axis, as.factor(sample))) + 
       geom_tile(aes(fill = density),colour = "white")+
       scale_fill_gradient(low = "#F0F0F0",high = "#000000")+
       ggtitle("transcript length 5,000 - 10,000 nt (2,888 genes)") +
       theme(
       panel.background = element_blank(),
       plot.background = element_blank(), 
       panel.grid   =element_blank(),
       plot.title   =element_text(size=font_1, color="black",vjust = 0.4,hjust = 0.5, margin=margin(t =0, r = 0, b = 0, l = 0, unit = "pt")),
       axis.text.y  =element_text(size=font_1, color="black"),
       axis.text.x  =element_blank(),
       axis.title   =element_blank(),
       axis.ticks   =element_blank(),
       axis.line    =element_blank(),
       legend.title =element_text(size=font_2),
       legend.text  =element_text(size=font_2),
       plot.margin  =margin(t = 0, r = 0, b = 0, l = 0, unit = "pt"),
       legend.key.height=unit(0.2, "cm"),
       legend.key.width=unit(0.2, "cm")
       )

data_coverage_b<-data_coverage_a[ data_coverage_a$transcript_length<5000,]
data_coverage_c<-data.frame(
  'x_axis' = c( rep(density(data_coverage_b[data_coverage_b$sample=="oocyte_1",7])$x , 5)),
  'density'= c( density(data_coverage_b[data_coverage_b$sample=="oocyte_1",7])$y, density(data_coverage_b[data_coverage_b$sample=="oocyte_2",7])$y, 
                density(data_coverage_b[data_coverage_b$sample=="oocyte_3",7])$y, density(data_coverage_b[data_coverage_b$sample=="oocyte_4",7])$y,
                density(data_coverage_b[data_coverage_b$sample=="oocyte_5",7])$y),
    
  'sample' = c(rep(c("oocyte_1","oocyte_2","oocyte_3","oocyte_4","oocyte_5"), 
                   c( 
                     length(density(data_coverage_b[data_coverage_b$sample=="oocyte_1",7])$y), length(density(data_coverage_b[data_coverage_b$sample=="oocyte_2",7])$y), 
                     length(density(data_coverage_b[data_coverage_b$sample=="oocyte_3",7])$y), length(density(data_coverage_b[data_coverage_b$sample=="oocyte_4",7])$y),
                     length(density(data_coverage_b[data_coverage_b$sample=="oocyte_5",7])$y))
                     ) 
               ))

plot5<-ggplot(data_coverage_c, aes(x_axis, as.factor(sample))) + 
       geom_tile(aes(fill = density),colour = "white")+
       scale_fill_gradient(low = "#F0F0F0",high = "#000000")+
       ggtitle("transcript length < 5,000 nt (6,604 genes)") +
       scale_x_continuous(breaks = c(0,100), labels=c("0%  5'", "3' 100%"))+
       theme(
       panel.background = element_blank(),
       plot.background  = element_blank(), 
       panel.grid   =element_blank(),
       plot.title   =element_text(size=font_1, color="black",vjust = 0.4,hjust = 0.5, margin=margin(t =0, r = 0, b = 0, l = 0, unit = "pt")),
       axis.text.y  =element_text(size=font_1, color="black"),
       axis.text.x  =element_text(size=font_1, color="black"),
       axis.title   =element_blank(),
       axis.ticks.y =element_blank(),
       axis.line    =element_blank(),
       legend.title =element_text(size=font_2),
       legend.text  =element_text(size=font_2),
       plot.margin  =margin(t = 0, r = 0, b = 0, l = 0, unit = "pt"),
       legend.key.height=unit(0.2, "cm"),
       legend.key.width=unit(0.2, "cm")
       )

Next, we compiled individual plots to compose the panel observed on the paper. The image may seem distorted on the html file, but is not if saved as a png file.

ggarrange( ggarrange(plot1, plot2,labels = c('a', 'b'), ncol=2, nrow=1,  widths = c(2,4),hjust=c(0,0), font.label=list(size = 9, color = "black", face = "plain")),
           ggarrange(plot3, labels = c('c'), ncol=1, nrow=1,hjust=0, vjust=1,font.label=list(size = 9, color = "black", face = "plain")), 
           ggarrange(plot4,  ncol=1, nrow=1),
           ggarrange(plot5,  ncol=1, nrow=1),
           nrow=4, heights=c(2,1,1,1.15))

sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.10
## 
## Matrix products: default
## BLAS: /usr/lib/atlas-base/atlas/libblas.so.3.0
## LAPACK: /usr/lib/atlas-base/atlas/liblapack.so.3.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-2   biomaRt_2.34.2       kableExtra_0.7.0    
##  [4] scales_0.5.0         ggpubr_0.1.6         magrittr_1.5        
##  [7] ggplot2_2.2.1        gplots_3.0.1         stringr_1.2.0       
## [10] Rsamtools_1.30.0     Biostrings_2.46.0    XVector_0.18.0      
## [13] GenomicRanges_1.30.1 GenomeInfoDb_1.14.0  IRanges_2.12.0      
## [16] S4Vectors_0.16.0     BiocGenerics_0.24.0  knitr_1.19          
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.15           prettyunits_1.0.2      gtools_3.5.0          
##  [4] assertthat_0.2.0       rprojroot_1.3-2        digest_0.6.15         
##  [7] R6_2.2.2               plyr_1.8.4             backports_1.1.2       
## [10] RSQLite_2.0            evaluate_0.10.1        highr_0.6             
## [13] httr_1.3.1             pillar_1.1.0           zlibbioc_1.24.0       
## [16] rlang_0.1.6            progress_1.1.2         curl_3.1              
## [19] lazyeval_0.2.1         gdata_2.18.0           blob_1.1.0            
## [22] rmarkdown_1.8          labeling_0.3           BiocParallel_1.12.0   
## [25] readr_1.1.1            RCurl_1.95-4.10        bit_1.1-12            
## [28] munsell_0.4.3          compiler_3.4.3         pkgconfig_2.0.1       
## [31] htmltools_0.3.6        tibble_1.4.2           GenomeInfoDbData_1.0.0
## [34] XML_3.98-1.9           viridisLite_0.3.0      dplyr_0.7.4           
## [37] bitops_1.0-6           grid_3.4.3             gtable_0.2.0          
## [40] DBI_0.7                KernSmooth_2.23-15     stringi_1.1.6         
## [43] bindrcpp_0.2           xml2_1.2.0             cowplot_0.9.2         
## [46] tools_3.4.3            bit64_0.9-7            Biobase_2.38.0        
## [49] glue_1.2.0             purrr_0.2.4            hms_0.4.1             
## [52] yaml_2.1.16            AnnotationDbi_1.40.0   colorspace_1.3-2      
## [55] caTools_1.17.1         rvest_0.3.2            memoise_1.1.0         
## [58] bindr_0.1