#Supplementary Data 2. The R script for analyzing NGS sequencing data from the B2 and mNG oligo libraries #transfected along with ADAR-specific plasmid in ADAR1 KO-HEK293T cell lines # Dissecting the basis for differential substrate specificity of ADAR1 and ADAR2 #Author list #Marlon S. Zambrano-Mila, Monika Witzenberger, Zohar Rosenwasser, Anna Uzonyi, Ronit Nir, Shay Ben-Aroya, Erez Y. Levanon, Schraga Schwartz #Data analysis of NGS data #Fastq files were assessed by a custom R script (Supplementary Data 2). The read-filtering process removed reads #containing wrong start and end, lacking the established barcodes, and misaligning at adenosine positions. #Read 1 and 2 were merged into a single sequence by custom truncation and matching. For each barcode, the editing percentage #was quantified as (G/(A+G))*100 at each adenosine position. Δ editing was calculated as the difference of editing levels #at adenosine positions between each structurally altered sequence and perfect-double stranded construct,respectively # Load necessary library library(ShortRead) library(Biostrings) library(parallel) library(ggplot2) library(Biostrings) library(pheatmap) library(EnvStats) library(purrr) library(RVenn) library(ggplot2) library(gridExtra) library(ggvenn) library(egg) library("GGally") library(stringr) library(tidyverse) library("RColorBrewer") ############################### B2 library ################################### B2_library_A_to_G_analysis<-function(B2_R1_readFastaq,B2_R2_readFastaq,z){ # Process Read1.fastq files B2_R1_sequence<-as.data.frame(sread(B2_R1)) B2_R1_sequence$number_read<-1:nrow(B2_R1_sequence) Total_Number_of_Reads_R1<-nrow(B2_R1_sequence) #Sequences in Read1.fastq are filtered and processed to extract the "B2_sequence," including filtering sequences that start with "GCCGGG" and counting the number of properly beginning reads. B2_R1_sequence<-B2_R1_sequence[grepl("GCCGGGC",B2_R1_sequence$x),] B2_R1_sequence$B2_sequence<-gsub(".*AAACTAGT","" ,B2_R1_sequence$x) B2_R1_sequence<-B2_R1_sequence[grepl("^GCCGGG",B2_R1_sequence$B2_sequence),] number_reads_beggining_properly<- nrow(B2_R1_sequence) colnames(B2_R1_sequence)<-c("non_processed_R1","number_read","B2_constructR1") B2_R1_sequence<-B2_R1_sequence[nchar(B2_R1_sequence$B2_constructR1)>=82,] B2_R1_sequence$B2_constructR1<-substr(B2_R1_sequence$B2_constructR1, start = 1,stop = 82) Percentage_reads_beggining_properly<- (number_reads_beggining_properly/ Total_Number_of_Reads_R1)*100 # Process Read2.fastq files B2_R2_sequence <- as.data.frame(sread(B2_R2)) B2_R2_sequence$number_read <- 1:nrow(B2_R2_sequence) Total_Number_of_Reads_R2 <- nrow(B2_R2_sequence) # Reverse complement and filter sequences B2_R2_sequence_reverse_comp <- as.data.frame(sread(reverseComplement(B2_R2))) B2_R2_sequence_reverse_comp$number_read <- 1:nrow(B2_R2_sequence_reverse_comp) # Read and filter Barcodes. This table can be found in Uzonyi et al.(2021): Table S2. Sequences of the B2 and mNG oligo library pools, related to STAR Methods. Barcodes_csv <- read.csv("/DATA_HELA_ADAR2_over_ADAR1_knockdown_B2_libr/Table_S2_Twist_library_sequence_plans.csv", sep = ",", header = TRUE) Barcodes_csv <- Barcodes_csv[Barcodes_csv$B2.mNG == "B2", ] Barcodes_csv$site <- as.character(Barcodes_csv$site) # Further filter B2_R2_sequence B2_R2_sequence <- B2_R2_sequence_reverse_comp[grepl("TCCCTC|TTCGAA", B2_R2_sequence_reverse_comp$x), ] B2_R2_sequence$barcode <- gsub(".*TTCGAA|\\TCCC.*", "", B2_R2_sequence$x) B2_R2_sequence<-B2_R2_sequence[grep("CCCCTAA",B2_R2_sequence$x),] B2_R2_sequence$B2_constructR2<-gsub("\\CCCCTAA.*","",B2_R2_sequence$x) # Filter based on sequence and barcode length B2_R2_sequence<-B2_R2_sequence[nchar(B2_R2_sequence$B2_constructR2)>=63,] B2_R2_sequence$B2_constructR2<- substr(B2_R2_sequence$B2_constructR2, start =nchar(B2_R2_sequence$B2_constructR2)-63,stop = nchar(B2_R2_sequence$B2_constructR2)) B2_R2_sequence<-B2_R2_sequence[nchar(B2_R2_sequence$barcode)==8,] # Merge B2_R2_sequence with Barcodes_csv based on the 'barcode' column B2_R2_sequence_Barcode_ID <- merge(B2_R2_sequence, Barcodes_csv, by.x = "barcode", by.y = "barcode") # Rename the columns for clarity colnames(B2_R2_sequence_Barcode_ID) <- c("barcode", "No_processed_Read2", "number_read", "B2_constructR2", "ID", "Buffer", "F", "BstBI", "loop", "seq", "AscI", "R", "desc", "site", "B2.mNG", "sample.ctrl", "Total_length", "Buffer_length") # Calculate the number of unique barcodes Number_of_Barcodes <- length(unique(Barcodes_csv$barcode)) # Calculate the total number of reads with proper barcodes and the percentage Reads_properly_barcodes <- nrow(B2_R2_sequence_Barcode_ID) Reads_properly_barcodes_percent <- (Reads_properly_barcodes / Total_Number_of_Reads_R2) * 100 # Merge B2_R1_sequence with B2_R2_sequence_Barcode_ID based on 'number_read' B2_df_R1_R2 <- merge(B2_R1_sequence, B2_R2_sequence_Barcode_ID, by.x = "number_read", by.y = "number_read", all = FALSE) # Combine B2_constructR1 and B2_constructR2 into Whole_Construct B2_df_R1_R2$Whole_Construct <- paste0(B2_df_R1_R2$B2_constructR1, B2_df_R1_R2$B2_constructR2) # Filter based on the length of Whole_Construct B2_df_R1_R2 <- B2_df_R1_R2[nchar(B2_df_R1_R2$Whole_Construct) == 146, ] # Define the perfect DNA strand sequence perfect_ds_fw_strand<-("GCCGGGCGTGGTGGCACACGCCTTTAATCCCAGCACTCGGGAGGCAGAGGCAGGCAGATTTCTGAGTTGGAGGCCAGCCTGGTCTACAAAGTGAGTTCCAGGACAGCCAGAGCTACACAGAGAAACCCTGTCTCGAAAAACCAAAA") # Generate row names for B2_df_R1_R2 rownames(B2_df_R1_R2)<-paste0("R",1:nrow(B2_df_R1_R2)) design_seq_const<-perfect_ds_fw_strand # Extract A_G positions from the perfect DNA strand A_G_positions<-unlist(gregexpr('A', design_seq_const)) Nuc_matrix<-do.call(rbind,str_split(B2_df_R1_R2$Whole_Construct,"")) Nuc_matrix<-as.data.frame(Nuc_matrix) rownames(Nuc_matrix)<-rownames(B2_df_R1_R2) AG_pos_analized<-Nuc_matrix[,paste0("V",A_G_positions)] rownames(AG_pos_analized)<-rownames(B2_df_R1_R2) AG_pos_analized$Combined_cols <- do.call(paste, c(AG_pos_analized[1:ncol(AG_pos_analized)], sep = "")) # Filter A ositions that do not contain 'C' or 'T' in Combined_cols AG_pos_analized<-AG_pos_analized[!grepl("C|T",AG_pos_analized$Combined_cols),] B2_df_R1_R2<-B2_df_R1_R2[rownames(AG_pos_analized),] # Calculate the number of reads with proper beginning and proper barcodes, and the percentage Reads_R1_R2_proper_beginning_proper_barcode <- nrow(B2_df_R1_R2) Reads_R1_R2_proper_beginning_proper_barcode_percent <- (Reads_R1_R2_proper_beginning_proper_barcode / Total_Number_of_Reads_R1) * 100 # Create a data frame for quality control metrics QC_control_df1 <- data.frame( Total_Number_of_Reads_R1, number_reads_beggining_properly, Percentage_reads_beggining_properly, Total_Number_of_Reads_R2, Reads_properly_barcodes, Reads_properly_barcodes_percent, Reads_R1_R2_proper_beginning_proper_barcode, Reads_R1_R2_proper_beginning_proper_barcode_percent ) # Calculate quantiles of the barcode counts Quantiles_number_read_per_barcode <- quantile(table(B2_df_R1_R2$barcode), c(0.01, 0.1, 0.9, 0.99)) # Combine QC_control_df1 with the quantiles and set column names QC_control_df <- cbind(QC_control_df1, t(as.data.frame(Quantiles_number_read_per_barcode))) colnames(QC_control_df) <- c("Total Number of Reads_R1", "Number of Reads beginning properly", "% of Reads beginning properly", "Number of Reads_R2", "Number of Reads beginning properly barcodes_R2", "% of Reads with proper barcodes_R2", "Reads R1 R2 with proper beginning and proper barcodes", "% of Reads_R1_R2 with proper beginning and proper barcodes", "1%", "10%", "90%", "99%") # Create an empirical cumulative distribution plot plot(ecdf(log10(table(B2_df_R1_R2$barcode))), xlab = "Log10 of number of reads in each barcode", pch = 20, main = "ECD: Log10 N° of reads in each barcode", yaxt = "n" ) axis(2, at = seq(0, 1, by = 0.1), las = 2) abline(v = log10(6), col = "Red") # Select the desired columns and split by barcode B2_df_R1_R2_list <- split(B2_df_R1_R2[, c("number_read", "Whole_Construct", "barcode", "desc", "site", "B2.mNG", "sample.ctrl")], f = B2_df_R1_R2$barcode) Whole_const_Seq<-toupper("GCCGGGCGTGGTGGCACACGCCTTTAATCCCAGCACTCGGGAGGCAGAGGCAGGCAGATTTCTGAGTTGGAGGCCAGCCTGGTCTACAAAGTGAGTTCCAGGACAGCCAGAGCTACACAGAGAAACCCTGTCTCGAAAAACCAAAA") # Extract the A positions from the perfect double-stranded sequence A_positions_Seq <- unlist(gregexpr('A', Whole_const_Seq)) # Create a list of A positions in each molecule for the specified barcode A_positions_list <- lapply(B2_df_R1_R2_list[["CTGTAGAC"]]$Whole_Construct, function(molecule) { unlist(strsplit(molecule, ""))[A_positions_Seq] }) # Create a data frame to store the number of edits per molecule Number_of_edits_per_molecule_df <- data.frame( Molecule = 1:length(A_positions_list), Edits = sapply(A_positions_list, function(positions) sum(positions == "G")) ) # Create a boxplot of the number of edits library(ggplot2) p <- ggplot(Number_of_edits_per_molecule_df, aes(x = factor(1), y = Edits)) + geom_boxplot(fill = "lightblue", width = 0.5) + labs(x = NULL, y = "# Edits per Molecule") + theme_minimal() + ggtitle("B2") p <- p + theme( legend.position = "none", panel.border = element_rect(color = "black", fill = NA, size = 0.5), axis.text.x = element_blank(), axis.title = element_text(face = "bold", size = 12), axis.text.y = element_text(colour = "black", size = 11) ) plot(p) # Number of analyzed barcodes Number_of_analyzed_barcodes <- length(B2_df_R1_R2_list) # Create a consensus matrix for each barcode concensus_matrix_per_barcode <- mclapply( B2_df_R1_R2_list, mc.cores = 12, function(w) { Sequence_barcode_StringSet <- DNAStringSet(w$Whole_Construct, use.names = TRUE) consensus_matrix_barcode <- as.data.frame(t(consensusMatrix(Sequence_barcode_StringSet)))[, c(1:4, 15:16)] } ) # Calculate A-to-G rate for each position in the consensus matrix concensus_matrix_per_barcode_A_to_G <- mclapply( concensus_matrix_per_barcode, mc.cores = 12, function(y) { y$position <- 1:nrow(y) y$A_to_G_rate <- (y$G / (y$A + y$G)) * 100 return(y) } ) # Extract A-to-G rates at A positions concensus_matrix_per_barcode_A_to_G_target_pos <- mclapply( names(concensus_matrix_per_barcode_A_to_G), mc.cores = 12, function(z) { df_A_POS <- as.data.frame(t(as.data.frame(concensus_matrix_per_barcode_A_to_G[[z]][A_positions_Seq, "A_to_G_rate"]))) df_A_POS$Barcode_names <- names(concensus_matrix_per_barcode_A_to_G[z]) colnames(df_A_POS) <- c(paste0("A", A_positions_Seq), "Barcode ID") rownames(df_A_POS) <- names(concensus_matrix_per_barcode_A_to_G[z]) return(df_A_POS) } ) # Combine the results df_A_G_POS_editing <- do.call(rbind, concensus_matrix_per_barcode_A_to_G_target_pos) # Match with barcode data matched_index <- match(df_A_G_POS_editing$`Barcode ID`, Barcodes_csv$barcode) df_A_G_POS_editing <- cbind(df_A_G_POS_editing, Barcodes_csv[matched_index, c("desc", "sample.ctrl", "site")]) # Correlation of editing levels ADAR2_1_barcodes <- pivot_longer(df_A_G_POS_editing, cols = starts_with("A"), values_to = "AtoI_Editing", names_to = "A_position") ADAR2_1_control_barcodes <- ADAR2_1_barcodes[ADAR2_1_barcodes$sample.ctrl == "barcode_ctrl", ] ADAR2_1_control_barcodes$merging_column <- paste0( ADAR2_1_control_barcodes$desc, ADAR2_1_control_barcodes$site, ADAR2_1_control_barcodes$A_position ) ADAR2_1_sample_barcodes <- ADAR2_1_barcodes[ADAR2_1_barcodes$sample.ctrl == "sample", ] ADAR2_1_sample_barcodes$merging_column <- paste0( ADAR2_1_sample_barcodes$desc, ADAR2_1_sample_barcodes$site, ADAR2_1_sample_barcodes$A_position ) ADAR2_1_sample_control_barcodes <- merge( ADAR2_1_sample_barcodes, ADAR2_1_control_barcodes, by.x = "merging_column", by.y = "merging_column" ) # Create a scatterplot with correlation information p4.1 <- ggplot(ADAR2_1_sample_control_barcodes, aes(x = AtoI_Editing.x, y = AtoI_Editing.y)) + geom_point(size = 0.1, color = "deepskyblue4") + geom_abline(intercept = 0, slope = 1, color = "black") + xlim(0, 75) + ylim(0, 75) + ggtitle(z) + theme_minimal() + theme( strip.background = element_blank(), strip.text.x = element_blank(), legend.title = element_blank(), legend.position = "bottom", legend.text = element_text(size = 7), plot.title = element_text(size = 9, face = "bold"), panel.border = element_rect(color = "black", fill = NA, size = 0.5), axis.text.x = element_text(colour = "black", size = 7), axis.title = element_text(face = "bold", size = 7), axis.text.y = element_text(colour = "black", size = 7) ) # Add p-value and correlation information cor_edit <- cor.test(ADAR2_1_sample_control_barcodes$AtoI_Editing.x, ADAR2_1_sample_control_barcodes$AtoI_Editing.y) p5 <- p4.1 + annotate(geom = "text", x = 13, y = 50, label = paste("r =", round(cor_edit$estimate, 3)), color = "black") + annotate(geom = "text", x = 13, y = 60, label = paste("p-value =", round(cor_edit$p.value, 3)), color = "black") + xlab("Sample barcodes") + ylab("Control barcodes") plot(p5) ############## B2 random disruption ############# # Retrieve the editing levels of the target constructs Random_disruption_df<-df_A_G_POS_editing[ df_A_G_POS_editing$desc=="random" & df_A_G_POS_editing$sample.ctrl=="sample" ,] Random_perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="perfect_ds",1:44] Random_perfect_ds$site<-0 No_ds_structure<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="ctrl_repeat",1:44] No_ds_structure$site<-1 Random_disruption_df<-rbind(Random_perfect_ds,Random_disruption_df,No_ds_structure) Random_disruption_df$average_editing<-rowMeans(Random_disruption_df[,1:41]) # Order the data and set labels Random_disruption_df<-Random_disruption_df[order(as.numeric(Random_disruption_df$site)),] rownames(Random_disruption_df)<-paste(Random_disruption_df$site) Random_disruption_df$site<- as.numeric(Random_disruption_df$site)*100 Random_disruption_df$Treatment<-z Random_disruption_df$site<-round(Random_disruption_df$site,2) # Create the plot Random_ggplot<-ggplot(Random_disruption_df, aes(x=site,average_editing,y=average_editing, color=Treatment))+geom_line(size=0.5)+theme_classic(base_size = 12)+xlab("% Random Disruption")+ylab("Mean editing")+scale_x_continuous(expand = c(0, 0),n.breaks = 10)+scale_y_continuous(expand = c(0, 0),n.breaks = 7, limits = c(0,max(Random_disruption_df$average_editing )+5))+theme(panel.border = element_rect(color = "black",fill = NA,size = 1),legend.position = c(0.7, 0.6),legend.title=element_blank(),axis.title = element_text(face="bold"), title =element_text(face="bold") )+ggtitle("B2") Random_ggplot<-Random_ggplot+ scale_color_manual(values=c("#0000FF")) plot(Random_ggplot) ############ Heatmap of a 1nt-,2nt-,3nt- and 4nt-mismatch running from 5’ to 3’ throughout the double-stranded RNA ##################### # Filter the data frame to select specific rows Disruption_df <- df_A_G_POS_editing[df_A_G_POS_editing$desc %in% c("mismatch1", "mismatch2", "mismatch3", "mismatch4") & df_A_G_POS_editing$sample.ctrl == "sample", ] Perfect_ds <- df_A_G_POS_editing[df_A_G_POS_editing$desc == "perfect_ds", ] # Calculate delta editing Delta_editing <- mclapply(1:nrow(Disruption_df), mc.cores = 8,function(x) { Delta_editing_per_construct <- Disruption_df[x, 1:41] - Perfect_ds[1, 1:41] Delta_editing_per_construct <- cbind(Delta_editing_per_construct, Disruption_df[x, 42:45]) Delta_editing_per_construct }) Delta_editing_df <- do.call(rbind, Delta_editing) # Process the delta editing data Delta_editing_df<-do.call(rbind,Delta_editing) Delta_editing_df<-Delta_editing_df[order(as.numeric(Delta_editing_df$site)), ] Delta_editing_df$Mistmatch_names<-paste0(Delta_editing_df$site,Delta_editing_df$sample.ctrl,Delta_editing_df$desc ) Delta_editing_df_mis_1_3<-Delta_editing_df[Delta_editing_df$sample.ctrl=="sample",] mismatches_ADAR_list<-split(Delta_editing_df_mis_1_3,f=Delta_editing_df_mis_1_3$desc) # Create a list of pheatmap plots pheatmap_plots_mismatches<-mclapply(names(mismatches_ADAR_list), mc.cores = 8, function(z){ Delta_editing_df_mistmatch3<-mismatches_ADAR_list[[z]][order(as.numeric(mismatches_ADAR_list[[z]]$site), decreasing = T),] rownames(Delta_editing_df_mistmatch3)<-Delta_editing_df_mistmatch3$site Delta_editing_df_mistmatch3_matrix<-as.matrix(Delta_editing_df_mistmatch3[,1:41]) Delta_editing_df_mistmatch3_matrix_scales<-scale(Delta_editing_df_mistmatch3_matrix) Delta_editing_df_mistmatch3_matrix_scales<-as.data.frame(Delta_editing_df_mistmatch3_matrix_scales) list_rows <- split(Delta_editing_df_mistmatch3_matrix_scales,seq(nrow(Delta_editing_df_mistmatch3_matrix_scales))) list_rows_capped_zscore<-lapply(list_rows, function(z){ z[which(z<=-4)]<- -4 z[which(z>=4)]<- 4 return(z)}) Delta_editing_df_mistmatch3_matrix_scales_capped<-do.call(rbind,list_rows_capped_zscore) colnames(Delta_editing_df_mistmatch3_matrix_scales_capped)<-colnames(Delta_editing_df_mistmatch3_matrix_scales) plot_pheatmap<-pheatmap(Delta_editing_df_mistmatch3_matrix_scales_capped,cluster_rows=F, cluster_cols=F,show_rownames=F, fontsize = 11, main = z ) return(plot_pheatmap[[4]]) }) # Arrange and display the pheatmap plots grid.arrange(grobs=pheatmap_plots_mismatches, ncol=2) ########################## ADAR1- and ADAR2-mediated editing offsets based on subsets of 3-nucleotide mismatch running throughout the mNG and B2 sequences. ################ # Filter and select relevant rows mismatch_types <- c("mismatch1", "mismatch2", "mismatch3", "mismatch4") Disruption_df <- df_A_G_POS_editing[df_A_G_POS_editing$desc %in% mismatch_types & df_A_G_POS_editing$sample.ctrl == "sample", ] Perfect_ds <- df_A_G_POS_editing[df_A_G_POS_editing$desc == "perfect_ds", ] # Calculate delta editing Delta_editing <- lapply(1:nrow(Disruption_df), function(x) { Delta_editing_per_construct <- Disruption_df[x, 1:41] - Perfect_ds[1, 1:41] Delta_editing_per_construct <- cbind(Delta_editing_per_construct, Disruption_df[x, 42:45]) Delta_editing_per_construct }) Delta_editing_df <- do.call(rbind, Delta_editing) # Process the delta editing data Delta_editing_df <- Delta_editing_df[order(as.numeric(Delta_editing_df$site)), ] Delta_editing_df$Mistmatch_names <- paste0(Delta_editing_df$site, Delta_editing_df$sample.ctrl, Delta_editing_df$desc) # Split by mismatch kind Delta_editing_df_mis_1_3_list<-split(Delta_editing_df,f=Delta_editing_df$desc) # Process data per mismatch kind Delta_editing_df_mis_1_3_df_list<-mclapply(Delta_editing_df_mis_1_3_list, mc.cores = 8, function(y){ max_vector<-sapply(abs(y[1:41]),max) Col_consider_downstream<-names(max_vector[max_vector>1]) Delta_editing_df_mis_1_3<-y[,c(Col_consider_downstream,"Barcode ID","desc","sample.ctrl","site","Mistmatch_names")] Delta_editing_df_mis_1_3<-Delta_editing_df_mis_1_3[!Delta_editing_df_mis_1_3$sample.ctrl=="barcode_ctrl",] }) # Calculate distances and editing levels Delta_editing_mistmatch_1_3_df_list<-mclapply(Delta_editing_df_mis_1_3_df_list,mc.cores=8, function(z){ Delta_editing_list<-split(z, f=z$Mistmatch_names) Delta_editing_mistmatch<-mclapply(Delta_editing_list, mc.cores = 8, function(x){ Editing_levels<-as.numeric(x[1,1:(ncol(x)-5)]) Distance_from_disruption<- (146-as.numeric(x[1,"site"])-as.numeric(gsub("A","",names(x[,1:(ncol(x)-5)]))))*(-1) Distance_from_disruption_df<-data.frame(Distance_from_mistmatch=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Mismatch_kind=rep(x$desc, length(Editing_levels))) Distance_from_disruption_df$A_position<- as.numeric(gsub("A","",colnames(x[,1:(ncol(x)-5)]))) return(Distance_from_disruption_df) }) Delta_editing_mistmatch_df<-do.call(rbind,Delta_editing_mistmatch) }) # Arrange the distances per mismatch kind Delta_editing_mistmatch_1<-Delta_editing_mistmatch_1_3_df_list[[1]] Delta_editing_mistmatch_2<-Delta_editing_mistmatch_1_3_df_list[[2]] Delta_editing_mistmatch_2$Distance_from_mistmatch<-Delta_editing_mistmatch_2$Distance_from_mistmatch +0.5 Delta_editing_mistmatch_3<-Delta_editing_mistmatch_1_3_df_list[[3]] Delta_editing_mistmatch_3$Distance_from_mistmatch<-Delta_editing_mistmatch_3$Distance_from_mistmatch+1 Delta_editing_mistmatch_4<-Delta_editing_mistmatch_1_3_df_list[[4]] Delta_editing_mistmatch_4$Distance_from_mistmatch<-Delta_editing_mistmatch_4$Distance_from_mistmatch+1.5 # Combine all mismatch data Delta_editing_mistmatch_df<-rbind(Delta_editing_mistmatch_1,Delta_editing_mistmatch_2,Delta_editing_mistmatch_3,Delta_editing_mistmatch_4) mismatch.labs <- c("Mismatch 1 nucleotide","Mismatch 2 nucleotide", "Mismatch 3 nucleotides","Mismatch 4 nucleotide") names(mismatch.labs) <- c("mismatch1", "mismatch2","mismatch3", "mismatch4") Delta_editing_mistmatch_df<-Delta_editing_mistmatch_df[!is.na(Delta_editing_mistmatch_df$Editing_level_A_to_I),] Delta_editing_mistmatch_df_output<-Delta_editing_mistmatch_df Delta_editing_mistmatch_df_list<-split(Delta_editing_mistmatch_df,f=Delta_editing_mistmatch_df$Mismatch_kind) # Apply LOESS smoothing Delta_editing_mistmatch_df_list_LOESS<- mclapply(Delta_editing_mistmatch_df_list, mc.cores = 8, function(z){ loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_mistmatch, data=z, span=0.05) z$smoothed5 <- predict(loessMod50) return(z) }) Delta_editing_mistmatch_df<-do.call(rbind,Delta_editing_mistmatch_df_list_LOESS) # Split by mismatch kind for visualization ADARs_periodicity_mis_kinds_ADAR_list<-split(Delta_editing_mistmatch_df, f=Delta_editing_mistmatch_df$Mismatch_kind) # Calculate quartiles per distance across all mismatches-carrying constructs ADARs_periodicity_mis_kindsc_ADAR_variation<-mclapply(ADARs_periodicity_mis_kinds_ADAR_list, mc.cores = 8, function(z){ ADARs_dist_mis_list<-split(z, f=z$Distance_from_mistmatch) ADARs_dist_mis_quartiles_list<-mclapply(ADARs_dist_mis_list, mc.cores = 8, function(x){ df<-as.data.frame(x) quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1)))) colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1)) df<-cbind(df,quartiles_df) return(df) }) ADARs_dist_mis_quartiles_list_df<-do.call(rbind,ADARs_dist_mis_quartiles_list) return(ADARs_dist_mis_quartiles_list_df) }) ADARs_periodicity_mis_kindsc_ADAR_variation_df<-do.call(rbind,ADARs_periodicity_mis_kindsc_ADAR_variation) # Create the summary plots per mismatch kind vertical.lines<-c(-26,-35) p.1 <- ggplot(ADARs_periodicity_mis_kindsc_ADAR_variation_df, aes(Distance_from_mistmatch, smoothed5))+ geom_ribbon(aes(ymin=Q_0.25,ymax=Q_0.75),alpha=0.15)+ geom_line(size=0.50,colour= "Darkblue") p.1 <-p.1 +xlab("Distance from mistmatch")+ylab("Delta editing")+theme_classic()#+scale_color_manual(values=c("Black","Blue")) p.1 <-p.1 +theme(strip.background = element_blank(), legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=10),plot.title = element_text(size=16,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=11,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=11)) p.1 <-p.1 +geom_vline(size=0.5,xintercept = vertical.lines, color="#666699",alpha=0.6,linetype = "dashed")+ggtitle("") p.1<-p.1+scale_x_continuous(n.breaks = 8)+facet_wrap(~Mismatch_kind, ncol = 4)+scale_y_continuous(n.breaks = 6, limits=c(-40,40)) plot(p.1) ############## Size of the mismatch ############## # Filter the data for specific mismatch lengths ( For ADAR1: -35 or -35.5 and for ADAR2: -26 OR 26.5) ADAR_B2_1_to_4_nuc_mis<-ADARs_periodicity_mis_kindsc_ADAR_variation_df[ADARs_periodicity_mis_kindsc_ADAR_variation_df$Distance_from_mistmatch==-35|ADARs_periodicity_mis_kindsc_ADAR_variation_df$Distance_from_mistmatch==-35.5,] # Create the box plot plot_Boxplot_ADAR2<-ggplot(ADAR_B2_1_to_4_nuc_mis, aes(x=Mismatch_kind, y=Editing_level_A_to_I))+geom_boxplot()+ theme(legend.position="none",axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +xlab("Mismatch size")+ylab("Delta Editing")+theme_classic() plot_Boxplot_ADAR2<-plot_Boxplot_ADAR2+theme(strip.background = element_blank(),strip.text.y = element_blank(),strip.text.x = element_text(size = 8),legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=12),plot.title = element_text(size=12,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=12),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=12)) plot(plot_Boxplot_ADAR2) ############ B2 T bulges PERIODICITY ###################### # Filter the data for specific bulge types and sample control Disruption_bulges_df <- df_A_G_POS_editing %>% filter(desc %in% c("bulge-T", "bulge-TTC", "bulge-TTCTT", "bulge-TTCTTCT"), sample.ctrl == "sample") # Select perfect ds Perfect_ds <- df_A_G_POS_editing %>% filter(desc == "perfect_ds") # Calculate delta editing Delta_editing<- mclapply(rownames(Disruption_bulges_df),mc.cores = 8, function(x){ Delta_editing_per_construct<-Disruption_bulges_df[x,1:41]-Perfect_ds[1,1:41] Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_bulges_df[x,42:45]) return(Delta_editing_per_construct) } ) Delta_editing_df<-do.call(rbind,Delta_editing) # Calculate Distance from bulge Delta_editing_df<-Delta_editing_df[order(as.numeric(Delta_editing_df$site)), ] Delta_editing_df$Mistmatch_names<-paste0(Delta_editing_df$site,Delta_editing_df$sample.ctrl,Delta_editing_df$desc ) Delta_editing_list<-split(Delta_editing_df, f=Delta_editing_df$Mistmatch_names) Delta_editing_bulge<-mclapply(Delta_editing_list, mc.cores = 8, function(x){ Editing_levels<-as.numeric(x[1,1:41]) Distance_from_disruption<-(146-as.numeric(x[1,"site"])- as.numeric(gsub("A","",names(x[,1:41])))+0.5)*-1 Distance_from_Disruption_bulges_df<-data.frame(Distance_from_bulge=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Bulge.kinds=rep(x$desc, length(Editing_levels))) }) Delta_editing_bulge_df<-do.call(rbind,Delta_editing_bulge) bulge.labs <- c("Bulge T", "Bulge TTC", "Bulge TTCTT","Bulge TTCTTCT") names(bulge.labs) <- c("bulge-T", "bulge-TTC", "bulge-TTCTT","bulge-TTCTTCT") Delta_editing_bulge_df<-Delta_editing_bulge_df[!is.na(Delta_editing_bulge_df$Editing_level_A_to_I),] # Apply LOESS smoothing Loes_bulge_kind<-split(Delta_editing_bulge_df, f=Delta_editing_bulge_df$Bulge.kinds)## Loes_bulge_kind_list<-mclapply(Loes_bulge_kind, mc.cores = 8, function(x){## loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_bulge, data=x, span=0.11)### x$smoothed5 <- predict(loessMod50) return(x)}) Delta_editing_bulge_df<-do.call(rbind,Loes_bulge_kind_list) # Calculate quartiles per distance across all mismatches-carrying constructs ggplot_quartiles_T_bulges_list<-split(Delta_editing_bulge_df, f=Delta_editing_bulge_df$Bulge.kinds) Pyrimidine_bulges_mNG_quartiles<-mclapply(ggplot_quartiles_T_bulges_list, mc.cores = 8, function(x){ ADAR1_3nuc_mis<-x ADAR1_3nuc_mis_ADAR1<-split(ADAR1_3nuc_mis, f=ADAR1_3nuc_mis$Distance_from_bulge) ADAR1_3nuc_mis_ADAR2_list<-mclapply(ADAR1_3nuc_mis_ADAR1, mc.cores = 8, function(x){ df<-as.data.frame(x) quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1)))) colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1)) df<-cbind(df,quartiles_df) return(df) }) ADAR1_3nuc_mis_ADAR1_list_df<-do.call(rbind,ADAR1_3nuc_mis_ADAR2_list) }) Pyrimidine_bulges_mNG_quartiles_df<-do.call(rbind,Pyrimidine_bulges_mNG_quartiles) Pyrimidine_bulges_mNG_quartiles_df$ADAR<-z # Create the summary plots per bulge kind vertical.lines<-c(-35,-26) color_plot_ADAR<-brewer.pal(8, "Dark2") p.1 <- ggplot(Pyrimidine_bulges_mNG_quartiles_df, aes(Distance_from_bulge, smoothed5))+ geom_ribbon(aes(ymin=Q_0.25,ymax=Q_0.75),alpha=0.15)+ geom_line(size=0.40,colour= "DarkBlue") p.1 <-p.1 +xlab("")+ylab("")+theme_classic() p.1 <-p.1 +theme(legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=6),plot.title = element_text(size=10,hjust = 0.5),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=7)) p.1 <-p.1 +geom_vline(size=0.5,xintercept = vertical.lines, color="#666699",alpha=0.6,linetype = "dashed")+ggtitle(unique(Pyrimidine_bulges_mNG_quartiles_df$ADAR))+facet_grid(~Bulge.kinds) +xlab("Bulge kind")+ylab("Delta Editing") plot(p.1) ############################## A-C mismatches ############################################### # Filter the data Disruption_T_to_C <- df_A_G_POS_editing %>% filter(desc == "TtoC" & sample.ctrl == "sample") # Calculate mismatch position and order the data Disruption_T_to_C$mismatch_position<- 146-as.numeric(Disruption_T_to_C$site) Disruption_T_to_C<-Disruption_T_to_C[order(as.numeric(Disruption_T_to_C$mismatch_position)),] # Set rownames for visualization rownames(Disruption_T_to_C) <- Disruption_T_to_C$mismatch_position # Extract Perfect_ds Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="perfect_ds",1:44] Perfect_ds$site<-0 # Calculate Delta editing per construct Delta_editing<- mclapply(rownames(Disruption_T_to_C),mc.cores = 8, function(x){ Delta_editing_per_construct<-Disruption_T_to_C[x,1:41]-Perfect_ds[1,1:41] Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_T_to_C[x,42:45]) return(Delta_editing_per_construct) } ) Delta_editing_T_to_C_df<-do.call(rbind,Delta_editing) # Split and calculate distance Delta_editing_T_to_C_df<-Delta_editing_T_to_C_df[order(as.numeric(Delta_editing_T_to_C_df$site)), ] Delta_editing_T_to_C_df$Mistmatch_names<-paste0(Delta_editing_T_to_C_df$site,Delta_editing_T_to_C_df$sample.ctrl,Delta_editing_T_to_C_df$desc ) Delta_editing_list_TtoC<-split(Delta_editing_T_to_C_df, f=Delta_editing_T_to_C_df$Mistmatch_names) Delta_editing_mistmatch_T_to_C<-mclapply(Delta_editing_list_TtoC, mc.cores = 8, function(x){ Editing_levels<-as.numeric(x[1,1:41]) Distance_from_disruption<-(146-as.numeric(x[1,"site"])-as.numeric(gsub("A","",names(x[,1:41]))))*(-1) #146- as.numeric(gsub("A","",names(x[,1:41])))- as.numeric(x[1,"site"]) Distance_from_disruption_df<-data.frame(Distance_from_mistmatch=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Mismatch_kind=rep(x$desc, length(Editing_levels))) }) Delta_editing_mistmatch_TtoC_df<-do.call(rbind,Delta_editing_mistmatch_T_to_C) # Apply loess smoothing Delta_editing_mistmatch_TtoC_df<-Delta_editing_mistmatch_TtoC_df[!is.nan(Delta_editing_mistmatch_TtoC_df$Editing_level_A_to_I),] loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_mistmatch, data=Delta_editing_mistmatch_TtoC_df, span=0.07) Delta_editing_mistmatch_TtoC_df$smoothed5 <- predict(loessMod50) # Calculate quartiles per distance across all A-C carrying constructs ADAR_A_C_list<-split(Delta_editing_mistmatch_TtoC_df, f=Delta_editing_mistmatch_TtoC_df$Distance_from_mistmatch) A_C_ADAR_list<-mclapply(ADAR_A_C_list, mc.cores = 8, function(x){ df<-as.data.frame(x) quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1)))) colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1)) df<-cbind(df,quartiles_df) return(df) }) ADAR_A_C_list_df<-do.call(rbind,A_C_ADAR_list) ADAR_A_C_list_df$ADAR<-z ################### A-A mismatch ######################## # Filter data Disruption_extened_A_A <- df_A_G_POS_editing %>% filter(desc == "mismatch1" & sample.ctrl == "sample") # Read barcodes CSV Barcodes_csv<-Barcodes_csv[Barcodes_csv$B2.mNG=="B2",] Barcodes_csv_all_com<-Barcodes_csv[,c("barcode","seq")] # Calculate mismatch position and merge with barcode data Disruption_extened_A_A$loc_nuc_mistmatch<-146-as.numeric(Disruption_extened_A_A$site) Delta_editing_all_comb_dfmerged<-merge(Disruption_extened_A_A,Barcodes_csv_all_com, by.x="Barcode ID",by.y="barcode") Delta_editing_all_comb_dfmerged$site<-as.numeric(Delta_editing_all_comb_dfmerged$site) # Define reference sequences perf_ds_sequence<-c("GCCGGGCGTGGTGGCACACGCCTTTAATCCCAGCACTCGGGAGGCAGAGGCAGGCAGATTTCTGAGTTGGAGGCCAGCCTGGTCTACAAAGTGAGTTCCAGGACAGCCAGAGCTACACAGAGAAACCCTGTCTCGAAAAACCAAAA") Variable_arm<-c("TTTTGGTTTTTCGAGACAGGGTTTCTCTGTGTAGCTCTGGCTGTCCTGGAACTCACTTTGTAGACCAGGCTGGCCTCCAACTCAGAAATCTGCCTGCCTCTGCCTCCCGAGTGCTGGGATTAAAGGCGTGTGCCACCACGCCCGGC") # Process the data Delta_editing_all_comb_nuc_list<-mclapply(rownames(Delta_editing_all_comb_dfmerged), mc.cores = 8, function(w){ z<-as.data.frame(Delta_editing_all_comb_dfmerged[w,]) z$nucletide<-substr(perf_ds_sequence, z$loc_nuc_mistmatch, z$loc_nuc_mistmatch) z$ref_nuc_to_changed<-substr(Variable_arm, z$site+1, z$site+1) z$To_nuc_changed<-substr(z$seq, z$site+1, z$site+1) return(z) }) Delta_editing_all_comb_nuc_df<-do.call(rbind,Delta_editing_all_comb_nuc_list) Delta_editing_all_A_A_mis<-Delta_editing_all_comb_nuc_df[Delta_editing_all_comb_nuc_df$nucletide=="A" & Delta_editing_all_comb_nuc_df$ref_nuc_to_changed=="T" &Delta_editing_all_comb_nuc_df$To_nuc_changed=="A",] Delta_editing_all_comb_nuc_df<-data.frame(Delta_editing_all_comb_nuc_df$`Barcode ID`) colnames(Delta_editing_all_comb_nuc_df)<-"Barcode ID" # Merge data with mismatch information Disruption_A_A_mismacth<-merge(Disruption_extened_A_A,Delta_editing_all_comb_nuc_df,by="Barcode ID") Disruption_A_A_mismacth<-Disruption_A_A_mismacth[,colnames(Disruption_A_A_mismacth)[c(2:42,1,44:ncol(Disruption_A_A_mismacth)-1)]] Disruption_extened_A_A<-Disruption_A_A_mismacth Disruption_extened_A_A$mismatch_position<- 146-as.numeric(Disruption_extened_A_A$site) Disruption_extened_A_A<-Disruption_extened_A_A[order(as.numeric(Disruption_extened_A_A$mismatch_position)),] # Set rownames for visualization rownames(Disruption_extened_A_A)<- as.character(Disruption_extened_A_A$mismatch_position) # Extract Perfect_ds Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="perfect_ds",1:44] Perfect_ds$site<-0 # Calculate Delta editing per construct Delta_editing<- mclapply(rownames(Disruption_extened_A_A),mc.cores = 8, function(x){ Delta_editing_per_construct<-Disruption_extened_A_A[x,1:41]-Perfect_ds[1,1:41] Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_extened_A_A[x,42:45]) return(Delta_editing_per_construct) } ) Delta_editing_T_to_A_df<-do.call(rbind,Delta_editing) # Calculate the distance from the disruption per construct Delta_editing_T_to_A_df<-Delta_editing_T_to_A_df[order(as.numeric(Delta_editing_T_to_A_df$site)), ] Delta_editing_T_to_A_df$Mistmatch_names<-paste0(Delta_editing_T_to_A_df$site,Delta_editing_T_to_A_df$sample.ctrl,Delta_editing_T_to_A_df$desc ) Delta_editing_list_TtoA<-split(Delta_editing_T_to_A_df, f=Delta_editing_T_to_A_df$Mistmatch_names) Delta_editing_mistmatch_T_to_A<-mclapply(Delta_editing_list_TtoA, mc.cores = 8, function(x){ Editing_levels<-as.numeric(x[1,1:41]) Distance_from_disruption<- (146-as.numeric(x[1,"site"])-as.numeric(gsub("A","",names(x[,1:41]))))*(-1) Distance_from_disruption_df<-data.frame(Distance_from_mistmatch=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Mismatch_kind=rep(x$desc, length(Editing_levels))) }) Delta_editing_mistmatch_T_to_A_DF<-do.call(rbind,Delta_editing_mistmatch_T_to_A) # Smooth the data using loess Delta_editing_mistmatch_T_to_A_DF<-Delta_editing_mistmatch_T_to_A_DF[!is.nan(Delta_editing_mistmatch_T_to_A_DF$Editing_level_A_to_I),] loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_mistmatch, data=Delta_editing_mistmatch_T_to_A_DF, span=0.07) Delta_editing_mistmatch_T_to_A_DF$smoothed5 <- predict(loessMod50) Delta_editing_mistmatch_T_to_A_DF$ADAR<-z ADAR2_A_A_list<-split(Delta_editing_mistmatch_T_to_A_DF, f=Delta_editing_mistmatch_T_to_A_DF$Distance_from_mistmatch) # Calculate quartiles per distance across all A-A carrying constructs ADAR2_A_A_ADAR2_list<-mclapply(ADAR2_A_A_list, mc.cores = 8, function(x){ df<-as.data.frame(x) quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1)))) colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1)) df<-cbind(df,quartiles_df) return(df) }) ADAR2_A_A_list_df<-do.call(rbind,ADAR2_A_A_ADAR2_list) ################### A-G mismatch ######################## # Filter data Disruption_extened_T_to_G<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="TtoG"&df_A_G_POS_editing$sample.ctrl=="sample" ,] # Calculate and assign mismatch position Disruption_extened_T_to_G$mismatch_position<- 146-as.numeric(Disruption_extened_T_to_G$site) # Arrange data by mismatch position and set row names Disruption_extened_T_to_G<-Disruption_extened_T_to_G[order(as.numeric(Disruption_extened_T_to_G$mismatch_position)),] rownames(Disruption_extened_T_to_G)<- as.character(Disruption_extened_T_to_G$mismatch_position) # Extract Perfect_ds Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="perfect_ds",1:44] Perfect_ds$site<-0 # Calculate Delta editing per construct Delta_editing<- mclapply(rownames(Disruption_extened_T_to_G),mc.cores = 8, function(x){ Delta_editing_per_construct<-Disruption_extened_T_to_G[x,1:41]-Perfect_ds[1,1:41] Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_extened_T_to_G[x,42:45]) return(Delta_editing_per_construct) } ) Delta_editing_T_to_G_df<-do.call(rbind,Delta_editing) Delta_editing_T_to_G_df<-Delta_editing_T_to_G_df[order(as.numeric(Delta_editing_T_to_G_df$site)), ] Delta_editing_T_to_G_df$Mistmatch_names<-paste0(Delta_editing_T_to_G_df$site,Delta_editing_T_to_G_df$sample.ctrl,Delta_editing_T_to_G_df$desc ) # Calculate the distance from the disruption per construct Delta_editing_list_TtoG<-split(Delta_editing_T_to_G_df, f=Delta_editing_T_to_G_df$Mistmatch_names) Delta_editing_mistmatch_T_to_G<-mclapply(Delta_editing_list_TtoG, mc.cores = 8, function(x){ Editing_levels<-as.numeric(x[1,1:41]) Distance_from_disruption<-(146-as.numeric(x[1,"site"])-as.numeric(gsub("A","",names(x[,1:41]))))*(-1) Distance_from_disruption_df<-data.frame(Distance_from_mistmatch=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Mismatch_kind=rep(x$desc, length(Editing_levels))) }) Delta_editing_mistmatch_TtoG_df<-do.call(rbind,Delta_editing_mistmatch_T_to_G) Delta_editing_mistmatch_TtoG_df<-Delta_editing_mistmatch_TtoG_df[!is.nan(Delta_editing_mistmatch_TtoG_df$Editing_level_A_to_I),] # Smooth the data using loess loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_mistmatch, data=Delta_editing_mistmatch_TtoG_df, span=0.07) Delta_editing_mistmatch_TtoG_df$smoothed5 <- predict(loessMod50) Delta_editing_mistmatch_TtoG_df$ADAR<-z # Split and summarize data by Distance_from_mistmatch ADAR2_A_C_list<-split(Delta_editing_mistmatch_TtoG_df, f=Delta_editing_mistmatch_TtoG_df$Distance_from_mistmatch) ADAR2_A_C_ADAR2_list<-mclapply(ADAR2_A_C_list, mc.cores = 8, function(x){ df<-as.data.frame(x) quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1)))) colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1)) df<-cbind(df,quartiles_df) return(df) }) ADAR2_A_G_list_df<-do.call(rbind,ADAR2_A_C_ADAR2_list) # Combine the data ADAR_A_C_list_df<-ADAR_A_C_list_df[,colnames(ADAR2_A_G_list_df)] A_A_A_C_A_G_mismatch_df_all_ADARs_df_summary<-rbind(ADAR_A_C_list_df,ADAR2_A_A_list_df,ADAR2_A_G_list_df) # Create the summary plots per mismatch p.1 <- ggplot(A_A_A_C_A_G_mismatch_df_all_ADARs_df_summary, aes(Distance_from_mistmatch, smoothed5, colour=Mismatch_kind))+ geom_line(size=0.5)+theme_classic() p.1 <-p.1 +theme(strip.background = element_blank(), strip.text.y = element_blank(), legend.title=element_blank(),legend.position=c(0.8,0.25),legend.text = element_text(size=10),plot.title = element_text(size=8,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=8,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 10),axis.text.y = element_text(colour="black",size=8)) p.1<-p.1+scale_x_continuous(n.breaks = 6,limits=c(-50,50)) vertical.lines<-c(-35,-26,30) p.1<-p.1+geom_vline(size=0.5,xintercept = vertical.lines, color="#666699",alpha=0.6,linetype = "dashed")+scale_y_continuous(n.breaks = 8, limits=c(NA,NA)) p.1<-p.1+scale_color_manual(values=c('Black','darkblue','darkgreen'),labels=c('A-A','A-C', 'A-G'))+xlab("Bulge kind")+ylab("Delta Editing") plot(p.1) ########################### Mismatch all combinations ############### # Filter Barcodes_csv for B2.mNG=="B2" Barcodes_csv<-Barcodes_csv[Barcodes_csv$B2.mNG=="B2",] Barcodes_csv$site<-as.character(Barcodes_csv$site) # Filter data for mismatch_all_comb in the sample all_combinations<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mismatch_all_comb" & df_A_G_POS_editing$sample.ctrl=="sample" ,] # Filter data for mismatch1 in the sample target_sites<-unique(all_combinations$site) all_combinations_1<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mismatch1" & df_A_G_POS_editing$sample.ctrl=="sample" ,] all_combinations_1<-all_combinations_1[na.omit(match(target_sites,all_combinations_1$site)),] # Combine the two datasets all_combinations<-rbind(all_combinations,all_combinations_1) all_combinations$loc_nuc_mistmatch<-146-as.numeric(all_combinations$site) # Extract Perfect_ds Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="perfect_ds",1:41] Perfect_ds$site<-0 # Calculate Delta editing per construct Delta_editing<- mclapply(rownames(all_combinations),mc.cores = 8, function(x){ Delta_editing_per_construct<-all_combinations[x,1:41]-Perfect_ds[1,1:41] Delta_editing_per_construct<-cbind(Delta_editing_per_construct,all_combinations[x,42:46]) return(Delta_editing_per_construct) } ) Delta_editing_all_comb_df<-do.call(rbind,Delta_editing) Delta_editing_all_comb_df$site<-as.numeric(Delta_editing_all_comb_df$site) Barcodes_csv_all_com<-Barcodes_csv[,c("barcode","seq")] # Merge Delta_editing_all_comb_df with Barcodes_csv_all_com Delta_editing_all_comb_dfmerged<-merge(Delta_editing_all_comb_df,Barcodes_csv_all_com, by.x="Barcode ID",by.y="barcode") # Define perf_ds_sequence and Variable_arm perf_ds_sequence<-c("GCCGGGCGTGGTGGCACACGCCTTTAATCCCAGCACTCGGGAGGCAGAGGCAGGCAGATTTCTGAGTTGGAGGCCAGCCTGGTCTACAAAGTGAGTTCCAGGACAGCCAGAGCTACACAGAGAAACCCTGTCTCGAAAAACCAAAA") Variable_arm<-c("TTTTGGTTTTTCGAGACAGGGTTTCTCTGTGTAGCTCTGGCTGTCCTGGAACTCACTTTGTAGACCAGGCTGGCCTCCAACTCAGAAATCTGCCTGCCTCTGCCTCCCGAGTGCTGGGATTAAAGGCGTGTGCCACCACGCCCGGC") Delta_editing_all_comb_nuc_list<-mclapply(rownames(Delta_editing_all_comb_dfmerged), mc.cores = 8, function(w){ z<-as.data.frame(Delta_editing_all_comb_dfmerged[w,]) z$nucletide<-substr(perf_ds_sequence, z$loc_nuc_mistmatch, z$loc_nuc_mistmatch) z$ref_nuc_to_changed<-substr(Variable_arm, z$site+1, z$site+1) z$To_nuc_changed<-substr(z$seq, z$site+1, z$site+1) return(z) }) Delta_editing_all_comb_nuc_df<-do.call(rbind,Delta_editing_all_comb_nuc_list) # Define a function for defining the nucleotide surrounding each A site nuc_wind_function<-function(df_row,Inv_ds_sequence,window_size){ postion_mis<-df_row[,"loc_nuc_mistmatch"] window_pos<--window_size:window_size min_dist<-mclapply(window_pos, mc.cores = 8, function(z){ Nuc_defined<-substr(Inv_ds_sequence,z+postion_mis,postion_mis+z) Loc_MisM_nuc<-substr(Inv_ds_sequence,postion_mis,postion_mis) if(Nuc_defined=="A"){ tmp_df<-data.frame(Pos_Window=z,Loc_MisM=postion_mis,Nuc_mism=Loc_MisM_nuc,Nuc_B2_pos=postion_mis+z, Nuc=Nuc_defined, Editing= df_row[1,paste0("A",z+postion_mis)]) }else{ tmp_df<-data.frame(Pos_Window=z,Loc_MisM=postion_mis,Nuc_mism=Loc_MisM_nuc,Nuc_B2_pos=postion_mis+z, Nuc=Nuc_defined, Editing=NA) } return(tmp_df) }) df_row_dis_tmp<-do.call(rbind,min_dist) rownames(df_row_dis_tmp)<-df_row_dis_tmp$Pos_Window tmp_Editing<-as.data.frame(df_row_dis_tmp$Editing) rownames(tmp_Editing)<-df_row_dis_tmp$Pos_Window df_row_dis<-cbind(df_row,t(df_row_dis_tmp$Editing)) colnames(df_row_dis)<-c(colnames(df_row),df_row_dis_tmp$Pos_Window) return(df_row_dis) } # Apply nuc_wind_function to each row in Delta_editing_all_comb_nuc_df Mismatch_all_comb<-mclapply(rownames(Delta_editing_all_comb_nuc_df),mc.cores = 8, function(z){ df_rowz_input<-Delta_editing_all_comb_nuc_df[z,] x<-nuc_wind_function(df_rowz_input,perf_ds_sequence,4 ) }) Mismatch_all_comb_df<- do.call(rbind,Mismatch_all_comb) # Split Mismatch_all_comb_df by nucleotide Mism_oppo_of_comb<-split(Mismatch_all_comb_df,f=Mismatch_all_comb_df$nucletide) # Filter data for mismatch-carrying constucts (TtoC and TtoG) in the sample all_combinations1<-df_A_G_POS_editing[(df_A_G_POS_editing$desc=="TtoC"|df_A_G_POS_editing$desc=="TtoG") & df_A_G_POS_editing$sample.ctrl=="sample" ,] target_sites<-unique(all_combinations1$site) # Filter data for mismatch1-carrying constucts in the sample all_combinations_1<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mismatch1" & df_A_G_POS_editing$sample.ctrl=="sample" ,] all_combinations_1<-all_combinations_1[na.omit(match(target_sites,all_combinations_1$site)),] all_combinations<-rbind(all_combinations1,all_combinations_1) all_combinations$loc_nuc_mistmatch<-146-as.numeric(all_combinations$site) # Filter data for perfect_ds Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="perfect_ds",1:41] Perfect_ds$site<-0 # Calculate Delta editing per construct Delta_editing<- mclapply(rownames(all_combinations),mc.cores = 8, function(x){ Delta_editing_per_construct<-all_combinations[x,1:41]-Perfect_ds[1,1:41] Delta_editing_per_construct<-cbind(Delta_editing_per_construct,all_combinations[x,42:46]) return(Delta_editing_per_construct) } ) Delta_editing_all_comb_df<-do.call(rbind,Delta_editing) Delta_editing_all_comb_df$site<-as.numeric(Delta_editing_all_comb_df$site) Barcodes_csv_all_com<-Barcodes_csv[,c("barcode","seq")] # Merge Delta_editing_all_comb_df with Barcodes_csv Delta_editing_all_comb_dfmerged<-merge(Delta_editing_all_comb_df,Barcodes_csv_all_com, by.x="Barcode ID",by.y="barcode") Delta_editing_all_comb_nuc_list<-mclapply(rownames(Delta_editing_all_comb_dfmerged), mc.cores = 8, function(w){ z<-as.data.frame(Delta_editing_all_comb_dfmerged[w,]) z$nucletide<-substr(perf_ds_sequence, z$loc_nuc_mistmatch, z$loc_nuc_mistmatch) z$ref_nuc_to_changed<-substr(Variable_arm, z$site+1, z$site+1) z$To_nuc_changed<-substr(z$seq, z$site+1, z$site+1) return(z) }) Delta_editing_all_comb_nuc_df<-do.call(rbind,Delta_editing_all_comb_nuc_list) df_row=Delta_editing_all_comb_nuc_df[1,] Inv_ds_sequence=perf_ds_sequence window_size=4 # Define a function for defining the nucleotide surrounding each A site nuc_wind_function<-function(df_row,Inv_ds_sequence,window_size){ postion_mis<-df_row[,"loc_nuc_mistmatch"] window_pos<--window_size:window_size min_dist<-mclapply(window_pos, mc.cores = 8, function(z){ Nuc_defined<-substr(Inv_ds_sequence,z+postion_mis,postion_mis+z) Loc_MisM_nuc<-substr(Inv_ds_sequence,postion_mis,postion_mis) if(Nuc_defined=="A"){ tmp_df<-data.frame(Pos_Window=z,Loc_MisM=postion_mis,Nuc_mism=Loc_MisM_nuc,Nuc_B2_pos=postion_mis+z, Nuc=Nuc_defined, Editing= df_row[1,paste0("A",z+postion_mis)]) }else{ tmp_df<-data.frame(Pos_Window=z,Loc_MisM=postion_mis,Nuc_mism=Loc_MisM_nuc,Nuc_B2_pos=postion_mis+z, Nuc=Nuc_defined, Editing=NA) } return(tmp_df) }) df_row_dis_tmp<-do.call(rbind,min_dist) rownames(df_row_dis_tmp)<-df_row_dis_tmp$Pos_Window tmp_Editing<-as.data.frame(df_row_dis_tmp$Editing) rownames(tmp_Editing)<-df_row_dis_tmp$Pos_Window df_row_dis<-cbind(df_row,t(df_row_dis_tmp$Editing)) colnames(df_row_dis)<-c(colnames(df_row),df_row_dis_tmp$Pos_Window) return(df_row_dis) } Mismatch_all_comb<-mclapply(rownames(Delta_editing_all_comb_nuc_df),mc.cores = 8, function(z){ df_rowz_input<-Delta_editing_all_comb_nuc_df[z,] x<-nuc_wind_function(df_rowz_input,perf_ds_sequence,4 ) }) Mismatch_all_comb_df<- do.call(rbind,Mismatch_all_comb) Mism_oppo_of_N<-split(Mismatch_all_comb_df,f=Mismatch_all_comb_df$nucletide) Mism_oppo_of_A<-Mism_oppo_of_N[[1]] # Split Mismatch_all_comb_df by nucletide tmp11<-split(Mism_oppo_of_A,f=Mism_oppo_of_A$To_nuc_changed) A_df_all_comb<- rbind(colMeans(tmp11[[1]][,51:59], na.rm = T),colMeans(tmp11[[2]][,51:59], na.rm = T),colMeans(tmp11[[3]][,51:59], na.rm = T)) rownames(A_df_all_comb)<-names(tmp11) colnames(A_df_all_comb)<-c("4","3","2","1","A","-1","-2","-3","-4") A_df_all_comb<-A_df_all_comb[,c("-4","-3","-2","-1","A","1","2","3","4")] Mism_oppo_of_T<-Mism_oppo_of_comb[["T"]] tmp11<-split(Mism_oppo_of_T,f=Mism_oppo_of_T$To_nuc_changed) T_df_all_comb<- rbind(colMeans(tmp11[[1]][,51:59], na.rm = T),colMeans(tmp11[[2]][,51:59], na.rm = T),colMeans(tmp11[[3]][,51:59], na.rm = T)) rownames(T_df_all_comb)<-names(tmp11) colnames(T_df_all_comb)<-c("4","3","2","1","A","-1","-2","-3","-4") T_df_all_comb<-T_df_all_comb[,c("-4","-3","-2","-1","A","1","2","3","4")] Mism_oppo_of_G<-Mism_oppo_of_comb[["G"]] tmp11<-split(Mism_oppo_of_G,f=Mism_oppo_of_G$To_nuc_changed) G_df_all_comb<- rbind(colMeans(tmp11[[1]][,51:59], na.rm = T),colMeans(tmp11[[2]][,51:59], na.rm = T),colMeans(tmp11[[3]][,51:59], na.rm = T)) rownames(G_df_all_comb)<-names(tmp11) colnames(G_df_all_comb)<-c("4","3","2","1","A","-1","-2","-3","-4") G_df_all_comb<-G_df_all_comb[,c("-4","-3","-2","-1","A","1","2","3","4")] Mism_oppo_of_C<-Mism_oppo_of_comb[["C"]] tmp11<-split(Mism_oppo_of_C,f=Mism_oppo_of_C$To_nuc_changed) C_df_all_comb<- rbind(colMeans(tmp11[[1]][,51:59], na.rm = T),colMeans(tmp11[[2]][,51:59], na.rm = T),colMeans(tmp11[[3]][,51:59], na.rm = T)) rownames(C_df_all_comb)<-names(tmp11) colnames(C_df_all_comb)<-c("4","3","2","1","A","-1","-2","-3","-4") C_df_all_comb<-C_df_all_comb[,c("-4","-3","-2","-1","A","1","2","3","4")] max_min_df_all_com<-rbind(A_df_all_comb,T_df_all_comb,G_df_all_comb,C_df_all_comb) if(abs(min(max_min_df_all_com,na.rm = T))>abs(max(max_min_df_all_com,na.rm = T))){ Range_value<-abs(min(max_min_df_all_com,na.rm = T)) }else{ Range_value<-abs(max(max_min_df_all_com,na.rm = T)) } # Create a vector of breaks for the color scale breaksList = seq(-Range_value,Range_value ,by=0.5) # Generate a pheatmap for Mismatch opposite of A p4<-pheatmap(A_df_all_comb,cluster_rows=FALSE, cluster_cols=FALSE, main = "Mismatch opposite of A",color = colorRampPalette(rev(RColorBrewer::brewer.pal(n = 11, name = "RdYlBu")))(length(breaksList)), breaks = breaksList, fontsize = 6) # Generate a pheatmap for Mismatch opposite of T p1<-pheatmap(T_df_all_comb,cluster_rows=FALSE, cluster_cols=FALSE, main = paste0(z,": Mismatch opposite of T"), color = colorRampPalette(rev(RColorBrewer::brewer.pal(n = 11, name = "RdYlBu")))(length(breaksList)), breaks = breaksList, fontsize = 6) # Generate a pheatmap for Mismatch opposite of G p2<-pheatmap(G_df_all_comb,cluster_rows=FALSE, cluster_cols=FALSE, main = "Mismatch opposite of G",color = colorRampPalette(rev(RColorBrewer::brewer.pal(n = 11, name = "RdYlBu")))(length(breaksList)), breaks = breaksList, fontsize = 6) # Generate a pheatmap for Mismatch opposite of C p3<-pheatmap(C_df_all_comb,cluster_rows=FALSE, cluster_cols=FALSE, main = "Mismatch opposite of C",color = colorRampPalette(rev(RColorBrewer::brewer.pal(n = 11, name = "RdYlBu")))(length(breaksList)), breaks = breaksList, fontsize = 6) Seq_preference_B2<-function(df_A_G_POS_editing,name_treatment){ #df_A_G_POS_editing<-ADARs_A_G_Posediting[[1]] perfect_ds_fw_strand<-("GCCGGGCGTGGTGGCACACGCCTTTAATCCCAGCACTCGGGAGGCAGAGGCAGGCAGATTTCTGAGTTGGAGGCCAGCCTGGTCTACAAAGTGAGTTCCAGGACAGCCAGAGCTACACAGAGAAACCCTGTCTCGAAAAACCAAAA") Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$`Barcode ID`=="CTGTAGAC",1:32] Perfect_ds_t<-t(Perfect_ds) Perfect_ds_t<-as.data.frame(Perfect_ds_t) Perfect_ds_t$A_pos<-as.numeric(gsub("A","",rownames(Perfect_ds_t))) downstream_nuc<-mclapply(rownames(Perfect_ds_t),mc.cores = 8, function(z){ df<-as.data.frame(Perfect_ds_t[z,]) pos_A<-as.numeric(df$A_pos) nuc_down<-substr(perfect_ds_fw_strand, start =pos_A, stop = pos_A+1 ) df<-cbind(df,nuc_down) colnames(df)<-c("Editing","A_position","Nuc_downstream") return(df) }) downstream_nuc_df_ggplot<-do.call(rbind,downstream_nuc) downstream_nuc_df_ggplot$Pos<-"Downstream" upstream_nuc<-mclapply(rownames(Perfect_ds_t),mc.cores = 8, function(z){ df<-Perfect_ds_t[z,] pos_A<-as.numeric(df$A_pos) nuc_up<-substr(perfect_ds_fw_strand, start =pos_A-1, stop = pos_A ) df<-cbind(df,nuc_up) colnames(df)<-c("Editing","A_position","Nuc_downstream") return(df) }) upstream_nuc_df_ggplot<-do.call(rbind,upstream_nuc) upstream_nuc_df_ggplot$Pos<-"Upstream" upstream_nuc_df_ggplot$ADAR<-name_treatment downstream_nuc_df_ggplot$ADAR<-name_treatment Nuc_preference<- rbind(upstream_nuc_df_ggplot,downstream_nuc_df_ggplot) Nuc_preference$library<-"mNG" return(Nuc_preference) } ADAR_seq_preference<-Seq_preference_B2(df_A_G_POS_editing,z) output<-list(ADAR_seq_preference,df_A_G_POS_editing) return(output) } ####ADAR2 # Define file paths fastq_file1 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_5_658_ADARs_B2_G_LRS/657_ADARs_mNG_library_ADAR1_KO_5_658_ADARs_B2_G_LRS_S68_R1_001.fastq.gz" fastq_file2 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_5_658_ADARs_B2_G_LRS/657_ADARs_mNG_library_ADAR1_KO_5_658_ADARs_B2_G_LRS_S68_R2_001.fastq.gz" # Set the random seed for reproducibility and Load and sample the Fastq files set.seed(123L) f1 <- FastqSampler(fastq_file1, n = 1e7) set.seed(123L) f2 <- FastqSampler(fastq_file2, n = 1e7) B2_R1 <- yield(f1) B2_R2 <- yield(f2) ADAR2_Rep1_B2<-B2_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR2") # Define file paths fastq_file1 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS_S76_R1_001.fastq.gz" fastq_file2 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS_S76_R2_001.fastq.gz" # Set the random seed for reproducibility and Load and sample the Fastq files set.seed(123L) f1 <- FastqSampler(fastq_file1, n = 1e7) set.seed(123L) f2 <- FastqSampler(fastq_file2, n = 1e7) B2_R1 <- yield(f1) B2_R2 <- yield(f2) # Alternatively, if you prefer to work with the entire dataset rather than a sample, you can use the following lines to load the complete data files. # B2_R1 = readFastq("/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS_S76_R1_001.fastq.gz") # B2_R2 = readFastq("/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS_S76_R2_001.fastq.gz") ADAR2_Rep2_B2<-B2_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR2") ADAR2_1_ggplot<-pivot_longer(ADAR2_Rep1_B2[[2]], cols = starts_with("A"), values_to = "AtoI_Editing", names_to= "A_postion" ) ADAR2_1_ggplot$ID_ggplot<-paste0(ADAR2_1_ggplot$`Barcode ID`,ADAR2_1_ggplot$sample.ctrl,ADAR2_1_ggplot$A_postion) ADAR2_1_ggplot_df<-as.data.frame(ADAR2_1_ggplot) ADAR2_1_ggplot_df<-ADAR2_1_ggplot_df[,6:7] colnames(ADAR2_1_ggplot_df)<-c("ADAR2p_B2_Rep1_AtoI_Editing","ID_ggplot") ADAR2_2_ggplot<-pivot_longer(ADAR2_Rep2_B2[[2]], cols = starts_with("A"),values_to = "AtoI_Editing", names_to= "A_postion" ) ADAR2_2_ggplot$ID_ggplot<-paste0(ADAR2_2_ggplot$`Barcode ID`,ADAR2_2_ggplot$sample.ctrl,ADAR2_2_ggplot$A_postion) ADAR2_2_ggplot_df<-as.data.frame(ADAR2_2_ggplot) ADAR2_2_ggplot_df<-ADAR2_2_ggplot_df[,6:7] colnames(ADAR2_2_ggplot_df)<-c("ADAR2p_B2_Rep2_AtoI_Editing","ID_ggplot") ADAR2_plasmid_B2_repetitions<-merge(ADAR2_1_ggplot_df,ADAR2_2_ggplot_df, by.x="ID_ggplot",by.y="ID_ggplot" ) ADAR2_plasmid_B2_repetitions<-as.data.frame(ADAR2_plasmid_B2_repetitions) p3<-ggplot(ADAR2_plasmid_B2_repetitions, aes(x=ADAR2p_B2_Rep1_AtoI_Editing,y=ADAR2p_B2_Rep2_AtoI_Editing))+geom_point(size = 0.1, color="deepskyblue4")+ geom_abline(intercept = 0, slope = 1, color="black")+xlim(0,80)+ylim(0,80)+ ggtitle("ADAR2")+theme_linedraw() p3<-p3+theme_classic()+theme(strip.background = element_blank(), strip.text.x = element_blank(), legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=7),plot.title = element_text(size=9,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 7),axis.text.y = element_text(colour="black",size=7)) p3<-p3+ annotate(geom="text", x=13, y=70, label=paste("p-value < 2.2e-16 "), color="black") Correlation_Rep<-cor.test(ADAR2_plasmid_B2_repetitions$ADAR2p_B2_Rep1_AtoI_Editing,ADAR2_plasmid_B2_repetitions$ADAR2p_B2_Rep2_AtoI_Editing) p3<-p3+ annotate(geom="text", x=12, y=60, label=paste("r=",round(Correlation_Rep$estimate,4)), color="black")+xlab("Overexpressed ADAR2 rep1")+ylab("Overexpressed ADAR2 rep2") plot(p3) #### ADAR1 fastq_file1 = "/LongRead_ADARs_B2_mNG_lib_KO_cells/658_ADARs_B2_A_LRS/658_ADARs_B2_A_LRS_S82_R1_001.fastq.gz" fastq_file2 = "/LongRead_ADARs_B2_mNG_lib_KO_cells/658_ADARs_B2_A_LRS/658_ADARs_B2_A_LRS_S82_R2_001.fastq.gz" # Set the random seed for reproducibility and Load and sample the Fastq files set.seed(123L) f1 <- FastqSampler(fastq_file1, n = 1e7) set.seed(123L) f2 <- FastqSampler(fastq_file2, n = 1e7) B2_R1 <- yield(f1) B2_R2 <- yield(f2) ADAR1_Rep1_B2<-B2_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR1") # Define file paths fastq_file1 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_7_658_ADARs_B2_i_LRS/657_ADARs_mNG_library_ADAR1_KO_7_658_ADARs_B2_i_LRS_S70_R1_001.fastq.gz" fastq_file2 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_7_658_ADARs_B2_i_LRS/657_ADARs_mNG_library_ADAR1_KO_7_658_ADARs_B2_i_LRS_S70_R2_001.fastq.gz" # Set the random seed for reproducibility and Load and sample the Fastq files set.seed(123L) f1 <- FastqSampler(fastq_file1, n = 1e7) set.seed(123L) f2 <- FastqSampler(fastq_file2, n = 1e7) B2_R1 <- yield(f1) B2_R2 <- yield(f2) ADAR1_Rep2_B2<-B2_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR1") ADAR1_1_ggplot<-pivot_longer(ADAR1_Rep1_B2[[2]], cols = starts_with("A"), values_to = "AtoI_Editing", names_to= "A_postion" ) ADAR1_1_ggplot$ID_ggplot<-paste0(ADAR1_1_ggplot$`Barcode ID`,ADAR1_1_ggplot$sample.ctrl,ADAR1_1_ggplot$A_postion) ADAR1_1_ggplot_df<-as.data.frame(ADAR1_1_ggplot) ADAR1_1_ggplot_df<-ADAR1_1_ggplot_df[,6:7] colnames(ADAR1_1_ggplot_df)<-c("ADAR1p_B2_Rep1_AtoI_Editing","ID_ggplot") ADAR1_2_ggplot<-pivot_longer(ADAR1_Rep1_B2[[2]], cols = starts_with("A"),values_to = "AtoI_Editing", names_to= "A_postion" ) ADAR1_2_ggplot$ID_ggplot<-paste0(ADAR1_2_ggplot$`Barcode ID`,ADAR1_2_ggplot$sample.ctrl,ADAR1_2_ggplot$A_postion) ADAR1_2_ggplot_df<-as.data.frame(ADAR1_2_ggplot) ADAR1_2_ggplot_df<-ADAR1_2_ggplot_df[,6:7] colnames(ADAR1_2_ggplot_df)<-c("ADAR1p_B2_Rep2_AtoI_Editing","ID_ggplot") ADAR1_plasmid_B2_repetitions<-merge(ADAR1_1_ggplot_df,ADAR1_2_ggplot_df, by.x="ID_ggplot",by.y="ID_ggplot" ) ADAR1_plasmid_B2_repetitions<-as.data.frame(ADAR1_plasmid_B2_repetitions) p3<-ggplot(ADAR1_plasmid_B2_repetitions, aes(x=ADAR1p_B2_Rep1_AtoI_Editing,y=ADAR1p_B2_Rep2_AtoI_Editing))+geom_point(size = 0.1, color="deepskyblue4")+ geom_abline(intercept = 0, slope = 1, color="black")+xlim(0,80)+ylim(0,80)+ ggtitle("ADAR1")+theme_linedraw() p3<-p3+theme_classic()+theme(strip.background = element_blank(), strip.text.x = element_blank(), legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=7),plot.title = element_text(size=9,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 7),axis.text.y = element_text(colour="black",size=7)) p3<-p3+ annotate(geom="text", x=13, y=70, label=paste("p-value < 2.2e-16 "), color="black") Correlation_Rep<-cor.test(ADAR1_plasmid_B2_repetitions$ADAR1p_B2_Rep1_AtoI_Editing,ADAR1_plasmid_B2_repetitions$ADAR1p_B2_Rep2_AtoI_Editing) p3<-p3+ annotate(geom="text", x=12, y=60, label=paste("r=",round(Correlation_Rep$estimate,4)), color="black")+xlab("Overexpressed ADAR1 rep1")+ylab("Overexpressed ADAR1 rep2") plot(p3) ############################### mNG library ################################### mNG_library_A_to_G_analysis<-function(B2_R1_readFastaq,B2_R2_readFastaq,z){ #Sequences in Read1.fastq are filtered and processed to extract the "B2_sequence," including filtering sequences that start with "GCCGGG" and counting the number of properly beginning reads. mNG_R1_sequence<-as.data.frame(sread(B2_R1)) mNG_R1_sequence$number_read<-1:nrow(mNG_R1_sequence) # Calculate the total number of reads in R1 Total_Number_of_Reads_R1<-nrow(mNG_R1_sequence) # Filter sequences containing "TTTGCCA" and Extract those sequences mNG_R1_sequence<-mNG_R1_sequence[grepl("TTTGCCA",mNG_R1_sequence$x),] mNG_R1_sequence$mNG_sequence<-gsub(".*TTTGCCA","" ,mNG_R1_sequence$x) # Calculate the number of reads beginning properly number_reads_beggining_properly<- nrow(mNG_R1_sequence) colnames(mNG_R1_sequence)<-c("non_processed_R1","number_read","mNG_constructR1") mNG_R1_sequence<-mNG_R1_sequence[nchar(mNG_R1_sequence$mNG_constructR1)>=81,] # Truncate 'Read 1' to 81 characters mNG_R1_sequence$mNG_constructR1<-substr(mNG_R1_sequence$mNG_constructR1, start = 1,stop = 81) # Calculate the percentage of reads beginning properly Percentage_reads_beggining_properly<- (number_reads_beggining_properly/ Total_Number_of_Reads_R1)*100 # Reads2 # Read Read2.fastq file and create a data frame mNG_R2_sequence<-as.data.frame(sread(B2_R2)) mNG_R2_sequence$number_read<-1:nrow(mNG_R2_sequence) Total_Number_of_Reads_R2<-nrow(mNG_R2_sequence) # Create a reverse complement of B2_R2 data mNG_R2_sequence_reverse_comp<-as.data.frame(sread(reverseComplement(B2_R2))) mNG_R2_sequence_reverse_comp$number_read<-1:nrow(mNG_R2_sequence_reverse_comp) # Read barcodes data TWIST library spreadsheet obtained from Uzonyi 2021 #This table can be found in Uzonyi et al.(2021): Table S2. Sequences of the B2 and mNG oligo library pools, related to STAR Methods. Barcodes_csv<-read.csv("/DATA_HELA_ADAR2_over_ADAR1_knockdown_B2_libr/Table_S2_Twist_library_sequence_plans.csv", sep = ",", header = TRUE) # Filter for only mNG construct's sequences Barcodes_csv<-Barcodes_csv[Barcodes_csv$B2.mNG=="mNG",] Barcodes_csv$site<-as.character(Barcodes_csv$site) # Process mNG_R2_sequence data based on barcode location per construct mNG_R2_sequence<-mNG_R2_sequence_reverse_comp[grepl("TCCCTCA",mNG_R2_sequence_reverse_comp$x),] mNG_R2_sequence<-mNG_R2_sequence[grepl("TTCGAA",mNG_R2_sequence$x),] mNG_R2_sequence$barcode<-gsub(".*TTCGAA","",mNG_R2_sequence$x) mNG_R2_sequence$barcode<-gsub("\\TCCCTC.*","",mNG_R2_sequence$barcode) mNG_R2_sequence<-mNG_R2_sequence[grep("ACTAGTAT",mNG_R2_sequence$x),] mNG_R2_sequence$mNG_constructR2<-gsub("\\ACTAGTAT.*","",mNG_R2_sequence$x) # Truncate 'Read 2' mNG_R2_sequence<-mNG_R2_sequence[nchar(mNG_R2_sequence$mNG_constructR2)>=64,] mNG_R2_sequence$mNG_constructR2<- substr(mNG_R2_sequence$mNG_constructR2, start =nchar(mNG_R2_sequence$mNG_constructR2)-64,stop = nchar(mNG_R2_sequence$mNG_constructR2)) mNG_R2_sequence<-mNG_R2_sequence[nchar(mNG_R2_sequence$barcode)==8,] # Merge mNG_R2_sequence with Barcodes_csv mNG_R2_sequence_Barcode_ID<-merge(mNG_R2_sequence,Barcodes_csv,by.x="barcode", by.y="barcode" ) colnames(mNG_R2_sequence_Barcode_ID)<-c("barcode","No_processed_Read2", "number_read","mNG_constructR2", "ID","Buffer","F","BstBI","loop","seq","AscI", "R","desc","site","B2.mNG","sample.ctrl","Total_length","Buffer_length") Number_of_Barcodes<-length(unique(Barcodes_csv$barcode)) Reads_properly_barcodes<-nrow(mNG_R2_sequence_Barcode_ID) Reads_properly_barcodes_percent<-(Reads_properly_barcodes/Total_Number_of_Reads_R2)*100 mNG_df_R1_R2<-merge(mNG_R1_sequence,mNG_R2_sequence_Barcode_ID, by.x="number_read", by.y="number_read", all=F) mNG_df_R1_R2$Whole_Construct<- paste0(mNG_df_R1_R2$mNG_constructR1,mNG_df_R1_R2$mNG_constructR2) mNG_df_R1_R2<-mNG_df_R1_R2[nchar(mNG_df_R1_R2$Whole_Construct)==146,] # Define the perfect DNA strand sequence perfect_ds_fw_strand<-("AGCCAATGGCGGCTAACTATCTGAAGAACCAGCCGATGTACGTGTTCCGTAAGACGGAGCTCAAGCACTCCAAGACCGAGCTCAACTTCAAGGAGTGGCAAAAGGCCTTTACCGATGTGATGGGCATGGACGAGCTGTACAAGTAA") # Generate row names for mNG_df_R1_R2 rownames(mNG_df_R1_R2)<-paste0("R",1:nrow(mNG_df_R1_R2)) design_seq_const<-perfect_ds_fw_strand # Extract A_G positions from the perfect DNA strand A_G_positions<-unlist(gregexpr('A', design_seq_const)) Nuc_matrix<-do.call(rbind,str_split(mNG_df_R1_R2$Whole_Construct,"")) Nuc_matrix<-as.data.frame(Nuc_matrix) rownames(Nuc_matrix)<-rownames(mNG_df_R1_R2) AG_pos_analized<-Nuc_matrix[,paste0("V",A_G_positions)] rownames(AG_pos_analized)<-rownames(mNG_df_R1_R2) AG_pos_analized$Combined_cols <- do.call(paste, c(AG_pos_analized[1:ncol(AG_pos_analized)], sep = "")) # Filter A positions that do not contain 'C' or 'T' in Combined_cols AG_pos_analized<-AG_pos_analized[!grepl("C|T",AG_pos_analized$Combined_cols),] mNG_df_R1_R2<-mNG_df_R1_R2[rownames(AG_pos_analized),] # Calculate the number of reads after filtering Number_of_Reads_after_filter_out<- nrow(mNG_df_R1_R2) QC_control_df1<-data.frame(Total_Number_of_Reads_R1, number_reads_beggining_properly, Percentage_reads_beggining_properly, Total_Number_of_Reads_R2, Reads_properly_barcodes, Reads_properly_barcodes_percent) Quantiles_number_read_per_barcode<-quantile(as.numeric(table(mNG_df_R1_R2$barcode)), c(0.01,0.1,0.9,0.99)) QC_control_df<-cbind(QC_control_df1, t(as.data.frame(Quantiles_number_read_per_barcode))) colnames(QC_control_df)<-c("Total Number of Reads_R1","Number of reads beginning properly", "% of reads beginning properly","Number of Reads_R2","Number of Reads beginning properly barcodes_R2","% of Reads with proper barcodes_R2","1%","10%","90%","99%") # Create a plot of the ECDF for the number of reads in each barcode plot(ecdf(log10(as.numeric(table(mNG_df_R1_R2$barcode)))), xlab="Log10 of number of reads in each barcode", pch=20, main="ECD: Log10 N° of reads in each barcode",yaxt="n" ) axis(2, at = seq(0, 1, by = 0.1), las=2) abline(v=log10(6), col="Red") # Subset the original dataframe to keep only the selected columns mNG_df_R1_R2<-mNG_df_R1_R2[,1:21] mNG_df_R1_R2<-mNG_df_R1_R2[,c("number_read", "Whole_Construct","barcode","desc", "site","B2.mNG","sample.ctrl")] # Split the dataframe into a list by the 'barcode' column mNG_df_R1_R2_list<-split(mNG_df_R1_R2, f=mNG_df_R1_R2$barcode) ########## Boxplots representing the distribution of numbers of editing events in the single mNG/B2 perfect double-stranded molecules ########################## # Filter and extract relevant mNG Construct perfec_ds_filtered_mNG_df_R1_R2_list<-strsplit(mNG_df_R1_R2_list[["AAGGCCAT"]]$Whole_Construct,"") perfec_ds_filtered_mNG_df_R1_R2_list<-mclapply(perfec_ds_filtered_mNG_df_R1_R2_list, mc.cores = 8, function(x){ x[A_G_positions]}) perfec_ds_filtered_mNG_df_R1_R2_df_nuc<-as.data.frame(do.call(rbind,perfec_ds_filtered_mNG_df_R1_R2_list)) perfect_ds_fw_strand_separated<-unlist(strsplit(perfect_ds_fw_strand,"")) perfect_ds_fw_strand_separated<-perfect_ds_fw_strand_separated[A_G_positions] colnames(perfec_ds_filtered_mNG_df_R1_R2_df_nuc)<-perfect_ds_fw_strand_separated rownames(perfec_ds_filtered_mNG_df_R1_R2_df_nuc)<-1:nrow(perfec_ds_filtered_mNG_df_R1_R2_df_nuc) # Calculate the number of edits per molecule Number_of_edits_per_molecule<-mclapply(rownames(perfec_ds_filtered_mNG_df_R1_R2_df_nuc), mc.cores = 8, function(z){ Number_of_edits<-as.numeric(sum(perfec_ds_filtered_mNG_df_R1_R2_df_nuc[z,]=="G")) Number_of_edits_df<-c(perfec_ds_filtered_mNG_df_R1_R2_df_nuc[z,],Number_of_edits,paste0(perfec_ds_filtered_mNG_df_R1_R2_df_nuc[z,],collapse = "") ) Number_of_edits_df<-as.data.frame(Number_of_edits_df) colnames(Number_of_edits_df)<-1:ncol(Number_of_edits_df) return(Number_of_edits_df) }) Number_of_edits_per_molecule_df<-do.call(rbind,Number_of_edits_per_molecule) colnames(Number_of_edits_per_molecule_df)<-c(perfect_ds_fw_strand_separated,"number_edits","Trimmed_Read1_sequence") Number_of_edits_per_molecule_df$ADAR<-z # Create a boxplot using ggplot2 Number_of_edits_per_molecule_df$color_ADAR<-gsub("\\Rep.*","",Number_of_edits_per_molecule_df$ADAR) p<-ggplot(Number_of_edits_per_molecule_df[,45:48], aes(x=ADAR, y=number_edits,fill=color_ADAR))+geom_boxplot(outlier.shape = NA)+ scale_fill_brewer(palette="PuBu") p<-p+xlab("")+ylab("# edits per molecule")+theme_classic()+ggtitle("mNG") p<-p +theme(strip.background = element_blank(), strip.text.x = element_blank(), legend.title=element_blank(),legend.position="none",legend.text = element_text(size=11),plot.title = element_text(size=14,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=11,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=11)) plot(p) ########### A-to-I quantification per barcode ################# # Calculate the number of analyzed barcodes Number_of_analyzed_barcodes<-length(mNG_df_R1_R2_list) # Create consensus matrices per barcode concensus_matrix_per_barcode<-mclapply(mNG_df_R1_R2_list,mc.cores = 12, function(w){ Sequence_barcode_StringSet<-DNAStringSet(w$Whole_Construct,use.names=TRUE) consensus_matrix_barcode<-as.data.frame(t(consensusMatrix(Sequence_barcode_StringSet)))[,c(1:4,15:16)] }) # Calculate A-to-G rate per barcode concensus_matrix_per_barcode_A_to_G<-mclapply(concensus_matrix_per_barcode,mc.cores = 12, function(y){ y$position<-1:nrow(y) y$A_to_G_rate<- (y$G/(y$A+y$G))*100 return(y) }) # Extract A-to-G rate at target positions concensus_matrix_per_barcode_A_to_G_target_pos<- mclapply(names(concensus_matrix_per_barcode_A_to_G),mc.cores = 12,function(z){ df_A_POS<- as.data.frame(t(as.data.frame(concensus_matrix_per_barcode_A_to_G[[z]][A_G_positions,"A_to_G_rate"]))) df_A_POS$Barcode_names<-names(concensus_matrix_per_barcode_A_to_G[z]) colnames(df_A_POS)<- c(paste0("A",A_G_positions),"Barcode ID" ) rownames(df_A_POS)<-names(concensus_matrix_per_barcode_A_to_G[z]) return(df_A_POS) }) df_A_G_POS_editing<-do.call(rbind,concensus_matrix_per_barcode_A_to_G_target_pos) # Create a heatmap which plot the editing levels across all constructs #pheatmap(df_A_G_POS_editing[,1:44], cluster_rows=F, cluster_cols=F,show_rownames=F, main = "Considering all barcodes") # Match barcode information matched_index<-match(df_A_G_POS_editing$`Barcode ID`,Barcodes_csv$barcode) df_A_G_POS_editing<- cbind(df_A_G_POS_editing,Barcodes_csv[matched_index,c("desc","sample.ctrl","site")]) # Correlation of editing levels in mNG constructs with different barcode sequences ADAR2_1_barcodes<-pivot_longer(df_A_G_POS_editing, cols = starts_with("A"), values_to = "AtoI_Editing", names_to= "A_postion" ) ADAR2_1_control_barcodes<-ADAR2_1_barcodes[ADAR2_1_barcodes$sample.ctrl=="barcode_ctrl",] ADAR2_1_control_barcodes$merging_column<-paste0(ADAR2_1_control_barcodes$desc,ADAR2_1_control_barcodes$site,ADAR2_1_control_barcodes$A_postion) ADAR2_1_sample_barcodes<-ADAR2_1_barcodes[ADAR2_1_barcodes$sample.ctrl=="sample",] ADAR2_1_sample_barcodes$merging_column<-paste0(ADAR2_1_sample_barcodes$desc,ADAR2_1_sample_barcodes$site,ADAR2_1_sample_barcodes$A_postion) ADAR2_1_sample_control_barcodes<-merge(ADAR2_1_sample_barcodes,ADAR2_1_control_barcodes,by.x="merging_column",by.y="merging_column") # Create a scatter plot p4.1<-ggplot(ADAR2_1_sample_control_barcodes, aes(x=AtoI_Editing.x,y=AtoI_Editing.y))+geom_point(size = 0.1, color="deepskyblue4")+ geom_abline(intercept = 0, slope = 1, color="black")+xlim(0,75)+ylim(0,75)+ ggtitle(z)+theme_linedraw() p4.1<-p4.1+theme_classic()+theme(strip.background = element_blank(), strip.text.x = element_blank(), legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=7),plot.title = element_text(size=9,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7),axis.title = element_text(face = "bold",size = 7),axis.text.y = element_text(colour="black",size=7)) cor_test_mNG<-cor.test(ADAR2_1_sample_control_barcodes$AtoI_Editing.x,ADAR2_1_sample_control_barcodes$AtoI_Editing.y) p4.1<-p4.1+ annotate(geom="text", x=20, y=60, label=paste(paste("p-value",cor_test_mNG$p.value)), color="black") p5<-p4.1+ annotate(geom="text", x=13, y=50, label=paste("r=",round(cor_test_mNG$estimate,3)), color="black")+xlab("Sample barcodes")+ylab("Control barcodes") plot(p5) ############ mNG random disruption ##################### # Filter and extract relevant mNG Construct Random_disruption_df<-df_A_G_POS_editing[ df_A_G_POS_editing$desc=="mNG_random" & df_A_G_POS_editing$sample.ctrl=="sample" ,] Random_perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mNG_perfect_ds",1:48] Random_perfect_ds$site<-0 No_ds_structure<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mNG_ctrl_repeat",1:48] No_ds_structure$site<-1 Random_disruption_df<-rbind(Random_perfect_ds,Random_disruption_df,No_ds_structure) # Calculate average editing Random_disruption_df$average_editing<-rowMeans(Random_disruption_df[,1:44]) # Sort and format data Random_disruption_df<-Random_disruption_df[order(as.numeric(Random_disruption_df$site)),] rownames(Random_disruption_df)<-paste(Random_disruption_df$site) Random_disruption_df$site<- as.numeric(Random_disruption_df$site)*100 Random_disruption_df$Treatment<-z Random_disruption_df$site<-round(Random_disruption_df$site,2) # Create a line plot Random_ggplot<-ggplot(Random_disruption_df, aes(x=site,average_editing,y=average_editing, color=Treatment))+geom_line(size=0.5)+theme_classic(base_size = 12)+xlab("% Random Disruption")+ylab("Mean editing %")+scale_x_continuous(expand = c(0, 0),n.breaks = 10)+scale_y_continuous(expand = c(0, 0),n.breaks = 7, limits = c(0,max(Random_disruption_df$average_editing )+5))+theme(panel.border = element_rect(color = "black",fill = NA,size = 1),legend.position = c(0.7, 0.6),legend.title=element_blank(),axis.title = element_text(face="bold"), title =element_text(face="bold") )+ggtitle("") Random_ggplot<-Random_ggplot+ scale_color_manual(values=c("#0000FF")) plot(Random_ggplot) ############ Heatmap of a 1nt-,2nt-,3nt- and 4nt-mismatch running from 5’ to 3’ throughout the double-stranded RNA ##################### # Filter data for mismatch-carrying constructs and perfect ds Disruption_df<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mismatch1"|df_A_G_POS_editing$desc=="mismatch3",] Disruption_df<-Disruption_df[Disruption_df$sample.ctrl=="sample",] Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mNG_perfect_ds",] # Calculate Delta editing for each construct Delta_editing<- mclapply(rownames(Disruption_df),mc.cores = 8, function(x){ Delta_editing_per_construct<-(Disruption_df[x,1:44])-(Perfect_ds[1,1:44]) Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_df[x,45:48]) return(Delta_editing_per_construct)}) Delta_editing_df<-do.call(rbind,Delta_editing) Delta_editing_df<-Delta_editing_df[order(as.numeric(Delta_editing_df$site)), ] Delta_editing_df$Mistmatch_names<-paste0(Delta_editing_df$site,Delta_editing_df$sample.ctrl,Delta_editing_df$desc ) # Filter mismatch data for "sample" Delta_editing_df_mis_1_3<-Delta_editing_df[Delta_editing_df$sample.ctrl=="sample",] # Split data into separate lists by mismatch kind mismatches_ADAR_list<-split(Delta_editing_df_mis_1_3,f=Delta_editing_df_mis_1_3$desc) # Create pheatmap plots for each mismatch pheatmap_plots_mismatches<-mclapply(names(mismatches_ADAR_list), mc.cores = 8, function(z){ Delta_editing_df_mistmatch3<-mismatches_ADAR_list[[z]][order(as.numeric(mismatches_ADAR_list[[z]]$site), decreasing = T),] rownames(Delta_editing_df_mistmatch3)<-Delta_editing_df_mistmatch3$site Delta_editing_df_mistmatch3_matrix<-as.matrix(Delta_editing_df_mistmatch3[,1:41]) Delta_editing_df_mistmatch3_matrix_scales<-scale(Delta_editing_df_mistmatch3_matrix) Delta_editing_df_mistmatch3_matrix_scales<-as.data.frame(Delta_editing_df_mistmatch3_matrix_scales) list_rows <- split(Delta_editing_df_mistmatch3_matrix_scales,seq(nrow(Delta_editing_df_mistmatch3_matrix_scales))) list_rows_capped_zscore<-lapply(list_rows, function(z){ z[which(z<=-4)]<- -4 z[which(z>=4)]<- 4 return(z)}) Delta_editing_df_mistmatch3_matrix_scales_capped<-do.call(rbind,list_rows_capped_zscore) colnames(Delta_editing_df_mistmatch3_matrix_scales_capped)<-colnames(Delta_editing_df_mistmatch3_matrix_scales) plot_pheatmap<-pheatmap(Delta_editing_df_mistmatch3_matrix_scales_capped,cluster_rows=F, cluster_cols=F,show_rownames=F, fontsize = 11, main = z ) return(plot_pheatmap[[4]]) }) grid.arrange(grobs=pheatmap_plots_mismatches, ncol=2) ########################## ADAR1- and ADAR2-mediated editing offsets based on subsets of 3-nucleotide mismatch running throughout the mNG and B2 sequences. ################ # Filter data for mismatch1 and mismatch3, and for the "sample" Disruption_df<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mismatch1" | df_A_G_POS_editing$desc=="mismatch3",] Disruption_df<-Disruption_df[Disruption_df$sample.ctrl=="sample",] # Filter data for the perfect ds Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mNG_perfect_ds",] # Calculate Delta editing for each construct Delta_editing<- mclapply(rownames(Disruption_df),mc.cores = 8, function(x){ Delta_editing_per_construct<-(Disruption_df[x,1:44])-(Perfect_ds[1,1:44]) Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_df[x,45:48]) return(Delta_editing_per_construct)}) Delta_editing_df<-do.call(rbind,Delta_editing) Delta_editing_df<-Delta_editing_df[order(as.numeric(Delta_editing_df$site)), ] Delta_editing_df$Mistmatch_names<-paste0(Delta_editing_df$site,Delta_editing_df$sample.ctrl,Delta_editing_df$desc ) Delta_editing_df_mis_1_3<-Delta_editing_df # Split the data by mismatch kind to process separately Delta_editing_df_mis_1_3_list<-split(Delta_editing_df_mis_1_3,f=Delta_editing_df_mis_1_3$desc) # Process and filter data for each type of mismatch Delta_editing_df_mis_1_3_df_list<-mclapply(Delta_editing_df_mis_1_3_list, mc.cores = 8, function(y){ max_vector<-sapply(abs(y[1:44]),max) Col_consider_downstream<-names(max_vector[max_vector>1]) Delta_editing_df_mis_1_3<-y[,c(Col_consider_downstream,"Barcode ID","desc","sample.ctrl","site","Mistmatch_names")] Delta_editing_df_mis_1_3<-Delta_editing_df_mis_1_3[!Delta_editing_df_mis_1_3$sample.ctrl=="barcode_ctrl",] }) # Calculate distance from the disruption for each construct Delta_editing_mistmatch_1_3_df_list<-mclapply(Delta_editing_df_mis_1_3_df_list,mc.cores=8, function(z){ Delta_editing_list<-split(z, f=z$Mistmatch_names) Delta_editing_mistmatch<-mclapply(Delta_editing_list, mc.cores = 8, function(x){ Editing_levels<-as.numeric(x[1,1:(ncol(x)-5)]) Distance_from_disruption<- (146-as.numeric(x[1,"site"])-as.numeric(gsub("A","",names(x[,1:(ncol(x)-5)]))))*(-1) Distance_from_disruption_df<-data.frame(Distance_from_mistmatch=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Mismatch_kind=rep(x$desc, length(Editing_levels))) Distance_from_disruption_df$A_position<- as.numeric(gsub("A","",colnames(x[,1:(ncol(x)-5)]))) return(Distance_from_disruption_df) }) Delta_editing_mistmatch_df<-do.call(rbind,Delta_editing_mistmatch) }) # Split the data by mismatch kind to process separately and arrange the distance for mismatch3 Delta_editing_mistmatch_1<-Delta_editing_mistmatch_1_3_df_list[[1]] Delta_editing_mistmatch_3<-Delta_editing_mistmatch_1_3_df_list[[2]] Delta_editing_mistmatch_3$Distance_from_mistmatch<-Delta_editing_mistmatch_3$Distance_from_mistmatch+1 Delta_editing_mistmatch_df<-rbind(Delta_editing_mistmatch_1,Delta_editing_mistmatch_3) # Define labels for the mismatch types mismatch.labs <- c("Mismatch 1 nucleotide", "Mismatch 3 nucleotides") names(mismatch.labs) <- c("mismatch1","mismatch3") # Split the data by mismatch kind and apply LOESS smoothing Delta_editing_mistmatch_df<-Delta_editing_mistmatch_df[!is.na(Delta_editing_mistmatch_df$Editing_level_A_to_I),] Delta_editing_mistmatch_df_list<-split(Delta_editing_mistmatch_df,f=Delta_editing_mistmatch_df$Mismatch_kind) Delta_editing_mistmatch_df_list_LOESS<- mclapply(Delta_editing_mistmatch_df_list, mc.cores = 8, function(z){ loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_mistmatch, data=z, span=0.05) z$smoothed5 <- predict(loessMod50) return(z) }) Delta_editing_mistmatch_df<-do.call(rbind,Delta_editing_mistmatch_df_list_LOESS) ADARs_periodicity_mis_kinds_ADAR_list<-split(Delta_editing_mistmatch_df, f=Delta_editing_mistmatch_df$Mismatch_kind) # Calculate quartiles per distance ADARs_periodicity_mis_kindsc_ADAR_variation<-mclapply(ADARs_periodicity_mis_kinds_ADAR_list, mc.cores = 8, function(z){ ADARs_dist_mis_list<-split(z, f=z$Distance_from_mistmatch) ADARs_dist_mis_quartiles_list<-mclapply(ADARs_dist_mis_list, mc.cores = 8, function(x){ df<-as.data.frame(x) quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1)))) colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1)) df<-cbind(df,quartiles_df) return(df) }) ADARs_dist_mis_quartiles_list_df<-do.call(rbind,ADARs_dist_mis_quartiles_list) return(ADARs_dist_mis_quartiles_list_df) }) ADARs_periodicity_mis_kindsc_ADAR_variation_df<-do.call(rbind,ADARs_periodicity_mis_kindsc_ADAR_variation) # Create a summary plot for visualization per mismatch kind vertical.lines<-c(-26,-35) p.1 <- ggplot(ADARs_periodicity_mis_kindsc_ADAR_variation_df, aes(Distance_from_mistmatch, smoothed5))+ geom_ribbon(aes(ymin=Q_0.25,ymax=Q_0.75),alpha=0.15)+ geom_line(size=0.50,colour= "Darkblue") p.1 <-p.1 +xlab("Distance from mistmatch")+ylab("Delta editing")+theme_classic()#+scale_color_manual(values=c("Black","Blue")) p.1 <-p.1 +theme(strip.background = element_blank(), legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=11),plot.title = element_text(size=14,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=11,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=11)) p.1 <-p.1 +geom_vline(size=0.5,xintercept = vertical.lines, color="#666699",alpha=0.6,linetype = "dashed")+ggtitle("")#+facet_grid(~ADAR) p.1<-p.1+scale_x_continuous(n.breaks = 8)+facet_wrap(~Mismatch_kind, ncol = 4)+scale_y_continuous(n.breaks = 6, limits=c(-40,40)) plot(p.1) # Create a summary plot for mismatch3 ADARs_periodicity_3nt_mis_kindsc_ADAR_variation_df<-ADARs_periodicity_mis_kindsc_ADAR_variation_df[ADARs_periodicity_mis_kindsc_ADAR_variation_df$Mismatch_kind=="mismatch3",] p.1 <- ggplot(ADARs_periodicity_3nt_mis_kindsc_ADAR_variation_df, aes(Distance_from_mistmatch, smoothed5))+ geom_ribbon(aes(ymin=Q_0.25,ymax=Q_0.75),alpha=0.15)+ geom_line(size=0.40,colour= "Darkblue") p.1 <-p.1 +xlab("Distance from the mismatch")+ylab("Delta Editing")+theme_classic()#+scale_color_manual(values=c("Black","Blue")) p.1 <-p.1 +theme(strip.background = element_blank(), strip.text.x = element_blank(), legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=6),plot.title = element_text(size=7,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=7)) p.1 <-p.1 +geom_vline(size=0.5,xintercept = vertical.lines, color="#666699",alpha=0.6,linetype = "dashed")+ggtitle(paste("mNG: 3-nuc mismatch"))#+facet_grid(~ADAR) p.1<-p.1+scale_x_continuous(n.breaks = 8, limits = c(-50,50))+scale_y_continuous(n.breaks = 6, limits=c(min(ADARs_periodicity_3nt_mis_kindsc_ADAR_variation_df$Q_0.25-1),max(ADARs_periodicity_3nt_mis_kindsc_ADAR_variation_df$Q_0.75+5))) plot(p.1) ############## Size of mismatch #################### # Filter the data for specific mismatch lengths ( For ADAR1: -35 or -35.5 and for ADAR2: -26 OR 26.5) ADAR_B2_1_to_3_nuc_mis<-ADARs_periodicity_mis_kindsc_ADAR_variation_df[ADARs_periodicity_mis_kindsc_ADAR_variation_df$Distance_from_mistmatch==-35|ADARs_periodicity_mis_kindsc_ADAR_variation_df$Distance_from_mistmatch==-35.5,] # Create a boxplot plot_Boxplot_ADAR2<-ggplot(ADAR_B2_1_to_3_nuc_mis, aes(x=Mismatch_kind, y=Editing_level_A_to_I))+geom_boxplot()+ theme(legend.position="none",axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +xlab("Mismatch size")+ylab("Delta Editing")+theme_classic() plot_Boxplot_ADAR2<-plot_Boxplot_ADAR2+theme(strip.background = element_blank(),strip.text.y = element_blank(),strip.text.x = element_text(size = 8), legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=12),plot.title = element_text(size=12,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=12),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=12)) plot(plot_Boxplot_ADAR2) ############ T bulges PERIODICITY ###################### # Define the bulge kinds and filter the data for bulges-carrying constructs Disruption_bulges_df<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="bulge-T" | df_A_G_POS_editing$desc=="bulge-TTC" | df_A_G_POS_editing$desc=="bulge-TTCTT" | df_A_G_POS_editing$desc=="bulge-TTCTTCT",] Disruption_bulges_df<-Disruption_bulges_df[Disruption_bulges_df$sample.ctrl=="sample",] Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mNG_perfect_ds",] Delta_editing<- mclapply(rownames(Disruption_bulges_df),mc.cores = 8, function(x){ Delta_editing_per_construct<-Disruption_bulges_df[x,1:44]-Perfect_ds[1,1:44] Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_bulges_df[x,45:48]) return(Delta_editing_per_construct) } ) # Calculate Delta editing per construct Delta_editing_df<-do.call(rbind,Delta_editing) Delta_editing_df<-Delta_editing_df[order(as.numeric(Delta_editing_df$site)), ] Delta_editing_df$Mistmatch_names<-paste0(Delta_editing_df$site,Delta_editing_df$sample.ctrl,Delta_editing_df$desc ) Delta_editing_list<-split(Delta_editing_df, f=Delta_editing_df$Mistmatch_names) # Calculate distance from bulge and adjust Delta_editing_bulge<-mclapply(Delta_editing_list, mc.cores = 8, function(x){ Editing_levels<-as.numeric(x[1,1:44]) Distance_from_disruption<-(146-as.numeric(x[1,"site"])- as.numeric(gsub("A","",names(x[,1:44])))+0.5)*-1 Distance_from_Disruption_bulges_df<-data.frame(Distance_from_bulge=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Bulge.kinds=rep(x$desc, length(Editing_levels))) }) Delta_editing_bulge_df<-do.call(rbind,Delta_editing_bulge) bulge.labs <- c("Bulge T", "Bulge TTC", "Bulge TTCTT","Bulge TTCTTCT") names(bulge.labs) <- c("bulge-T", "bulge-TTC", "bulge-TTCTT","bulge-TTCTTCT") Delta_editing_bulge_df<-Delta_editing_bulge_df[!is.na(Delta_editing_bulge_df$Editing_level_A_to_I),] # Apply LOESS smoothing Loes_bulge_kind<-split(Delta_editing_bulge_df, f=Delta_editing_bulge_df$Bulge.kinds)## Loes_bulge_kind_list<-mclapply(Loes_bulge_kind, mc.cores = 8, function(x){## loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_bulge, data=x, span=0.11)### x$smoothed5 <- predict(loessMod50)## return(x)}) Delta_editing_bulge_df<-do.call(rbind,Loes_bulge_kind_list)### # Calculate quartiles ggplot_quartiles_T_bulges_list<-split(Delta_editing_bulge_df, f=Delta_editing_bulge_df$Bulge.kinds) Pyrimidine_bulges_mNG_quartiles<-mclapply(ggplot_quartiles_T_bulges_list, mc.cores = 8, function(x){ ADAR1_3nuc_mis<-x ADAR1_3nuc_mis_ADAR1<-split(ADAR1_3nuc_mis, f=ADAR1_3nuc_mis$Distance_from_bulge) ADAR1_3nuc_mis_ADAR2_list<-mclapply(ADAR1_3nuc_mis_ADAR1, mc.cores = 8, function(x){ df<-as.data.frame(x) quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1)))) colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1)) df<-cbind(df,quartiles_df) return(df) }) ADAR1_3nuc_mis_ADAR1_list_df<-do.call(rbind,ADAR1_3nuc_mis_ADAR2_list) }) Pyrimidine_bulges_mNG_quartiles_df<-do.call(rbind,Pyrimidine_bulges_mNG_quartiles) Pyrimidine_bulges_mNG_quartiles_df$ADAR<-z vertical.lines<-c(-35,-26,30) color_plot_ADAR<-brewer.pal(8, "Dark2") # Create and customize the summary plot per T-bulge kind p.1 <- ggplot(Pyrimidine_bulges_mNG_quartiles_df, aes(Distance_from_bulge, smoothed5))+ geom_ribbon(aes(ymin=Q_0.25,ymax=Q_0.75),alpha=0.15)+ geom_line(size=0.40,colour= "DarkBlue") p.1 <-p.1 +xlab("")+ylab("")+theme_classic() p.1 <-p.1 +theme(legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=6),plot.title = element_text(size=10,hjust = 0.5),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=7)) p.1 <-p.1 +geom_vline(size=0.5,xintercept = vertical.lines, color="#666699",alpha=0.6,linetype = "dashed")+ggtitle(unique(Pyrimidine_bulges_mNG_quartiles_df$ADAR))+facet_grid(~Bulge.kinds) plot(p.1) ######################################## mNG Sequence Preference ########## mNG_all_ADARs_function_seq_pref<-function(x,ADAR_ID){ df_A_G_POS_editing<-x perfect_ds_fw_strand<-("AGCCAATGGCGGCTAACTATCTGAAGAACCAGCCGATGTACGTGTTCCGTAAGACGGAGCTCAAGCACTCCAAGACCGAGCTCAACTTCAAGGAGTGGCAAAAGGCCTTTACCGATGTGATGGGCATGGACGAGCTGTACAAGTAA") Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$`Barcode ID`=="AAGGCCAT",1:39] Perfect_ds_t<-t(Perfect_ds) Perfect_ds_t<-as.data.frame(Perfect_ds_t) Perfect_ds_t$A_pos<-as.numeric(gsub("A","",rownames(Perfect_ds_t))) downstream_nuc<-mclapply(rownames(Perfect_ds_t),mc.cores = 8, function(z){ df<-as.data.frame(Perfect_ds_t[z,]) pos_A<-df$A_pos nuc_down<-substr(perfect_ds_fw_strand, start =pos_A, stop = pos_A+1 ) df<-cbind(df,nuc_down) colnames(df)<-c("Editing","A_position","Nuc_downstream") return(df) }) downstream_nuc_df_ggplot<-do.call(rbind,downstream_nuc) downstream_nuc_df_ggplot$ADAR<-ADAR_ID downstream_nuc_df_ggplot$Pos<-"Downstream" p1_nuc<-ggplot(downstream_nuc_df_ggplot, aes(x=reorder(Nuc_downstream,Editing, median),y=Editing))+geom_boxplot(outlier.shape = NA)+geom_jitter(position=position_jitter(0.2), color="blue")+ggtitle("Base after A")+xlab("")+theme_classic()+ylab("Editing Level") Perfect_ds_t<-Perfect_ds_t[-1,] upstream_nuc<-mclapply(rownames(Perfect_ds_t),mc.cores = 8, function(z){ df<-Perfect_ds_t[z,] pos_A<-df$A_pos nuc_up<-substr(perfect_ds_fw_strand, start =pos_A-1, stop = pos_A ) df<-cbind(df,nuc_up) colnames(df)<-c("Editing","A_position","Nuc_downstream") return(df) }) upstream_nuc_df_ggplot<-do.call(rbind,upstream_nuc) upstream_nuc_df_ggplot$ADAR<-ADAR_ID upstream_nuc_df_ggplot$Pos<-"Upstream" p2_nuc<-ggplot(upstream_nuc_df_ggplot, aes(x=reorder(Nuc_downstream,Editing, median),y=Editing))+geom_boxplot(outlier.shape = NA)+geom_jitter(position=position_jitter(0.2), color="blue")+ggtitle("Base before A")+xlab("")+ylab("Editing Level")+theme_classic() Pref_down_Upst<-rbind(upstream_nuc_df_ggplot,downstream_nuc_df_ggplot) Pref_down_Upst$library<-"mNG" return(Pref_down_Upst) #grid.arrange(p2_nuc,p1_nuc, ncol=2) } ADAR_seq_pref<-mNG_all_ADARs_function_seq_pref(df_A_G_POS_editing,z) output<-list(ADAR_seq_pref,df_A_G_POS_editing) return(output) } #ADAR2 # Define file paths fastq_file1 = "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS_S64_R1_001.fastq.gz" fastq_file2 = "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS_S64_R2_001.fastq.gz" # Set the random seed for reproducibility and Load and sample the Fastq files set.seed(123L) f1 <- FastqSampler(fastq_file1, n = 1e7) set.seed(123L) f2 <- FastqSampler(fastq_file2, n = 1e7) B2_R1 <- yield(f1) B2_R2 <- yield(f2) # Alternatively, if you prefer to work with the entire dataset rather than a sample, you can use the following lines to load the complete data files. # B2_R1 = readFastq("/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS_S64_R1_001.fastq.gz") # B2_R2 = readFastq("/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS_S64_R2_001.fastq.gz") ADAR2_Rep1_mNG<-mNG_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR2") # Define file paths fastq_file1 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_10_658_ADARs_B2_L_LRS/657_ADARs_mNG_library_ADAR1_KO_10_658_ADARs_B2_L_LRS_S73_R1_001.fastq.gz" fastq_file2 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_10_658_ADARs_B2_L_LRS/657_ADARs_mNG_library_ADAR1_KO_10_658_ADARs_B2_L_LRS_S73_R2_001.fastq.gz" # Set the random seed for reproducibility and Load and sample the Fastq files set.seed(123L) f1 <- FastqSampler(fastq_file1, n = 1e7) set.seed(123L) f2 <- FastqSampler(fastq_file2, n = 1e7) B2_R1 <- yield(f1) B2_R2 <- yield(f2) ADAR2_Rep2_mNG<-mNG_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR2") ADAR2_1_ggplot<-pivot_longer(ADAR2_Rep1_mNG[[2]], cols = starts_with("A"), values_to = "AtoI_Editing", names_to= "A_postion" ) ADAR2_1_ggplot$ID_ggplot<-paste0(ADAR2_1_ggplot$`Barcode ID`,ADAR2_1_ggplot$sample.ctrl,ADAR2_1_ggplot$A_postion) ADAR2_1_ggplot_df<-as.data.frame(ADAR2_1_ggplot) ADAR2_1_ggplot_df<-ADAR2_1_ggplot_df[,6:7] colnames(ADAR2_1_ggplot_df)<-c("ADAR2p_B2_Rep1_AtoI_Editing","ID_ggplot") ADAR2_2_ggplot<-pivot_longer(ADAR2_Rep2_mNG[[2]], cols = starts_with("A"),values_to = "AtoI_Editing", names_to= "A_postion" ) ADAR2_2_ggplot$ID_ggplot<-paste0(ADAR2_2_ggplot$`Barcode ID`,ADAR2_2_ggplot$sample.ctrl,ADAR2_2_ggplot$A_postion) ADAR2_2_ggplot_df<-as.data.frame(ADAR2_2_ggplot) ADAR2_2_ggplot_df<-ADAR2_2_ggplot_df[,6:7] colnames(ADAR2_2_ggplot_df)<-c("ADAR2p_B2_Rep2_AtoI_Editing","ID_ggplot") ADAR2_plasmid_B2_repetitions<-merge(ADAR2_1_ggplot_df,ADAR2_2_ggplot_df, by.x="ID_ggplot",by.y="ID_ggplot" ) ADAR2_plasmid_B2_repetitions<-as.data.frame(ADAR2_plasmid_B2_repetitions) p3<-ggplot(ADAR2_plasmid_B2_repetitions, aes(x=ADAR2p_B2_Rep1_AtoI_Editing,y=ADAR2p_B2_Rep2_AtoI_Editing))+geom_point(size = 0.1, color="deepskyblue4")+ geom_abline(intercept = 0, slope = 1, color="black")+xlim(0,100)+ylim(0,100)+ ggtitle("ADAR2")+theme_linedraw() p3<-p3+theme_classic()+theme(strip.background = element_blank(), strip.text.x = element_blank(), legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=7),plot.title = element_text(size=9,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 7),axis.text.y = element_text(colour="black",size=7)) p3<-p3+ annotate(geom="text", x=13, y=70, label=paste("p-value < 2.2e-16 "), color="black") Correlation_Rep<-cor.test(ADAR2_plasmid_B2_repetitions$ADAR2p_B2_Rep1_AtoI_Editing,ADAR2_plasmid_B2_repetitions$ADAR2p_B2_Rep2_AtoI_Editing) p3<-p3+ annotate(geom="text", x=12, y=60, label=paste("r=",round(Correlation_Rep$estimate,4)), color="black")+xlab("Overexpressed ADAR rep1")+ylab("Overexpressed ADAR rep2") plot(p3) #ADAR1 # Define file paths fastq_file1 = "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_2_658_ADARs_B2_D_LRS/657_ADARs_mNG_library_ADAR1_KO_2_658_ADARs_B2_D_LRS_S65_R1_001.fastq.gz" fastq_file2 = "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_2_658_ADARs_B2_D_LRS/657_ADARs_mNG_library_ADAR1_KO_2_658_ADARs_B2_D_LRS_S65_R2_001.fastq.gz" # Set the random seed for reproducibility and Load and sample the Fastq files set.seed(123L) f1 <- FastqSampler(fastq_file1, n = 1e7) set.seed(123L) f2 <- FastqSampler(fastq_file2, n = 1e7) B2_R1 <- yield(f1) B2_R2 <- yield(f2) ADAR1_Rep1_mNG<-mNG_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR1") # Define file paths fastq_file1 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_11_658_ADARs_B2_M_LRS/657_ADARs_mNG_library_ADAR1_KO_11_658_ADARs_B2_M_LRS_S74_R1_001.fastq.gz" fastq_file2 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_11_658_ADARs_B2_M_LRS/657_ADARs_mNG_library_ADAR1_KO_11_658_ADARs_B2_M_LRS_S74_R2_001.fastq.gz" # Set the random seed for reproducibility and Load and sample the Fastq files set.seed(123L) f1 <- FastqSampler(fastq_file1, n = 1e7) set.seed(123L) f2 <- FastqSampler(fastq_file2, n = 1e7) B2_R1 <- yield(f1) B2_R2 <- yield(f2) ADAR1_Rep2_mNG<-mNG_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR1") ADAR1_1_ggplot<-pivot_longer(ADAR1_Rep1_mNG[[2]], cols = starts_with("A"), values_to = "AtoI_Editing", names_to= "A_postion" ) ADAR1_1_ggplot$ID_ggplot<-paste0(ADAR1_1_ggplot$`Barcode ID`,ADAR1_1_ggplot$sample.ctrl,ADAR1_1_ggplot$A_postion) ADAR1_1_ggplot_df<-as.data.frame(ADAR1_1_ggplot) ADAR1_1_ggplot_df<-ADAR1_1_ggplot_df[,6:7] colnames(ADAR1_1_ggplot_df)<-c("ADAR1p_B2_Rep1_AtoI_Editing","ID_ggplot") ADAR1_2_ggplot<-pivot_longer(ADAR1_Rep2_mNG[[2]], cols = starts_with("A"),values_to = "AtoI_Editing", names_to= "A_postion" ) ADAR1_2_ggplot$ID_ggplot<-paste0(ADAR1_2_ggplot$`Barcode ID`,ADAR1_2_ggplot$sample.ctrl,ADAR1_2_ggplot$A_postion) ADAR1_2_ggplot_df<-as.data.frame(ADAR1_2_ggplot) ADAR1_2_ggplot_df<-ADAR1_2_ggplot_df[,6:7] colnames(ADAR1_2_ggplot_df)<-c("ADAR1p_B2_Rep2_AtoI_Editing","ID_ggplot") ADAR1_plasmid_B2_repetitions<-merge(ADAR1_1_ggplot_df,ADAR1_2_ggplot_df, by.x="ID_ggplot",by.y="ID_ggplot" ) ADAR1_plasmid_B2_repetitions<-as.data.frame(ADAR1_plasmid_B2_repetitions) p3<-ggplot(ADAR1_plasmid_B2_repetitions, aes(x=ADAR1p_B2_Rep1_AtoI_Editing,y=ADAR1p_B2_Rep2_AtoI_Editing))+geom_point(size = 0.1, color="deepskyblue4")+ geom_abline(intercept = 0, slope = 1, color="black")+xlim(0,100)+ylim(0,100)+ ggtitle("ADAR1")+theme_linedraw() p3<-p3+theme_classic()+theme(strip.background = element_blank(), strip.text.x = element_blank(), legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=7),plot.title = element_text(size=9,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 7),axis.text.y = element_text(colour="black",size=7)) p3<-p3+ annotate(geom="text", x=13, y=70, label=paste("p-value < 2.2e-16 "), color="black") Correlation_Rep<-cor.test(ADAR1_plasmid_B2_repetitions$ADAR1p_B2_Rep1_AtoI_Editing,ADAR1_plasmid_B2_repetitions$ADAR1p_B2_Rep2_AtoI_Editing) p3<-p3+ annotate(geom="text", x=12, y=60, label=paste("r=",round(Correlation_Rep$estimate,4)), color="black")+xlab("Overexpressed ADAR rep1")+ylab("Overexpressed ADAR rep2") plot(p3) # Sequence preference function(example for ADAR2) # Combine data frames from B2 and mNG libraries regarding sequence preference All_ADARs_seq_pref_mNG_df<- ADAR2_Rep1_mNG[[1]] All_ADARs_seq_pref_mNG_df<-All_ADARs_seq_pref_mNG_df[,colnames(ADAR2_Rep1_B2[[1]])] All_ADARs_seq_pref_mNG_B2_df<- rbind(ADAR2_Rep1_B2[[1]],All_ADARs_seq_pref_mNG_df) # Process upstream and downstream data All_ADARs_seq_pref_mNG_B2_df_downs<-All_ADARs_seq_pref_mNG_B2_df[All_ADARs_seq_pref_mNG_B2_df$Pos=="Upstream",] All_ADARs_seq_pref_mNG_B2_df_downs$Nucleotide<-do.call(rbind,str_split(All_ADARs_seq_pref_mNG_B2_df_downs$Nuc_downstream, ""))[,-2] All_ADARs_seq_pref_mNG_B2_df_ups<-All_ADARs_seq_pref_mNG_B2_df[All_ADARs_seq_pref_mNG_B2_df$Pos=="Downstream",] All_ADARs_seq_pref_mNG_B2_df_ups$Nucleotide<-do.call(rbind,str_split(All_ADARs_seq_pref_mNG_B2_df_ups$Nuc_downstream, ""))[,-1] # Combine upstream and downstream data All_ADARs_seq_pref_mNG_B2_df_ups_down<-rbind(All_ADARs_seq_pref_mNG_B2_df_downs,All_ADARs_seq_pref_mNG_B2_df_ups) All_ADARs_seq_pref_mNG_B2_df_ups_down$Nucleotide<-factor(All_ADARs_seq_pref_mNG_B2_df_ups_down$Nucleotide, levels = c("G","C","T","A")) # Plot upstream data All_ADARs_seq_pref_mNG_B2_df_ups<-All_ADARs_seq_pref_mNG_B2_df_ups_down[All_ADARs_seq_pref_mNG_B2_df_ups_down$Pos=="Upstream",] p.1_ups <- ggplot(All_ADARs_seq_pref_mNG_B2_df_ups, aes(x=Nucleotide,y=Editing,color=Nucleotide))+geom_boxplot(outlier.size = 0.5,position=position_dodge(0.9)) p.1_ups<-p.1_ups +xlab("")+ylab("Editing %")+theme_classic() p.1_ups<-p.1_ups +theme(strip.background = element_blank(),strip.text.y = element_blank(),strip.text.x = element_text(size = 8), legend.title=element_blank(),legend.position="none",legend.text = element_text(size=8),plot.title = element_text(size=8,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=8,angle = 0, vjust = 0.5, hjust=0.5),axis.title = element_text(face = "bold",size = 10),axis.text.y = element_text(colour="black",size=8)) p.1_ups <-p.1_ups +ggtitle("5'")+ scale_color_brewer(palette="Set1") # Process downstream data All_ADARs_seq_pref_mNG_B2_df_downs<-All_ADARs_seq_pref_mNG_B2_df_ups_down[All_ADARs_seq_pref_mNG_B2_df_ups_down$Pos=="Downstream",] All_ADARs_seq_pref_mNG_B2_df_downs$Nucleotide<-factor(All_ADARs_seq_pref_mNG_B2_df_downs$Nucleotide, levels = c("G","C","T","A")) # Plot downstream data p.1_downs <- ggplot(All_ADARs_seq_pref_mNG_B2_df_downs, aes(x=Nucleotide,y=Editing,color=Nucleotide))+geom_boxplot(outlier.size = 0.5,position=position_dodge(0.9)) p.1_downs<-p.1_downs +xlab("")+ylab("")+theme_classic() p.1_downs<-p.1_downs +theme(strip.background = element_blank(),strip.text.y = element_blank(),strip.text.x = element_text(size = 8), legend.title=element_blank(),legend.position="none",legend.text = element_text(size=8),plot.title = element_text(size=8,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=8,angle = 0, vjust = 0.5, hjust=0.5),axis.title = element_text(face = "bold",size = 10),axis.text.y = element_text(colour="black",size=8)) p.1_downs <-p.1_downs +ggtitle("3'")+ scale_color_brewer(palette="Set1") # Arrange and display the plots grid.arrange(p.1_ups,p.1_downs, ncol=2) ############################## Periodicity: ADAR2 Variable Arm (Bottom Arm) ########################## # Define file paths B2_R1 = readFastq("/LongRead_ADARs_B2_mNG_lib_KO_cells/658_660_ADARs_B2_Variable_arm_G_LRS_ADAR1_mut_KO_cells_LRS_B2_lib_7/658_660_ADARs_B2_Variable_arm_G_LRS_ADAR1_mut_KO_cells_LRS_B2_lib_7_S90_R1_001.fastq.gz") B2_R2 = readFastq("/LongRead_ADARs_B2_mNG_lib_KO_cells/658_660_ADARs_B2_Variable_arm_G_LRS_ADAR1_mut_KO_cells_LRS_B2_lib_7/658_660_ADARs_B2_Variable_arm_G_LRS_ADAR1_mut_KO_cells_LRS_B2_lib_7_S90_R2_001.fastq.gz") # Name of ADAR data to be analyzed z<-"ADAR2" # Read B2_R1 sequence data and add a column for row numbers B2_R1_sequence<-as.data.frame(sread(B2_R1)) B2_R1_sequence$number_read<-1:nrow(B2_R1_sequence) Total_Number_of_Reads_R1<-nrow(B2_R1_sequence) # Filter rows base on sequence on the loop B2_R1_sequence<-B2_R1_sequence[grepl("CCTTCGAA",B2_R1_sequence$x),] B2_R1_sequence$B2_sequence<-gsub(".*CCTTCGAA","" ,B2_R1_sequence$x) number_reads_beggining_properly<- nrow(B2_R1_sequence) colnames(B2_R1_sequence)<-c("non_processed_R1","number_read","B2_constructR1") B2_R1_sequence_1<-B2_R1_sequence[nchar(B2_R1_sequence$B2_constructR1)>=127,] B2_R1_sequence<-B2_R1_sequence_1[grep("TCCCTCACTACCCTCAACCCA",B2_R1_sequence_1$B2_constructR1),] # Calculate the percentage of reads beginning properly Percentage_reads_beggining_properly<- (number_reads_beggining_properly/ Total_Number_of_Reads_R1)*100 # Extract barcode and B2 variable arm sequence B2_R1_sequence$barcode<-gsub("\\TCCC.*","",B2_R1_sequence$B2_constructR1) B2_R1_sequence$B2_constructR1<-gsub(".*TCCCTCACTACCCTCAACCCA","",B2_R1_sequence$B2_constructR1) # Read Barcodes CSV file and Filter rows for B2 barcodes ## Read and filter Barcodes. This table can be found in Uzonyi et al.(2021): Table S2. Sequences of the B2 and mNG oligo library pools, related to STAR Methods. Barcodes_csv<-read.csv("/DATA_HELA_ADAR2_over_ADAR1_knockdown_B2_libr/Table_S2_Twist_library_sequence_plans.csv", sep = ",", header = TRUE) Barcodes_csv<-Barcodes_csv[Barcodes_csv$B2.mNG=="B2",] Barcodes_csv$site<-as.character(Barcodes_csv$site) # Merge B2_R1_sequence with Barcodes_csv based on 'barcode' B2_R1_sequence_Barcode_ID<-merge(B2_R1_sequence,Barcodes_csv,by.x="barcode", by.y="barcode" ) # Read B2_R2 sequence data and set row names and number_read B2_R2_sequence<-as.data.frame(sread(B2_R2)) rownames(B2_R2_sequence)<-1:nrow(B2_R2_sequence) B2_R2_sequence$number_read<-1:nrow(B2_R2_sequence) Total_Number_of_Reads_R2<-nrow(B2_R2_sequence) # Read reverse complement of B2_R2 sequence B2_R2_sequence_reverse_comp<-as.data.frame(sread(reverseComplement(B2_R2))) B2_R2_sequence_reverse_comp$number_read<-1:nrow(B2_R2_sequence_reverse_comp) # Filter rows containing "GGCGCGC" and Extract B2_constructR2 B2_R2_sequence_reverse_comp<-B2_R2_sequence_reverse_comp[grepl("GGCGCGC",B2_R2_sequence_reverse_comp$x),] B2_R2_sequence_reverse_comp$B2_constructR2<-gsub("\\GGCGCGC.*","",B2_R2_sequence_reverse_comp$x) # Merge B2_R1 and B2_R2 sequences based on 'number_read' B2_df_R1_R2<-merge(B2_R1_sequence_Barcode_ID,B2_R2_sequence_reverse_comp, by.x="number_read", by.y="number_read", all=F) # Calculate the 'Expected_length' of sequences B2_df_R1_R2$Expected_length<-nchar(B2_df_R1_R2$seq) # Filter rows with B2_constructR1 longer than 98 characters B2_df_R1_R2<-B2_df_R1_R2[nchar(B2_df_R1_R2$B2_constructR1)>98, ] B2_df_R1_R2$B2_constructR1<-substr(B2_df_R1_R2$B2_constructR1, start = 1,stop = 98) rownames(B2_df_R1_R2)<-1:nrow(B2_df_R1_R2) # Calculate the 'length_required for R2' for B2 sequences B2_df_R1_R2$length_required_R2<-B2_df_R1_R2$Expected_length-98 # Filter rows with B2_constructR2 longer than 60 characters B2_df_R1_R2<-B2_df_R1_R2[nchar(B2_df_R1_R2$B2_constructR2)>60,] B2_df_R1_R2$B2_constructR2<-str_sub(B2_df_R1_R2$B2_constructR2, start= -60) # Split the data by 'length_required_R2' B2_df_R1_R2_list<-split(B2_df_R1_R2, f=B2_df_R1_R2$length_required_R2) # Process and combine sequences based on the expected length Whole_B2_df_constructs_list<-mclapply(names(B2_df_R1_R2_list),mc.cores = 8,function(x){ df<-B2_df_R1_R2_list[[x]] df$B2_constructR2<-str_sub(df$B2_constructR2, start= -as.numeric(x)) df$whole_construct<-paste0(df$B2_constructR1,df$B2_constructR2) return(df) }) Whole_B2_df_constructs_df<-do.call(rbind,Whole_B2_df_constructs_list) # Filter rows for retrieving mismatc-carrying constructs Whole_B2_df_constructs_df<-Whole_B2_df_constructs_df[Whole_B2_df_constructs_df$desc=="mismatch1"|Whole_B2_df_constructs_df$desc=="mismatch2"|Whole_B2_df_constructs_df$desc=="mismatch3"|Whole_B2_df_constructs_df$desc=="mismatch4"|Whole_B2_df_constructs_df$desc=="perfect_ds",] B2_df_R1_R2_list<-split(Whole_B2_df_constructs_df,f=Whole_B2_df_constructs_df$barcode) # Process the data based on the T and C positions and create 'Final_df' cured_aligment_whole_contruct<-mclapply(B2_df_R1_R2_list, mc.cores = 8,function(z){ rownames(z)<-paste0("R",1:nrow(z)) design_seq_const<-unique(z$seq) T_positions<-unlist(gregexpr('T', design_seq_const)) Nuc_matrix<-do.call(rbind,str_split(z$whole_construct,"")) Nuc_matrix<-as.data.frame(Nuc_matrix) rownames(Nuc_matrix)<-rownames(z) T_pos_analized<-Nuc_matrix[,paste0("V",T_positions)] rownames(T_pos_analized)<-rownames(z) T_pos_analized$Combined_cols <- do.call(paste, c(T_pos_analized[,1:ncol(T_pos_analized)], sep = "")) T_string_compared<-paste0(rep("T",ncol(T_pos_analized)-1), collapse = "") T_pos_analized<-T_pos_analized[grepl(T_string_compared,T_pos_analized$Combined_cols),] Final_df<-z[rownames(T_pos_analized),] return(Final_df) }) Cured_aligment_whole_contruct_C<-mclapply(cured_aligment_whole_contruct, mc.cores = 8,function(z){ design_seq_const<-unique(z$seq) T_positions<-unlist(gregexpr('C', design_seq_const)) Nuc_matrix<-do.call(rbind,str_split(z$whole_construct,"")) Nuc_matrix<-as.data.frame(Nuc_matrix) rownames(Nuc_matrix)<-rownames(z) colnames(Nuc_matrix)<-gsub("V","",colnames(Nuc_matrix)) T_pos_analized<-Nuc_matrix[,as.character(T_positions)] rownames(T_pos_analized)<-rownames(z) T_pos_analized$Combined_cols <- do.call(paste, c(T_pos_analized[,colnames(T_pos_analized)], sep = "")) T_string_compared<-paste0(rep("C",ncol(T_pos_analized)-1), collapse = "") T_pos_analized<-T_pos_analized[grepl(T_string_compared,T_pos_analized$Combined_cols),] Final_df<-z[rownames(T_pos_analized),] return(Final_df) }) B2_df_R1_R2_final<-do.call(rbind,Cured_aligment_whole_contruct_C) # Split data by 'barcode' B2_df_R1_R2_list<-split(B2_df_R1_R2_final,f=B2_df_R1_R2_final$barcode) # # Compute consensus matrices per construct concensus_matrix_per_barcode<-mclapply(B2_df_R1_R2_list,mc.cores = 8, function(w){ Sequence_barcode_StringSet<-DNAStringSet(w$whole_construct,use.names=TRUE) consensus_matrix_barcode<-as.data.frame(t(consensusMatrix(Sequence_barcode_StringSet)))[,c(1:4,15:16)] }) # Compute A-to-G rate per construct concensus_matrix_per_barcode_A_to_G<-mclapply(concensus_matrix_per_barcode,mc.cores = 8, function(y){ y$position<-1:nrow(y) y$A_to_G_rate<- y$G/(y$A+y$G)*100 return(y) }) # Define the A positions on "Arm perfect double-stranded reporter" A_positions_B2<-c(14,16,18,33,50,51,55,62,64,67,79,80,84,86,87,88,110,119,122,123,124,135,138) # Define a function to select the eiditng level on target positions concensus_matrix_per_barcode_A_to_G_target_pos<- mclapply(names(concensus_matrix_per_barcode_A_to_G),mc.cores=8,function(z){ df_A_POS<- as.data.frame(t(as.data.frame(concensus_matrix_per_barcode_A_to_G[[z]][A_positions_B2,"A_to_G_rate"]))) df_A_POS$Barcode_names<-names(concensus_matrix_per_barcode_A_to_G[z]) colnames(df_A_POS)<- c(paste0("A",A_positions_B2),"Barcode ID" ) rownames(df_A_POS)<-names(concensus_matrix_per_barcode_A_to_G[z]) return(df_A_POS) }) df_A_G_POS_editing<-do.call(rbind,concensus_matrix_per_barcode_A_to_G_target_pos) # Merge with Barcodes_csv and select desired columns Disruption_df<-merge(df_A_G_POS_editing,Barcodes_csv[,c("barcode","desc", "site", "B2.mNG", "sample.ctrl")], by.x="Barcode ID", by.y="barcode" ) df_A_G_POS_editing<-Disruption_df[,c(2:24,1,25:28)] # Filter rows based on 'desc' and 'sample.ctrl' to rerieve the mismatch-carrying constructs Disruption_df<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mismatch1" | df_A_G_POS_editing$desc=="mismatch2" | df_A_G_POS_editing$desc=="mismatch3" | df_A_G_POS_editing$desc=="mismatch4",] Disruption_df<-Disruption_df[Disruption_df$sample.ctrl=="sample",] Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="perfect_ds",] # Calculate delta editing Delta_editing<- mclapply(rownames(Disruption_df),mc.cores = 8, function(x){ Delta_editing_per_construct<-Disruption_df[x,1:23]-Perfect_ds[1,1:23] Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_df[x,24:28]) return(Delta_editing_per_construct) } ) Delta_editing_df<-do.call(rbind,Delta_editing) Delta_editing_df<-Delta_editing_df[order(as.numeric(Delta_editing_df$site)), ] Delta_editing_df$Mistmatch_names<-paste0(Delta_editing_df$site,Delta_editing_df$sample.ctrl,Delta_editing_df$desc ) Delta_editing_list<-split(Delta_editing_df, f=Delta_editing_df$Mistmatch_names) # Calculate distance from the disruption Delta_editing_mistmatch<-mclapply(Delta_editing_list, mc.cores = 8, function(x){ Editing_levels<-as.numeric(x[1,1:23]) Distance_from_disruption<- as.numeric(gsub("A","",names(x[,1:23])))-as.numeric(x[1,"site"])-1 Distance_from_disruption_df<-data.frame(Distance_from_mistmatch=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Mismatch_kind=rep(x$desc, length(Editing_levels))) }) Delta_editing_mistmatch_df<-do.call(rbind,Delta_editing_mistmatch) # Split the data based on the mismatch kind to adust the proper distance Delta_editing_mistmatch_df_1_3_list<-split(Delta_editing_mistmatch_df,f=Delta_editing_mistmatch_df$Mismatch_kind) Delta_editing_mistmatch_1<-Delta_editing_mistmatch_df_1_3_list[[1]] Delta_editing_mistmatch_3<-Delta_editing_mistmatch_df_1_3_list[[3]] Delta_editing_mistmatch_3$Distance_from_mistmatch<-Delta_editing_mistmatch_3$Distance_from_mistmatch-1 Delta_editing_mistmatch_df<-rbind(Delta_editing_mistmatch_1,Delta_editing_mistmatch_3) Delta_editing_mistmatch_df<-Delta_editing_mistmatch_df[!is.na(Delta_editing_mistmatch_df$Editing_level_A_to_I),] # Create a mapping for mismatch labels mismatch.labs <- c("Mismatch 1 nucleotide", "Mismatch 3 nucleotides") names(mismatch.labs) <- c("mismatch1", "mismatch3") # Split the data by mismatch kind Delta_editing_mistmatch_df_list<-split(Delta_editing_mistmatch_df,f=Delta_editing_mistmatch_df$Mismatch_kind) # Apply LOESS smoothing to each subset Delta_editing_mistmatch_df_list_LOESS<- mclapply(Delta_editing_mistmatch_df_list, mc.cores = 8, function(z){ loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_mistmatch, data=z, span=0.05) z$smoothed5 <- predict(loessMod50) return(z) predict(loessMod50) }) Delta_editing_mistmatch_df<-do.call(rbind,Delta_editing_mistmatch_df_list_LOESS) # Filter data for mismatch3 ADAR1_ADAR2_3nuc_mis_list_df<-Delta_editing_mistmatch_df[Delta_editing_mistmatch_df$Mismatch_kind=="mismatch3",] # Split data based on Distance from mistmatch ADAR2nuc_mis_ADAR1<-split(ADAR1_ADAR2_3nuc_mis_list_df, f=ADAR1_ADAR2_3nuc_mis_list_df$Distance_from_mistmatch) # Calculate quartiles per distance ADAR1_3nuc_mis_ADAR2_list<-mclapply(ADAR2nuc_mis_ADAR1, mc.cores = 8, function(x){ df<-as.data.frame(x) quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1)))) colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1)) df<-cbind(df,quartiles_df) return(df)}) ADAR2_3nuc_mis_Var_Arm_list_df<-do.call(rbind,ADAR1_3nuc_mis_ADAR2_list) ADAR2_3nuc_mis_Var_Arm_list_df$Editing_level_A_to_I<-as.numeric(ADAR2_3nuc_mis_Var_Arm_list_df$Editing_level_A_to_I) vertical.lines<-c(-26,30,-2,-13) # Create and customize the summary plot for 3nt-mismatch carrying constructs p.1 <- ggplot(ADAR2_3nuc_mis_Var_Arm_list_df, aes(Distance_from_mistmatch, smoothed5))+ geom_line(size=0.75, colour="Blue")+ geom_ribbon(aes(ymin=Q_0.25,ymax=Q_0.75),alpha=0.3) p.1 <-p.1 +xlab("Distance from mistmatch")+ylab("Delta editing")+theme_classic() p.1 <-p.1 +theme(strip.background = element_blank(), strip.text.x = element_blank(), legend.title=element_blank(),legend.text = element_text(size=11),plot.title = element_text(size=14,face = "bold",),axis.text.x = element_text(colour="black",size=11,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=11)) p.1 <-p.1 +geom_vline(size=0.5,xintercept = vertical.lines, color="#666699",alpha=0.6,linetype = "dashed")+ggtitle("B2 Variable arm: mismatch3 ")#+facet_grid(~ADAR) p.1<-p.1+scale_x_continuous(n.breaks = 12)+scale_y_continuous(n.breaks = 8, limits=c(-80,15)) plot(p.1)