#Supplementary Data 2. The R script for analyzing NGS sequencing data from the B2 and mNG oligo libraries
#transfected along with ADAR-specific plasmid in ADAR1 KO-HEK293T cell lines

# Dissecting the basis for differential substrate specificity of ADAR1 and ADAR2 
#Author list
#Marlon S. Zambrano-Mila, Monika Witzenberger, Zohar Rosenwasser, Anna Uzonyi, Ronit Nir, Shay Ben-Aroya, Erez Y. Levanon, Schraga Schwartz


#Data analysis of NGS data 
#Fastq files were assessed by a custom R script (Supplementary Data 2). The read-filtering process removed reads
#containing wrong start and end, lacking the established barcodes, and misaligning at adenosine positions. 
#Read 1 and 2 were merged into a single sequence by custom truncation and matching. For each barcode, the editing percentage
#was quantified as (G/(A+G))*100 at each adenosine position. Δ editing was calculated as the difference of editing levels
#at adenosine positions between each structurally altered sequence and perfect-double stranded construct,respectively


# Load necessary library
library(ShortRead)
library(Biostrings)
library(parallel)
library(ggplot2)
library(Biostrings)
library(pheatmap)
library(EnvStats)
library(purrr)
library(RVenn)
library(ggplot2)
library(gridExtra)
library(ggvenn)
library(egg)
library("GGally")
library(stringr)
library(tidyverse)
library("RColorBrewer")


############################### B2 library ###################################
B2_library_A_to_G_analysis<-function(B2_R1_readFastaq,B2_R2_readFastaq,z){

# Process Read1.fastq files
B2_R1_sequence<-as.data.frame(sread(B2_R1))
B2_R1_sequence$number_read<-1:nrow(B2_R1_sequence)
Total_Number_of_Reads_R1<-nrow(B2_R1_sequence)

#Sequences in Read1.fastq are filtered and processed to extract the "B2_sequence," including filtering sequences that start with "GCCGGG" and counting the number of properly beginning reads. 
B2_R1_sequence<-B2_R1_sequence[grepl("GCCGGGC",B2_R1_sequence$x),]
B2_R1_sequence$B2_sequence<-gsub(".*AAACTAGT","" ,B2_R1_sequence$x)
B2_R1_sequence<-B2_R1_sequence[grepl("^GCCGGG",B2_R1_sequence$B2_sequence),]
number_reads_beggining_properly<- nrow(B2_R1_sequence)
colnames(B2_R1_sequence)<-c("non_processed_R1","number_read","B2_constructR1")
B2_R1_sequence<-B2_R1_sequence[nchar(B2_R1_sequence$B2_constructR1)>=82,]
B2_R1_sequence$B2_constructR1<-substr(B2_R1_sequence$B2_constructR1, start = 1,stop = 82)
Percentage_reads_beggining_properly<- (number_reads_beggining_properly/ Total_Number_of_Reads_R1)*100

# Process Read2.fastq files
B2_R2_sequence <- as.data.frame(sread(B2_R2))
B2_R2_sequence$number_read <- 1:nrow(B2_R2_sequence)
Total_Number_of_Reads_R2 <- nrow(B2_R2_sequence)

# Reverse complement and filter sequences
B2_R2_sequence_reverse_comp <- as.data.frame(sread(reverseComplement(B2_R2)))
B2_R2_sequence_reverse_comp$number_read <- 1:nrow(B2_R2_sequence_reverse_comp)

# Read and filter Barcodes. This table can be found in Uzonyi et al.(2021): Table S2. Sequences of the B2 and mNG oligo library pools, related to STAR Methods.
Barcodes_csv <- read.csv("/DATA_HELA_ADAR2_over_ADAR1_knockdown_B2_libr/Table_S2_Twist_library_sequence_plans.csv", sep = ",", header = TRUE)
Barcodes_csv <- Barcodes_csv[Barcodes_csv$B2.mNG == "B2", ]
Barcodes_csv$site <- as.character(Barcodes_csv$site)

# Further filter B2_R2_sequence
B2_R2_sequence <- B2_R2_sequence_reverse_comp[grepl("TCCCTC|TTCGAA", B2_R2_sequence_reverse_comp$x), ]
B2_R2_sequence$barcode <- gsub(".*TTCGAA|\\TCCC.*", "", B2_R2_sequence$x)
B2_R2_sequence<-B2_R2_sequence[grep("CCCCTAA",B2_R2_sequence$x),]
B2_R2_sequence$B2_constructR2<-gsub("\\CCCCTAA.*","",B2_R2_sequence$x)


# Filter based on sequence and barcode length
B2_R2_sequence<-B2_R2_sequence[nchar(B2_R2_sequence$B2_constructR2)>=63,]
B2_R2_sequence$B2_constructR2<- substr(B2_R2_sequence$B2_constructR2, start =nchar(B2_R2_sequence$B2_constructR2)-63,stop = nchar(B2_R2_sequence$B2_constructR2))
B2_R2_sequence<-B2_R2_sequence[nchar(B2_R2_sequence$barcode)==8,]


# Merge B2_R2_sequence with Barcodes_csv based on the 'barcode' column
B2_R2_sequence_Barcode_ID <- merge(B2_R2_sequence, Barcodes_csv, by.x = "barcode", by.y = "barcode")

# Rename the columns for clarity
colnames(B2_R2_sequence_Barcode_ID) <- c("barcode", "No_processed_Read2", "number_read", "B2_constructR2", "ID", "Buffer", "F", "BstBI", "loop", "seq", "AscI", "R", "desc", "site", "B2.mNG", "sample.ctrl", "Total_length", "Buffer_length")

# Calculate the number of unique barcodes
Number_of_Barcodes <- length(unique(Barcodes_csv$barcode))

# Calculate the total number of reads with proper barcodes and the percentage
Reads_properly_barcodes <- nrow(B2_R2_sequence_Barcode_ID)
Reads_properly_barcodes_percent <- (Reads_properly_barcodes / Total_Number_of_Reads_R2) * 100

# Merge B2_R1_sequence with B2_R2_sequence_Barcode_ID based on 'number_read'
B2_df_R1_R2 <- merge(B2_R1_sequence, B2_R2_sequence_Barcode_ID, by.x = "number_read", by.y = "number_read", all = FALSE)

# Combine B2_constructR1 and B2_constructR2 into Whole_Construct
B2_df_R1_R2$Whole_Construct <- paste0(B2_df_R1_R2$B2_constructR1, B2_df_R1_R2$B2_constructR2)

# Filter based on the length of Whole_Construct
B2_df_R1_R2 <- B2_df_R1_R2[nchar(B2_df_R1_R2$Whole_Construct) == 146, ]

# Define the perfect DNA strand sequence
perfect_ds_fw_strand<-("GCCGGGCGTGGTGGCACACGCCTTTAATCCCAGCACTCGGGAGGCAGAGGCAGGCAGATTTCTGAGTTGGAGGCCAGCCTGGTCTACAAAGTGAGTTCCAGGACAGCCAGAGCTACACAGAGAAACCCTGTCTCGAAAAACCAAAA")

# Generate row names for B2_df_R1_R2
rownames(B2_df_R1_R2)<-paste0("R",1:nrow(B2_df_R1_R2))
design_seq_const<-perfect_ds_fw_strand

# Extract A_G positions from the perfect DNA strand
A_G_positions<-unlist(gregexpr('A', design_seq_const))
Nuc_matrix<-do.call(rbind,str_split(B2_df_R1_R2$Whole_Construct,""))
Nuc_matrix<-as.data.frame(Nuc_matrix)
rownames(Nuc_matrix)<-rownames(B2_df_R1_R2)
AG_pos_analized<-Nuc_matrix[,paste0("V",A_G_positions)]
rownames(AG_pos_analized)<-rownames(B2_df_R1_R2)
AG_pos_analized$Combined_cols <- do.call(paste, c(AG_pos_analized[1:ncol(AG_pos_analized)], sep = ""))

# Filter A ositions that do not contain 'C' or 'T' in Combined_cols
AG_pos_analized<-AG_pos_analized[!grepl("C|T",AG_pos_analized$Combined_cols),]
B2_df_R1_R2<-B2_df_R1_R2[rownames(AG_pos_analized),]


# Calculate the number of reads with proper beginning and proper barcodes, and the percentage
Reads_R1_R2_proper_beginning_proper_barcode <- nrow(B2_df_R1_R2)
Reads_R1_R2_proper_beginning_proper_barcode_percent <- (Reads_R1_R2_proper_beginning_proper_barcode / Total_Number_of_Reads_R1) * 100

# Create a data frame for quality control metrics
QC_control_df1 <- data.frame(
  Total_Number_of_Reads_R1,
  number_reads_beggining_properly,
  Percentage_reads_beggining_properly,
  Total_Number_of_Reads_R2,
  Reads_properly_barcodes,
  Reads_properly_barcodes_percent,
  Reads_R1_R2_proper_beginning_proper_barcode,
  Reads_R1_R2_proper_beginning_proper_barcode_percent
)

# Calculate quantiles of the barcode counts
Quantiles_number_read_per_barcode <- quantile(table(B2_df_R1_R2$barcode), c(0.01, 0.1, 0.9, 0.99))

# Combine QC_control_df1 with the quantiles and set column names
QC_control_df <- cbind(QC_control_df1, t(as.data.frame(Quantiles_number_read_per_barcode)))
colnames(QC_control_df) <- c("Total Number of Reads_R1", "Number of Reads beginning properly", "% of Reads beginning properly", "Number of Reads_R2", "Number of Reads beginning properly barcodes_R2", "% of Reads with proper barcodes_R2", "Reads R1 R2 with proper beginning and proper barcodes", "% of Reads_R1_R2 with proper beginning and proper barcodes", "1%", "10%", "90%", "99%")

# Create an empirical cumulative distribution plot
plot(ecdf(log10(table(B2_df_R1_R2$barcode))),
     xlab = "Log10  of number of reads in each barcode",
     pch = 20,
     main = "ECD: Log10 N° of reads in each barcode",
     yaxt = "n"
)
axis(2, at = seq(0, 1, by = 0.1), las = 2)
abline(v = log10(6), col = "Red")


# Select the desired columns and split by barcode
B2_df_R1_R2_list <- split(B2_df_R1_R2[, c("number_read", "Whole_Construct", "barcode", "desc", "site", "B2.mNG", "sample.ctrl")], f = B2_df_R1_R2$barcode)

Whole_const_Seq<-toupper("GCCGGGCGTGGTGGCACACGCCTTTAATCCCAGCACTCGGGAGGCAGAGGCAGGCAGATTTCTGAGTTGGAGGCCAGCCTGGTCTACAAAGTGAGTTCCAGGACAGCCAGAGCTACACAGAGAAACCCTGTCTCGAAAAACCAAAA")


# Extract the A positions from the perfect double-stranded sequence
A_positions_Seq <- unlist(gregexpr('A', Whole_const_Seq))

# Create a list of A positions in each molecule for the specified barcode
A_positions_list <- lapply(B2_df_R1_R2_list[["CTGTAGAC"]]$Whole_Construct, function(molecule) {
  unlist(strsplit(molecule, ""))[A_positions_Seq]
})

# Create a data frame to store the number of edits per molecule
Number_of_edits_per_molecule_df <- data.frame(
  Molecule = 1:length(A_positions_list),
  Edits = sapply(A_positions_list, function(positions) sum(positions == "G"))
)

# Create a boxplot of the number of edits
library(ggplot2)
p <- ggplot(Number_of_edits_per_molecule_df, aes(x = factor(1), y = Edits)) +
  geom_boxplot(fill = "lightblue", width = 0.5) +
  labs(x = NULL, y = "# Edits per Molecule") +
  theme_minimal() +
  ggtitle("B2")

p <- p + theme(
  legend.position = "none",
  panel.border = element_rect(color = "black", fill = NA, size = 0.5),
  axis.text.x = element_blank(),
  axis.title = element_text(face = "bold", size = 12),
  axis.text.y = element_text(colour = "black", size = 11)
)

plot(p)

# Number of analyzed barcodes
Number_of_analyzed_barcodes <- length(B2_df_R1_R2_list)

# Create a consensus matrix for each barcode
concensus_matrix_per_barcode <- mclapply(
  B2_df_R1_R2_list, mc.cores = 12, function(w) {
    Sequence_barcode_StringSet <- DNAStringSet(w$Whole_Construct, use.names = TRUE)
    consensus_matrix_barcode <- as.data.frame(t(consensusMatrix(Sequence_barcode_StringSet)))[, c(1:4, 15:16)]
  }
)

# Calculate A-to-G rate for each position in the consensus matrix
concensus_matrix_per_barcode_A_to_G <- mclapply(
  concensus_matrix_per_barcode, mc.cores = 12, function(y) {
    y$position <- 1:nrow(y)
    y$A_to_G_rate <- (y$G / (y$A + y$G)) * 100
    return(y)
  }
)

# Extract A-to-G rates at A positions
concensus_matrix_per_barcode_A_to_G_target_pos <- mclapply(
  names(concensus_matrix_per_barcode_A_to_G), mc.cores = 12, function(z) {
    df_A_POS <- as.data.frame(t(as.data.frame(concensus_matrix_per_barcode_A_to_G[[z]][A_positions_Seq, "A_to_G_rate"])))
    df_A_POS$Barcode_names <- names(concensus_matrix_per_barcode_A_to_G[z])
    colnames(df_A_POS) <- c(paste0("A", A_positions_Seq), "Barcode ID")
    rownames(df_A_POS) <- names(concensus_matrix_per_barcode_A_to_G[z])
    return(df_A_POS)
  }
)

# Combine the results
df_A_G_POS_editing <- do.call(rbind, concensus_matrix_per_barcode_A_to_G_target_pos)

# Match with barcode data
matched_index <- match(df_A_G_POS_editing$`Barcode ID`, Barcodes_csv$barcode)
df_A_G_POS_editing <- cbind(df_A_G_POS_editing, Barcodes_csv[matched_index, c("desc", "sample.ctrl", "site")])

# Correlation of editing levels
ADAR2_1_barcodes <- pivot_longer(df_A_G_POS_editing, cols = starts_with("A"), values_to = "AtoI_Editing", names_to = "A_position")
ADAR2_1_control_barcodes <- ADAR2_1_barcodes[ADAR2_1_barcodes$sample.ctrl == "barcode_ctrl", ]
ADAR2_1_control_barcodes$merging_column <- paste0(
  ADAR2_1_control_barcodes$desc, ADAR2_1_control_barcodes$site, ADAR2_1_control_barcodes$A_position
)
ADAR2_1_sample_barcodes <- ADAR2_1_barcodes[ADAR2_1_barcodes$sample.ctrl == "sample", ]
ADAR2_1_sample_barcodes$merging_column <- paste0(
  ADAR2_1_sample_barcodes$desc, ADAR2_1_sample_barcodes$site, ADAR2_1_sample_barcodes$A_position
)
ADAR2_1_sample_control_barcodes <- merge(
  ADAR2_1_sample_barcodes, ADAR2_1_control_barcodes, by.x = "merging_column", by.y = "merging_column"
)

# Create a scatterplot with correlation information
p4.1 <- ggplot(ADAR2_1_sample_control_barcodes, aes(x = AtoI_Editing.x, y = AtoI_Editing.y)) +
  geom_point(size = 0.1, color = "deepskyblue4") +
  geom_abline(intercept = 0, slope = 1, color = "black") +
  xlim(0, 75) + ylim(0, 75) +
  ggtitle(z) +
  theme_minimal() +
  theme(
    strip.background = element_blank(),
    strip.text.x = element_blank(),
    legend.title = element_blank(),
    legend.position = "bottom",
    legend.text = element_text(size = 7),
    plot.title = element_text(size = 9, face = "bold"),
    panel.border = element_rect(color = "black", fill = NA, size = 0.5),
    axis.text.x = element_text(colour = "black", size = 7),
    axis.title = element_text(face = "bold", size = 7),
    axis.text.y = element_text(colour = "black", size = 7)
  )

# Add p-value and correlation information
cor_edit <- cor.test(ADAR2_1_sample_control_barcodes$AtoI_Editing.x, ADAR2_1_sample_control_barcodes$AtoI_Editing.y)
p5 <- p4.1 +
  annotate(geom = "text", x = 13, y = 50, label = paste("r =", round(cor_edit$estimate, 3)), color = "black") +
  annotate(geom = "text", x = 13, y = 60, label = paste("p-value =", round(cor_edit$p.value, 3)), color = "black") +
  xlab("Sample barcodes") +
  ylab("Control barcodes")

plot(p5)

############## B2 random disruption #############

# Retrieve the editing levels of the target constructs
Random_disruption_df<-df_A_G_POS_editing[ df_A_G_POS_editing$desc=="random" & df_A_G_POS_editing$sample.ctrl=="sample" ,]
Random_perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="perfect_ds",1:44]
Random_perfect_ds$site<-0
No_ds_structure<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="ctrl_repeat",1:44]
No_ds_structure$site<-1
Random_disruption_df<-rbind(Random_perfect_ds,Random_disruption_df,No_ds_structure)
Random_disruption_df$average_editing<-rowMeans(Random_disruption_df[,1:41])

# Order the data and set labels
Random_disruption_df<-Random_disruption_df[order(as.numeric(Random_disruption_df$site)),]
rownames(Random_disruption_df)<-paste(Random_disruption_df$site)
Random_disruption_df$site<- as.numeric(Random_disruption_df$site)*100
Random_disruption_df$Treatment<-z
Random_disruption_df$site<-round(Random_disruption_df$site,2)

# Create the plot
Random_ggplot<-ggplot(Random_disruption_df, aes(x=site,average_editing,y=average_editing, color=Treatment))+geom_line(size=0.5)+theme_classic(base_size = 12)+xlab("% Random Disruption")+ylab("Mean editing")+scale_x_continuous(expand = c(0, 0),n.breaks = 10)+scale_y_continuous(expand = c(0, 0),n.breaks = 7, limits = c(0,max(Random_disruption_df$average_editing )+5))+theme(panel.border = element_rect(color = "black",fill = NA,size = 1),legend.position = c(0.7, 0.6),legend.title=element_blank(),axis.title = element_text(face="bold"), title =element_text(face="bold") )+ggtitle("B2")
Random_ggplot<-Random_ggplot+ scale_color_manual(values=c("#0000FF"))
plot(Random_ggplot)

############  Heatmap of a 1nt-,2nt-,3nt- and 4nt-mismatch running from 5’ to 3’ throughout the double-stranded RNA #####################


# Filter the data frame to select specific rows
Disruption_df <- df_A_G_POS_editing[df_A_G_POS_editing$desc %in% c("mismatch1", "mismatch2", "mismatch3", "mismatch4") & df_A_G_POS_editing$sample.ctrl == "sample", ]
Perfect_ds <- df_A_G_POS_editing[df_A_G_POS_editing$desc == "perfect_ds", ]

# Calculate delta editing
Delta_editing <- mclapply(1:nrow(Disruption_df), mc.cores = 8,function(x) {
  Delta_editing_per_construct <- Disruption_df[x, 1:41] - Perfect_ds[1, 1:41]
  Delta_editing_per_construct <- cbind(Delta_editing_per_construct, Disruption_df[x, 42:45])
  Delta_editing_per_construct
})
Delta_editing_df <- do.call(rbind, Delta_editing)

         
# Process the delta editing data
Delta_editing_df<-do.call(rbind,Delta_editing)
Delta_editing_df<-Delta_editing_df[order(as.numeric(Delta_editing_df$site)), ]
Delta_editing_df$Mistmatch_names<-paste0(Delta_editing_df$site,Delta_editing_df$sample.ctrl,Delta_editing_df$desc )
Delta_editing_df_mis_1_3<-Delta_editing_df[Delta_editing_df$sample.ctrl=="sample",]
mismatches_ADAR_list<-split(Delta_editing_df_mis_1_3,f=Delta_editing_df_mis_1_3$desc)

# Create a list of pheatmap plots
pheatmap_plots_mismatches<-mclapply(names(mismatches_ADAR_list), mc.cores = 8, function(z){
  Delta_editing_df_mistmatch3<-mismatches_ADAR_list[[z]][order(as.numeric(mismatches_ADAR_list[[z]]$site), decreasing = T),]
  rownames(Delta_editing_df_mistmatch3)<-Delta_editing_df_mistmatch3$site
  Delta_editing_df_mistmatch3_matrix<-as.matrix(Delta_editing_df_mistmatch3[,1:41])
  Delta_editing_df_mistmatch3_matrix_scales<-scale(Delta_editing_df_mistmatch3_matrix)
  Delta_editing_df_mistmatch3_matrix_scales<-as.data.frame(Delta_editing_df_mistmatch3_matrix_scales)
  list_rows <- split(Delta_editing_df_mistmatch3_matrix_scales,seq(nrow(Delta_editing_df_mistmatch3_matrix_scales)))
  list_rows_capped_zscore<-lapply(list_rows, function(z){
    z[which(z<=-4)]<- -4
    z[which(z>=4)]<- 4
    return(z)})
  Delta_editing_df_mistmatch3_matrix_scales_capped<-do.call(rbind,list_rows_capped_zscore)
  colnames(Delta_editing_df_mistmatch3_matrix_scales_capped)<-colnames(Delta_editing_df_mistmatch3_matrix_scales)
  plot_pheatmap<-pheatmap(Delta_editing_df_mistmatch3_matrix_scales_capped,cluster_rows=F, cluster_cols=F,show_rownames=F, fontsize = 11, main = z )
  return(plot_pheatmap[[4]])
})

# Arrange and display the pheatmap plots
grid.arrange(grobs=pheatmap_plots_mismatches, ncol=2)

########################## ADAR1- and ADAR2-mediated editing offsets based on subsets of 3-nucleotide mismatch running throughout the mNG and B2 sequences. ################


# Filter and select relevant rows
mismatch_types <- c("mismatch1", "mismatch2", "mismatch3", "mismatch4")
Disruption_df <- df_A_G_POS_editing[df_A_G_POS_editing$desc %in% mismatch_types & df_A_G_POS_editing$sample.ctrl == "sample", ]
Perfect_ds <- df_A_G_POS_editing[df_A_G_POS_editing$desc == "perfect_ds", ]

# Calculate delta editing
Delta_editing <- lapply(1:nrow(Disruption_df), function(x) {
  Delta_editing_per_construct <- Disruption_df[x, 1:41] - Perfect_ds[1, 1:41]
  Delta_editing_per_construct <- cbind(Delta_editing_per_construct, Disruption_df[x, 42:45])
  Delta_editing_per_construct
})
Delta_editing_df <- do.call(rbind, Delta_editing)

# Process the delta editing data
Delta_editing_df <- Delta_editing_df[order(as.numeric(Delta_editing_df$site)), ]
Delta_editing_df$Mistmatch_names <- paste0(Delta_editing_df$site, Delta_editing_df$sample.ctrl, Delta_editing_df$desc)

# Split by mismatch kind
Delta_editing_df_mis_1_3_list<-split(Delta_editing_df,f=Delta_editing_df$desc)

# Process data per mismatch kind
Delta_editing_df_mis_1_3_df_list<-mclapply(Delta_editing_df_mis_1_3_list, mc.cores = 8, function(y){
  max_vector<-sapply(abs(y[1:41]),max)
  Col_consider_downstream<-names(max_vector[max_vector>1])
  Delta_editing_df_mis_1_3<-y[,c(Col_consider_downstream,"Barcode ID","desc","sample.ctrl","site","Mistmatch_names")]
  Delta_editing_df_mis_1_3<-Delta_editing_df_mis_1_3[!Delta_editing_df_mis_1_3$sample.ctrl=="barcode_ctrl",]
})

# Calculate distances and editing levels
Delta_editing_mistmatch_1_3_df_list<-mclapply(Delta_editing_df_mis_1_3_df_list,mc.cores=8, function(z){
  Delta_editing_list<-split(z, f=z$Mistmatch_names)
  Delta_editing_mistmatch<-mclapply(Delta_editing_list, mc.cores = 8, function(x){
    Editing_levels<-as.numeric(x[1,1:(ncol(x)-5)])
    Distance_from_disruption<- (146-as.numeric(x[1,"site"])-as.numeric(gsub("A","",names(x[,1:(ncol(x)-5)]))))*(-1)
    Distance_from_disruption_df<-data.frame(Distance_from_mistmatch=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Mismatch_kind=rep(x$desc, length(Editing_levels)))
    Distance_from_disruption_df$A_position<- as.numeric(gsub("A","",colnames(x[,1:(ncol(x)-5)])))
    return(Distance_from_disruption_df)
  })
  Delta_editing_mistmatch_df<-do.call(rbind,Delta_editing_mistmatch)
})

# Arrange the distances per mismatch kind 
Delta_editing_mistmatch_1<-Delta_editing_mistmatch_1_3_df_list[[1]]
Delta_editing_mistmatch_2<-Delta_editing_mistmatch_1_3_df_list[[2]]
Delta_editing_mistmatch_2$Distance_from_mistmatch<-Delta_editing_mistmatch_2$Distance_from_mistmatch +0.5
Delta_editing_mistmatch_3<-Delta_editing_mistmatch_1_3_df_list[[3]]
Delta_editing_mistmatch_3$Distance_from_mistmatch<-Delta_editing_mistmatch_3$Distance_from_mistmatch+1
Delta_editing_mistmatch_4<-Delta_editing_mistmatch_1_3_df_list[[4]]
Delta_editing_mistmatch_4$Distance_from_mistmatch<-Delta_editing_mistmatch_4$Distance_from_mistmatch+1.5

# Combine all mismatch data
Delta_editing_mistmatch_df<-rbind(Delta_editing_mistmatch_1,Delta_editing_mistmatch_2,Delta_editing_mistmatch_3,Delta_editing_mistmatch_4)

mismatch.labs <- c("Mismatch 1 nucleotide","Mismatch 2 nucleotide", "Mismatch 3 nucleotides","Mismatch 4 nucleotide")
names(mismatch.labs) <- c("mismatch1", "mismatch2","mismatch3", "mismatch4")
Delta_editing_mistmatch_df<-Delta_editing_mistmatch_df[!is.na(Delta_editing_mistmatch_df$Editing_level_A_to_I),]
Delta_editing_mistmatch_df_output<-Delta_editing_mistmatch_df
Delta_editing_mistmatch_df_list<-split(Delta_editing_mistmatch_df,f=Delta_editing_mistmatch_df$Mismatch_kind)


# Apply LOESS smoothing
Delta_editing_mistmatch_df_list_LOESS<- mclapply(Delta_editing_mistmatch_df_list, mc.cores = 8, function(z){
  loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_mistmatch, data=z, span=0.05)
  z$smoothed5 <- predict(loessMod50)
  return(z)
})
Delta_editing_mistmatch_df<-do.call(rbind,Delta_editing_mistmatch_df_list_LOESS)

# Split by mismatch kind for visualization
ADARs_periodicity_mis_kinds_ADAR_list<-split(Delta_editing_mistmatch_df, f=Delta_editing_mistmatch_df$Mismatch_kind)

# Calculate quartiles per distance across all mismatches-carrying constructs
ADARs_periodicity_mis_kindsc_ADAR_variation<-mclapply(ADARs_periodicity_mis_kinds_ADAR_list, mc.cores = 8, function(z){
  ADARs_dist_mis_list<-split(z, f=z$Distance_from_mistmatch)
  ADARs_dist_mis_quartiles_list<-mclapply(ADARs_dist_mis_list, mc.cores = 8, function(x){
    df<-as.data.frame(x)
    quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1))))
    colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1))
    df<-cbind(df,quartiles_df)
    return(df)
  }) 
  ADARs_dist_mis_quartiles_list_df<-do.call(rbind,ADARs_dist_mis_quartiles_list)
  return(ADARs_dist_mis_quartiles_list_df)
})
ADARs_periodicity_mis_kindsc_ADAR_variation_df<-do.call(rbind,ADARs_periodicity_mis_kindsc_ADAR_variation)

# Create the summary plots per mismatch kind
vertical.lines<-c(-26,-35)
p.1 <- ggplot(ADARs_periodicity_mis_kindsc_ADAR_variation_df, aes(Distance_from_mistmatch, smoothed5))+ geom_ribbon(aes(ymin=Q_0.25,ymax=Q_0.75),alpha=0.15)+ geom_line(size=0.50,colour= "Darkblue")
p.1 <-p.1 +xlab("Distance from mistmatch")+ylab("Delta editing")+theme_classic()#+scale_color_manual(values=c("Black","Blue"))
p.1 <-p.1 +theme(strip.background = element_blank(),
                 legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=10),plot.title = element_text(size=16,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=11,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=11)) 
p.1 <-p.1 +geom_vline(size=0.5,xintercept = vertical.lines, color="#666699",alpha=0.6,linetype = "dashed")+ggtitle("")
p.1<-p.1+scale_x_continuous(n.breaks = 8)+facet_wrap(~Mismatch_kind, ncol = 4)+scale_y_continuous(n.breaks = 6, limits=c(-40,40))
plot(p.1)


############## Size of the mismatch ##############

# Filter the data for specific mismatch lengths ( For ADAR1: -35 or -35.5 and for ADAR2: -26 OR 26.5)
ADAR_B2_1_to_4_nuc_mis<-ADARs_periodicity_mis_kindsc_ADAR_variation_df[ADARs_periodicity_mis_kindsc_ADAR_variation_df$Distance_from_mistmatch==-35|ADARs_periodicity_mis_kindsc_ADAR_variation_df$Distance_from_mistmatch==-35.5,]

# Create the box plot
plot_Boxplot_ADAR2<-ggplot(ADAR_B2_1_to_4_nuc_mis, aes(x=Mismatch_kind, y=Editing_level_A_to_I))+geom_boxplot()+ theme(legend.position="none",axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +xlab("Mismatch size")+ylab("Delta Editing")+theme_classic()
plot_Boxplot_ADAR2<-plot_Boxplot_ADAR2+theme(strip.background = element_blank(),strip.text.y = element_blank(),strip.text.x = element_text(size = 8),legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=12),plot.title = element_text(size=12,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=12),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=12)) 
plot(plot_Boxplot_ADAR2)

############ B2 T bulges PERIODICITY ######################

# Filter the data for specific bulge types and sample control
Disruption_bulges_df <- df_A_G_POS_editing %>%
  filter(desc %in% c("bulge-T", "bulge-TTC", "bulge-TTCTT", "bulge-TTCTTCT"), sample.ctrl == "sample")

# Select perfect ds
Perfect_ds <- df_A_G_POS_editing %>%
  filter(desc == "perfect_ds")

# Calculate delta editing
Delta_editing<- mclapply(rownames(Disruption_bulges_df),mc.cores = 8, function(x){
  Delta_editing_per_construct<-Disruption_bulges_df[x,1:41]-Perfect_ds[1,1:41]
  Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_bulges_df[x,42:45])
  return(Delta_editing_per_construct)
} )
Delta_editing_df<-do.call(rbind,Delta_editing)

# Calculate Distance from bulge
Delta_editing_df<-Delta_editing_df[order(as.numeric(Delta_editing_df$site)), ]
Delta_editing_df$Mistmatch_names<-paste0(Delta_editing_df$site,Delta_editing_df$sample.ctrl,Delta_editing_df$desc )
Delta_editing_list<-split(Delta_editing_df, f=Delta_editing_df$Mistmatch_names)
Delta_editing_bulge<-mclapply(Delta_editing_list, mc.cores = 8, function(x){
  Editing_levels<-as.numeric(x[1,1:41])
  Distance_from_disruption<-(146-as.numeric(x[1,"site"])- as.numeric(gsub("A","",names(x[,1:41])))+0.5)*-1
  Distance_from_Disruption_bulges_df<-data.frame(Distance_from_bulge=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Bulge.kinds=rep(x$desc, length(Editing_levels)))
})
Delta_editing_bulge_df<-do.call(rbind,Delta_editing_bulge)

bulge.labs <- c("Bulge T", "Bulge TTC", "Bulge TTCTT","Bulge TTCTTCT")
names(bulge.labs) <- c("bulge-T", "bulge-TTC", "bulge-TTCTT","bulge-TTCTTCT")
Delta_editing_bulge_df<-Delta_editing_bulge_df[!is.na(Delta_editing_bulge_df$Editing_level_A_to_I),]

# Apply LOESS smoothing
Loes_bulge_kind<-split(Delta_editing_bulge_df, f=Delta_editing_bulge_df$Bulge.kinds)##
Loes_bulge_kind_list<-mclapply(Loes_bulge_kind, mc.cores = 8, function(x){##
  loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_bulge, data=x, span=0.11)###
  x$smoothed5 <- predict(loessMod50)
  return(x)})
Delta_editing_bulge_df<-do.call(rbind,Loes_bulge_kind_list)

# Calculate quartiles per distance across all mismatches-carrying constructs
ggplot_quartiles_T_bulges_list<-split(Delta_editing_bulge_df, f=Delta_editing_bulge_df$Bulge.kinds)
Pyrimidine_bulges_mNG_quartiles<-mclapply(ggplot_quartiles_T_bulges_list, mc.cores = 8, function(x){
  ADAR1_3nuc_mis<-x
  ADAR1_3nuc_mis_ADAR1<-split(ADAR1_3nuc_mis, f=ADAR1_3nuc_mis$Distance_from_bulge)
  ADAR1_3nuc_mis_ADAR2_list<-mclapply(ADAR1_3nuc_mis_ADAR1, mc.cores = 8, function(x){
    df<-as.data.frame(x)
    quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1))))
    colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1))
    df<-cbind(df,quartiles_df)
    return(df)
  })
  ADAR1_3nuc_mis_ADAR1_list_df<-do.call(rbind,ADAR1_3nuc_mis_ADAR2_list)
})
Pyrimidine_bulges_mNG_quartiles_df<-do.call(rbind,Pyrimidine_bulges_mNG_quartiles)
Pyrimidine_bulges_mNG_quartiles_df$ADAR<-z

# Create the summary plots per bulge kind
vertical.lines<-c(-35,-26)
color_plot_ADAR<-brewer.pal(8, "Dark2")
p.1 <- ggplot(Pyrimidine_bulges_mNG_quartiles_df, aes(Distance_from_bulge, smoothed5))+ geom_ribbon(aes(ymin=Q_0.25,ymax=Q_0.75),alpha=0.15)+ geom_line(size=0.40,colour= "DarkBlue")
p.1 <-p.1 +xlab("")+ylab("")+theme_classic()
p.1 <-p.1 +theme(legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=6),plot.title = element_text(size=10,hjust = 0.5),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=7)) 
p.1 <-p.1 +geom_vline(size=0.5,xintercept = vertical.lines, color="#666699",alpha=0.6,linetype = "dashed")+ggtitle(unique(Pyrimidine_bulges_mNG_quartiles_df$ADAR))+facet_grid(~Bulge.kinds) +xlab("Bulge kind")+ylab("Delta Editing")
plot(p.1)

############################## A-C mismatches ###############################################

# Filter the data
Disruption_T_to_C <- df_A_G_POS_editing %>%
  filter(desc == "TtoC" & sample.ctrl == "sample")

# Calculate mismatch position and order the data
Disruption_T_to_C$mismatch_position<- 146-as.numeric(Disruption_T_to_C$site)
Disruption_T_to_C<-Disruption_T_to_C[order(as.numeric(Disruption_T_to_C$mismatch_position)),]

# Set rownames for visualization
rownames(Disruption_T_to_C) <- Disruption_T_to_C$mismatch_position

# Extract Perfect_ds
Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="perfect_ds",1:44]
Perfect_ds$site<-0

# Calculate Delta editing per construct
Delta_editing<- mclapply(rownames(Disruption_T_to_C),mc.cores = 8, function(x){
  Delta_editing_per_construct<-Disruption_T_to_C[x,1:41]-Perfect_ds[1,1:41]
  Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_T_to_C[x,42:45])
  return(Delta_editing_per_construct)
} )

Delta_editing_T_to_C_df<-do.call(rbind,Delta_editing)

# Split and calculate distance
Delta_editing_T_to_C_df<-Delta_editing_T_to_C_df[order(as.numeric(Delta_editing_T_to_C_df$site)), ]
Delta_editing_T_to_C_df$Mistmatch_names<-paste0(Delta_editing_T_to_C_df$site,Delta_editing_T_to_C_df$sample.ctrl,Delta_editing_T_to_C_df$desc )
Delta_editing_list_TtoC<-split(Delta_editing_T_to_C_df, f=Delta_editing_T_to_C_df$Mistmatch_names)
Delta_editing_mistmatch_T_to_C<-mclapply(Delta_editing_list_TtoC, mc.cores = 8, function(x){
  Editing_levels<-as.numeric(x[1,1:41])
  Distance_from_disruption<-(146-as.numeric(x[1,"site"])-as.numeric(gsub("A","",names(x[,1:41]))))*(-1) #146- as.numeric(gsub("A","",names(x[,1:41])))- as.numeric(x[1,"site"])
  Distance_from_disruption_df<-data.frame(Distance_from_mistmatch=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Mismatch_kind=rep(x$desc, length(Editing_levels)))
})
Delta_editing_mistmatch_TtoC_df<-do.call(rbind,Delta_editing_mistmatch_T_to_C)

# Apply loess smoothing
Delta_editing_mistmatch_TtoC_df<-Delta_editing_mistmatch_TtoC_df[!is.nan(Delta_editing_mistmatch_TtoC_df$Editing_level_A_to_I),]
loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_mistmatch, data=Delta_editing_mistmatch_TtoC_df, span=0.07)
Delta_editing_mistmatch_TtoC_df$smoothed5 <- predict(loessMod50)

# Calculate quartiles per distance across all A-C carrying constructs
ADAR_A_C_list<-split(Delta_editing_mistmatch_TtoC_df, f=Delta_editing_mistmatch_TtoC_df$Distance_from_mistmatch)
A_C_ADAR_list<-mclapply(ADAR_A_C_list, mc.cores = 8, function(x){
  df<-as.data.frame(x)
  quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1))))
  colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1))
  df<-cbind(df,quartiles_df)
  return(df)
})
ADAR_A_C_list_df<-do.call(rbind,A_C_ADAR_list)
ADAR_A_C_list_df$ADAR<-z

################### A-A mismatch ########################    

# Filter data
Disruption_extened_A_A <- df_A_G_POS_editing %>% filter(desc == "mismatch1" & sample.ctrl == "sample")

# Read barcodes CSV
Barcodes_csv<-Barcodes_csv[Barcodes_csv$B2.mNG=="B2",]
Barcodes_csv_all_com<-Barcodes_csv[,c("barcode","seq")]

# Calculate mismatch position and merge with barcode data
Disruption_extened_A_A$loc_nuc_mistmatch<-146-as.numeric(Disruption_extened_A_A$site)
Delta_editing_all_comb_dfmerged<-merge(Disruption_extened_A_A,Barcodes_csv_all_com, by.x="Barcode ID",by.y="barcode")
Delta_editing_all_comb_dfmerged$site<-as.numeric(Delta_editing_all_comb_dfmerged$site)

# Define reference sequences
perf_ds_sequence<-c("GCCGGGCGTGGTGGCACACGCCTTTAATCCCAGCACTCGGGAGGCAGAGGCAGGCAGATTTCTGAGTTGGAGGCCAGCCTGGTCTACAAAGTGAGTTCCAGGACAGCCAGAGCTACACAGAGAAACCCTGTCTCGAAAAACCAAAA")
Variable_arm<-c("TTTTGGTTTTTCGAGACAGGGTTTCTCTGTGTAGCTCTGGCTGTCCTGGAACTCACTTTGTAGACCAGGCTGGCCTCCAACTCAGAAATCTGCCTGCCTCTGCCTCCCGAGTGCTGGGATTAAAGGCGTGTGCCACCACGCCCGGC")

# Process the data
Delta_editing_all_comb_nuc_list<-mclapply(rownames(Delta_editing_all_comb_dfmerged), mc.cores = 8, function(w){
  z<-as.data.frame(Delta_editing_all_comb_dfmerged[w,])
  z$nucletide<-substr(perf_ds_sequence, z$loc_nuc_mistmatch, z$loc_nuc_mistmatch)
  z$ref_nuc_to_changed<-substr(Variable_arm, z$site+1, z$site+1)
  z$To_nuc_changed<-substr(z$seq, z$site+1, z$site+1)
  return(z)
})
Delta_editing_all_comb_nuc_df<-do.call(rbind,Delta_editing_all_comb_nuc_list)

Delta_editing_all_A_A_mis<-Delta_editing_all_comb_nuc_df[Delta_editing_all_comb_nuc_df$nucletide=="A" & Delta_editing_all_comb_nuc_df$ref_nuc_to_changed=="T" &Delta_editing_all_comb_nuc_df$To_nuc_changed=="A",]
Delta_editing_all_comb_nuc_df<-data.frame(Delta_editing_all_comb_nuc_df$`Barcode ID`) 
colnames(Delta_editing_all_comb_nuc_df)<-"Barcode ID"

# Merge data with mismatch information
Disruption_A_A_mismacth<-merge(Disruption_extened_A_A,Delta_editing_all_comb_nuc_df,by="Barcode ID")
Disruption_A_A_mismacth<-Disruption_A_A_mismacth[,colnames(Disruption_A_A_mismacth)[c(2:42,1,44:ncol(Disruption_A_A_mismacth)-1)]]
Disruption_extened_A_A<-Disruption_A_A_mismacth
Disruption_extened_A_A$mismatch_position<- 146-as.numeric(Disruption_extened_A_A$site)
Disruption_extened_A_A<-Disruption_extened_A_A[order(as.numeric(Disruption_extened_A_A$mismatch_position)),]

# Set rownames for visualization
rownames(Disruption_extened_A_A)<- as.character(Disruption_extened_A_A$mismatch_position)

# Extract Perfect_ds
Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="perfect_ds",1:44]
Perfect_ds$site<-0

# Calculate Delta editing per construct
Delta_editing<- mclapply(rownames(Disruption_extened_A_A),mc.cores = 8, function(x){
  Delta_editing_per_construct<-Disruption_extened_A_A[x,1:41]-Perfect_ds[1,1:41]
  Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_extened_A_A[x,42:45])
  return(Delta_editing_per_construct)
} )
Delta_editing_T_to_A_df<-do.call(rbind,Delta_editing)

# Calculate the distance from the disruption per construct
Delta_editing_T_to_A_df<-Delta_editing_T_to_A_df[order(as.numeric(Delta_editing_T_to_A_df$site)), ]
Delta_editing_T_to_A_df$Mistmatch_names<-paste0(Delta_editing_T_to_A_df$site,Delta_editing_T_to_A_df$sample.ctrl,Delta_editing_T_to_A_df$desc )
Delta_editing_list_TtoA<-split(Delta_editing_T_to_A_df, f=Delta_editing_T_to_A_df$Mistmatch_names)
Delta_editing_mistmatch_T_to_A<-mclapply(Delta_editing_list_TtoA, mc.cores = 8, function(x){
  Editing_levels<-as.numeric(x[1,1:41])
  Distance_from_disruption<- (146-as.numeric(x[1,"site"])-as.numeric(gsub("A","",names(x[,1:41]))))*(-1)
  Distance_from_disruption_df<-data.frame(Distance_from_mistmatch=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Mismatch_kind=rep(x$desc, length(Editing_levels)))
})
Delta_editing_mistmatch_T_to_A_DF<-do.call(rbind,Delta_editing_mistmatch_T_to_A)

# Smooth the data using loess
Delta_editing_mistmatch_T_to_A_DF<-Delta_editing_mistmatch_T_to_A_DF[!is.nan(Delta_editing_mistmatch_T_to_A_DF$Editing_level_A_to_I),]
loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_mistmatch, data=Delta_editing_mistmatch_T_to_A_DF, span=0.07)
Delta_editing_mistmatch_T_to_A_DF$smoothed5 <- predict(loessMod50)
Delta_editing_mistmatch_T_to_A_DF$ADAR<-z
ADAR2_A_A_list<-split(Delta_editing_mistmatch_T_to_A_DF, f=Delta_editing_mistmatch_T_to_A_DF$Distance_from_mistmatch)

# Calculate quartiles per distance across all A-A carrying constructs
ADAR2_A_A_ADAR2_list<-mclapply(ADAR2_A_A_list, mc.cores = 8, function(x){
  df<-as.data.frame(x)
  quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1))))
  colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1))
  df<-cbind(df,quartiles_df)
  return(df)
})
ADAR2_A_A_list_df<-do.call(rbind,ADAR2_A_A_ADAR2_list)

################### A-G mismatch ########################    

# Filter data
Disruption_extened_T_to_G<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="TtoG"&df_A_G_POS_editing$sample.ctrl=="sample" ,]

# Calculate and assign mismatch position
Disruption_extened_T_to_G$mismatch_position<- 146-as.numeric(Disruption_extened_T_to_G$site)

# Arrange data by mismatch position and set row names
Disruption_extened_T_to_G<-Disruption_extened_T_to_G[order(as.numeric(Disruption_extened_T_to_G$mismatch_position)),]
rownames(Disruption_extened_T_to_G)<- as.character(Disruption_extened_T_to_G$mismatch_position)

# Extract Perfect_ds
Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="perfect_ds",1:44]
Perfect_ds$site<-0

# Calculate Delta editing per construct
Delta_editing<- mclapply(rownames(Disruption_extened_T_to_G),mc.cores = 8, function(x){
  Delta_editing_per_construct<-Disruption_extened_T_to_G[x,1:41]-Perfect_ds[1,1:41]
  Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_extened_T_to_G[x,42:45])
  return(Delta_editing_per_construct)
} )
Delta_editing_T_to_G_df<-do.call(rbind,Delta_editing)
Delta_editing_T_to_G_df<-Delta_editing_T_to_G_df[order(as.numeric(Delta_editing_T_to_G_df$site)), ]
Delta_editing_T_to_G_df$Mistmatch_names<-paste0(Delta_editing_T_to_G_df$site,Delta_editing_T_to_G_df$sample.ctrl,Delta_editing_T_to_G_df$desc )

# Calculate the distance from the disruption per construct
Delta_editing_list_TtoG<-split(Delta_editing_T_to_G_df, f=Delta_editing_T_to_G_df$Mistmatch_names)
Delta_editing_mistmatch_T_to_G<-mclapply(Delta_editing_list_TtoG, mc.cores = 8, function(x){
  Editing_levels<-as.numeric(x[1,1:41])
  Distance_from_disruption<-(146-as.numeric(x[1,"site"])-as.numeric(gsub("A","",names(x[,1:41]))))*(-1) 
  Distance_from_disruption_df<-data.frame(Distance_from_mistmatch=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Mismatch_kind=rep(x$desc, length(Editing_levels)))
})
Delta_editing_mistmatch_TtoG_df<-do.call(rbind,Delta_editing_mistmatch_T_to_G)
Delta_editing_mistmatch_TtoG_df<-Delta_editing_mistmatch_TtoG_df[!is.nan(Delta_editing_mistmatch_TtoG_df$Editing_level_A_to_I),]

# Smooth the data using loess
loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_mistmatch, data=Delta_editing_mistmatch_TtoG_df, span=0.07)
Delta_editing_mistmatch_TtoG_df$smoothed5 <- predict(loessMod50)
Delta_editing_mistmatch_TtoG_df$ADAR<-z

# Split and summarize data by Distance_from_mistmatch
ADAR2_A_C_list<-split(Delta_editing_mistmatch_TtoG_df, f=Delta_editing_mistmatch_TtoG_df$Distance_from_mistmatch)
ADAR2_A_C_ADAR2_list<-mclapply(ADAR2_A_C_list, mc.cores = 8, function(x){
  df<-as.data.frame(x)
  quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1))))
  colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1))
  df<-cbind(df,quartiles_df)
  return(df)
})
ADAR2_A_G_list_df<-do.call(rbind,ADAR2_A_C_ADAR2_list)

# Combine the data
ADAR_A_C_list_df<-ADAR_A_C_list_df[,colnames(ADAR2_A_G_list_df)]
A_A_A_C_A_G_mismatch_df_all_ADARs_df_summary<-rbind(ADAR_A_C_list_df,ADAR2_A_A_list_df,ADAR2_A_G_list_df)

# Create the summary plots per mismatch
p.1 <- ggplot(A_A_A_C_A_G_mismatch_df_all_ADARs_df_summary, aes(Distance_from_mistmatch, smoothed5, colour=Mismatch_kind))+ geom_line(size=0.5)+theme_classic()
p.1 <-p.1 +theme(strip.background = element_blank(),
                 strip.text.y = element_blank(),
                 legend.title=element_blank(),legend.position=c(0.8,0.25),legend.text = element_text(size=10),plot.title = element_text(size=8,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=8,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 10),axis.text.y = element_text(colour="black",size=8)) 
p.1<-p.1+scale_x_continuous(n.breaks = 6,limits=c(-50,50))
vertical.lines<-c(-35,-26,30)
p.1<-p.1+geom_vline(size=0.5,xintercept = vertical.lines, color="#666699",alpha=0.6,linetype = "dashed")+scale_y_continuous(n.breaks = 8, limits=c(NA,NA))
p.1<-p.1+scale_color_manual(values=c('Black','darkblue','darkgreen'),labels=c('A-A','A-C', 'A-G'))+xlab("Bulge kind")+ylab("Delta Editing")
plot(p.1)

########################### Mismatch all combinations ###############

# Filter Barcodes_csv for B2.mNG=="B2"
Barcodes_csv<-Barcodes_csv[Barcodes_csv$B2.mNG=="B2",]
Barcodes_csv$site<-as.character(Barcodes_csv$site)

# Filter data for mismatch_all_comb in the sample
all_combinations<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mismatch_all_comb" & df_A_G_POS_editing$sample.ctrl=="sample" ,]

# Filter data for mismatch1 in the sample
target_sites<-unique(all_combinations$site)
all_combinations_1<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mismatch1" & df_A_G_POS_editing$sample.ctrl=="sample" ,]
all_combinations_1<-all_combinations_1[na.omit(match(target_sites,all_combinations_1$site)),]

# Combine the two datasets
all_combinations<-rbind(all_combinations,all_combinations_1)
all_combinations$loc_nuc_mistmatch<-146-as.numeric(all_combinations$site)

# Extract Perfect_ds
Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="perfect_ds",1:41]
Perfect_ds$site<-0

# Calculate Delta editing per construct
Delta_editing<- mclapply(rownames(all_combinations),mc.cores = 8, function(x){
  Delta_editing_per_construct<-all_combinations[x,1:41]-Perfect_ds[1,1:41]
  Delta_editing_per_construct<-cbind(Delta_editing_per_construct,all_combinations[x,42:46])
  return(Delta_editing_per_construct)
} )
Delta_editing_all_comb_df<-do.call(rbind,Delta_editing)
Delta_editing_all_comb_df$site<-as.numeric(Delta_editing_all_comb_df$site)
Barcodes_csv_all_com<-Barcodes_csv[,c("barcode","seq")]

# Merge Delta_editing_all_comb_df with Barcodes_csv_all_com
Delta_editing_all_comb_dfmerged<-merge(Delta_editing_all_comb_df,Barcodes_csv_all_com, by.x="Barcode ID",by.y="barcode")

# Define perf_ds_sequence and Variable_arm
perf_ds_sequence<-c("GCCGGGCGTGGTGGCACACGCCTTTAATCCCAGCACTCGGGAGGCAGAGGCAGGCAGATTTCTGAGTTGGAGGCCAGCCTGGTCTACAAAGTGAGTTCCAGGACAGCCAGAGCTACACAGAGAAACCCTGTCTCGAAAAACCAAAA")
Variable_arm<-c("TTTTGGTTTTTCGAGACAGGGTTTCTCTGTGTAGCTCTGGCTGTCCTGGAACTCACTTTGTAGACCAGGCTGGCCTCCAACTCAGAAATCTGCCTGCCTCTGCCTCCCGAGTGCTGGGATTAAAGGCGTGTGCCACCACGCCCGGC")

Delta_editing_all_comb_nuc_list<-mclapply(rownames(Delta_editing_all_comb_dfmerged), mc.cores = 8, function(w){
  z<-as.data.frame(Delta_editing_all_comb_dfmerged[w,])
  z$nucletide<-substr(perf_ds_sequence, z$loc_nuc_mistmatch, z$loc_nuc_mistmatch)
  z$ref_nuc_to_changed<-substr(Variable_arm, z$site+1, z$site+1)
  z$To_nuc_changed<-substr(z$seq, z$site+1, z$site+1)
  return(z)
})
Delta_editing_all_comb_nuc_df<-do.call(rbind,Delta_editing_all_comb_nuc_list)

# Define a function for defining the nucleotide surrounding each A site 
  nuc_wind_function<-function(df_row,Inv_ds_sequence,window_size){
  postion_mis<-df_row[,"loc_nuc_mistmatch"]
  window_pos<--window_size:window_size
  min_dist<-mclapply(window_pos, mc.cores = 8, function(z){
    Nuc_defined<-substr(Inv_ds_sequence,z+postion_mis,postion_mis+z)
    Loc_MisM_nuc<-substr(Inv_ds_sequence,postion_mis,postion_mis)
    if(Nuc_defined=="A"){
      tmp_df<-data.frame(Pos_Window=z,Loc_MisM=postion_mis,Nuc_mism=Loc_MisM_nuc,Nuc_B2_pos=postion_mis+z, Nuc=Nuc_defined, Editing= df_row[1,paste0("A",z+postion_mis)])
    }else{
      tmp_df<-data.frame(Pos_Window=z,Loc_MisM=postion_mis,Nuc_mism=Loc_MisM_nuc,Nuc_B2_pos=postion_mis+z, Nuc=Nuc_defined, Editing=NA)
    }
    return(tmp_df)
  })
  df_row_dis_tmp<-do.call(rbind,min_dist)
  rownames(df_row_dis_tmp)<-df_row_dis_tmp$Pos_Window
  tmp_Editing<-as.data.frame(df_row_dis_tmp$Editing)
  rownames(tmp_Editing)<-df_row_dis_tmp$Pos_Window
  df_row_dis<-cbind(df_row,t(df_row_dis_tmp$Editing))
  colnames(df_row_dis)<-c(colnames(df_row),df_row_dis_tmp$Pos_Window)
  return(df_row_dis)
}

# Apply nuc_wind_function to each row in Delta_editing_all_comb_nuc_df
Mismatch_all_comb<-mclapply(rownames(Delta_editing_all_comb_nuc_df),mc.cores = 8, function(z){
  df_rowz_input<-Delta_editing_all_comb_nuc_df[z,]
  x<-nuc_wind_function(df_rowz_input,perf_ds_sequence,4 )
})
Mismatch_all_comb_df<- do.call(rbind,Mismatch_all_comb)

# Split Mismatch_all_comb_df by nucleotide
Mism_oppo_of_comb<-split(Mismatch_all_comb_df,f=Mismatch_all_comb_df$nucletide)


# Filter data for mismatch-carrying constucts (TtoC and TtoG) in the sample
all_combinations1<-df_A_G_POS_editing[(df_A_G_POS_editing$desc=="TtoC"|df_A_G_POS_editing$desc=="TtoG") & df_A_G_POS_editing$sample.ctrl=="sample" ,]
target_sites<-unique(all_combinations1$site)

# Filter data for mismatch1-carrying constucts in the sample
all_combinations_1<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mismatch1" & df_A_G_POS_editing$sample.ctrl=="sample" ,]
all_combinations_1<-all_combinations_1[na.omit(match(target_sites,all_combinations_1$site)),]
all_combinations<-rbind(all_combinations1,all_combinations_1)
all_combinations$loc_nuc_mistmatch<-146-as.numeric(all_combinations$site)

# Filter data for perfect_ds
Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="perfect_ds",1:41]
Perfect_ds$site<-0

# Calculate Delta editing per construct
Delta_editing<- mclapply(rownames(all_combinations),mc.cores = 8, function(x){
  Delta_editing_per_construct<-all_combinations[x,1:41]-Perfect_ds[1,1:41]
  Delta_editing_per_construct<-cbind(Delta_editing_per_construct,all_combinations[x,42:46])
  return(Delta_editing_per_construct)
} )
Delta_editing_all_comb_df<-do.call(rbind,Delta_editing)

Delta_editing_all_comb_df$site<-as.numeric(Delta_editing_all_comb_df$site)
Barcodes_csv_all_com<-Barcodes_csv[,c("barcode","seq")]

# Merge Delta_editing_all_comb_df with Barcodes_csv
Delta_editing_all_comb_dfmerged<-merge(Delta_editing_all_comb_df,Barcodes_csv_all_com, by.x="Barcode ID",by.y="barcode")


Delta_editing_all_comb_nuc_list<-mclapply(rownames(Delta_editing_all_comb_dfmerged), mc.cores = 8, function(w){
  z<-as.data.frame(Delta_editing_all_comb_dfmerged[w,])
  z$nucletide<-substr(perf_ds_sequence, z$loc_nuc_mistmatch, z$loc_nuc_mistmatch)
  z$ref_nuc_to_changed<-substr(Variable_arm, z$site+1, z$site+1)
  z$To_nuc_changed<-substr(z$seq, z$site+1, z$site+1)
  return(z)
})
Delta_editing_all_comb_nuc_df<-do.call(rbind,Delta_editing_all_comb_nuc_list)
df_row=Delta_editing_all_comb_nuc_df[1,]
Inv_ds_sequence=perf_ds_sequence
window_size=4

# Define a function for defining the nucleotide surrounding each A site 
  nuc_wind_function<-function(df_row,Inv_ds_sequence,window_size){
  postion_mis<-df_row[,"loc_nuc_mistmatch"]
  window_pos<--window_size:window_size
  min_dist<-mclapply(window_pos, mc.cores = 8, function(z){
    Nuc_defined<-substr(Inv_ds_sequence,z+postion_mis,postion_mis+z)
    Loc_MisM_nuc<-substr(Inv_ds_sequence,postion_mis,postion_mis)
    if(Nuc_defined=="A"){
      tmp_df<-data.frame(Pos_Window=z,Loc_MisM=postion_mis,Nuc_mism=Loc_MisM_nuc,Nuc_B2_pos=postion_mis+z, Nuc=Nuc_defined, Editing= df_row[1,paste0("A",z+postion_mis)])
    }else{
      tmp_df<-data.frame(Pos_Window=z,Loc_MisM=postion_mis,Nuc_mism=Loc_MisM_nuc,Nuc_B2_pos=postion_mis+z, Nuc=Nuc_defined, Editing=NA)
    }
    return(tmp_df)
  })
  df_row_dis_tmp<-do.call(rbind,min_dist)
  rownames(df_row_dis_tmp)<-df_row_dis_tmp$Pos_Window
  tmp_Editing<-as.data.frame(df_row_dis_tmp$Editing)
  rownames(tmp_Editing)<-df_row_dis_tmp$Pos_Window
  df_row_dis<-cbind(df_row,t(df_row_dis_tmp$Editing))
  colnames(df_row_dis)<-c(colnames(df_row),df_row_dis_tmp$Pos_Window)
  return(df_row_dis)
  }


Mismatch_all_comb<-mclapply(rownames(Delta_editing_all_comb_nuc_df),mc.cores = 8, function(z){
  df_rowz_input<-Delta_editing_all_comb_nuc_df[z,]
  x<-nuc_wind_function(df_rowz_input,perf_ds_sequence,4 )
})
Mismatch_all_comb_df<- do.call(rbind,Mismatch_all_comb)
Mism_oppo_of_N<-split(Mismatch_all_comb_df,f=Mismatch_all_comb_df$nucletide)
Mism_oppo_of_A<-Mism_oppo_of_N[[1]]

# Split Mismatch_all_comb_df by nucletide
tmp11<-split(Mism_oppo_of_A,f=Mism_oppo_of_A$To_nuc_changed)

A_df_all_comb<- rbind(colMeans(tmp11[[1]][,51:59], na.rm = T),colMeans(tmp11[[2]][,51:59], na.rm = T),colMeans(tmp11[[3]][,51:59], na.rm = T))
rownames(A_df_all_comb)<-names(tmp11)
colnames(A_df_all_comb)<-c("4","3","2","1","A","-1","-2","-3","-4")
A_df_all_comb<-A_df_all_comb[,c("-4","-3","-2","-1","A","1","2","3","4")]


Mism_oppo_of_T<-Mism_oppo_of_comb[["T"]]
tmp11<-split(Mism_oppo_of_T,f=Mism_oppo_of_T$To_nuc_changed)
T_df_all_comb<- rbind(colMeans(tmp11[[1]][,51:59], na.rm = T),colMeans(tmp11[[2]][,51:59], na.rm = T),colMeans(tmp11[[3]][,51:59], na.rm = T))
rownames(T_df_all_comb)<-names(tmp11)
colnames(T_df_all_comb)<-c("4","3","2","1","A","-1","-2","-3","-4")
T_df_all_comb<-T_df_all_comb[,c("-4","-3","-2","-1","A","1","2","3","4")]


Mism_oppo_of_G<-Mism_oppo_of_comb[["G"]]
tmp11<-split(Mism_oppo_of_G,f=Mism_oppo_of_G$To_nuc_changed)
G_df_all_comb<- rbind(colMeans(tmp11[[1]][,51:59], na.rm = T),colMeans(tmp11[[2]][,51:59], na.rm = T),colMeans(tmp11[[3]][,51:59], na.rm = T))
rownames(G_df_all_comb)<-names(tmp11)
colnames(G_df_all_comb)<-c("4","3","2","1","A","-1","-2","-3","-4")
G_df_all_comb<-G_df_all_comb[,c("-4","-3","-2","-1","A","1","2","3","4")]


Mism_oppo_of_C<-Mism_oppo_of_comb[["C"]]
tmp11<-split(Mism_oppo_of_C,f=Mism_oppo_of_C$To_nuc_changed)
C_df_all_comb<- rbind(colMeans(tmp11[[1]][,51:59], na.rm = T),colMeans(tmp11[[2]][,51:59], na.rm = T),colMeans(tmp11[[3]][,51:59], na.rm = T))
rownames(C_df_all_comb)<-names(tmp11)
colnames(C_df_all_comb)<-c("4","3","2","1","A","-1","-2","-3","-4")
C_df_all_comb<-C_df_all_comb[,c("-4","-3","-2","-1","A","1","2","3","4")]

max_min_df_all_com<-rbind(A_df_all_comb,T_df_all_comb,G_df_all_comb,C_df_all_comb)

if(abs(min(max_min_df_all_com,na.rm = T))>abs(max(max_min_df_all_com,na.rm = T))){
  Range_value<-abs(min(max_min_df_all_com,na.rm = T))
  
}else{
  Range_value<-abs(max(max_min_df_all_com,na.rm = T))
  
}

# Create a vector of breaks for the color scale
breaksList = seq(-Range_value,Range_value ,by=0.5)

# Generate a pheatmap for Mismatch opposite of A
p4<-pheatmap(A_df_all_comb,cluster_rows=FALSE, cluster_cols=FALSE, main = "Mismatch opposite of A",color = colorRampPalette(rev(RColorBrewer::brewer.pal(n = 11, name = "RdYlBu")))(length(breaksList)), 
             breaks = breaksList, fontsize = 6)

# Generate a pheatmap for Mismatch opposite of T
p1<-pheatmap(T_df_all_comb,cluster_rows=FALSE, cluster_cols=FALSE, main = paste0(z,": Mismatch opposite of T"),
             color = colorRampPalette(rev(RColorBrewer::brewer.pal(n = 11, name = "RdYlBu")))(length(breaksList)), 
             breaks = breaksList, fontsize = 6)

# Generate a pheatmap for Mismatch opposite of G
p2<-pheatmap(G_df_all_comb,cluster_rows=FALSE, cluster_cols=FALSE, main = "Mismatch opposite of G",color = colorRampPalette(rev(RColorBrewer::brewer.pal(n = 11, name = "RdYlBu")))(length(breaksList)), 
             breaks = breaksList, fontsize = 6)

# Generate a pheatmap for Mismatch opposite of C
p3<-pheatmap(C_df_all_comb,cluster_rows=FALSE, cluster_cols=FALSE, main = "Mismatch opposite of C",color = colorRampPalette(rev(RColorBrewer::brewer.pal(n = 11, name = "RdYlBu")))(length(breaksList)), 
             breaks = breaksList, fontsize = 6)

Seq_preference_B2<-function(df_A_G_POS_editing,name_treatment){
  #df_A_G_POS_editing<-ADARs_A_G_Posediting[[1]]
  perfect_ds_fw_strand<-("GCCGGGCGTGGTGGCACACGCCTTTAATCCCAGCACTCGGGAGGCAGAGGCAGGCAGATTTCTGAGTTGGAGGCCAGCCTGGTCTACAAAGTGAGTTCCAGGACAGCCAGAGCTACACAGAGAAACCCTGTCTCGAAAAACCAAAA")
  Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$`Barcode ID`=="CTGTAGAC",1:32]
  Perfect_ds_t<-t(Perfect_ds)
  Perfect_ds_t<-as.data.frame(Perfect_ds_t)
  Perfect_ds_t$A_pos<-as.numeric(gsub("A","",rownames(Perfect_ds_t)))
  downstream_nuc<-mclapply(rownames(Perfect_ds_t),mc.cores = 8, function(z){
    df<-as.data.frame(Perfect_ds_t[z,])
    pos_A<-as.numeric(df$A_pos)
    nuc_down<-substr(perfect_ds_fw_strand, start =pos_A, stop = pos_A+1 ) 
    df<-cbind(df,nuc_down)
    colnames(df)<-c("Editing","A_position","Nuc_downstream")
    return(df)
  })
  downstream_nuc_df_ggplot<-do.call(rbind,downstream_nuc)
  downstream_nuc_df_ggplot$Pos<-"Downstream"
  
  
  upstream_nuc<-mclapply(rownames(Perfect_ds_t),mc.cores = 8, function(z){
    df<-Perfect_ds_t[z,]
    pos_A<-as.numeric(df$A_pos)
    nuc_up<-substr(perfect_ds_fw_strand, start =pos_A-1, stop = pos_A ) 
    df<-cbind(df,nuc_up)
    colnames(df)<-c("Editing","A_position","Nuc_downstream")
    return(df)
  })
  upstream_nuc_df_ggplot<-do.call(rbind,upstream_nuc)
  upstream_nuc_df_ggplot$Pos<-"Upstream"
  upstream_nuc_df_ggplot$ADAR<-name_treatment
  downstream_nuc_df_ggplot$ADAR<-name_treatment
  Nuc_preference<- rbind(upstream_nuc_df_ggplot,downstream_nuc_df_ggplot)
  Nuc_preference$library<-"mNG"
  
  return(Nuc_preference)
}

ADAR_seq_preference<-Seq_preference_B2(df_A_G_POS_editing,z)

output<-list(ADAR_seq_preference,df_A_G_POS_editing)

return(output)
}


####ADAR2

# Define file paths  
 fastq_file1 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_5_658_ADARs_B2_G_LRS/657_ADARs_mNG_library_ADAR1_KO_5_658_ADARs_B2_G_LRS_S68_R1_001.fastq.gz"
 fastq_file2 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_5_658_ADARs_B2_G_LRS/657_ADARs_mNG_library_ADAR1_KO_5_658_ADARs_B2_G_LRS_S68_R2_001.fastq.gz"

# Set the random seed for reproducibility and Load and sample the Fastq files
set.seed(123L)
f1 <- FastqSampler(fastq_file1, n = 1e7)
set.seed(123L)
f2 <- FastqSampler(fastq_file2, n = 1e7)
B2_R1 <- yield(f1)
B2_R2 <- yield(f2)
ADAR2_Rep1_B2<-B2_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR2")

# Define file paths  
fastq_file1 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS_S76_R1_001.fastq.gz"
fastq_file2 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS_S76_R2_001.fastq.gz"

# Set the random seed for reproducibility and Load and sample the Fastq files
set.seed(123L)
f1 <- FastqSampler(fastq_file1, n = 1e7)
set.seed(123L)
f2 <- FastqSampler(fastq_file2, n = 1e7)
B2_R1 <- yield(f1)
B2_R2 <- yield(f2)

# Alternatively, if you prefer to work with the entire dataset rather than a sample, you can use the following lines to load the complete data files.
# B2_R1 = readFastq("/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS_S76_R1_001.fastq.gz")
# B2_R2 = readFastq("/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS/657_ADARs_mNG_library_ADAR1_KO_13_658_ADARs_B2_O_LRS_S76_R2_001.fastq.gz")

ADAR2_Rep2_B2<-B2_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR2")

ADAR2_1_ggplot<-pivot_longer(ADAR2_Rep1_B2[[2]], cols = starts_with("A"), values_to = "AtoI_Editing", names_to= "A_postion" )
ADAR2_1_ggplot$ID_ggplot<-paste0(ADAR2_1_ggplot$`Barcode ID`,ADAR2_1_ggplot$sample.ctrl,ADAR2_1_ggplot$A_postion)
ADAR2_1_ggplot_df<-as.data.frame(ADAR2_1_ggplot)
ADAR2_1_ggplot_df<-ADAR2_1_ggplot_df[,6:7]
colnames(ADAR2_1_ggplot_df)<-c("ADAR2p_B2_Rep1_AtoI_Editing","ID_ggplot")

ADAR2_2_ggplot<-pivot_longer(ADAR2_Rep2_B2[[2]], cols = starts_with("A"),values_to = "AtoI_Editing", names_to= "A_postion" )
ADAR2_2_ggplot$ID_ggplot<-paste0(ADAR2_2_ggplot$`Barcode ID`,ADAR2_2_ggplot$sample.ctrl,ADAR2_2_ggplot$A_postion)
ADAR2_2_ggplot_df<-as.data.frame(ADAR2_2_ggplot)
ADAR2_2_ggplot_df<-ADAR2_2_ggplot_df[,6:7]
colnames(ADAR2_2_ggplot_df)<-c("ADAR2p_B2_Rep2_AtoI_Editing","ID_ggplot")
ADAR2_plasmid_B2_repetitions<-merge(ADAR2_1_ggplot_df,ADAR2_2_ggplot_df, by.x="ID_ggplot",by.y="ID_ggplot"  )
ADAR2_plasmid_B2_repetitions<-as.data.frame(ADAR2_plasmid_B2_repetitions)
p3<-ggplot(ADAR2_plasmid_B2_repetitions, aes(x=ADAR2p_B2_Rep1_AtoI_Editing,y=ADAR2p_B2_Rep2_AtoI_Editing))+geom_point(size = 0.1, color="deepskyblue4")+ geom_abline(intercept = 0, slope = 1, color="black")+xlim(0,80)+ylim(0,80)+ ggtitle("ADAR2")+theme_linedraw()
p3<-p3+theme_classic()+theme(strip.background = element_blank(),
                             strip.text.x = element_blank(),
                             legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=7),plot.title = element_text(size=9,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 7),axis.text.y = element_text(colour="black",size=7)) 
p3<-p3+ annotate(geom="text", x=13, y=70, label=paste("p-value < 2.2e-16 "), color="black")
Correlation_Rep<-cor.test(ADAR2_plasmid_B2_repetitions$ADAR2p_B2_Rep1_AtoI_Editing,ADAR2_plasmid_B2_repetitions$ADAR2p_B2_Rep2_AtoI_Editing)
p3<-p3+ annotate(geom="text", x=12, y=60, label=paste("r=",round(Correlation_Rep$estimate,4)), color="black")+xlab("Overexpressed ADAR2 rep1")+ylab("Overexpressed ADAR2 rep2")
plot(p3)

#### ADAR1

fastq_file1 = "/LongRead_ADARs_B2_mNG_lib_KO_cells/658_ADARs_B2_A_LRS/658_ADARs_B2_A_LRS_S82_R1_001.fastq.gz"
fastq_file2 = "/LongRead_ADARs_B2_mNG_lib_KO_cells/658_ADARs_B2_A_LRS/658_ADARs_B2_A_LRS_S82_R2_001.fastq.gz"


# Set the random seed for reproducibility and Load and sample the Fastq files
set.seed(123L)
f1 <- FastqSampler(fastq_file1, n = 1e7)
set.seed(123L)
f2 <- FastqSampler(fastq_file2, n = 1e7)
B2_R1 <- yield(f1)
B2_R2 <- yield(f2)
ADAR1_Rep1_B2<-B2_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR1")

# Define file paths  
fastq_file1 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_7_658_ADARs_B2_i_LRS/657_ADARs_mNG_library_ADAR1_KO_7_658_ADARs_B2_i_LRS_S70_R1_001.fastq.gz"
fastq_file2 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_7_658_ADARs_B2_i_LRS/657_ADARs_mNG_library_ADAR1_KO_7_658_ADARs_B2_i_LRS_S70_R2_001.fastq.gz"

# Set the random seed for reproducibility and Load and sample the Fastq files
set.seed(123L)
f1 <- FastqSampler(fastq_file1, n = 1e7)
set.seed(123L)
f2 <- FastqSampler(fastq_file2, n = 1e7)
B2_R1 <- yield(f1)
B2_R2 <- yield(f2)

ADAR1_Rep2_B2<-B2_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR1")

ADAR1_1_ggplot<-pivot_longer(ADAR1_Rep1_B2[[2]], cols = starts_with("A"), values_to = "AtoI_Editing", names_to= "A_postion" )
ADAR1_1_ggplot$ID_ggplot<-paste0(ADAR1_1_ggplot$`Barcode ID`,ADAR1_1_ggplot$sample.ctrl,ADAR1_1_ggplot$A_postion)
ADAR1_1_ggplot_df<-as.data.frame(ADAR1_1_ggplot)
ADAR1_1_ggplot_df<-ADAR1_1_ggplot_df[,6:7]
colnames(ADAR1_1_ggplot_df)<-c("ADAR1p_B2_Rep1_AtoI_Editing","ID_ggplot")

ADAR1_2_ggplot<-pivot_longer(ADAR1_Rep1_B2[[2]], cols = starts_with("A"),values_to = "AtoI_Editing", names_to= "A_postion" )
ADAR1_2_ggplot$ID_ggplot<-paste0(ADAR1_2_ggplot$`Barcode ID`,ADAR1_2_ggplot$sample.ctrl,ADAR1_2_ggplot$A_postion)
ADAR1_2_ggplot_df<-as.data.frame(ADAR1_2_ggplot)
ADAR1_2_ggplot_df<-ADAR1_2_ggplot_df[,6:7]
colnames(ADAR1_2_ggplot_df)<-c("ADAR1p_B2_Rep2_AtoI_Editing","ID_ggplot")
ADAR1_plasmid_B2_repetitions<-merge(ADAR1_1_ggplot_df,ADAR1_2_ggplot_df, by.x="ID_ggplot",by.y="ID_ggplot"  )
ADAR1_plasmid_B2_repetitions<-as.data.frame(ADAR1_plasmid_B2_repetitions)
p3<-ggplot(ADAR1_plasmid_B2_repetitions, aes(x=ADAR1p_B2_Rep1_AtoI_Editing,y=ADAR1p_B2_Rep2_AtoI_Editing))+geom_point(size = 0.1, color="deepskyblue4")+ geom_abline(intercept = 0, slope = 1, color="black")+xlim(0,80)+ylim(0,80)+ ggtitle("ADAR1")+theme_linedraw()
p3<-p3+theme_classic()+theme(strip.background = element_blank(),
                             strip.text.x = element_blank(),
                             legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=7),plot.title = element_text(size=9,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 7),axis.text.y = element_text(colour="black",size=7)) 
p3<-p3+ annotate(geom="text", x=13, y=70, label=paste("p-value < 2.2e-16 "), color="black")
Correlation_Rep<-cor.test(ADAR1_plasmid_B2_repetitions$ADAR1p_B2_Rep1_AtoI_Editing,ADAR1_plasmid_B2_repetitions$ADAR1p_B2_Rep2_AtoI_Editing)
p3<-p3+ annotate(geom="text", x=12, y=60, label=paste("r=",round(Correlation_Rep$estimate,4)), color="black")+xlab("Overexpressed ADAR1 rep1")+ylab("Overexpressed ADAR1 rep2")
plot(p3)


############################### mNG library ###################################

mNG_library_A_to_G_analysis<-function(B2_R1_readFastaq,B2_R2_readFastaq,z){

  #Sequences in Read1.fastq are filtered and processed to extract the "B2_sequence," including filtering sequences that start with "GCCGGG" and counting the number of properly beginning reads. 
mNG_R1_sequence<-as.data.frame(sread(B2_R1))
mNG_R1_sequence$number_read<-1:nrow(mNG_R1_sequence)

# Calculate the total number of reads in R1
Total_Number_of_Reads_R1<-nrow(mNG_R1_sequence)

# Filter sequences containing "TTTGCCA" and Extract those sequences
mNG_R1_sequence<-mNG_R1_sequence[grepl("TTTGCCA",mNG_R1_sequence$x),]
mNG_R1_sequence$mNG_sequence<-gsub(".*TTTGCCA","" ,mNG_R1_sequence$x)

# Calculate the number of reads beginning properly
number_reads_beggining_properly<- nrow(mNG_R1_sequence)
colnames(mNG_R1_sequence)<-c("non_processed_R1","number_read","mNG_constructR1")
mNG_R1_sequence<-mNG_R1_sequence[nchar(mNG_R1_sequence$mNG_constructR1)>=81,]

# Truncate 'Read 1' to 81 characters
mNG_R1_sequence$mNG_constructR1<-substr(mNG_R1_sequence$mNG_constructR1, start = 1,stop = 81)

# Calculate the percentage of reads beginning properly
Percentage_reads_beggining_properly<- (number_reads_beggining_properly/ Total_Number_of_Reads_R1)*100

# Reads2
# Read Read2.fastq file and create a data frame
mNG_R2_sequence<-as.data.frame(sread(B2_R2))
mNG_R2_sequence$number_read<-1:nrow(mNG_R2_sequence)
Total_Number_of_Reads_R2<-nrow(mNG_R2_sequence)

# Create a reverse complement of B2_R2 data
mNG_R2_sequence_reverse_comp<-as.data.frame(sread(reverseComplement(B2_R2)))
mNG_R2_sequence_reverse_comp$number_read<-1:nrow(mNG_R2_sequence_reverse_comp)

# Read barcodes data TWIST library spreadsheet obtained from Uzonyi 2021
#This table can be found in Uzonyi et al.(2021): Table S2. Sequences of the B2 and mNG oligo library pools, related to STAR Methods.
Barcodes_csv<-read.csv("/DATA_HELA_ADAR2_over_ADAR1_knockdown_B2_libr/Table_S2_Twist_library_sequence_plans.csv", sep = ",", header = TRUE)

# Filter for only mNG construct's sequences
Barcodes_csv<-Barcodes_csv[Barcodes_csv$B2.mNG=="mNG",]
Barcodes_csv$site<-as.character(Barcodes_csv$site) 

# Process mNG_R2_sequence data based on barcode location per construct
mNG_R2_sequence<-mNG_R2_sequence_reverse_comp[grepl("TCCCTCA",mNG_R2_sequence_reverse_comp$x),]
mNG_R2_sequence<-mNG_R2_sequence[grepl("TTCGAA",mNG_R2_sequence$x),]
mNG_R2_sequence$barcode<-gsub(".*TTCGAA","",mNG_R2_sequence$x)
mNG_R2_sequence$barcode<-gsub("\\TCCCTC.*","",mNG_R2_sequence$barcode)
mNG_R2_sequence<-mNG_R2_sequence[grep("ACTAGTAT",mNG_R2_sequence$x),]
mNG_R2_sequence$mNG_constructR2<-gsub("\\ACTAGTAT.*","",mNG_R2_sequence$x)

# Truncate 'Read 2'
mNG_R2_sequence<-mNG_R2_sequence[nchar(mNG_R2_sequence$mNG_constructR2)>=64,]
mNG_R2_sequence$mNG_constructR2<- substr(mNG_R2_sequence$mNG_constructR2, start =nchar(mNG_R2_sequence$mNG_constructR2)-64,stop = nchar(mNG_R2_sequence$mNG_constructR2))

mNG_R2_sequence<-mNG_R2_sequence[nchar(mNG_R2_sequence$barcode)==8,]

# Merge mNG_R2_sequence with Barcodes_csv
mNG_R2_sequence_Barcode_ID<-merge(mNG_R2_sequence,Barcodes_csv,by.x="barcode", by.y="barcode" )
colnames(mNG_R2_sequence_Barcode_ID)<-c("barcode","No_processed_Read2", "number_read","mNG_constructR2", "ID","Buffer","F","BstBI","loop","seq","AscI", "R","desc","site","B2.mNG","sample.ctrl","Total_length","Buffer_length")
Number_of_Barcodes<-length(unique(Barcodes_csv$barcode))

Reads_properly_barcodes<-nrow(mNG_R2_sequence_Barcode_ID)
Reads_properly_barcodes_percent<-(Reads_properly_barcodes/Total_Number_of_Reads_R2)*100

mNG_df_R1_R2<-merge(mNG_R1_sequence,mNG_R2_sequence_Barcode_ID, by.x="number_read", by.y="number_read", all=F)


mNG_df_R1_R2$Whole_Construct<- paste0(mNG_df_R1_R2$mNG_constructR1,mNG_df_R1_R2$mNG_constructR2)
mNG_df_R1_R2<-mNG_df_R1_R2[nchar(mNG_df_R1_R2$Whole_Construct)==146,]


# Define the perfect DNA strand sequence
perfect_ds_fw_strand<-("AGCCAATGGCGGCTAACTATCTGAAGAACCAGCCGATGTACGTGTTCCGTAAGACGGAGCTCAAGCACTCCAAGACCGAGCTCAACTTCAAGGAGTGGCAAAAGGCCTTTACCGATGTGATGGGCATGGACGAGCTGTACAAGTAA")

# Generate row names for mNG_df_R1_R2
rownames(mNG_df_R1_R2)<-paste0("R",1:nrow(mNG_df_R1_R2))
design_seq_const<-perfect_ds_fw_strand

# Extract A_G positions from the perfect DNA strand
A_G_positions<-unlist(gregexpr('A', design_seq_const))
Nuc_matrix<-do.call(rbind,str_split(mNG_df_R1_R2$Whole_Construct,""))
Nuc_matrix<-as.data.frame(Nuc_matrix)
rownames(Nuc_matrix)<-rownames(mNG_df_R1_R2)
AG_pos_analized<-Nuc_matrix[,paste0("V",A_G_positions)]
rownames(AG_pos_analized)<-rownames(mNG_df_R1_R2)
AG_pos_analized$Combined_cols <- do.call(paste, c(AG_pos_analized[1:ncol(AG_pos_analized)], sep = ""))

# Filter A positions that do not contain 'C' or 'T' in Combined_cols
AG_pos_analized<-AG_pos_analized[!grepl("C|T",AG_pos_analized$Combined_cols),]
mNG_df_R1_R2<-mNG_df_R1_R2[rownames(AG_pos_analized),]

# Calculate the number of reads after filtering
Number_of_Reads_after_filter_out<- nrow(mNG_df_R1_R2)

QC_control_df1<-data.frame(Total_Number_of_Reads_R1, number_reads_beggining_properly, Percentage_reads_beggining_properly, Total_Number_of_Reads_R2, Reads_properly_barcodes, Reads_properly_barcodes_percent)
Quantiles_number_read_per_barcode<-quantile(as.numeric(table(mNG_df_R1_R2$barcode)), c(0.01,0.1,0.9,0.99))
QC_control_df<-cbind(QC_control_df1, t(as.data.frame(Quantiles_number_read_per_barcode)))
colnames(QC_control_df)<-c("Total Number of Reads_R1","Number of reads beginning properly", "% of reads beginning properly","Number of Reads_R2","Number of Reads beginning properly barcodes_R2","% of Reads with proper barcodes_R2","1%","10%","90%","99%")

# Create a plot of the ECDF for the number of reads in each barcode
plot(ecdf(log10(as.numeric(table(mNG_df_R1_R2$barcode)))), xlab="Log10  of number of reads in each barcode", pch=20, main="ECD: Log10 N° of reads in each barcode",yaxt="n" )
axis(2, at = seq(0, 1, by = 0.1), las=2)
abline(v=log10(6), col="Red")


# Subset the original dataframe to keep only the selected columns
mNG_df_R1_R2<-mNG_df_R1_R2[,1:21]
mNG_df_R1_R2<-mNG_df_R1_R2[,c("number_read", "Whole_Construct","barcode","desc", "site","B2.mNG","sample.ctrl")]

# Split the dataframe into a list by the 'barcode' column
mNG_df_R1_R2_list<-split(mNG_df_R1_R2, f=mNG_df_R1_R2$barcode)


########## Boxplots representing the distribution of numbers of editing events in the single mNG/B2 perfect double-stranded molecules ##########################

# Filter and extract relevant mNG Construct
perfec_ds_filtered_mNG_df_R1_R2_list<-strsplit(mNG_df_R1_R2_list[["AAGGCCAT"]]$Whole_Construct,"")
perfec_ds_filtered_mNG_df_R1_R2_list<-mclapply(perfec_ds_filtered_mNG_df_R1_R2_list, mc.cores = 8, function(x){
  x[A_G_positions]})

perfec_ds_filtered_mNG_df_R1_R2_df_nuc<-as.data.frame(do.call(rbind,perfec_ds_filtered_mNG_df_R1_R2_list))
perfect_ds_fw_strand_separated<-unlist(strsplit(perfect_ds_fw_strand,""))
perfect_ds_fw_strand_separated<-perfect_ds_fw_strand_separated[A_G_positions]
colnames(perfec_ds_filtered_mNG_df_R1_R2_df_nuc)<-perfect_ds_fw_strand_separated
rownames(perfec_ds_filtered_mNG_df_R1_R2_df_nuc)<-1:nrow(perfec_ds_filtered_mNG_df_R1_R2_df_nuc)

# Calculate the number of edits per molecule
Number_of_edits_per_molecule<-mclapply(rownames(perfec_ds_filtered_mNG_df_R1_R2_df_nuc), mc.cores = 8, function(z){
  Number_of_edits<-as.numeric(sum(perfec_ds_filtered_mNG_df_R1_R2_df_nuc[z,]=="G"))
  Number_of_edits_df<-c(perfec_ds_filtered_mNG_df_R1_R2_df_nuc[z,],Number_of_edits,paste0(perfec_ds_filtered_mNG_df_R1_R2_df_nuc[z,],collapse = "") )
  Number_of_edits_df<-as.data.frame(Number_of_edits_df)
  colnames(Number_of_edits_df)<-1:ncol(Number_of_edits_df)
  return(Number_of_edits_df)
})

Number_of_edits_per_molecule_df<-do.call(rbind,Number_of_edits_per_molecule)
colnames(Number_of_edits_per_molecule_df)<-c(perfect_ds_fw_strand_separated,"number_edits","Trimmed_Read1_sequence")
Number_of_edits_per_molecule_df$ADAR<-z

# Create a boxplot using ggplot2
Number_of_edits_per_molecule_df$color_ADAR<-gsub("\\Rep.*","",Number_of_edits_per_molecule_df$ADAR)
p<-ggplot(Number_of_edits_per_molecule_df[,45:48], aes(x=ADAR, y=number_edits,fill=color_ADAR))+geom_boxplot(outlier.shape = NA)+ scale_fill_brewer(palette="PuBu")
p<-p+xlab("")+ylab("# edits per molecule")+theme_classic()+ggtitle("mNG")
p<-p +theme(strip.background = element_blank(),
            strip.text.x = element_blank(),
            legend.title=element_blank(),legend.position="none",legend.text = element_text(size=11),plot.title = element_text(size=14,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=11,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=11)) 
plot(p)


########### A-to-I quantification per barcode #################

# Calculate the number of analyzed barcodes
Number_of_analyzed_barcodes<-length(mNG_df_R1_R2_list)

# Create consensus matrices per barcode
concensus_matrix_per_barcode<-mclapply(mNG_df_R1_R2_list,mc.cores = 12, function(w){
  Sequence_barcode_StringSet<-DNAStringSet(w$Whole_Construct,use.names=TRUE)
  consensus_matrix_barcode<-as.data.frame(t(consensusMatrix(Sequence_barcode_StringSet)))[,c(1:4,15:16)]
})

# Calculate A-to-G rate per barcode
concensus_matrix_per_barcode_A_to_G<-mclapply(concensus_matrix_per_barcode,mc.cores = 12, function(y){
  y$position<-1:nrow(y)
  y$A_to_G_rate<- (y$G/(y$A+y$G))*100
  return(y)
})

# Extract A-to-G rate at target positions
concensus_matrix_per_barcode_A_to_G_target_pos<- mclapply(names(concensus_matrix_per_barcode_A_to_G),mc.cores = 12,function(z){
  df_A_POS<- as.data.frame(t(as.data.frame(concensus_matrix_per_barcode_A_to_G[[z]][A_G_positions,"A_to_G_rate"])))
  df_A_POS$Barcode_names<-names(concensus_matrix_per_barcode_A_to_G[z])
  colnames(df_A_POS)<- c(paste0("A",A_G_positions),"Barcode ID" )
  rownames(df_A_POS)<-names(concensus_matrix_per_barcode_A_to_G[z])
  return(df_A_POS)
})
df_A_G_POS_editing<-do.call(rbind,concensus_matrix_per_barcode_A_to_G_target_pos)

# Create a heatmap which plot the editing levels across all constructs
#pheatmap(df_A_G_POS_editing[,1:44], cluster_rows=F, cluster_cols=F,show_rownames=F, main = "Considering all barcodes")

# Match barcode information
matched_index<-match(df_A_G_POS_editing$`Barcode ID`,Barcodes_csv$barcode)
df_A_G_POS_editing<- cbind(df_A_G_POS_editing,Barcodes_csv[matched_index,c("desc","sample.ctrl","site")])

# Correlation of editing levels in mNG constructs with different barcode sequences
ADAR2_1_barcodes<-pivot_longer(df_A_G_POS_editing, cols = starts_with("A"), values_to = "AtoI_Editing", names_to= "A_postion" )
ADAR2_1_control_barcodes<-ADAR2_1_barcodes[ADAR2_1_barcodes$sample.ctrl=="barcode_ctrl",]
ADAR2_1_control_barcodes$merging_column<-paste0(ADAR2_1_control_barcodes$desc,ADAR2_1_control_barcodes$site,ADAR2_1_control_barcodes$A_postion)
ADAR2_1_sample_barcodes<-ADAR2_1_barcodes[ADAR2_1_barcodes$sample.ctrl=="sample",]
ADAR2_1_sample_barcodes$merging_column<-paste0(ADAR2_1_sample_barcodes$desc,ADAR2_1_sample_barcodes$site,ADAR2_1_sample_barcodes$A_postion)
ADAR2_1_sample_control_barcodes<-merge(ADAR2_1_sample_barcodes,ADAR2_1_control_barcodes,by.x="merging_column",by.y="merging_column")

# Create a scatter plot
p4.1<-ggplot(ADAR2_1_sample_control_barcodes, aes(x=AtoI_Editing.x,y=AtoI_Editing.y))+geom_point(size = 0.1, color="deepskyblue4")+ geom_abline(intercept = 0, slope = 1, color="black")+xlim(0,75)+ylim(0,75)+ ggtitle(z)+theme_linedraw()
p4.1<-p4.1+theme_classic()+theme(strip.background = element_blank(), strip.text.x = element_blank(), legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=7),plot.title = element_text(size=9,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7),axis.title = element_text(face = "bold",size = 7),axis.text.y = element_text(colour="black",size=7)) 
cor_test_mNG<-cor.test(ADAR2_1_sample_control_barcodes$AtoI_Editing.x,ADAR2_1_sample_control_barcodes$AtoI_Editing.y)
p4.1<-p4.1+ annotate(geom="text", x=20, y=60, label=paste(paste("p-value",cor_test_mNG$p.value)), color="black")
p5<-p4.1+ annotate(geom="text", x=13, y=50, label=paste("r=",round(cor_test_mNG$estimate,3)), color="black")+xlab("Sample barcodes")+ylab("Control barcodes")
plot(p5)

############ mNG random disruption #####################

# Filter and extract relevant mNG Construct
Random_disruption_df<-df_A_G_POS_editing[ df_A_G_POS_editing$desc=="mNG_random" & df_A_G_POS_editing$sample.ctrl=="sample" ,]
Random_perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mNG_perfect_ds",1:48]
Random_perfect_ds$site<-0
No_ds_structure<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mNG_ctrl_repeat",1:48] 
No_ds_structure$site<-1
Random_disruption_df<-rbind(Random_perfect_ds,Random_disruption_df,No_ds_structure)

# Calculate average editing
Random_disruption_df$average_editing<-rowMeans(Random_disruption_df[,1:44])

# Sort and format data
Random_disruption_df<-Random_disruption_df[order(as.numeric(Random_disruption_df$site)),]
rownames(Random_disruption_df)<-paste(Random_disruption_df$site)
Random_disruption_df$site<- as.numeric(Random_disruption_df$site)*100
Random_disruption_df$Treatment<-z
Random_disruption_df$site<-round(Random_disruption_df$site,2)

# Create a line plot
Random_ggplot<-ggplot(Random_disruption_df, aes(x=site,average_editing,y=average_editing, color=Treatment))+geom_line(size=0.5)+theme_classic(base_size = 12)+xlab("% Random Disruption")+ylab("Mean editing %")+scale_x_continuous(expand = c(0, 0),n.breaks = 10)+scale_y_continuous(expand = c(0, 0),n.breaks = 7, limits = c(0,max(Random_disruption_df$average_editing )+5))+theme(panel.border = element_rect(color = "black",fill = NA,size = 1),legend.position = c(0.7, 0.6),legend.title=element_blank(),axis.title = element_text(face="bold"), title =element_text(face="bold") )+ggtitle("")
Random_ggplot<-Random_ggplot+ scale_color_manual(values=c("#0000FF"))
plot(Random_ggplot)

############  Heatmap of a 1nt-,2nt-,3nt- and 4nt-mismatch running from 5’ to 3’ throughout the double-stranded RNA #####################

# Filter data for mismatch-carrying constructs and perfect ds
Disruption_df<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mismatch1"|df_A_G_POS_editing$desc=="mismatch3",]
Disruption_df<-Disruption_df[Disruption_df$sample.ctrl=="sample",]
Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mNG_perfect_ds",]

# Calculate Delta editing for each construct
Delta_editing<- mclapply(rownames(Disruption_df),mc.cores = 8, function(x){
  Delta_editing_per_construct<-(Disruption_df[x,1:44])-(Perfect_ds[1,1:44])
  Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_df[x,45:48])
  return(Delta_editing_per_construct)})
Delta_editing_df<-do.call(rbind,Delta_editing)

Delta_editing_df<-Delta_editing_df[order(as.numeric(Delta_editing_df$site)), ]
Delta_editing_df$Mistmatch_names<-paste0(Delta_editing_df$site,Delta_editing_df$sample.ctrl,Delta_editing_df$desc )

# Filter mismatch data for "sample"
Delta_editing_df_mis_1_3<-Delta_editing_df[Delta_editing_df$sample.ctrl=="sample",]

# Split data into separate lists by mismatch kind
mismatches_ADAR_list<-split(Delta_editing_df_mis_1_3,f=Delta_editing_df_mis_1_3$desc)

# Create pheatmap plots for each mismatch
pheatmap_plots_mismatches<-mclapply(names(mismatches_ADAR_list), mc.cores = 8, function(z){
  Delta_editing_df_mistmatch3<-mismatches_ADAR_list[[z]][order(as.numeric(mismatches_ADAR_list[[z]]$site), decreasing = T),]
  rownames(Delta_editing_df_mistmatch3)<-Delta_editing_df_mistmatch3$site
  Delta_editing_df_mistmatch3_matrix<-as.matrix(Delta_editing_df_mistmatch3[,1:41])
  Delta_editing_df_mistmatch3_matrix_scales<-scale(Delta_editing_df_mistmatch3_matrix)
  Delta_editing_df_mistmatch3_matrix_scales<-as.data.frame(Delta_editing_df_mistmatch3_matrix_scales)
  list_rows <- split(Delta_editing_df_mistmatch3_matrix_scales,seq(nrow(Delta_editing_df_mistmatch3_matrix_scales)))
  list_rows_capped_zscore<-lapply(list_rows, function(z){
    z[which(z<=-4)]<- -4
    z[which(z>=4)]<- 4
    return(z)})
  Delta_editing_df_mistmatch3_matrix_scales_capped<-do.call(rbind,list_rows_capped_zscore)
  colnames(Delta_editing_df_mistmatch3_matrix_scales_capped)<-colnames(Delta_editing_df_mistmatch3_matrix_scales)
  plot_pheatmap<-pheatmap(Delta_editing_df_mistmatch3_matrix_scales_capped,cluster_rows=F, cluster_cols=F,show_rownames=F, fontsize = 11, main = z )
  return(plot_pheatmap[[4]])
})
grid.arrange(grobs=pheatmap_plots_mismatches, ncol=2)


########################## ADAR1- and ADAR2-mediated editing offsets based on subsets of 3-nucleotide mismatch running throughout the mNG and B2 sequences. ################

# Filter data for mismatch1 and mismatch3, and for the "sample" 
Disruption_df<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mismatch1" | df_A_G_POS_editing$desc=="mismatch3",]
Disruption_df<-Disruption_df[Disruption_df$sample.ctrl=="sample",]

# Filter data for the perfect ds
Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mNG_perfect_ds",]

# Calculate Delta editing for each construct
Delta_editing<- mclapply(rownames(Disruption_df),mc.cores = 8, function(x){
  Delta_editing_per_construct<-(Disruption_df[x,1:44])-(Perfect_ds[1,1:44])
  Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_df[x,45:48])
  return(Delta_editing_per_construct)})
Delta_editing_df<-do.call(rbind,Delta_editing)
Delta_editing_df<-Delta_editing_df[order(as.numeric(Delta_editing_df$site)), ]
Delta_editing_df$Mistmatch_names<-paste0(Delta_editing_df$site,Delta_editing_df$sample.ctrl,Delta_editing_df$desc )
Delta_editing_df_mis_1_3<-Delta_editing_df

# Split the data by mismatch kind to process separately
Delta_editing_df_mis_1_3_list<-split(Delta_editing_df_mis_1_3,f=Delta_editing_df_mis_1_3$desc)

# Process and filter data for each type of mismatch
Delta_editing_df_mis_1_3_df_list<-mclapply(Delta_editing_df_mis_1_3_list, mc.cores = 8, function(y){
  max_vector<-sapply(abs(y[1:44]),max)
  Col_consider_downstream<-names(max_vector[max_vector>1])
  Delta_editing_df_mis_1_3<-y[,c(Col_consider_downstream,"Barcode ID","desc","sample.ctrl","site","Mistmatch_names")]
  Delta_editing_df_mis_1_3<-Delta_editing_df_mis_1_3[!Delta_editing_df_mis_1_3$sample.ctrl=="barcode_ctrl",]
})

# Calculate distance from the disruption for each construct
Delta_editing_mistmatch_1_3_df_list<-mclapply(Delta_editing_df_mis_1_3_df_list,mc.cores=8, function(z){
  Delta_editing_list<-split(z, f=z$Mistmatch_names)
  Delta_editing_mistmatch<-mclapply(Delta_editing_list, mc.cores = 8, function(x){
    Editing_levels<-as.numeric(x[1,1:(ncol(x)-5)])
    Distance_from_disruption<- (146-as.numeric(x[1,"site"])-as.numeric(gsub("A","",names(x[,1:(ncol(x)-5)]))))*(-1)
    Distance_from_disruption_df<-data.frame(Distance_from_mistmatch=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Mismatch_kind=rep(x$desc, length(Editing_levels)))
    Distance_from_disruption_df$A_position<- as.numeric(gsub("A","",colnames(x[,1:(ncol(x)-5)])))
    return(Distance_from_disruption_df)
  })
  Delta_editing_mistmatch_df<-do.call(rbind,Delta_editing_mistmatch)
})

# Split the data by mismatch kind to process separately and arrange the distance for mismatch3
Delta_editing_mistmatch_1<-Delta_editing_mistmatch_1_3_df_list[[1]]
Delta_editing_mistmatch_3<-Delta_editing_mistmatch_1_3_df_list[[2]]
Delta_editing_mistmatch_3$Distance_from_mistmatch<-Delta_editing_mistmatch_3$Distance_from_mistmatch+1


Delta_editing_mistmatch_df<-rbind(Delta_editing_mistmatch_1,Delta_editing_mistmatch_3)

# Define labels for the mismatch types
mismatch.labs <- c("Mismatch 1 nucleotide", "Mismatch 3 nucleotides")
names(mismatch.labs) <- c("mismatch1","mismatch3")

# Split the data by mismatch kind and apply LOESS smoothing
Delta_editing_mistmatch_df<-Delta_editing_mistmatch_df[!is.na(Delta_editing_mistmatch_df$Editing_level_A_to_I),]
Delta_editing_mistmatch_df_list<-split(Delta_editing_mistmatch_df,f=Delta_editing_mistmatch_df$Mismatch_kind)
Delta_editing_mistmatch_df_list_LOESS<- mclapply(Delta_editing_mistmatch_df_list, mc.cores = 8, function(z){
  loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_mistmatch, data=z, span=0.05)
  z$smoothed5 <- predict(loessMod50)
  return(z)
  
})
Delta_editing_mistmatch_df<-do.call(rbind,Delta_editing_mistmatch_df_list_LOESS)


ADARs_periodicity_mis_kinds_ADAR_list<-split(Delta_editing_mistmatch_df, f=Delta_editing_mistmatch_df$Mismatch_kind)


# Calculate quartiles per distance
ADARs_periodicity_mis_kindsc_ADAR_variation<-mclapply(ADARs_periodicity_mis_kinds_ADAR_list, mc.cores = 8, function(z){
  ADARs_dist_mis_list<-split(z, f=z$Distance_from_mistmatch)
  ADARs_dist_mis_quartiles_list<-mclapply(ADARs_dist_mis_list, mc.cores = 8, function(x){
    df<-as.data.frame(x)
    quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1))))
    colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1))
    df<-cbind(df,quartiles_df)
    return(df)
  }) 
  ADARs_dist_mis_quartiles_list_df<-do.call(rbind,ADARs_dist_mis_quartiles_list)
  return(ADARs_dist_mis_quartiles_list_df)
})
ADARs_periodicity_mis_kindsc_ADAR_variation_df<-do.call(rbind,ADARs_periodicity_mis_kindsc_ADAR_variation)

# Create a summary plot for visualization per mismatch kind
vertical.lines<-c(-26,-35)
p.1 <- ggplot(ADARs_periodicity_mis_kindsc_ADAR_variation_df, aes(Distance_from_mistmatch, smoothed5))+ geom_ribbon(aes(ymin=Q_0.25,ymax=Q_0.75),alpha=0.15)+ geom_line(size=0.50,colour= "Darkblue")
p.1 <-p.1 +xlab("Distance from mistmatch")+ylab("Delta editing")+theme_classic()#+scale_color_manual(values=c("Black","Blue"))
p.1 <-p.1 +theme(strip.background = element_blank(),
                 legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=11),plot.title = element_text(size=14,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=11,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=11)) 

p.1 <-p.1 +geom_vline(size=0.5,xintercept = vertical.lines, color="#666699",alpha=0.6,linetype = "dashed")+ggtitle("")#+facet_grid(~ADAR)
p.1<-p.1+scale_x_continuous(n.breaks = 8)+facet_wrap(~Mismatch_kind, ncol = 4)+scale_y_continuous(n.breaks = 6, limits=c(-40,40))
plot(p.1)

# Create a summary plot for mismatch3
ADARs_periodicity_3nt_mis_kindsc_ADAR_variation_df<-ADARs_periodicity_mis_kindsc_ADAR_variation_df[ADARs_periodicity_mis_kindsc_ADAR_variation_df$Mismatch_kind=="mismatch3",]
p.1 <- ggplot(ADARs_periodicity_3nt_mis_kindsc_ADAR_variation_df, aes(Distance_from_mistmatch, smoothed5))+ geom_ribbon(aes(ymin=Q_0.25,ymax=Q_0.75),alpha=0.15)+ geom_line(size=0.40,colour= "Darkblue")
p.1 <-p.1 +xlab("Distance from the mismatch")+ylab("Delta Editing")+theme_classic()#+scale_color_manual(values=c("Black","Blue"))
p.1 <-p.1 +theme(strip.background = element_blank(),
                 strip.text.x = element_blank(),
                 legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=6),plot.title = element_text(size=7,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=7)) 
p.1 <-p.1 +geom_vline(size=0.5,xintercept = vertical.lines, color="#666699",alpha=0.6,linetype = "dashed")+ggtitle(paste("mNG: 3-nuc mismatch"))#+facet_grid(~ADAR)
p.1<-p.1+scale_x_continuous(n.breaks = 8, limits = c(-50,50))+scale_y_continuous(n.breaks = 6, limits=c(min(ADARs_periodicity_3nt_mis_kindsc_ADAR_variation_df$Q_0.25-1),max(ADARs_periodicity_3nt_mis_kindsc_ADAR_variation_df$Q_0.75+5)))
plot(p.1)

############## Size of mismatch ####################

# Filter the data for specific mismatch lengths ( For ADAR1: -35 or -35.5 and for ADAR2: -26 OR 26.5) 
ADAR_B2_1_to_3_nuc_mis<-ADARs_periodicity_mis_kindsc_ADAR_variation_df[ADARs_periodicity_mis_kindsc_ADAR_variation_df$Distance_from_mistmatch==-35|ADARs_periodicity_mis_kindsc_ADAR_variation_df$Distance_from_mistmatch==-35.5,]

# Create a boxplot
plot_Boxplot_ADAR2<-ggplot(ADAR_B2_1_to_3_nuc_mis, aes(x=Mismatch_kind, y=Editing_level_A_to_I))+geom_boxplot()+ theme(legend.position="none",axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +xlab("Mismatch size")+ylab("Delta Editing")+theme_classic()
plot_Boxplot_ADAR2<-plot_Boxplot_ADAR2+theme(strip.background = element_blank(),strip.text.y = element_blank(),strip.text.x = element_text(size = 8),
                                             legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=12),plot.title = element_text(size=12,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=12),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=12)) 
plot(plot_Boxplot_ADAR2)

############ T bulges PERIODICITY ######################

# Define the bulge kinds and filter the data for bulges-carrying constructs
Disruption_bulges_df<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="bulge-T" | df_A_G_POS_editing$desc=="bulge-TTC" | df_A_G_POS_editing$desc=="bulge-TTCTT" | df_A_G_POS_editing$desc=="bulge-TTCTTCT",]
Disruption_bulges_df<-Disruption_bulges_df[Disruption_bulges_df$sample.ctrl=="sample",]
Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mNG_perfect_ds",]


Delta_editing<- mclapply(rownames(Disruption_bulges_df),mc.cores = 8, function(x){
  Delta_editing_per_construct<-Disruption_bulges_df[x,1:44]-Perfect_ds[1,1:44]
  Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_bulges_df[x,45:48])
  return(Delta_editing_per_construct)
} )

# Calculate Delta editing per construct
Delta_editing_df<-do.call(rbind,Delta_editing)
Delta_editing_df<-Delta_editing_df[order(as.numeric(Delta_editing_df$site)), ]
Delta_editing_df$Mistmatch_names<-paste0(Delta_editing_df$site,Delta_editing_df$sample.ctrl,Delta_editing_df$desc )
Delta_editing_list<-split(Delta_editing_df, f=Delta_editing_df$Mistmatch_names)

# Calculate distance from bulge and adjust
Delta_editing_bulge<-mclapply(Delta_editing_list, mc.cores = 8, function(x){
  Editing_levels<-as.numeric(x[1,1:44])
  Distance_from_disruption<-(146-as.numeric(x[1,"site"])- as.numeric(gsub("A","",names(x[,1:44])))+0.5)*-1
  Distance_from_Disruption_bulges_df<-data.frame(Distance_from_bulge=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Bulge.kinds=rep(x$desc, length(Editing_levels)))
})

Delta_editing_bulge_df<-do.call(rbind,Delta_editing_bulge)
bulge.labs <- c("Bulge T", "Bulge TTC", "Bulge TTCTT","Bulge TTCTTCT")
names(bulge.labs) <- c("bulge-T", "bulge-TTC", "bulge-TTCTT","bulge-TTCTTCT")
Delta_editing_bulge_df<-Delta_editing_bulge_df[!is.na(Delta_editing_bulge_df$Editing_level_A_to_I),]

# Apply LOESS smoothing
Loes_bulge_kind<-split(Delta_editing_bulge_df, f=Delta_editing_bulge_df$Bulge.kinds)##
Loes_bulge_kind_list<-mclapply(Loes_bulge_kind, mc.cores = 8, function(x){##
  loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_bulge, data=x, span=0.11)###
  x$smoothed5 <- predict(loessMod50)##
  return(x)})
Delta_editing_bulge_df<-do.call(rbind,Loes_bulge_kind_list)###

# Calculate quartiles
ggplot_quartiles_T_bulges_list<-split(Delta_editing_bulge_df, f=Delta_editing_bulge_df$Bulge.kinds)
Pyrimidine_bulges_mNG_quartiles<-mclapply(ggplot_quartiles_T_bulges_list, mc.cores = 8, function(x){
  ADAR1_3nuc_mis<-x
  ADAR1_3nuc_mis_ADAR1<-split(ADAR1_3nuc_mis, f=ADAR1_3nuc_mis$Distance_from_bulge)
  ADAR1_3nuc_mis_ADAR2_list<-mclapply(ADAR1_3nuc_mis_ADAR1, mc.cores = 8, function(x){
    df<-as.data.frame(x)
    quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1))))
    colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1))
    df<-cbind(df,quartiles_df)
    return(df)
  })
  ADAR1_3nuc_mis_ADAR1_list_df<-do.call(rbind,ADAR1_3nuc_mis_ADAR2_list)
})
Pyrimidine_bulges_mNG_quartiles_df<-do.call(rbind,Pyrimidine_bulges_mNG_quartiles)
Pyrimidine_bulges_mNG_quartiles_df$ADAR<-z
vertical.lines<-c(-35,-26,30)
color_plot_ADAR<-brewer.pal(8, "Dark2")

# Create and customize the summary plot per T-bulge kind
p.1 <- ggplot(Pyrimidine_bulges_mNG_quartiles_df, aes(Distance_from_bulge, smoothed5))+ geom_ribbon(aes(ymin=Q_0.25,ymax=Q_0.75),alpha=0.15)+ geom_line(size=0.40,colour= "DarkBlue")
p.1 <-p.1 +xlab("")+ylab("")+theme_classic()
p.1 <-p.1 +theme(legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=6),plot.title = element_text(size=10,hjust = 0.5),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=7)) 
p.1 <-p.1 +geom_vline(size=0.5,xintercept = vertical.lines, color="#666699",alpha=0.6,linetype = "dashed")+ggtitle(unique(Pyrimidine_bulges_mNG_quartiles_df$ADAR))+facet_grid(~Bulge.kinds)
plot(p.1)


######################################## mNG Sequence Preference ##########

mNG_all_ADARs_function_seq_pref<-function(x,ADAR_ID){
  df_A_G_POS_editing<-x
  perfect_ds_fw_strand<-("AGCCAATGGCGGCTAACTATCTGAAGAACCAGCCGATGTACGTGTTCCGTAAGACGGAGCTCAAGCACTCCAAGACCGAGCTCAACTTCAAGGAGTGGCAAAAGGCCTTTACCGATGTGATGGGCATGGACGAGCTGTACAAGTAA")
  Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$`Barcode ID`=="AAGGCCAT",1:39]
  Perfect_ds_t<-t(Perfect_ds)
  Perfect_ds_t<-as.data.frame(Perfect_ds_t)
  Perfect_ds_t$A_pos<-as.numeric(gsub("A","",rownames(Perfect_ds_t)))
  downstream_nuc<-mclapply(rownames(Perfect_ds_t),mc.cores = 8, function(z){
    df<-as.data.frame(Perfect_ds_t[z,])
    pos_A<-df$A_pos
    nuc_down<-substr(perfect_ds_fw_strand, start =pos_A, stop = pos_A+1 ) 
    df<-cbind(df,nuc_down)
    colnames(df)<-c("Editing","A_position","Nuc_downstream")
    return(df)
  })
  downstream_nuc_df_ggplot<-do.call(rbind,downstream_nuc)
  downstream_nuc_df_ggplot$ADAR<-ADAR_ID
  downstream_nuc_df_ggplot$Pos<-"Downstream"
  p1_nuc<-ggplot(downstream_nuc_df_ggplot, aes(x=reorder(Nuc_downstream,Editing, median),y=Editing))+geom_boxplot(outlier.shape = NA)+geom_jitter(position=position_jitter(0.2), color="blue")+ggtitle("Base after A")+xlab("")+theme_classic()+ylab("Editing Level")
  
  Perfect_ds_t<-Perfect_ds_t[-1,]
  upstream_nuc<-mclapply(rownames(Perfect_ds_t),mc.cores = 8, function(z){
    df<-Perfect_ds_t[z,]
    pos_A<-df$A_pos
    nuc_up<-substr(perfect_ds_fw_strand, start =pos_A-1, stop = pos_A ) 
    df<-cbind(df,nuc_up)
    colnames(df)<-c("Editing","A_position","Nuc_downstream")
    return(df)
  })
  upstream_nuc_df_ggplot<-do.call(rbind,upstream_nuc)
  upstream_nuc_df_ggplot$ADAR<-ADAR_ID
  upstream_nuc_df_ggplot$Pos<-"Upstream"
  p2_nuc<-ggplot(upstream_nuc_df_ggplot, aes(x=reorder(Nuc_downstream,Editing, median),y=Editing))+geom_boxplot(outlier.shape = NA)+geom_jitter(position=position_jitter(0.2), color="blue")+ggtitle("Base before A")+xlab("")+ylab("Editing Level")+theme_classic()
  Pref_down_Upst<-rbind(upstream_nuc_df_ggplot,downstream_nuc_df_ggplot)
  
  
  Pref_down_Upst$library<-"mNG"
  return(Pref_down_Upst)
  #grid.arrange(p2_nuc,p1_nuc, ncol=2) 
  
}


ADAR_seq_pref<-mNG_all_ADARs_function_seq_pref(df_A_G_POS_editing,z)

output<-list(ADAR_seq_pref,df_A_G_POS_editing)

return(output)
}

#ADAR2

# Define file paths  
fastq_file1 = "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS_S64_R1_001.fastq.gz"
fastq_file2 = "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS_S64_R2_001.fastq.gz"

# Set the random seed for reproducibility and Load and sample the Fastq files
set.seed(123L)
f1 <- FastqSampler(fastq_file1, n = 1e7)
set.seed(123L)
f2 <- FastqSampler(fastq_file2, n = 1e7)
B2_R1 <- yield(f1)
B2_R2 <- yield(f2)

# Alternatively, if you prefer to work with the entire dataset rather than a sample, you can use the following lines to load the complete data files.
# B2_R1 = readFastq("/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS_S64_R1_001.fastq.gz")
# B2_R2 = readFastq("/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS/657_ADARs_mNG_library_ADAR1_KO_1_658_ADARs_B2_C_LRS_S64_R2_001.fastq.gz")

ADAR2_Rep1_mNG<-mNG_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR2")

# Define file paths  
fastq_file1 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_10_658_ADARs_B2_L_LRS/657_ADARs_mNG_library_ADAR1_KO_10_658_ADARs_B2_L_LRS_S73_R1_001.fastq.gz"
fastq_file2 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_10_658_ADARs_B2_L_LRS/657_ADARs_mNG_library_ADAR1_KO_10_658_ADARs_B2_L_LRS_S73_R2_001.fastq.gz"

# Set the random seed for reproducibility and Load and sample the Fastq files
set.seed(123L)
f1 <- FastqSampler(fastq_file1, n = 1e7)
set.seed(123L)
f2 <- FastqSampler(fastq_file2, n = 1e7)
B2_R1 <- yield(f1)
B2_R2 <- yield(f2)

ADAR2_Rep2_mNG<-mNG_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR2")

ADAR2_1_ggplot<-pivot_longer(ADAR2_Rep1_mNG[[2]], cols = starts_with("A"), values_to = "AtoI_Editing", names_to= "A_postion" )
ADAR2_1_ggplot$ID_ggplot<-paste0(ADAR2_1_ggplot$`Barcode ID`,ADAR2_1_ggplot$sample.ctrl,ADAR2_1_ggplot$A_postion)
ADAR2_1_ggplot_df<-as.data.frame(ADAR2_1_ggplot)
ADAR2_1_ggplot_df<-ADAR2_1_ggplot_df[,6:7]
colnames(ADAR2_1_ggplot_df)<-c("ADAR2p_B2_Rep1_AtoI_Editing","ID_ggplot")

ADAR2_2_ggplot<-pivot_longer(ADAR2_Rep2_mNG[[2]], cols = starts_with("A"),values_to = "AtoI_Editing", names_to= "A_postion" )
ADAR2_2_ggplot$ID_ggplot<-paste0(ADAR2_2_ggplot$`Barcode ID`,ADAR2_2_ggplot$sample.ctrl,ADAR2_2_ggplot$A_postion)
ADAR2_2_ggplot_df<-as.data.frame(ADAR2_2_ggplot)
ADAR2_2_ggplot_df<-ADAR2_2_ggplot_df[,6:7]
colnames(ADAR2_2_ggplot_df)<-c("ADAR2p_B2_Rep2_AtoI_Editing","ID_ggplot")
ADAR2_plasmid_B2_repetitions<-merge(ADAR2_1_ggplot_df,ADAR2_2_ggplot_df, by.x="ID_ggplot",by.y="ID_ggplot"  )
ADAR2_plasmid_B2_repetitions<-as.data.frame(ADAR2_plasmid_B2_repetitions)
p3<-ggplot(ADAR2_plasmid_B2_repetitions, aes(x=ADAR2p_B2_Rep1_AtoI_Editing,y=ADAR2p_B2_Rep2_AtoI_Editing))+geom_point(size = 0.1, color="deepskyblue4")+ geom_abline(intercept = 0, slope = 1, color="black")+xlim(0,100)+ylim(0,100)+ ggtitle("ADAR2")+theme_linedraw()
p3<-p3+theme_classic()+theme(strip.background = element_blank(),
                             strip.text.x = element_blank(),
                             legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=7),plot.title = element_text(size=9,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 7),axis.text.y = element_text(colour="black",size=7)) 
p3<-p3+ annotate(geom="text", x=13, y=70, label=paste("p-value < 2.2e-16 "), color="black")
Correlation_Rep<-cor.test(ADAR2_plasmid_B2_repetitions$ADAR2p_B2_Rep1_AtoI_Editing,ADAR2_plasmid_B2_repetitions$ADAR2p_B2_Rep2_AtoI_Editing)
p3<-p3+ annotate(geom="text", x=12, y=60, label=paste("r=",round(Correlation_Rep$estimate,4)), color="black")+xlab("Overexpressed ADAR rep1")+ylab("Overexpressed ADAR rep2")
plot(p3)


#ADAR1

# Define file paths  
fastq_file1 = "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_2_658_ADARs_B2_D_LRS/657_ADARs_mNG_library_ADAR1_KO_2_658_ADARs_B2_D_LRS_S65_R1_001.fastq.gz"
fastq_file2 = "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_2_658_ADARs_B2_D_LRS/657_ADARs_mNG_library_ADAR1_KO_2_658_ADARs_B2_D_LRS_S65_R2_001.fastq.gz"

# Set the random seed for reproducibility and Load and sample the Fastq files
set.seed(123L)
f1 <- FastqSampler(fastq_file1, n = 1e7)
set.seed(123L)
f2 <- FastqSampler(fastq_file2, n = 1e7)
B2_R1 <- yield(f1)
B2_R2 <- yield(f2)
ADAR1_Rep1_mNG<-mNG_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR1")

# Define file paths  
fastq_file1 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_11_658_ADARs_B2_M_LRS/657_ADARs_mNG_library_ADAR1_KO_11_658_ADARs_B2_M_LRS_S74_R1_001.fastq.gz"
fastq_file2 <- "/LongRead_ADARs_B2_mNG_lib_KO_cells/657_ADARs_mNG_library_ADAR1_KO_11_658_ADARs_B2_M_LRS/657_ADARs_mNG_library_ADAR1_KO_11_658_ADARs_B2_M_LRS_S74_R2_001.fastq.gz"

# Set the random seed for reproducibility and Load and sample the Fastq files
set.seed(123L)
f1 <- FastqSampler(fastq_file1, n = 1e7)
set.seed(123L)
f2 <- FastqSampler(fastq_file2, n = 1e7)
B2_R1 <- yield(f1)
B2_R2 <- yield(f2)

ADAR1_Rep2_mNG<-mNG_library_A_to_G_analysis(B2_R1,B2_R2,"ADAR1")

ADAR1_1_ggplot<-pivot_longer(ADAR1_Rep1_mNG[[2]], cols = starts_with("A"), values_to = "AtoI_Editing", names_to= "A_postion" )
ADAR1_1_ggplot$ID_ggplot<-paste0(ADAR1_1_ggplot$`Barcode ID`,ADAR1_1_ggplot$sample.ctrl,ADAR1_1_ggplot$A_postion)
ADAR1_1_ggplot_df<-as.data.frame(ADAR1_1_ggplot)
ADAR1_1_ggplot_df<-ADAR1_1_ggplot_df[,6:7]
colnames(ADAR1_1_ggplot_df)<-c("ADAR1p_B2_Rep1_AtoI_Editing","ID_ggplot")

ADAR1_2_ggplot<-pivot_longer(ADAR1_Rep2_mNG[[2]], cols = starts_with("A"),values_to = "AtoI_Editing", names_to= "A_postion" )
ADAR1_2_ggplot$ID_ggplot<-paste0(ADAR1_2_ggplot$`Barcode ID`,ADAR1_2_ggplot$sample.ctrl,ADAR1_2_ggplot$A_postion)
ADAR1_2_ggplot_df<-as.data.frame(ADAR1_2_ggplot)
ADAR1_2_ggplot_df<-ADAR1_2_ggplot_df[,6:7]
colnames(ADAR1_2_ggplot_df)<-c("ADAR1p_B2_Rep2_AtoI_Editing","ID_ggplot")
ADAR1_plasmid_B2_repetitions<-merge(ADAR1_1_ggplot_df,ADAR1_2_ggplot_df, by.x="ID_ggplot",by.y="ID_ggplot"  )
ADAR1_plasmid_B2_repetitions<-as.data.frame(ADAR1_plasmid_B2_repetitions)
p3<-ggplot(ADAR1_plasmid_B2_repetitions, aes(x=ADAR1p_B2_Rep1_AtoI_Editing,y=ADAR1p_B2_Rep2_AtoI_Editing))+geom_point(size = 0.1, color="deepskyblue4")+ geom_abline(intercept = 0, slope = 1, color="black")+xlim(0,100)+ylim(0,100)+ ggtitle("ADAR1")+theme_linedraw()
p3<-p3+theme_classic()+theme(strip.background = element_blank(),
                             strip.text.x = element_blank(),
                             legend.title=element_blank(),legend.position="bottom",legend.text = element_text(size=7),plot.title = element_text(size=9,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=7,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 7),axis.text.y = element_text(colour="black",size=7)) 
p3<-p3+ annotate(geom="text", x=13, y=70, label=paste("p-value < 2.2e-16 "), color="black")
Correlation_Rep<-cor.test(ADAR1_plasmid_B2_repetitions$ADAR1p_B2_Rep1_AtoI_Editing,ADAR1_plasmid_B2_repetitions$ADAR1p_B2_Rep2_AtoI_Editing)
p3<-p3+ annotate(geom="text", x=12, y=60, label=paste("r=",round(Correlation_Rep$estimate,4)), color="black")+xlab("Overexpressed ADAR rep1")+ylab("Overexpressed ADAR rep2")
plot(p3)

# Sequence preference function(example for ADAR2)
 
# Combine data frames from B2 and mNG libraries regarding sequence preference
All_ADARs_seq_pref_mNG_df<- ADAR2_Rep1_mNG[[1]]
All_ADARs_seq_pref_mNG_df<-All_ADARs_seq_pref_mNG_df[,colnames(ADAR2_Rep1_B2[[1]])]
All_ADARs_seq_pref_mNG_B2_df<- rbind(ADAR2_Rep1_B2[[1]],All_ADARs_seq_pref_mNG_df)

# Process upstream and downstream data
All_ADARs_seq_pref_mNG_B2_df_downs<-All_ADARs_seq_pref_mNG_B2_df[All_ADARs_seq_pref_mNG_B2_df$Pos=="Upstream",]
All_ADARs_seq_pref_mNG_B2_df_downs$Nucleotide<-do.call(rbind,str_split(All_ADARs_seq_pref_mNG_B2_df_downs$Nuc_downstream, ""))[,-2]
All_ADARs_seq_pref_mNG_B2_df_ups<-All_ADARs_seq_pref_mNG_B2_df[All_ADARs_seq_pref_mNG_B2_df$Pos=="Downstream",]
All_ADARs_seq_pref_mNG_B2_df_ups$Nucleotide<-do.call(rbind,str_split(All_ADARs_seq_pref_mNG_B2_df_ups$Nuc_downstream, ""))[,-1]

# Combine upstream and downstream data
All_ADARs_seq_pref_mNG_B2_df_ups_down<-rbind(All_ADARs_seq_pref_mNG_B2_df_downs,All_ADARs_seq_pref_mNG_B2_df_ups)
All_ADARs_seq_pref_mNG_B2_df_ups_down$Nucleotide<-factor(All_ADARs_seq_pref_mNG_B2_df_ups_down$Nucleotide, levels = c("G","C","T","A"))

# Plot upstream data
All_ADARs_seq_pref_mNG_B2_df_ups<-All_ADARs_seq_pref_mNG_B2_df_ups_down[All_ADARs_seq_pref_mNG_B2_df_ups_down$Pos=="Upstream",]
p.1_ups <- ggplot(All_ADARs_seq_pref_mNG_B2_df_ups, aes(x=Nucleotide,y=Editing,color=Nucleotide))+geom_boxplot(outlier.size = 0.5,position=position_dodge(0.9))
p.1_ups<-p.1_ups +xlab("")+ylab("Editing %")+theme_classic()
p.1_ups<-p.1_ups +theme(strip.background = element_blank(),strip.text.y = element_blank(),strip.text.x = element_text(size = 8),
                        legend.title=element_blank(),legend.position="none",legend.text = element_text(size=8),plot.title = element_text(size=8,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=8,angle = 0, vjust = 0.5, hjust=0.5),axis.title = element_text(face = "bold",size = 10),axis.text.y = element_text(colour="black",size=8)) 
p.1_ups <-p.1_ups +ggtitle("5'")+ scale_color_brewer(palette="Set1")

# Process downstream data
All_ADARs_seq_pref_mNG_B2_df_downs<-All_ADARs_seq_pref_mNG_B2_df_ups_down[All_ADARs_seq_pref_mNG_B2_df_ups_down$Pos=="Downstream",]
All_ADARs_seq_pref_mNG_B2_df_downs$Nucleotide<-factor(All_ADARs_seq_pref_mNG_B2_df_downs$Nucleotide, levels = c("G","C","T","A"))

# Plot downstream data
p.1_downs <- ggplot(All_ADARs_seq_pref_mNG_B2_df_downs, aes(x=Nucleotide,y=Editing,color=Nucleotide))+geom_boxplot(outlier.size = 0.5,position=position_dodge(0.9))
p.1_downs<-p.1_downs +xlab("")+ylab("")+theme_classic()
p.1_downs<-p.1_downs +theme(strip.background = element_blank(),strip.text.y = element_blank(),strip.text.x = element_text(size = 8),
                            legend.title=element_blank(),legend.position="none",legend.text = element_text(size=8),plot.title = element_text(size=8,face = "bold",),panel.border = element_rect(color = "black",fill = NA,size = 0.5),axis.text.x = element_text(colour="black",size=8,angle = 0, vjust = 0.5, hjust=0.5),axis.title = element_text(face = "bold",size = 10),axis.text.y = element_text(colour="black",size=8)) 
p.1_downs <-p.1_downs +ggtitle("3'")+ scale_color_brewer(palette="Set1")

# Arrange and display the plots
grid.arrange(p.1_ups,p.1_downs, ncol=2)


############################## Periodicity: ADAR2 Variable Arm (Bottom Arm) ##########################

# Define file paths  
B2_R1 = readFastq("/LongRead_ADARs_B2_mNG_lib_KO_cells/658_660_ADARs_B2_Variable_arm_G_LRS_ADAR1_mut_KO_cells_LRS_B2_lib_7/658_660_ADARs_B2_Variable_arm_G_LRS_ADAR1_mut_KO_cells_LRS_B2_lib_7_S90_R1_001.fastq.gz")
B2_R2 = readFastq("/LongRead_ADARs_B2_mNG_lib_KO_cells/658_660_ADARs_B2_Variable_arm_G_LRS_ADAR1_mut_KO_cells_LRS_B2_lib_7/658_660_ADARs_B2_Variable_arm_G_LRS_ADAR1_mut_KO_cells_LRS_B2_lib_7_S90_R2_001.fastq.gz")

# Name of ADAR data to be analyzed 
z<-"ADAR2"

# Read B2_R1 sequence data and add a column for row numbers
B2_R1_sequence<-as.data.frame(sread(B2_R1))
B2_R1_sequence$number_read<-1:nrow(B2_R1_sequence)
Total_Number_of_Reads_R1<-nrow(B2_R1_sequence)

# Filter rows base on sequence on the loop
B2_R1_sequence<-B2_R1_sequence[grepl("CCTTCGAA",B2_R1_sequence$x),]
B2_R1_sequence$B2_sequence<-gsub(".*CCTTCGAA","" ,B2_R1_sequence$x)
number_reads_beggining_properly<- nrow(B2_R1_sequence)
colnames(B2_R1_sequence)<-c("non_processed_R1","number_read","B2_constructR1")
B2_R1_sequence_1<-B2_R1_sequence[nchar(B2_R1_sequence$B2_constructR1)>=127,]
B2_R1_sequence<-B2_R1_sequence_1[grep("TCCCTCACTACCCTCAACCCA",B2_R1_sequence_1$B2_constructR1),]

# Calculate the percentage of reads beginning properly
Percentage_reads_beggining_properly<- (number_reads_beggining_properly/ Total_Number_of_Reads_R1)*100

# Extract barcode and B2 variable arm sequence
B2_R1_sequence$barcode<-gsub("\\TCCC.*","",B2_R1_sequence$B2_constructR1)
B2_R1_sequence$B2_constructR1<-gsub(".*TCCCTCACTACCCTCAACCCA","",B2_R1_sequence$B2_constructR1)

# Read Barcodes CSV file and Filter rows for B2 barcodes
## Read and filter Barcodes. This table can be found in Uzonyi et al.(2021): Table S2. Sequences of the B2 and mNG oligo library pools, related to STAR Methods.
Barcodes_csv<-read.csv("/DATA_HELA_ADAR2_over_ADAR1_knockdown_B2_libr/Table_S2_Twist_library_sequence_plans.csv", sep = ",", header = TRUE)
Barcodes_csv<-Barcodes_csv[Barcodes_csv$B2.mNG=="B2",]
Barcodes_csv$site<-as.character(Barcodes_csv$site) 

# Merge B2_R1_sequence with Barcodes_csv based on 'barcode'
B2_R1_sequence_Barcode_ID<-merge(B2_R1_sequence,Barcodes_csv,by.x="barcode", by.y="barcode" )

# Read B2_R2 sequence data and set row names and number_read
B2_R2_sequence<-as.data.frame(sread(B2_R2))
rownames(B2_R2_sequence)<-1:nrow(B2_R2_sequence)
B2_R2_sequence$number_read<-1:nrow(B2_R2_sequence)
Total_Number_of_Reads_R2<-nrow(B2_R2_sequence)

# Read reverse complement of B2_R2 sequence
B2_R2_sequence_reverse_comp<-as.data.frame(sread(reverseComplement(B2_R2)))
B2_R2_sequence_reverse_comp$number_read<-1:nrow(B2_R2_sequence_reverse_comp)

# Filter rows containing "GGCGCGC" and Extract B2_constructR2 
B2_R2_sequence_reverse_comp<-B2_R2_sequence_reverse_comp[grepl("GGCGCGC",B2_R2_sequence_reverse_comp$x),]
B2_R2_sequence_reverse_comp$B2_constructR2<-gsub("\\GGCGCGC.*","",B2_R2_sequence_reverse_comp$x)

# Merge B2_R1 and B2_R2 sequences based on 'number_read'
B2_df_R1_R2<-merge(B2_R1_sequence_Barcode_ID,B2_R2_sequence_reverse_comp, by.x="number_read", by.y="number_read", all=F)

# Calculate the 'Expected_length' of sequences
B2_df_R1_R2$Expected_length<-nchar(B2_df_R1_R2$seq)

# Filter rows with B2_constructR1 longer than 98 characters
B2_df_R1_R2<-B2_df_R1_R2[nchar(B2_df_R1_R2$B2_constructR1)>98, ]
B2_df_R1_R2$B2_constructR1<-substr(B2_df_R1_R2$B2_constructR1, start = 1,stop = 98)

rownames(B2_df_R1_R2)<-1:nrow(B2_df_R1_R2)

# Calculate the 'length_required for R2' for B2 sequences
B2_df_R1_R2$length_required_R2<-B2_df_R1_R2$Expected_length-98

# Filter rows with B2_constructR2 longer than 60 characters
B2_df_R1_R2<-B2_df_R1_R2[nchar(B2_df_R1_R2$B2_constructR2)>60,]
B2_df_R1_R2$B2_constructR2<-str_sub(B2_df_R1_R2$B2_constructR2, start= -60)

# Split the data by 'length_required_R2'
B2_df_R1_R2_list<-split(B2_df_R1_R2, f=B2_df_R1_R2$length_required_R2)

# Process and combine sequences based on the expected length
Whole_B2_df_constructs_list<-mclapply(names(B2_df_R1_R2_list),mc.cores = 8,function(x){
  df<-B2_df_R1_R2_list[[x]]
  df$B2_constructR2<-str_sub(df$B2_constructR2, start= -as.numeric(x))
  df$whole_construct<-paste0(df$B2_constructR1,df$B2_constructR2)
  return(df)
})

Whole_B2_df_constructs_df<-do.call(rbind,Whole_B2_df_constructs_list)

# Filter rows for retrieving mismatc-carrying constructs
Whole_B2_df_constructs_df<-Whole_B2_df_constructs_df[Whole_B2_df_constructs_df$desc=="mismatch1"|Whole_B2_df_constructs_df$desc=="mismatch2"|Whole_B2_df_constructs_df$desc=="mismatch3"|Whole_B2_df_constructs_df$desc=="mismatch4"|Whole_B2_df_constructs_df$desc=="perfect_ds",]
B2_df_R1_R2_list<-split(Whole_B2_df_constructs_df,f=Whole_B2_df_constructs_df$barcode)

# Process the data based on the T and C positions and create 'Final_df'
cured_aligment_whole_contruct<-mclapply(B2_df_R1_R2_list, mc.cores = 8,function(z){
  rownames(z)<-paste0("R",1:nrow(z))
  design_seq_const<-unique(z$seq)
  T_positions<-unlist(gregexpr('T', design_seq_const))
  Nuc_matrix<-do.call(rbind,str_split(z$whole_construct,""))
  Nuc_matrix<-as.data.frame(Nuc_matrix)
  rownames(Nuc_matrix)<-rownames(z)
  T_pos_analized<-Nuc_matrix[,paste0("V",T_positions)]
  rownames(T_pos_analized)<-rownames(z)
  T_pos_analized$Combined_cols <- do.call(paste, c(T_pos_analized[,1:ncol(T_pos_analized)], sep = ""))
  T_string_compared<-paste0(rep("T",ncol(T_pos_analized)-1), collapse = "")
  T_pos_analized<-T_pos_analized[grepl(T_string_compared,T_pos_analized$Combined_cols),]
  Final_df<-z[rownames(T_pos_analized),]
  return(Final_df)
})

Cured_aligment_whole_contruct_C<-mclapply(cured_aligment_whole_contruct, mc.cores = 8,function(z){
  design_seq_const<-unique(z$seq)
  T_positions<-unlist(gregexpr('C', design_seq_const))
  Nuc_matrix<-do.call(rbind,str_split(z$whole_construct,""))
  Nuc_matrix<-as.data.frame(Nuc_matrix)
  rownames(Nuc_matrix)<-rownames(z)
  colnames(Nuc_matrix)<-gsub("V","",colnames(Nuc_matrix))
  T_pos_analized<-Nuc_matrix[,as.character(T_positions)]
  rownames(T_pos_analized)<-rownames(z)
  T_pos_analized$Combined_cols <- do.call(paste, c(T_pos_analized[,colnames(T_pos_analized)], sep = ""))
  T_string_compared<-paste0(rep("C",ncol(T_pos_analized)-1), collapse = "")
  T_pos_analized<-T_pos_analized[grepl(T_string_compared,T_pos_analized$Combined_cols),]
  Final_df<-z[rownames(T_pos_analized),]
  return(Final_df)
})
B2_df_R1_R2_final<-do.call(rbind,Cured_aligment_whole_contruct_C)

# Split data by 'barcode'
B2_df_R1_R2_list<-split(B2_df_R1_R2_final,f=B2_df_R1_R2_final$barcode)

# # Compute consensus matrices per construct
concensus_matrix_per_barcode<-mclapply(B2_df_R1_R2_list,mc.cores = 8, function(w){
  Sequence_barcode_StringSet<-DNAStringSet(w$whole_construct,use.names=TRUE)
  consensus_matrix_barcode<-as.data.frame(t(consensusMatrix(Sequence_barcode_StringSet)))[,c(1:4,15:16)]
})

# Compute A-to-G rate per construct
concensus_matrix_per_barcode_A_to_G<-mclapply(concensus_matrix_per_barcode,mc.cores = 8, function(y){
  y$position<-1:nrow(y)
  y$A_to_G_rate<- y$G/(y$A+y$G)*100
  return(y)
})

# Define the A positions on "Arm perfect double-stranded reporter"
A_positions_B2<-c(14,16,18,33,50,51,55,62,64,67,79,80,84,86,87,88,110,119,122,123,124,135,138)

# Define a function to select the eiditng level on target positions
concensus_matrix_per_barcode_A_to_G_target_pos<- mclapply(names(concensus_matrix_per_barcode_A_to_G),mc.cores=8,function(z){
  df_A_POS<- as.data.frame(t(as.data.frame(concensus_matrix_per_barcode_A_to_G[[z]][A_positions_B2,"A_to_G_rate"])))
  df_A_POS$Barcode_names<-names(concensus_matrix_per_barcode_A_to_G[z])
  colnames(df_A_POS)<- c(paste0("A",A_positions_B2),"Barcode ID" )
  rownames(df_A_POS)<-names(concensus_matrix_per_barcode_A_to_G[z])
  return(df_A_POS)
})

df_A_G_POS_editing<-do.call(rbind,concensus_matrix_per_barcode_A_to_G_target_pos)

# Merge with Barcodes_csv and select desired columns
Disruption_df<-merge(df_A_G_POS_editing,Barcodes_csv[,c("barcode","desc", "site", "B2.mNG", "sample.ctrl")], by.x="Barcode ID", by.y="barcode" )
df_A_G_POS_editing<-Disruption_df[,c(2:24,1,25:28)]

# Filter rows based on 'desc' and 'sample.ctrl' to rerieve the mismatch-carrying constructs
Disruption_df<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="mismatch1" | df_A_G_POS_editing$desc=="mismatch2" | df_A_G_POS_editing$desc=="mismatch3" | df_A_G_POS_editing$desc=="mismatch4",]
Disruption_df<-Disruption_df[Disruption_df$sample.ctrl=="sample",]
Perfect_ds<-df_A_G_POS_editing[df_A_G_POS_editing$desc=="perfect_ds",]

# Calculate delta editing
Delta_editing<- mclapply(rownames(Disruption_df),mc.cores = 8, function(x){
  Delta_editing_per_construct<-Disruption_df[x,1:23]-Perfect_ds[1,1:23]
  Delta_editing_per_construct<-cbind(Delta_editing_per_construct,Disruption_df[x,24:28])
  return(Delta_editing_per_construct)
} )
Delta_editing_df<-do.call(rbind,Delta_editing)

Delta_editing_df<-Delta_editing_df[order(as.numeric(Delta_editing_df$site)), ]

Delta_editing_df$Mistmatch_names<-paste0(Delta_editing_df$site,Delta_editing_df$sample.ctrl,Delta_editing_df$desc )

Delta_editing_list<-split(Delta_editing_df, f=Delta_editing_df$Mistmatch_names)

# Calculate distance from the disruption
Delta_editing_mistmatch<-mclapply(Delta_editing_list, mc.cores = 8, function(x){
  Editing_levels<-as.numeric(x[1,1:23])
  Distance_from_disruption<- as.numeric(gsub("A","",names(x[,1:23])))-as.numeric(x[1,"site"])-1
  Distance_from_disruption_df<-data.frame(Distance_from_mistmatch=Distance_from_disruption,Editing_level_A_to_I=Editing_levels, Mismatch_kind=rep(x$desc, length(Editing_levels)))
  
})
Delta_editing_mistmatch_df<-do.call(rbind,Delta_editing_mistmatch)

# Split the data based on the mismatch kind to adust the proper distance
Delta_editing_mistmatch_df_1_3_list<-split(Delta_editing_mistmatch_df,f=Delta_editing_mistmatch_df$Mismatch_kind)
Delta_editing_mistmatch_1<-Delta_editing_mistmatch_df_1_3_list[[1]]
Delta_editing_mistmatch_3<-Delta_editing_mistmatch_df_1_3_list[[3]]
Delta_editing_mistmatch_3$Distance_from_mistmatch<-Delta_editing_mistmatch_3$Distance_from_mistmatch-1
Delta_editing_mistmatch_df<-rbind(Delta_editing_mistmatch_1,Delta_editing_mistmatch_3)
Delta_editing_mistmatch_df<-Delta_editing_mistmatch_df[!is.na(Delta_editing_mistmatch_df$Editing_level_A_to_I),]

# Create a mapping for mismatch labels
mismatch.labs <- c("Mismatch 1 nucleotide", "Mismatch 3 nucleotides")
names(mismatch.labs) <- c("mismatch1", "mismatch3")

# Split the data by mismatch kind
Delta_editing_mistmatch_df_list<-split(Delta_editing_mistmatch_df,f=Delta_editing_mistmatch_df$Mismatch_kind)

# Apply LOESS smoothing to each subset
Delta_editing_mistmatch_df_list_LOESS<- mclapply(Delta_editing_mistmatch_df_list, mc.cores = 8, function(z){
  loessMod50 <- loess(Editing_level_A_to_I ~ Distance_from_mistmatch, data=z, span=0.05)
  z$smoothed5 <- predict(loessMod50)
  return(z)
  predict(loessMod50)
})
Delta_editing_mistmatch_df<-do.call(rbind,Delta_editing_mistmatch_df_list_LOESS)

# Filter data for mismatch3
ADAR1_ADAR2_3nuc_mis_list_df<-Delta_editing_mistmatch_df[Delta_editing_mistmatch_df$Mismatch_kind=="mismatch3",]

# Split data based on Distance from mistmatch
ADAR2nuc_mis_ADAR1<-split(ADAR1_ADAR2_3nuc_mis_list_df, f=ADAR1_ADAR2_3nuc_mis_list_df$Distance_from_mistmatch)

# Calculate quartiles per distance
ADAR1_3nuc_mis_ADAR2_list<-mclapply(ADAR2nuc_mis_ADAR1, mc.cores = 8, function(x){
  df<-as.data.frame(x)
  quartiles_df<-data.frame(t(quantile(df$Editing_level_A_to_I, probs = c(0,0.25,0.5,0.75,1))))
  colnames(quartiles_df)<-paste0("Q_",c(0,0.25,0.5,0.75,1))
  df<-cbind(df,quartiles_df)
  return(df)})

ADAR2_3nuc_mis_Var_Arm_list_df<-do.call(rbind,ADAR1_3nuc_mis_ADAR2_list)
ADAR2_3nuc_mis_Var_Arm_list_df$Editing_level_A_to_I<-as.numeric(ADAR2_3nuc_mis_Var_Arm_list_df$Editing_level_A_to_I)
vertical.lines<-c(-26,30,-2,-13)

# Create and customize the summary plot  for 3nt-mismatch carrying constructs
p.1 <- ggplot(ADAR2_3nuc_mis_Var_Arm_list_df, aes(Distance_from_mistmatch, smoothed5))+ geom_line(size=0.75, colour="Blue")+  geom_ribbon(aes(ymin=Q_0.25,ymax=Q_0.75),alpha=0.3)
p.1 <-p.1 +xlab("Distance from mistmatch")+ylab("Delta editing")+theme_classic()
p.1 <-p.1 +theme(strip.background = element_blank(),
                 strip.text.x = element_blank(),
                 legend.title=element_blank(),legend.text = element_text(size=11),plot.title = element_text(size=14,face = "bold",),axis.text.x = element_text(colour="black",size=11,angle = 90, vjust = 0.5, hjust=1),axis.title = element_text(face = "bold",size = 12),axis.text.y = element_text(colour="black",size=11))
p.1 <-p.1 +geom_vline(size=0.5,xintercept = vertical.lines, color="#666699",alpha=0.6,linetype = "dashed")+ggtitle("B2 Variable arm: mismatch3 ")#+facet_grid(~ADAR)
p.1<-p.1+scale_x_continuous(n.breaks = 12)+scale_y_continuous(n.breaks = 8, limits=c(-80,15))
plot(p.1)