# Authors:     [Anonymized]
# Affiliation: [Anonymized]
# Description: R script to analyse healthcare complaints
# Data format: Healthcare Complaints Analysis Tool (HCAT)
# License:     Creative Commons Attribution-ShareAlike 4.0

################################################################################
# LOAD LIBRARIES AND DATA
################################################################################

library(tidyverse)
library(psych)
library(gdata)
library(vcd)
library(forcats)
library(magrittr)

# load the data (csv) file
data_raw <- read.csv("5 Data.csv")

################################################################################
# DESCRIPTIVES: TRUSTS
################################################################################

nrow(data_raw) # total complaints

# number of trusts, with mean and SD
temp <- data_raw %>%
  group_by(Trust.ID) %>%
  count()

describe(temp$n)

# complaints per 1k admission
describe(data_raw$Complaint.frequency)

# wordcount descriptives
sum(data_raw$Wordcount)
describe(data_raw$Wordcount)

################################################################################
# DESCRIPTIVES: COMPLAINTS
################################################################################

# function to calculate frequencies and percent for categorical variables
freq.and.pct <- function(df, variable) {
  eval(substitute(df %>% count(variable))) %>%
    mutate(pct = round(((n / nrow(
      df
    )) * 100), digits = 0))
}

# calculate frequencies and percentages
freq.and.pct(data_raw, Region)
freq.and.pct(data_raw, Complainant)
freq.and.pct(data_raw, Gender)
freq.and.pct(data_raw, Harm)

################################################################################
# CREATE DATA FRAME OF PROBLEMS
################################################################################

# change from 'wide' to 'long' format; with 'Problem sequence' variable
data <- reshape(
  data_raw,
  varying = 9:32,
  sep = ".",
  direction = "long",
  timevar = "Sequence",
  idvar = "ID"
) 

# order factors
data <- data %>%
  mutate(
    Region = factor(
      Region, level = c(
        "South",
        "Midlands",
        "North")),
    Gender = factor(
      Gender, level = c(
        "Female", 
        "Male", 
        "NA")),
    Complainant = factor(
      Complainant, level = c(
        "Patient", 
        "Family", 
        "Other")),
    Harm = ordered(
      Harm, level = c(
        "None", 
        "Minimal", 
        "Minor", 
        "Moderate", 
        "Major", 
        "Catastrophic")),
    Problem = factor(
      Problem, level = c(
        "Quality",
        "Safety",
        "Communication",
        "Listening",
        "Rights",
        "Environment",
        "Institution",
        "Other")),
    Severity = ordered(
      Severity, level = c(
        "Low", 
        "Medium", 
        "High")),
    Stage = factor(
      Stage, level = c(
        "Admission",
        "Examination",
        "Ward",
        "Procedure",
        "Discharge",
        "Other", 
        "Multiple")), 
    Staff = factor(
      Staff, level = c(
        "Administration",
        "Nursing",
        "Medical",
        "Other",
        "Multiple"))
  )

# remove rows that have no problem (problem = NA)
data <- data[is.na(data$Problem) == FALSE, ]

## create 'domain' variable
data$Domain <- data$Problem %>%
  fct_recode(
    "Clinical"     = "Quality",
    "Clinical"     = "Safety",
    "Relationship" = "Communication",
    "Relationship" = "Listening",
    "Relationship" = "Rights",
    "Management"   = "Environment",
    "Management"   = "Institution",
    "Other"        = "Other"
  )

# order domain factor
data$Domain <- factor(data$Domain,
                      levels = c("Clinical",
                                 "Relationship",
                                 "Management",
                                 "Other"))

################################################################################
# DESCRIPTIVES: PROBLEMS
################################################################################


count(data, ID) %$% sum(n) # total problems
count(data, ID) %$% mean(n) # mean problems
count(data, ID) %$% sd(n) # standard deviation

# calculating frequencies and percentages
freq.and.pct(data, Problem)
freq.and.pct(data, Domain)
freq.and.pct(data, Severity)
freq.and.pct(data, Stage)
freq.and.pct(data, Staff)

# correlation between harm and clinical severity
data %>%
  filter(Domain == "Clinical") %$%
  cor.test(as.numeric(Harm),
           as.numeric(Severity),
           method = "spearm",
           exact = FALSE)

# correlation between harm and relationship severity
data %>%
  filter(Domain == "Relationship") %$%
  cor.test(as.numeric(Harm),
           as.numeric(Severity),
           method = "spearm",
           exact = FALSE)

# correlation between harm and management severity
data %>%
  filter(Domain == "Management") %$%
  cor.test(as.numeric(Harm),
           as.numeric(Severity),
           method = "spearm",
           exact = FALSE)

################################################################################
# DESCRIPTIVES: REGIONAL DIFFERENCES
################################################################################


# does complaint frequency vary by region?
# (note: admission data missing for three trusts)
temp <- data %>%
  select(Trust.ID, Region, Complaint.frequency) %>%
  unique()

# plot differences
ggplot(temp) +
  aes(x = Region, y = Complaint.frequency) +
  geom_boxplot()

# test differences
results_anova <- aov(Complaint.frequency ~ Region, data = temp)
summary(results_anova)

# order complaint-level variables
data_raw$Region <- ordered(data_raw$Region,
                           levels = c("South", "Midlands", "North"))
data_raw$Complainant <- ordered(data_raw$Complainant,
                                levels = c("Patient", "Family", "Other"))
data_raw$Harm <- ordered(
  data_raw$Harm,
  levels = c("None", "Minimal", "Minor", "Moderate", "Major", "Catastrophic")
)

# cross tabulation for Region*Gender (complaint-level)
crosstab <-
  table(data_raw$Region, data_raw$Gender) # create crosstab
t(crosstab) # print crosstab
chisq <- chisq.test(crosstab, correct = FALSE)
chisq
t(chisq$expected) # expected values
t(chisq$residuals) # residuals

# cross tabulation for Region*Complainant (complaint-level)
crosstab <-
  table(data_raw$Region, data_raw$Complainant) # create crosstab
t(crosstab) # print crosstab
chisq <- chisq.test(crosstab, correct = FALSE)
chisq
t(chisq$expected) # expected values
t(chisq$residuals) # residuals

# cross tabulation for Region*Problem (problem-level)
crosstab <-
  table(data$Region, drop.levels(data$Problem)) # create crosstab
t(crosstab) # print crosstab
chisq <- chisq.test(crosstab, correct = FALSE)
chisq
t(chisq$expected) # expected values
t(chisq$residuals) # residuals

# cross tabulation for Region*Harm (complaint-level)
crosstab <- table(data_raw$Region, data_raw$Harm) # create crosstab
t(crosstab) # print crosstab
chisq <- chisq.test(crosstab, correct = FALSE)
chisq
t(chisq$expected) # expected values
t(chisq$residuals) # residuals

# calculating percent major and catastrophic harm in each region
total <- as.data.frame(t(crosstab)) %>%
  group_by(Var2) %>%
  tally(Freq)

high <- as.data.frame(t(crosstab)) %>%
  filter(Var1 == "Major" | Var1 == "Catastrophic") %>%
  group_by(Var2) %>%
  tally(Freq)

round((high$n / total$n) * 100, digits = 3)

################################################################################
# FIGURE 3: MOSAIC PLOT OF STAGE BY HARM
################################################################################

# add spaces into levels (other, multiple) to prevent text in figure overlapping
data2 <- data
levels(data2$Stage)[levels(data2$Stage) == "Other"] <- "Other  "
levels(data2$Stage)[levels(data2$Stage) == "Multiple"] <-
  "     Multiple"

# prepare plot
crosstab <- table(data2$Stage, data2$Harm) # create crosstab
table_proportions <- round(prop.table(crosstab) * 100) # calculate pct
cell_values <- c(crosstab) # capture values
rows <- dimnames(crosstab)[[1]] # capture row names
columns <- dimnames(crosstab)[[2]] # capture column names
variables <- c("Stage", "Harm") # give variables names
dimensions <- dim(crosstab)

table_values <-
  structure(
    c(cell_values),
    .Dim = as.integer(dimensions),
    .Dimnames = structure(list(rows, columns),
                          .Names = c(variables)),
    class = "table"
  )

table_text <-
  structure(
    c(paste(
      table_proportions, "%", " (", cell_values, ")", sep = ""
    )),
    .Dim = as.integer(dimensions),
    .Dimnames = structure(list(rows, columns),
                          .Names = c(variables)),
    class = "table"
  )

table_text[table_values <= 20] <- "" # removes text if n < 20
table_text[6:7, ] <-   "" # removes text in rows 6 & 7 (too narrow)

# plot
mosaic(
  table_values,
  pop = FALSE,
  gp = shading_Friendly2(h = c(0, 200), c = c(60, 0)),
  keep_aspect_ratio = FALSE,
  spacing = spacing_equal(0.4),
  rot_labels = c(top = 0, left = 0),
  just_labels = c(top = "center", left = "right"),
  margin = c(
    top = 6,
    left = 8,
    bottom = 4,
    right = 1.5
  ),
  offset_varnames = c(top = 1, left = 4.5),
  direction = "v"
)
labeling_cells(text = table_text,
               clip_cells = FALSE)(table_values)

# tables and statistics
t(crosstab) # print crosstab
t(table_proportions) # print cell percent
chisq <- chisq.test(crosstab, correct = FALSE)
chisq
t(chisq$expected) # expected values
t(chisq$residuals) # residuals

################################################################################
# FIGURE 4: MOSAIC PLOT OF STAGE BY PROBLEM (ONLY MAJOR & CATASTROPHIC HARM)
################################################################################

# create subset with only major and catastrophic harm
df <- data %>%
  filter(Harm == "Catastrophic" | Harm == "Major") %>%
  filter(Stage != "Other" & Stage != "Multiple") %>%
  filter(Problem != "Other") %>%
  drop.levels(df, reorder = FALSE)

# descriptives for stage in subset
freq.and.pct(df, Stage)

# prepare plot
crosstab <- table(df$Stage, df$Problem) # create crosstab
table_proportions <- round(prop.table(crosstab) * 100) # calculate pct
cell_values <- c(crosstab) # capture values
rows <- dimnames(crosstab)[[1]] # capture row names
columns <- dimnames(crosstab)[[2]] # capture column names
variables <- c("Stage", "Problem") # give variables names
dimensions <- dim(crosstab)

table_values <-
  structure(
    c(cell_values),
    .Dim = as.integer(dimensions),
    .Dimnames = structure(list(rows, columns),
                          .Names = c(variables)),
    class = "table"
  )

table_text <-
  structure(
    c(paste(
      table_proportions, "%", " (", cell_values, ")", sep = ""
    )),
    .Dim = as.integer(dimensions),
    .Dimnames = structure(list(rows, columns),
                          .Names = c(variables)),
    class = "table"
  )

table_text[table_values <= 2] <- "" # removes text if n < 2

# plot
mosaic(
  table_values,
  pop = FALSE,
  gp = shading_Friendly2(h = c(0, 200), c = c(60, 0)),
  keep_aspect_ratio = FALSE,
  spacing = spacing_equal(0.4),
  rot_labels = c(top = 0, left = 0),
  just_labels = c(top = "center", left = "right"),
  margin = c(
    top = 6,
    left = 8,
    bottom = 4,
    right = 1.5
  ),
  offset_varnames = c(top = 1, left = 4.5),
  direction = "v"
)
labeling_cells(text = table_text,
               clip_cells = FALSE)(table_values)

# tables and statistics
t(crosstab) # print crosstab
t(table_proportions) # print cell percent
chisq <- chisq.test(crosstab, correct = FALSE)
chisq
t(chisq$expected) # expected values
t(chisq$residuals) # residuals

################################################################################
# FIGURE 5: MOSAIC PLOT OF STAGE BY NEAR-MISSES
################################################################################

# filter for high severity with only minor, minimal or no harm
df <- data %>%
  filter(Harm != "Catastrophic" &
           Harm != "Major" & Harm != "Moderate") %>%
  filter(Stage != "Other" & Stage != "Multiple") %>%
  filter(Problem != "Other") %>%
  filter(Severity == "High") %>%
  drop.levels(df, reorder = FALSE)

nrow(df)

# descriptives
freq.and.pct(df, Stage)

# prepare plot
crosstab <- table(df$Stage, df$Problem) # create crosstab
table_proportions <- round(prop.table(crosstab) * 100) # calculate pct
cell_values <- c(crosstab) # capture values
rows <- dimnames(crosstab)[[1]] # capture row names
columns <- dimnames(crosstab)[[2]] # capture column names
variables <- c("Stage", "Problem") # give variables names
dimensions <- dim(crosstab)

table_values <-
  structure(
    c(cell_values),
    .Dim = as.integer(dimensions),
    .Dimnames = structure(list(rows, columns),
                          .Names = c(variables)),
    class = "table"
  )

table_text <-
  structure(
    c(paste(
      table_proportions, "%", " (", cell_values, ")", sep = ""
    )),
    .Dim = as.integer(dimensions),
    .Dimnames = structure(list(rows, columns),
                          .Names = c(variables)),
    class = "table"
  )

table_text[table_values <= 1] <- "" # removes text if n <= 1

# plot
mosaic(
  table_values,
  pop = FALSE,
  gp = shading_Friendly2(h = c(0, 200), c = c(60, 0)),
  keep_aspect_ratio = FALSE,
  spacing = spacing_equal(0.4),
  rot_labels = c(top = 0, left = 0),
  just_labels = c(top = "center", left = "right"),
  margin = c(
    top = 6,
    left = 8,
    bottom = 4,
    right = 1.5
  ),
  offset_varnames = c(top = 1, left = 4.5),
  direction = "v"
)
labeling_cells(text = table_text,
               clip_cells = FALSE)(table_values)

# tables and statistics
t(crosstab) # print crosstab
t(table_proportions) # print cell percent
# too many small values for chi square test

################################################################################
# BLINDSPOT: SYSTEMIC PROBLEMS AND HARM
################################################################################

freq.and.pct(data, Sequence)

# create data frame
df_reg <- data_raw
df_reg$Problem.count <- select(data_raw,
                               Problem.1,
                               Problem.2,
                               Problem.3,
                               Problem.4,
                               Problem.5,
                               Problem.6) %>%
  apply(1, function(x)
    length(unique(x))) %>%
  -1 # to discount NA values

df_reg$Staff.count <- select(data_raw,
                             Staff.1,
                             Staff.2,
                             Staff.3,
                             Staff.4,
                             Staff.5,
                             Staff.6) %>%
  apply(1, function(x)
    length(unique(x))) %>%
  -1 # to discount NA values

df_reg$Stage.count <- select(data_raw,
                             Stage.1,
                             Stage.2,
                             Stage.3,
                             Stage.4,
                             Stage.5,
                             Stage.6) %>%
  apply(1, function(x)
    length(unique(x))) %>%
  -1 # to discount NA values

df_reg <-
  select(df_reg, Harm, Problem.count, Staff.count, Stage.count)

df_reg$Harm <- ordered(
  df_reg$Harm,
  levels = c("None",
             "Minimal",
             "Minor",
             "Moderate",
             "Major",
             "Catastrophic")
)

library(MASS, pos = "package:base")
model.polr <- polr(Harm ~ Problem.count + Stage.count + Staff.count,
                   data = df_reg,
                   Hess = TRUE)
summary(model.polr)

# to create p values
(model.polr.table <- coef(summary(model.polr))) # store the table
p <- pnorm(abs(model.polr.table[, "t value"]), lower.tail = FALSE) * 2 # calc p
(model.polr.table <- cbind(model.polr.table, "p value" = p)) # bind p to table
exp(cbind(OR = coef(model.polr), # odds ratios
          confint(model.polr, level = 0.95))) # confidence intervals))

################################################################################
# FIGURE 6: MOSAIC PLOT OF HARM BY NUMBER OF PROBLEMS
################################################################################

# create new data frame
df_comp <- df_reg
df_comp$Problem.count <- factor(df_comp$Problem.count)

df_comp$Problem.count <- fct_collapse(
  df_comp$Problem.count,
  "One" = "1",
  "Two" = "2",
  "Three+" = c("3", "4", "5")
)

df_comp$Problem.count <- factor(df_comp$Problem.count,
                                levels = c("One", "Two", "Three+"))

# descriptives
freq.and.pct(df_comp, Problem.count)

# prepare plot
crosstab <-
  table(df_comp$Problem.count, df_comp$Harm) # create crosstab
table_proportions <- round(prop.table(crosstab) * 100) # calculate pct
cell_values <- c(crosstab) # capture values
rows <- dimnames(crosstab)[[1]] # capture row names
columns <- dimnames(crosstab)[[2]] # capture column names
variables <- c("Number of problems", "Harm") # give variables names
dimensions <- dim(crosstab)

table_values <-
  structure(
    c(cell_values),
    .Dim = as.integer(dimensions),
    .Dimnames = structure(list(rows, columns),
                          .Names = c(variables)),
    class = "table"
  )

table_text <-
  structure(
    c(paste(
      table_proportions, "%", " (", cell_values, ")", sep = ""
    )),
    .Dim = as.integer(dimensions),
    .Dimnames = structure(list(rows, columns),
                          .Names = c(variables)),
    class = "table"
  )

table_text[table_values <= 24] <- "" # removes text below 24

# plot
mosaic(
  table_values,
  pop = FALSE,
  gp = shading_Friendly2(h = c(0, 200), c = c(60, 0)),
  # color adjust
  keep_aspect_ratio = FALSE,
  spacing = spacing_equal(0.4),
  rot_labels = c(top = 0, left = 0),
  just_labels = c(top = "center", left = "right"),
  margin = c(
    top = 6,
    left = 8,
    bottom = 4,
    right = 1.5
  ),
  offset_varnames = c(top = 1, left = 4.5),
  direction = "v"
)
labeling_cells(text = table_text,
               clip_cells = FALSE)(table_values)

# tables and statistics
t(crosstab) # print crosstab
t(table_proportions) # print cell percent
chisq <- chisq.test(crosstab, correct = FALSE)
chisq
t(chisq$expected) # expected values
t(chisq$residuals) # residuals

################################################################################
# FIGURE 7: MOSAIC PLOT OF PROBLEM BY ORDER REPORTED
################################################################################

df <- data %>%
  filter(Problem != "Other") %>%
  drop.levels(df, reorder = FALSE)

df$Sequence <- as.factor(df$Sequence)
df$Sequence <- fct_collapse(
  df$Sequence,
  "First" = "1",
  "Second" = "2",
  "Third+" = c("3", "4", "5")
)

df$Sequence <- factor(df$Sequence,
                      levels = c("First", "Second", "Third+"))

# descriptives
freq.and.pct(df, Sequence)


# prepare plot
crosstab <- table(df$Sequence, df$Problem) # create crosstab
table_proportions <-
  round(prop.table(crosstab) * 100) # calculate pct
cell_values <- c(crosstab) # capture values
rows <- dimnames(crosstab)[[1]] # capture row names
columns <- dimnames(crosstab)[[2]] # capture column names
variables <- c("Order reported", "Problem") # give variables names
dimensions <- dim(crosstab)

table_values <-
  structure(
    c(cell_values),
    .Dim = as.integer(dimensions),
    .Dimnames = structure(list(rows, columns),
                          .Names = c(variables)),
    class = "table"
  )

table_text <-
  structure(
    c(paste(
      table_proportions, "%", " (", cell_values, ")", sep = ""
    )),
    .Dim = as.integer(dimensions),
    .Dimnames = structure(list(rows, columns),
                          .Names = c(variables)),
    class = "table"
  )

# plot
mosaic(
  table_values,
  pop = FALSE,
  gp = shading_Friendly2(h = c(0, 200), c = c(60, 0)),
  # color adjust
  keep_aspect_ratio = FALSE,
  spacing = spacing_equal(0.4),
  rot_labels = c(top = 0, left = 0),
  just_labels = c(top = "center", left = "right"),
  margin = c(
    top = 6,
    left = 8,
    bottom = 4,
    right = 1.5
  ),
  offset_varnames = c(top = 1, left = 4.5),
  direction = "v"
)
labeling_cells(text = table_text,
               clip_cells = FALSE)(table_values)

# tables and statistics
t(crosstab) # print crosstab
t(table_proportions) # print cell percent
chisq <- chisq.test(crosstab, correct = FALSE)
chisq
t(chisq$expected) # expected values
t(chisq$residuals) # residuals