Load Packages

#install.packages("lubridate")
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

Part 1. Data Clean & Organization

## 1.1 Read Data
data2012 <- read.csv('Copy of Trauma%20Scene%20Times%20Raw%20Data (1).xlsx - 2012.csv')
data2013 <- read.csv('Copy of Trauma%20Scene%20Times%20Raw%20Data (1).xlsx - 2013.csv')
data2014 <- read.csv('Copy of Trauma%20Scene%20Times%20Raw%20Data (1).xlsx - 2014.csv')
data2015 <- read.csv('Copy of Trauma%20Scene%20Times%20Raw%20Data (1).xlsx - 2015.csv')
data2016 <- read.csv('Copy of Trauma%20Scene%20Times%20Raw%20Data (1).xlsx - 2016.csv')

## 1.2 Combain 2012 - 2016 data sets
# Rename columns in data2015
names(data2015)[names(data2015) == "Missing.Diagnosis"] <- "Pt...Diagnosis"
names(data2015)[names(data2015) == "Calc...Transport.Mins"] <- "Calc...Transfer.Mins"
# Rename columns in data2016 if necessary
names(data2016)[names(data2016) == "Calc...Transport.Mins"] <- "Calc...Transfer.Mins"
# Now, try to bind the data frames again
data <- rbind(data2012, data2013, data2014, data2015, data2016)

## 1.3 Select the columns that we are interesting and rename
select_data <- data.frame(
  Date = data$Date,
  Weight = data$Pt...Weight,
  MOI = data$X6.Categories,
  #Twelve_cats = data$X12.Categories,
  Arrival = data$Time.Unit.Arrival,
  Depart = data$Time.Unit.Depart,
  Time_period = data$Calc...Scene.Mins,
  Response_priority = data$Resp..Priority,
  Transportation_priority = data$Trans..Priority
)

## 1.4 Remove NA Missing Values
cleaned_data <- na.omit(select_data)
colSums(is.na(cleaned_data))
##                    Date                  Weight                     MOI 
##                       0                       0                       0 
##                 Arrival                  Depart             Time_period 
##                       0                       0                       0 
##       Response_priority Transportation_priority 
##                       0                       0
## 1.5 Add new Year, Month and Date columns
library(lubridate)
cleaned_data$Year <- year(mdy(cleaned_data$Date))
cleaned_data$Month <- as.numeric(format(as.Date(cleaned_data$Date, format="%m/%d/%Y"), "%m"))
cleaned_data$Day <- as.numeric(format(as.Date(cleaned_data$Date, format="%m/%d/%Y"), "%d"))

## 1.6 Clean Six
cleaned_data$MOI <- tolower(cleaned_data$MOI)
#cleaned_data$Twelve_cats[cleaned_data$Twelve_cats == "Fall - Other"] <- "Other"

## 1.7 Add new columns tenmins 1 means under ten mins, and 0 means over ten mins Y
cleaned_data$Under_10_mins <- ifelse(cleaned_data$Time_period <= 10, 1, 0)

## 1.8 Convert arrive and depart time to mins
convert_time <- function(time_str) {
  time <- strptime(time_str, format="%I:%M:%S %p")
  return(as.numeric(format(time, "%H"))*60 + as.numeric(format(time, "%M")) + as.numeric(format(time, "%S"))/60)
}

cleaned_data$Arrival_min <- sapply(cleaned_data$Arrival, convert_time)
cleaned_data$Depart_min <- sapply(cleaned_data$Depart, convert_time)


## 1.9 create new column of 10-mins policy issued
cleaned_data$Date <- as.Date(cleaned_data$Date, format="%m/%d/%Y")
cleaned_data$Intervention <- ifelse(cleaned_data$Date < as.Date("2014-12-18"), "A_pre", "B_post")

# 1.10 New columns of division of the day 
# Function to categorize time into part of the day
get_time_of_day <- function(minutes) {
  hour <- minutes / 60  # Convert minutes to hours
  
  if (hour >= 5 && hour < 12) {
    return('Morning')
  } else if (hour >= 12 && hour < 17) {
    return('Afternoon')
  } else if (hour >= 17 && hour < 21) {
    return('Evening')
  } else {
    return('Night')
  }
}

# Apply the function to Arrival_min and Depart_min
cleaned_data$Arrival_Time_of_Day <- sapply(cleaned_data$Arrival_min, get_time_of_day)
cleaned_data$Depart_Time_of_Day <- sapply(cleaned_data$Depart_min, get_time_of_day)
table(cleaned_data$Arrival_Time_of_Day, cleaned_data$Depart_Time_of_Day)
##            
##             Afternoon Evening Morning Night
##   Afternoon      1089      51       0     0
##   Evening           0    1191       0    54
##   Morning          32       0     881     0
##   Night             0       0      21  1638
# 1.11 Drop the errors of time_period
# Replace values in Time_period with NA when they are equal to 1395.00
cleaned_data$Time_period[cleaned_data$Time_period == 1395.00] <- NA
cleaned_data <- na.omit(cleaned_data)
# Check if there are any negative values in Time_period
has_negatives <- any(cleaned_data$Time_period < 0)
# Print the result
print(has_negatives)
## [1] FALSE
summary(cleaned_data$Time_period)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    7.88   11.00   12.46   15.00  199.00
## 1.12 category to factor 
cleaned_data$MOI <- as.factor(cleaned_data$MOI)
#cleaned_data$Twelve_cats <- as.factor(cleaned_data$Twelve_cats)
cleaned_data$Intervention <- as.factor(cleaned_data$Intervention)
cleaned_data$Response_priority <- as.factor(cleaned_data$Response_priority)
cleaned_data$Transportation_priority <- as.factor(cleaned_data$Transportation_priority)
cleaned_data$Under_10_mins <- factor(cleaned_data$Under_10_mins, levels = c(0, 1))
cleaned_data$Arrival_Time_of_Day <- as.factor(cleaned_data$Arrival_Time_of_Day)
cleaned_data$Depart_Time_of_Day <- as.factor(cleaned_data$Depart_Time_of_Day)
colSums(is.na(cleaned_data))
##                    Date                  Weight                     MOI 
##                       0                       0                       0 
##                 Arrival                  Depart             Time_period 
##                       0                       0                       0 
##       Response_priority Transportation_priority                    Year 
##                       0                       0                       0 
##                   Month                     Day           Under_10_mins 
##                       0                       0                       0 
##             Arrival_min              Depart_min            Intervention 
##                       0                       0                       0 
##     Arrival_Time_of_Day      Depart_Time_of_Day 
##                       0                       0
## 1.13 Save the final data set
#write.csv(cleaned_data, file = "final_data.csv", row.names = FALSE)

Part 2. EDA

2.1 Plot the histogram for each numeric varaibles

# For numeric
hist(cleaned_data$Weight, main="Distribution of Weight", xlab="Weight (lb)", ylab="Frequency")

# For Category
bp <- function(categorical_data) {
  # Check if the input is a factor or character, if not, return an error message
  if (!is.factor(categorical_data) && !is.character(categorical_data)) {
    stop("The input data must be a categorical (factor or character) variable.")
  }

  # Calculate counts and percentages
  data_summary <- as.data.frame(table(categorical_data)) %>%
    rename(Category = categorical_data) %>%
    mutate(Percentage = Freq / sum(Freq) * 100)

  # Create the plot
  ggplot(data_summary, aes(x = Category, y = Freq, fill = Category)) +
    geom_bar(stat = "identity") +
    geom_text(aes(label = paste0(Freq, " (", round(Percentage, 1), "%)")),
              position = position_stack(vjust = 0.5), size = 3) +
    labs(title = paste("Bar Plot of Year 2012 to 2016"),#,#deparse(substitute(categorical_data))),
         x = "Year",
         y = "Count") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Adjust text angle for readability
}

bp2 <- function(categorical_dataY, categorical_data2) {
  # Check if the inputs are categorical
  if (!is.factor(categorical_dataY) && !is.character(categorical_dataY)) {
    stop("categorical_dataY must be a categorical (factor or character) variable.")
  }
  if (!is.factor(categorical_data2) && !is.character(categorical_data2)) {
    stop("categorical_data2 must be a categorical (factor or character) variable.")
  }

  # Calculate counts and percentages
  data_summary <- as.data.frame(table(categorical_dataY, categorical_data2)) %>%
    rename(Category = categorical_dataY, Subcategory = categorical_data2) %>%
    group_by(Category) %>%
    mutate(Percentage = Freq / sum(Freq) * 100)
  
  # Create the plot
  ggplot(data_summary, aes(x = Category, y = Freq, fill = Subcategory)) +
    geom_bar(stat = "identity") +
    geom_text(aes(label = paste0(Freq, " (", round(Percentage, 1), "%)")),
              position = position_stack(vjust = 0.5), size = 3) +
    labs(title = paste("Bar Plot of Intervention vs. Transportation Priority"),#, deparse(substitute(categorical_dataY)), "vs", deparse(substitute(categorical_data2))),
         x = "Intervention",
         y = "Count") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}

heatmap_for_two_categories <- function(cat_data1, cat_data2) {
  # Check if both inputs are categorical (factor or character)
  if (!is.factor(cat_data1) && !is.character(cat_data1)) {
    stop("cat_data1 should be a factor or a character vector.")
  } else {
    cat_data1 <- as.factor(cat_data1)
  }
  if (!is.factor(cat_data2) && !is.character(cat_data2)) {
    stop("cat_data2 should be a factor or a character vector.")
  } else {
    cat_data2 <- as.factor(cat_data2)
  }

  # Create a data frame of counts and calculate percentages
  data_summary <- data.frame(cat_data1, cat_data2) %>%
    count(cat_data1, cat_data2) %>%
    mutate(percentage = n / sum(n) * 100)

  # Create the heatmap
  ggplot(data_summary, aes(x = cat_data1, y = cat_data2, fill = n)) +
    geom_tile() +
    scale_fill_gradient(low = "white", high = "red") +  # Adjust to shades of red
    labs(title = "Heatmap of Category Combinations",
         x = deparse(substitute(cat_data1)),
         y = deparse(substitute(cat_data2)),
         fill = "Count") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 7),  # Adjust text angle and size
          axis.text.y = element_text(size = 7)) +
    geom_text(aes(label = paste0(n, "\n(", round(percentage, 1), "%)")), size = 3) # Add count and percentage labels
}

# Usage Example
bp(cleaned_data$Under_10_mins)

bp(cleaned_data$Intervention)

bp(cleaned_data$MOI)

bp(cleaned_data$Response_priority)