|
|
@@ -1,20 +1,18 @@
|
|
|
+# Extract fouls along with contextual information.
|
|
|
+# Save for later processing.
|
|
|
library(tidyverse)
|
|
|
|
|
|
-# use tidyverse csv reader
|
|
|
-season <- read_csv("data/2017-18_pbp.csv")
|
|
|
+# load in all data files and create a single variable
|
|
|
+# with the filename as an ID column
|
|
|
+datafiles <- list.files("data", pattern="*_pbp.csv", full.names = FALSE)
|
|
|
|
|
|
-## for later:
|
|
|
-## load in all data files and create a single variable
|
|
|
-## with the filename as an ID column
|
|
|
-#datafiles <- list.files("data", pattern="*_pbp.csv", full.names = FALSE)
|
|
|
-#
|
|
|
-#seasons <- purrr:map_df(datafiles,
|
|
|
-# ~read_csv(paste0("data/", .x)),
|
|
|
-# .id = "filename")
|
|
|
+seasons <- purrr:map_df(datafiles,
|
|
|
+ ~read_csv(paste0("data/", .x)),
|
|
|
+ .id = "filename")
|
|
|
|
|
|
# construct a new dataframe which is a subset of the original
|
|
|
# also ensure that SCOREMARGIN is a number and create an integer gameID
|
|
|
-season_subset <- season %>%
|
|
|
+season_subset <- seasons %>%
|
|
|
dplyr::select(GAME_ID, EVENTNUM, HOMEDESCRIPTION, VISITORDESCRIPTION,
|
|
|
SCORE, SCOREMARGIN, PCTIMESTRING, PERIOD) %>%
|
|
|
mutate(SCOREMARGIN = as.numeric(SCOREMARGIN)) %>%
|
|
|
@@ -84,40 +82,5 @@ allfouls <- mutate(allfouls,
|
|
|
!is.na(VISITORDESCRIPTION) ~ "VISITOR"))
|
|
|
|
|
|
# save the derived "all fouls" dataframe
|
|
|
-write_csv(allfouls, 'data/2017-18_foulsonly.csv')
|
|
|
+write_csv(allfouls, 'data/foulsonly.csv')
|
|
|
|
|
|
-## Plots
|
|
|
-# histogram of fouls as a function of corrected score margin
|
|
|
-png('figures/foul_histogram-all.png')
|
|
|
-ggplot(allfouls, aes(x=SCOREMARGIN_CORR)) +
|
|
|
- geom_histogram(binwidth=1, fill="black") +
|
|
|
- theme_bw() +
|
|
|
- scale_y_log10() +
|
|
|
- xlab("Score Margin") + ylab("N Fouls")
|
|
|
-
|
|
|
-dev.off()
|
|
|
-
|
|
|
-# histogram of fouls as a function of corrected score margin,
|
|
|
-# ignoring overtime and the final minute of regular play
|
|
|
-earlyfouls <- filter(allfouls,
|
|
|
- PERIOD <= 4,
|
|
|
- !(PERIOD == 4 & PCTIMESTRING < "00:01:00"))
|
|
|
-
|
|
|
-png('figures/foul_histogram-regular_nofinalmin.png')
|
|
|
-ggplot(earlyfouls, aes(x=SCOREMARGIN_CORR)) +
|
|
|
- geom_histogram(binwidth=1, fill="red", alpha=0.5) +
|
|
|
- theme_bw() +
|
|
|
- scale_y_log10() +
|
|
|
- xlab("Score Margin") + ylab("N Fouls") +
|
|
|
- geom_histogram(data=allfouls, binwidth=1, fill="green", alpha=0.5)
|
|
|
-dev.off()
|
|
|
-
|
|
|
-# hexbin plots of fouls as a function of total score and corrected score
|
|
|
-# margin, separated by home and away teams
|
|
|
-png('figures/fouls_totalscore-hexbin.png', height=600, width=1200)
|
|
|
-ggplot(allfouls, aes(SCOREMARGIN_CORR, TOTALSCORE)) +
|
|
|
- geom_hex() +
|
|
|
- scale_fill_viridis_c() +
|
|
|
- theme_bw() +
|
|
|
- facet_wrap(vars(FOULTEAM))
|
|
|
-dev.off()
|