Browse Source

separate foul table munging from plot generation

George C. Privon 6 years ago
parent
commit
97b6215763
2 changed files with 50 additions and 47 deletions
  1. 40 0
      code/foul_analysis.R
  2. 10 47
      code/gen_allfouls.R

+ 40 - 0
code/foul_analysis.R

@@ -0,0 +1,40 @@
+# Analysis of foul data
+library(tidyverse)
+
+allfouls <- read_csv("data/foulsonly.csv")
+
+## Plots
+# histogram of fouls as a function of corrected score margin
+png('figures/foul_histogram-all.png')
+ggplot(allfouls, aes(x=SCOREMARGIN_CORR)) +
+    geom_histogram(binwidth=1, fill="black") +
+    theme_bw() +
+    scale_y_log10() +
+    xlab("Score Margin") + ylab("N Fouls")
+
+dev.off()
+
+# histogram of fouls as a function of corrected score margin,
+# ignoring overtime and the final minute of regular play
+earlyfouls <- filter(allfouls,
+                     PERIOD <= 4,
+                     !(PERIOD == 4 & PCTIMESTRING < "00:01:00"))
+
+png('figures/foul_histogram-regular_nofinalmin.png')
+ggplot(earlyfouls, aes(x=SCOREMARGIN_CORR)) +
+    geom_histogram(binwidth=1, fill="red", alpha=0.5) +
+    theme_bw() +
+    scale_y_log10() +
+    xlab("Score Margin") + ylab("N Fouls") +
+    geom_histogram(data=allfouls, binwidth=1, fill="green", alpha=0.5)
+dev.off()
+
+# hexbin plots of fouls as a function of total score and corrected score
+# margin, separated by home and away teams
+png('figures/fouls_totalscore-hexbin.png', height=600, width=1200)
+ggplot(allfouls, aes(SCOREMARGIN_CORR, TOTALSCORE)) +
+    geom_hex() +
+    scale_fill_viridis_c() +
+    theme_bw() +
+    facet_wrap(vars(FOULTEAM))
+dev.off()

+ 10 - 47
code/single_season.R → code/gen_allfouls.R

@@ -1,20 +1,18 @@
+# Extract fouls along with contextual information.
+# Save for later processing.
 library(tidyverse)
 
-# use tidyverse csv reader
-season <- read_csv("data/2017-18_pbp.csv")
+# load in all data files and create a single variable
+# with the filename as an ID column
+datafiles <- list.files("data", pattern="*_pbp.csv", full.names = FALSE)
 
-## for later:
-## load in all data files and create a single variable
-## with the filename as an ID column
-#datafiles <- list.files("data", pattern="*_pbp.csv", full.names = FALSE)
-#
-#seasons <- purrr:map_df(datafiles,
-#                        ~read_csv(paste0("data/", .x)),
-#                        .id = "filename")
+seasons <- purrr:map_df(datafiles,
+                        ~read_csv(paste0("data/", .x)),
+                        .id = "filename")
 
 # construct a new dataframe which is a subset of the original
 # also ensure that SCOREMARGIN is a number and create an integer gameID
-season_subset <- season %>%
+season_subset <- seasons %>%
     dplyr::select(GAME_ID, EVENTNUM, HOMEDESCRIPTION, VISITORDESCRIPTION,
                   SCORE, SCOREMARGIN, PCTIMESTRING, PERIOD) %>%
     mutate(SCOREMARGIN = as.numeric(SCOREMARGIN)) %>%
@@ -84,40 +82,5 @@ allfouls <- mutate(allfouls,
                                         !is.na(VISITORDESCRIPTION) ~ "VISITOR"))
 
 # save the derived "all fouls" dataframe
-write_csv(allfouls, 'data/2017-18_foulsonly.csv')
+write_csv(allfouls, 'data/foulsonly.csv')
 
-## Plots
-# histogram of fouls as a function of corrected score margin
-png('figures/foul_histogram-all.png')
-ggplot(allfouls, aes(x=SCOREMARGIN_CORR)) +
-    geom_histogram(binwidth=1, fill="black") +
-    theme_bw() +
-    scale_y_log10() +
-    xlab("Score Margin") + ylab("N Fouls")
-
-dev.off()
-
-# histogram of fouls as a function of corrected score margin,
-# ignoring overtime and the final minute of regular play
-earlyfouls <- filter(allfouls,
-                     PERIOD <= 4,
-                     !(PERIOD == 4 & PCTIMESTRING < "00:01:00"))
-
-png('figures/foul_histogram-regular_nofinalmin.png')
-ggplot(earlyfouls, aes(x=SCOREMARGIN_CORR)) +
-    geom_histogram(binwidth=1, fill="red", alpha=0.5) +
-    theme_bw() +
-    scale_y_log10() +
-    xlab("Score Margin") + ylab("N Fouls") +
-    geom_histogram(data=allfouls, binwidth=1, fill="green", alpha=0.5)
-dev.off()
-
-# hexbin plots of fouls as a function of total score and corrected score
-# margin, separated by home and away teams
-png('figures/fouls_totalscore-hexbin.png', height=600, width=1200)
-ggplot(allfouls, aes(SCOREMARGIN_CORR, TOTALSCORE)) +
-    geom_hex() +
-    scale_fill_viridis_c() +
-    theme_bw() +
-    facet_wrap(vars(FOULTEAM))
-dev.off()