|
@@ -0,0 +1,123 @@
|
|
|
|
|
+library(tidyverse)
|
|
|
|
|
+
|
|
|
|
|
+# use tidyverse csv reader
|
|
|
|
|
+season <- read_csv("data/2017-18_pbp.csv")
|
|
|
|
|
+
|
|
|
|
|
+## for later:
|
|
|
|
|
+## load in all data files and create a single variable
|
|
|
|
|
+## with the filename as an ID column
|
|
|
|
|
+#datafiles <- list.files("data", full.names = TRUE)
|
|
|
|
|
+#
|
|
|
|
|
+#seasons <- purrr:map_df(datafiles,
|
|
|
|
|
+# ~read_csv(.x),
|
|
|
|
|
+# .id = "filename")
|
|
|
|
|
+
|
|
|
|
|
+# construct a new ID column which is a combination of EVENTNUM and GAME_ID
|
|
|
|
|
+# and only retain a subset of the original columns
|
|
|
|
|
+season_subset <- season %>%
|
|
|
|
|
+ dplyr::select(GAME_ID, EVENTNUM, HOMEDESCRIPTION, VISITORDESCRIPTION,
|
|
|
|
|
+ SCORE, SCOREMARGIN, PCTIMESTRING, PERIOD) %>%
|
|
|
|
|
+ mutate(SCOREMARGIN = as.numeric(SCOREMARGIN)) %>%
|
|
|
|
|
+ mutate(GAME_ID_INT = as.integer(GAME_ID))
|
|
|
|
|
+
|
|
|
|
|
+# find the home team foul entries
|
|
|
|
|
+homefouls <- grep("FOUL", x = season_subset$HOMEDESCRIPTION, value=FALSE)
|
|
|
|
|
+# and the visitor fouls (not doing pre-NA checking is faster)
|
|
|
|
|
+visitorfouls <- grep("FOUL", x = season_subset$VISITORDESCRIPTION, value=FALSE)
|
|
|
|
|
+
|
|
|
|
|
+# combine foul indices and get unique entries and create for only fouls
|
|
|
|
|
+allfouls <- season_subset[unique(c(homefouls, visitorfouls)),]
|
|
|
|
|
+
|
|
|
|
|
+# now go back and search for the game score.
|
|
|
|
|
+
|
|
|
|
|
+# find the closest previous event number with a score margin entry (can be the
|
|
|
|
|
+# same event number) and return the score margin
|
|
|
|
|
+find_score_margin <- function(gameid, eventnum, colname = "SCOREMARGIN"){
|
|
|
|
|
+ score_margin <- filter(season_subset,
|
|
|
|
|
+ GAME_ID_INT == gameid,
|
|
|
|
|
+ EVENTNUM <= eventnum,
|
|
|
|
|
+ !is.na(SCOREMARGIN)) %>%
|
|
|
|
|
+ arrange(EVENTNUM) %>%
|
|
|
|
|
+ pull(colname)
|
|
|
|
|
+ # if no score margin found, there is no score in the game
|
|
|
|
|
+ # return 0 (though this actually returns NA)
|
|
|
|
|
+ tail_scoremargin <- ifelse(!is.null(score_margin),
|
|
|
|
|
+ yes=tail(score_margin, 1),
|
|
|
|
|
+ no=0)
|
|
|
|
|
+ return(tail_scoremargin)
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+# associate score margins with the fouls
|
|
|
|
|
+scoremargins <- purrr::map2_dbl(.x = allfouls$GAME_ID_INT,
|
|
|
|
|
+ .y = allfouls$EVENTNUM,
|
|
|
|
|
+ ~find_score_margin(gameid= .x,
|
|
|
|
|
+ eventnum = .y,
|
|
|
|
|
+ colname="SCOREMARGIN"))
|
|
|
|
|
+# associate scores with the fouls
|
|
|
|
|
+scores <- purrr::map2_chr(.x = allfouls$GAME_ID_INT,
|
|
|
|
|
+ .y = allfouls$EVENTNUM,
|
|
|
|
|
+ ~find_score_margin(gameid = .x,
|
|
|
|
|
+ eventnum = .y,
|
|
|
|
|
+ colname ="SCORE"))
|
|
|
|
|
+
|
|
|
|
|
+allfouls$SCOREMARGIN <- replace_na(scoremargins, 0)
|
|
|
|
|
+allfouls$SCORE <- replace_na(scores, "0-0")
|
|
|
|
|
+
|
|
|
|
|
+# corrected score margin is for consistently plotting the number of fouls
|
|
|
|
|
+# when either the home or visiting team is ahead
|
|
|
|
|
+allfouls <- mutate(allfouls,
|
|
|
|
|
+ SCOREMARGIN_CORR = case_when(is.na(HOMEDESCRIPTION) ~ SCOREMARGIN,
|
|
|
|
|
+ is.na(VISITORDESCRIPTION) ~ -1*SCOREMARGIN))
|
|
|
|
|
+# add a total score column to the fouls data frame
|
|
|
|
|
+sum_score <- function(scorestr) {
|
|
|
|
|
+ parts <- str_split(scorestr, '-', simplify=TRUE)
|
|
|
|
|
+ totalscore <- as.integer(parts[1]) + as.integer(parts[2])
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+allfouls <- mutate(allfouls,
|
|
|
|
|
+ TOTALSCORE = purrr::map_int(.x = allfouls$SCORE,
|
|
|
|
|
+ ~sum_score(scorestr = .x)))
|
|
|
|
|
+
|
|
|
|
|
+# add a column specifying whether the foul was on home or visitor
|
|
|
|
|
+allfouls <- mutate(allfouls,
|
|
|
|
|
+ FOULTEAM = case_when(!is.na(HOMEDESCRIPTION) ~ "HOME",
|
|
|
|
|
+ !is.na(VISITORDESCRIPTION) ~ "VISITOR"))
|
|
|
|
|
+
|
|
|
|
|
+# save the derived "all fouls" dataframe
|
|
|
|
|
+write_csv(allfouls, 'data/2017-18_foulsonly.csv')
|
|
|
|
|
+
|
|
|
|
|
+## Plots
|
|
|
|
|
+# histogram of fouls as a function of corrected score margin
|
|
|
|
|
+png('figures/foul_histogram-all.png')
|
|
|
|
|
+ggplot(allfouls, aes(x=SCOREMARGIN_CORR)) +
|
|
|
|
|
+ geom_histogram(binwidth=1, fill="black") +
|
|
|
|
|
+ theme_bw() +
|
|
|
|
|
+ scale_y_log10() +
|
|
|
|
|
+ xlab("Score Margin") + ylab("N Fouls")
|
|
|
|
|
+
|
|
|
|
|
+dev.off()
|
|
|
|
|
+
|
|
|
|
|
+# histogram of fouls as a function of corrected score margin,
|
|
|
|
|
+# ignoring overtime and the final minute of regular play
|
|
|
|
|
+earlyfouls <- filter(allfouls,
|
|
|
|
|
+ PERIOD <= 4,
|
|
|
|
|
+ !(PERIOD == 4 & PCTIMESTRING < "00:01:00"))
|
|
|
|
|
+
|
|
|
|
|
+png('figures/foul_histogram-regular_nofinalmin.png')
|
|
|
|
|
+ggplot(earlyfouls, aes(x=SCOREMARGIN_CORR)) +
|
|
|
|
|
+ geom_histogram(binwidth=1, fill="red", alpha=0.5) +
|
|
|
|
|
+ theme_bw() +
|
|
|
|
|
+ scale_y_log10() +
|
|
|
|
|
+ xlab("Score Margin") + ylab("N Fouls") +
|
|
|
|
|
+ geom_histogram(data=allfouls, binwidth=1, fill="green", alpha=0.5)
|
|
|
|
|
+dev.off()
|
|
|
|
|
+
|
|
|
|
|
+# hexbin plots of fouls as a function of total score and corrected score
|
|
|
|
|
+# margin, separated by home and away teams
|
|
|
|
|
+png('figures/fouls_totalscore-hexbin.png', height=600, width=1200)
|
|
|
|
|
+ggplot(allfouls, aes(SCOREMARGIN_CORR, TOTALSCORE)) +
|
|
|
|
|
+ geom_hex() +
|
|
|
|
|
+ scale_fill_viridis_c() +
|
|
|
|
|
+ theme_bw() +
|
|
|
|
|
+ facet_wrap(vars(FOULTEAM))
|
|
|
|
|
+dev.off()
|