|
|
@@ -2,38 +2,11 @@
|
|
|
# Save for later processing.
|
|
|
library(tidyverse)
|
|
|
|
|
|
-# load in all data files and create a single variable
|
|
|
-# with the filename as an ID column
|
|
|
-datafiles <- list.files("data", pattern="*_pbp.csv", full.names = FALSE)
|
|
|
-
|
|
|
-seasons <- purrr::map_df(datafiles,
|
|
|
- ~read_csv(paste0("data/", .x)),
|
|
|
- .id = "filename")
|
|
|
-
|
|
|
-# construct a new dataframe which is a subset of the original
|
|
|
-# also ensure that SCOREMARGIN is a number and create an integer gameID
|
|
|
-season_subset <- seasons %>%
|
|
|
- dplyr::select(GAME_ID, EVENTNUM, HOMEDESCRIPTION, VISITORDESCRIPTION,
|
|
|
- SCORE, SCOREMARGIN, PCTIMESTRING, PERIOD,
|
|
|
- PLAYER1_TEAM_ABBREVIATION, PLAYER1_TEAM_CITY,
|
|
|
- PLAYER1_TEAM_NICKNAME) %>%
|
|
|
- mutate(SCOREMARGIN = as.numeric(SCOREMARGIN)) %>%
|
|
|
- mutate(GAME_ID_INT = as.integer(GAME_ID))
|
|
|
-
|
|
|
-# find the home team foul entries
|
|
|
-homefouls <- grep("FOUL", x = season_subset$HOMEDESCRIPTION, value=FALSE)
|
|
|
-# and the visitor fouls (not doing pre-NA checking is faster)
|
|
|
-visitorfouls <- grep("FOUL", x = season_subset$VISITORDESCRIPTION, value=FALSE)
|
|
|
-
|
|
|
-# combine foul indices and get unique entries and create for only fouls
|
|
|
-allfouls <- season_subset[unique(c(homefouls, visitorfouls)),]
|
|
|
-
|
|
|
-# now go back and search for the game score.
|
|
|
-
|
|
|
+## define some useful functions for later
|
|
|
# find the closest previous event number with a score margin entry (can be the
|
|
|
# same event number) and return the score margin
|
|
|
-find_score_margin <- function(gameid, eventnum, colname = "SCOREMARGIN"){
|
|
|
- score_margin <- filter(season_subset,
|
|
|
+find_score_margin <- function(seasondata, gameid, eventnum, colname = "SCOREMARGIN"){
|
|
|
+ score_margin <- filter(seasondata,
|
|
|
GAME_ID_INT == gameid,
|
|
|
EVENTNUM <= eventnum,
|
|
|
!is.na(SCOREMARGIN)) %>%
|
|
|
@@ -47,42 +20,85 @@ find_score_margin <- function(gameid, eventnum, colname = "SCOREMARGIN"){
|
|
|
return(tail_scoremargin)
|
|
|
}
|
|
|
|
|
|
-# associate score margins with the fouls
|
|
|
-scoremargins <- purrr::map2_dbl(.x = allfouls$GAME_ID_INT,
|
|
|
- .y = allfouls$EVENTNUM,
|
|
|
- ~find_score_margin(gameid= .x,
|
|
|
- eventnum = .y,
|
|
|
- colname="SCOREMARGIN"))
|
|
|
-# associate scores with the fouls
|
|
|
-scores <- purrr::map2_chr(.x = allfouls$GAME_ID_INT,
|
|
|
- .y = allfouls$EVENTNUM,
|
|
|
- ~find_score_margin(gameid = .x,
|
|
|
- eventnum = .y,
|
|
|
- colname ="SCORE"))
|
|
|
-
|
|
|
-allfouls$SCOREMARGIN <- replace_na(scoremargins, 0)
|
|
|
-allfouls$SCORE <- replace_na(scores, "0-0")
|
|
|
-
|
|
|
-# corrected score margin is for consistently plotting the number of fouls
|
|
|
-# when either the home or visiting team is ahead
|
|
|
-allfouls <- mutate(allfouls,
|
|
|
- SCOREMARGIN_CORR = case_when(is.na(HOMEDESCRIPTION) ~ SCOREMARGIN,
|
|
|
- is.na(VISITORDESCRIPTION) ~ -1*SCOREMARGIN))
|
|
|
# add a total score column to the fouls data frame
|
|
|
sum_score <- function(scorestr) {
|
|
|
parts <- str_split(scorestr, '-', simplify=TRUE)
|
|
|
totalscore <- as.integer(parts[1]) + as.integer(parts[2])
|
|
|
}
|
|
|
|
|
|
-allfouls <- mutate(allfouls,
|
|
|
- TOTALSCORE = purrr::map_int(.x = allfouls$SCORE,
|
|
|
- ~sum_score(scorestr = .x)))
|
|
|
+# function which reduces the season data into only the foul information
|
|
|
+# needed for analysis
|
|
|
+reduce_season <- function(filename) {
|
|
|
+
|
|
|
+ seasonID = strsplit(filename, '_pbp.csv')[[1]][1]
|
|
|
+ # load a single-season of data
|
|
|
+ season <- read_csv(paste0("data/", filename))
|
|
|
+ # construct a new dataframe which is a subset of the original
|
|
|
+ # also ensure that SCOREMARGIN is a number and create an integer gameID
|
|
|
+ # SCOREMARGIN is Home-Away score
|
|
|
+ # the foul was committed by PLAYER1 against PLAYER2
|
|
|
+ season_subset <- season %>%
|
|
|
+ dplyr::select(GAME_ID, EVENTNUM, HOMEDESCRIPTION, VISITORDESCRIPTION,
|
|
|
+ SCORE, SCOREMARGIN, PCTIMESTRING, PERIOD) %>%
|
|
|
+ mutate(SCOREMARGIN = as.numeric(SCOREMARGIN)) %>%
|
|
|
+ mutate(GAME_ID_INT = as.integer(GAME_ID))
|
|
|
|
|
|
-# add a column specifying whether the foul was on home or visitor
|
|
|
-allfouls <- mutate(allfouls,
|
|
|
- FOULTEAM = case_when(!is.na(HOMEDESCRIPTION) ~ "HOME",
|
|
|
- !is.na(VISITORDESCRIPTION) ~ "VISITOR"))
|
|
|
+ # clear original season data from memory
|
|
|
+ rm(season)
|
|
|
|
|
|
-# save the derived "all fouls" dataframe
|
|
|
-write_csv(allfouls, 'data/foulsonly.csv')
|
|
|
+ # find the home team foul entries
|
|
|
+ homefouls <- grep("FOUL",
|
|
|
+ x = season_subset$HOMEDESCRIPTION,
|
|
|
+ value=FALSE)
|
|
|
+ # and the visitor fouls
|
|
|
+ visitorfouls <- grep("FOUL",
|
|
|
+ x = season_subset$VISITORDESCRIPTION,
|
|
|
+ value=FALSE)
|
|
|
+
|
|
|
+ # combine foul indices and get unique entries and create for only fouls
|
|
|
+ allfouls <- season_subset[unique(c(homefouls, visitorfouls)),]
|
|
|
+
|
|
|
+ # now go back and search for the game score.
|
|
|
+
|
|
|
+ # associate score margins with the fouls
|
|
|
+ scoremargins <- purrr::map2_dbl(.x = allfouls$GAME_ID_INT,
|
|
|
+ .y = allfouls$EVENTNUM,
|
|
|
+ ~find_score_margin(season_subset,
|
|
|
+ gameid= .x,
|
|
|
+ eventnum = .y,
|
|
|
+ colname="SCOREMARGIN"))
|
|
|
+ # associate scores with the fouls
|
|
|
+ scores <- purrr::map2_chr(.x = allfouls$GAME_ID_INT,
|
|
|
+ .y = allfouls$EVENTNUM,
|
|
|
+ ~find_score_margin(season_subset,
|
|
|
+ gameid = .x,
|
|
|
+ eventnum = .y,
|
|
|
+ colname ="SCORE"))
|
|
|
+
|
|
|
+ allfouls$SCOREMARGIN <- replace_na(scoremargins, 0)
|
|
|
+ allfouls$SCORE <- replace_na(scores, "0-0")
|
|
|
+
|
|
|
+ # corrected score margin is for consistently plotting the number of fouls
|
|
|
+ # when either the home or visiting team is ahead
|
|
|
+ allfouls <- mutate(allfouls,
|
|
|
+ SCOREMARGIN_CORR = case_when(is.na(HOMEDESCRIPTION) ~ SCOREMARGIN,
|
|
|
+ is.na(VISITORDESCRIPTION) ~ -1*SCOREMARGIN))
|
|
|
+ allfouls <- mutate(allfouls,
|
|
|
+ TOTALSCORE = purrr::map_int(.x = allfouls$SCORE,
|
|
|
+ ~sum_score(scorestr = .x)))
|
|
|
+
|
|
|
+ # add a column specifying whether the foul was on home or visitor
|
|
|
+ allfouls <- mutate(allfouls,
|
|
|
+ FOULTEAM = case_when(!is.na(HOMEDESCRIPTION) ~ "HOME",
|
|
|
+ !is.na(VISITORDESCRIPTION) ~ "VISITOR"))
|
|
|
+
|
|
|
+ # save the derived "all fouls" dataframe
|
|
|
+ write_csv(allfouls, paste0('data/', paste0(seasonID, '-allfouls.csv')))
|
|
|
+}
|
|
|
+
|
|
|
+# load in all data files and create a single variable
|
|
|
+# with the filename as an ID column
|
|
|
+datafiles <- list.files("data", pattern="*_pbp.csv", full.names = FALSE)
|
|
|
|
|
|
+purrr::map_df(datafiles,
|
|
|
+ ~reduce_season(.x))
|