# Extract fouls along with contextual information. # Save for later processing. library(tidyverse) ## define some useful functions for later # find the closest previous event number with a score margin entry (can be the # same event number) and return the score margin find_score_margin <- function(seasondata, gameid, eventnum, colname = "SCOREMARGIN"){ score_margin <- filter(seasondata, GAME_ID_INT == gameid, EVENTNUM <= eventnum, !is.na(SCOREMARGIN)) %>% arrange(EVENTNUM) %>% pull(colname) # if no score margin found, there is no score in the game # return 0 (though this actually returns NA) tail_scoremargin <- ifelse(!is.null(score_margin), yes=tail(score_margin, 1), no=0) return(tail_scoremargin) } # add a total score column to the fouls data frame sum_score <- function(scorestr) { parts <- str_split(scorestr, '-', simplify=TRUE) totalscore <- as.integer(parts[1]) + as.integer(parts[2]) } # function which reduces the season data into only the foul information # needed for analysis reduce_season <- function(filename) { seasonID = strsplit(filename, '_pbp.csv')[[1]][1] # load a single-season of data season <- read_csv(paste0("data/", filename)) # construct a new dataframe which is a subset of the original # also ensure that SCOREMARGIN is a number and create an integer gameID # SCOREMARGIN is Home-Away score # the foul was committed by PLAYER1 against PLAYER2 season_subset <- season %>% dplyr::select(GAME_ID, EVENTNUM, HOMEDESCRIPTION, VISITORDESCRIPTION, SCORE, SCOREMARGIN, PCTIMESTRING, PERIOD) %>% mutate(SCOREMARGIN = as.numeric(SCOREMARGIN)) %>% mutate(GAME_ID_INT = as.integer(GAME_ID)) # clear original season data from memory rm(season) # find the home team foul entries homefouls <- grep("FOUL", x = season_subset$HOMEDESCRIPTION, value=FALSE) # and the visitor fouls visitorfouls <- grep("FOUL", x = season_subset$VISITORDESCRIPTION, value=FALSE) # combine foul indices and get unique entries and create for only fouls allfouls <- season_subset[unique(c(homefouls, visitorfouls)),] # now go back and search for the game score. # associate score margins with the fouls scoremargins <- purrr::map2_dbl(.x = allfouls$GAME_ID_INT, .y = allfouls$EVENTNUM, ~find_score_margin(season_subset, gameid= .x, eventnum = .y, colname="SCOREMARGIN")) # associate scores with the fouls scores <- purrr::map2_chr(.x = allfouls$GAME_ID_INT, .y = allfouls$EVENTNUM, ~find_score_margin(season_subset, gameid = .x, eventnum = .y, colname ="SCORE")) allfouls$SCOREMARGIN <- replace_na(scoremargins, 0) allfouls$SCORE <- replace_na(scores, "0-0") # corrected score margin is for consistently plotting the number of fouls # when either the home or visiting team is ahead allfouls <- mutate(allfouls, SCOREMARGIN_CORR = case_when(is.na(HOMEDESCRIPTION) ~ SCOREMARGIN, is.na(VISITORDESCRIPTION) ~ -1*SCOREMARGIN)) allfouls <- mutate(allfouls, TOTALSCORE = purrr::map_int(.x = allfouls$SCORE, ~sum_score(scorestr = .x))) # add a column specifying whether the foul was on home or visitor allfouls <- mutate(allfouls, FOULTEAM = case_when(!is.na(HOMEDESCRIPTION) ~ "HOME", !is.na(VISITORDESCRIPTION) ~ "VISITOR")) # save the derived "all fouls" dataframe write_csv(allfouls, paste0('data/', paste0(seasonID, '-allfouls.csv'))) } # load in all data files and create a single variable # with the filename as an ID column datafiles <- list.files("data", pattern="*_pbp.csv", full.names = FALSE) purrr::map_df(datafiles, ~reduce_season(.x))