library(tidyverse) # use tidyverse csv reader season <- read_csv("data/2017-18_pbp.csv") ## for later: ## load in all data files and create a single variable ## with the filename as an ID column #datafiles <- list.files("data", full.names = TRUE) # #seasons <- purrr:map_df(datafiles, # ~read_csv(.x), # .id = "filename") # construct a new ID column which is a combination of EVENTNUM and GAME_ID # and only retain a subset of the original columns season_subset <- season %>% dplyr::select(GAME_ID, EVENTNUM, HOMEDESCRIPTION, VISITORDESCRIPTION, SCORE, SCOREMARGIN, PCTIMESTRING, PERIOD) %>% mutate(SCOREMARGIN = as.numeric(SCOREMARGIN)) %>% mutate(GAME_ID_INT = as.integer(GAME_ID)) # find the home team foul entries homefouls <- grep("FOUL", x = season_subset$HOMEDESCRIPTION, value=FALSE) # and the visitor fouls (not doing pre-NA checking is faster) visitorfouls <- grep("FOUL", x = season_subset$VISITORDESCRIPTION, value=FALSE) # combine foul indices and get unique entries and create for only fouls allfouls <- season_subset[unique(c(homefouls, visitorfouls)),] # now go back and search for the game score. # find the closest previous event number with a score margin entry (can be the # same event number) and return the score margin find_score_margin <- function(gameid, eventnum, colname = "SCOREMARGIN"){ score_margin <- filter(season_subset, GAME_ID_INT == gameid, EVENTNUM <= eventnum, !is.na(SCOREMARGIN)) %>% arrange(EVENTNUM) %>% pull(colname) # if no score margin found, there is no score in the game # return 0 (though this actually returns NA) tail_scoremargin <- ifelse(!is.null(score_margin), yes=tail(score_margin, 1), no=0) return(tail_scoremargin) } # associate score margins with the fouls scoremargins <- purrr::map2_dbl(.x = allfouls$GAME_ID_INT, .y = allfouls$EVENTNUM, ~find_score_margin(gameid= .x, eventnum = .y, colname="SCOREMARGIN")) # associate scores with the fouls scores <- purrr::map2_chr(.x = allfouls$GAME_ID_INT, .y = allfouls$EVENTNUM, ~find_score_margin(gameid = .x, eventnum = .y, colname ="SCORE")) allfouls$SCOREMARGIN <- replace_na(scoremargins, 0) allfouls$SCORE <- replace_na(scores, "0-0") # corrected score margin is for consistently plotting the number of fouls # when either the home or visiting team is ahead allfouls <- mutate(allfouls, SCOREMARGIN_CORR = case_when(is.na(HOMEDESCRIPTION) ~ SCOREMARGIN, is.na(VISITORDESCRIPTION) ~ -1*SCOREMARGIN)) # add a total score column to the fouls data frame sum_score <- function(scorestr) { parts <- str_split(scorestr, '-', simplify=TRUE) totalscore <- as.integer(parts[1]) + as.integer(parts[2]) } allfouls <- mutate(allfouls, TOTALSCORE = purrr::map_int(.x = allfouls$SCORE, ~sum_score(scorestr = .x))) # add a column specifying whether the foul was on home or visitor allfouls <- mutate(allfouls, FOULTEAM = case_when(!is.na(HOMEDESCRIPTION) ~ "HOME", !is.na(VISITORDESCRIPTION) ~ "VISITOR")) # save the derived "all fouls" dataframe write_csv(allfouls, 'data/2017-18_foulsonly.csv') ## Plots # histogram of fouls as a function of corrected score margin png('figures/foul_histogram-all.png') ggplot(allfouls, aes(x=SCOREMARGIN_CORR)) + geom_histogram(binwidth=1, fill="black") + theme_bw() + scale_y_log10() + xlab("Score Margin") + ylab("N Fouls") dev.off() # histogram of fouls as a function of corrected score margin, # ignoring overtime and the final minute of regular play earlyfouls <- filter(allfouls, PERIOD <= 4, !(PERIOD == 4 & PCTIMESTRING < "00:01:00")) png('figures/foul_histogram-regular_nofinalmin.png') ggplot(earlyfouls, aes(x=SCOREMARGIN_CORR)) + geom_histogram(binwidth=1, fill="red", alpha=0.5) + theme_bw() + scale_y_log10() + xlab("Score Margin") + ylab("N Fouls") + geom_histogram(data=allfouls, binwidth=1, fill="green", alpha=0.5) dev.off() # hexbin plots of fouls as a function of total score and corrected score # margin, separated by home and away teams png('figures/fouls_totalscore-hexbin.png', height=600, width=1200) ggplot(allfouls, aes(SCOREMARGIN_CORR, TOTALSCORE)) + geom_hex() + scale_fill_viridis_c() + theme_bw() + facet_wrap(vars(FOULTEAM)) dev.off()