| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123 |
- library(tidyverse)
- # use tidyverse csv reader
- season <- read_csv("data/2017-18_pbp.csv")
- ## for later:
- ## load in all data files and create a single variable
- ## with the filename as an ID column
- #datafiles <- list.files("data", full.names = TRUE)
- #
- #seasons <- purrr:map_df(datafiles,
- # ~read_csv(.x),
- # .id = "filename")
- # construct a new ID column which is a combination of EVENTNUM and GAME_ID
- # and only retain a subset of the original columns
- season_subset <- season %>%
- dplyr::select(GAME_ID, EVENTNUM, HOMEDESCRIPTION, VISITORDESCRIPTION,
- SCORE, SCOREMARGIN, PCTIMESTRING, PERIOD) %>%
- mutate(SCOREMARGIN = as.numeric(SCOREMARGIN)) %>%
- mutate(GAME_ID_INT = as.integer(GAME_ID))
- # find the home team foul entries
- homefouls <- grep("FOUL", x = season_subset$HOMEDESCRIPTION, value=FALSE)
- # and the visitor fouls (not doing pre-NA checking is faster)
- visitorfouls <- grep("FOUL", x = season_subset$VISITORDESCRIPTION, value=FALSE)
- # combine foul indices and get unique entries and create for only fouls
- allfouls <- season_subset[unique(c(homefouls, visitorfouls)),]
- # now go back and search for the game score.
- # find the closest previous event number with a score margin entry (can be the
- # same event number) and return the score margin
- find_score_margin <- function(gameid, eventnum, colname = "SCOREMARGIN"){
- score_margin <- filter(season_subset,
- GAME_ID_INT == gameid,
- EVENTNUM <= eventnum,
- !is.na(SCOREMARGIN)) %>%
- arrange(EVENTNUM) %>%
- pull(colname)
- # if no score margin found, there is no score in the game
- # return 0 (though this actually returns NA)
- tail_scoremargin <- ifelse(!is.null(score_margin),
- yes=tail(score_margin, 1),
- no=0)
- return(tail_scoremargin)
- }
- # associate score margins with the fouls
- scoremargins <- purrr::map2_dbl(.x = allfouls$GAME_ID_INT,
- .y = allfouls$EVENTNUM,
- ~find_score_margin(gameid= .x,
- eventnum = .y,
- colname="SCOREMARGIN"))
- # associate scores with the fouls
- scores <- purrr::map2_chr(.x = allfouls$GAME_ID_INT,
- .y = allfouls$EVENTNUM,
- ~find_score_margin(gameid = .x,
- eventnum = .y,
- colname ="SCORE"))
- allfouls$SCOREMARGIN <- replace_na(scoremargins, 0)
- allfouls$SCORE <- replace_na(scores, "0-0")
- # corrected score margin is for consistently plotting the number of fouls
- # when either the home or visiting team is ahead
- allfouls <- mutate(allfouls,
- SCOREMARGIN_CORR = case_when(is.na(HOMEDESCRIPTION) ~ SCOREMARGIN,
- is.na(VISITORDESCRIPTION) ~ -1*SCOREMARGIN))
- # add a total score column to the fouls data frame
- sum_score <- function(scorestr) {
- parts <- str_split(scorestr, '-', simplify=TRUE)
- totalscore <- as.integer(parts[1]) + as.integer(parts[2])
- }
- allfouls <- mutate(allfouls,
- TOTALSCORE = purrr::map_int(.x = allfouls$SCORE,
- ~sum_score(scorestr = .x)))
- # add a column specifying whether the foul was on home or visitor
- allfouls <- mutate(allfouls,
- FOULTEAM = case_when(!is.na(HOMEDESCRIPTION) ~ "HOME",
- !is.na(VISITORDESCRIPTION) ~ "VISITOR"))
- # save the derived "all fouls" dataframe
- write_csv(allfouls, 'data/2017-18_foulsonly.csv')
- ## Plots
- # histogram of fouls as a function of corrected score margin
- png('figures/foul_histogram-all.png')
- ggplot(allfouls, aes(x=SCOREMARGIN_CORR)) +
- geom_histogram(binwidth=1, fill="black") +
- theme_bw() +
- scale_y_log10() +
- xlab("Score Margin") + ylab("N Fouls")
- dev.off()
- # histogram of fouls as a function of corrected score margin,
- # ignoring overtime and the final minute of regular play
- earlyfouls <- filter(allfouls,
- PERIOD <= 4,
- !(PERIOD == 4 & PCTIMESTRING < "00:01:00"))
- png('figures/foul_histogram-regular_nofinalmin.png')
- ggplot(earlyfouls, aes(x=SCOREMARGIN_CORR)) +
- geom_histogram(binwidth=1, fill="red", alpha=0.5) +
- theme_bw() +
- scale_y_log10() +
- xlab("Score Margin") + ylab("N Fouls") +
- geom_histogram(data=allfouls, binwidth=1, fill="green", alpha=0.5)
- dev.off()
- # hexbin plots of fouls as a function of total score and corrected score
- # margin, separated by home and away teams
- png('figures/fouls_totalscore-hexbin.png', height=600, width=1200)
- ggplot(allfouls, aes(SCOREMARGIN_CORR, TOTALSCORE)) +
- geom_hex() +
- scale_fill_viridis_c() +
- theme_bw() +
- facet_wrap(vars(FOULTEAM))
- dev.off()
|