gen_allfouls.R 3.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. # Extract fouls along with contextual information.
  2. # Save for later processing.
  3. library(tidyverse)
  4. # load in all data files and create a single variable
  5. # with the filename as an ID column
  6. datafiles <- list.files("data", pattern="*_pbp.csv", full.names = FALSE)
  7. seasons <- purrr:map_df(datafiles,
  8. ~read_csv(paste0("data/", .x)),
  9. .id = "filename")
  10. # construct a new dataframe which is a subset of the original
  11. # also ensure that SCOREMARGIN is a number and create an integer gameID
  12. season_subset <- seasons %>%
  13. dplyr::select(GAME_ID, EVENTNUM, HOMEDESCRIPTION, VISITORDESCRIPTION,
  14. SCORE, SCOREMARGIN, PCTIMESTRING, PERIOD) %>%
  15. mutate(SCOREMARGIN = as.numeric(SCOREMARGIN)) %>%
  16. mutate(GAME_ID_INT = as.integer(GAME_ID))
  17. # find the home team foul entries
  18. homefouls <- grep("FOUL", x = season_subset$HOMEDESCRIPTION, value=FALSE)
  19. # and the visitor fouls (not doing pre-NA checking is faster)
  20. visitorfouls <- grep("FOUL", x = season_subset$VISITORDESCRIPTION, value=FALSE)
  21. # combine foul indices and get unique entries and create for only fouls
  22. allfouls <- season_subset[unique(c(homefouls, visitorfouls)),]
  23. # now go back and search for the game score.
  24. # find the closest previous event number with a score margin entry (can be the
  25. # same event number) and return the score margin
  26. find_score_margin <- function(gameid, eventnum, colname = "SCOREMARGIN"){
  27. score_margin <- filter(season_subset,
  28. GAME_ID_INT == gameid,
  29. EVENTNUM <= eventnum,
  30. !is.na(SCOREMARGIN)) %>%
  31. arrange(EVENTNUM) %>%
  32. pull(colname)
  33. # if no score margin found, there is no score in the game
  34. # return 0 (though this actually returns NA)
  35. tail_scoremargin <- ifelse(!is.null(score_margin),
  36. yes=tail(score_margin, 1),
  37. no=0)
  38. return(tail_scoremargin)
  39. }
  40. # associate score margins with the fouls
  41. scoremargins <- purrr::map2_dbl(.x = allfouls$GAME_ID_INT,
  42. .y = allfouls$EVENTNUM,
  43. ~find_score_margin(gameid= .x,
  44. eventnum = .y,
  45. colname="SCOREMARGIN"))
  46. # associate scores with the fouls
  47. scores <- purrr::map2_chr(.x = allfouls$GAME_ID_INT,
  48. .y = allfouls$EVENTNUM,
  49. ~find_score_margin(gameid = .x,
  50. eventnum = .y,
  51. colname ="SCORE"))
  52. allfouls$SCOREMARGIN <- replace_na(scoremargins, 0)
  53. allfouls$SCORE <- replace_na(scores, "0-0")
  54. # corrected score margin is for consistently plotting the number of fouls
  55. # when either the home or visiting team is ahead
  56. allfouls <- mutate(allfouls,
  57. SCOREMARGIN_CORR = case_when(is.na(HOMEDESCRIPTION) ~ SCOREMARGIN,
  58. is.na(VISITORDESCRIPTION) ~ -1*SCOREMARGIN))
  59. # add a total score column to the fouls data frame
  60. sum_score <- function(scorestr) {
  61. parts <- str_split(scorestr, '-', simplify=TRUE)
  62. totalscore <- as.integer(parts[1]) + as.integer(parts[2])
  63. }
  64. allfouls <- mutate(allfouls,
  65. TOTALSCORE = purrr::map_int(.x = allfouls$SCORE,
  66. ~sum_score(scorestr = .x)))
  67. # add a column specifying whether the foul was on home or visitor
  68. allfouls <- mutate(allfouls,
  69. FOULTEAM = case_when(!is.na(HOMEDESCRIPTION) ~ "HOME",
  70. !is.na(VISITORDESCRIPTION) ~ "VISITOR"))
  71. # save the derived "all fouls" dataframe
  72. write_csv(allfouls, 'data/foulsonly.csv')