gen_allfouls.R 3.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. # Extract fouls along with contextual information.
  2. # Save for later processing.
  3. library(tidyverse)
  4. # load in all data files and create a single variable
  5. # with the filename as an ID column
  6. datafiles <- list.files("data", pattern="*_pbp.csv", full.names = FALSE)
  7. seasons <- purrr::map_df(datafiles,
  8. ~read_csv(paste0("data/", .x)),
  9. .id = "filename")
  10. # construct a new dataframe which is a subset of the original
  11. # also ensure that SCOREMARGIN is a number and create an integer gameID
  12. season_subset <- seasons %>%
  13. dplyr::select(GAME_ID, EVENTNUM, HOMEDESCRIPTION, VISITORDESCRIPTION,
  14. SCORE, SCOREMARGIN, PCTIMESTRING, PERIOD,
  15. PLAYER1_TEAM_ABBREVIATION, PLAYER1_TEAM_CITY,
  16. PLAYER1_TEAM_NICKNAME) %>%
  17. mutate(SCOREMARGIN = as.numeric(SCOREMARGIN)) %>%
  18. mutate(GAME_ID_INT = as.integer(GAME_ID))
  19. # find the home team foul entries
  20. homefouls <- grep("FOUL", x = season_subset$HOMEDESCRIPTION, value=FALSE)
  21. # and the visitor fouls (not doing pre-NA checking is faster)
  22. visitorfouls <- grep("FOUL", x = season_subset$VISITORDESCRIPTION, value=FALSE)
  23. # combine foul indices and get unique entries and create for only fouls
  24. allfouls <- season_subset[unique(c(homefouls, visitorfouls)),]
  25. # now go back and search for the game score.
  26. # find the closest previous event number with a score margin entry (can be the
  27. # same event number) and return the score margin
  28. find_score_margin <- function(gameid, eventnum, colname = "SCOREMARGIN"){
  29. score_margin <- filter(season_subset,
  30. GAME_ID_INT == gameid,
  31. EVENTNUM <= eventnum,
  32. !is.na(SCOREMARGIN)) %>%
  33. arrange(EVENTNUM) %>%
  34. pull(colname)
  35. # if no score margin found, there is no score in the game
  36. # return 0 (though this actually returns NA)
  37. tail_scoremargin <- ifelse(!is.null(score_margin),
  38. yes=tail(score_margin, 1),
  39. no=0)
  40. return(tail_scoremargin)
  41. }
  42. # associate score margins with the fouls
  43. scoremargins <- purrr::map2_dbl(.x = allfouls$GAME_ID_INT,
  44. .y = allfouls$EVENTNUM,
  45. ~find_score_margin(gameid= .x,
  46. eventnum = .y,
  47. colname="SCOREMARGIN"))
  48. # associate scores with the fouls
  49. scores <- purrr::map2_chr(.x = allfouls$GAME_ID_INT,
  50. .y = allfouls$EVENTNUM,
  51. ~find_score_margin(gameid = .x,
  52. eventnum = .y,
  53. colname ="SCORE"))
  54. allfouls$SCOREMARGIN <- replace_na(scoremargins, 0)
  55. allfouls$SCORE <- replace_na(scores, "0-0")
  56. # corrected score margin is for consistently plotting the number of fouls
  57. # when either the home or visiting team is ahead
  58. allfouls <- mutate(allfouls,
  59. SCOREMARGIN_CORR = case_when(is.na(HOMEDESCRIPTION) ~ SCOREMARGIN,
  60. is.na(VISITORDESCRIPTION) ~ -1*SCOREMARGIN))
  61. # add a total score column to the fouls data frame
  62. sum_score <- function(scorestr) {
  63. parts <- str_split(scorestr, '-', simplify=TRUE)
  64. totalscore <- as.integer(parts[1]) + as.integer(parts[2])
  65. }
  66. allfouls <- mutate(allfouls,
  67. TOTALSCORE = purrr::map_int(.x = allfouls$SCORE,
  68. ~sum_score(scorestr = .x)))
  69. # add a column specifying whether the foul was on home or visitor
  70. allfouls <- mutate(allfouls,
  71. FOULTEAM = case_when(!is.na(HOMEDESCRIPTION) ~ "HOME",
  72. !is.na(VISITORDESCRIPTION) ~ "VISITOR"))
  73. # save the derived "all fouls" dataframe
  74. write_csv(allfouls, 'data/foulsonly.csv')