gen_allfouls.R 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. # Extract fouls along with contextual information.
  2. # Save for later processing.
  3. library(tidyverse)
  4. ## define some useful functions for later
  5. # find the closest previous event number with a score margin entry (can be the
  6. # same event number) and return the score margin
  7. find_score_margin <- function(seasondata, gameid, eventnum, colname = "SCOREMARGIN"){
  8. score_margin <- filter(seasondata,
  9. GAME_ID_INT == gameid,
  10. EVENTNUM <= eventnum,
  11. !is.na(SCOREMARGIN)) %>%
  12. arrange(EVENTNUM) %>%
  13. pull(colname)
  14. # if no score margin found, there is no score in the game
  15. # return 0 (though this actually returns NA)
  16. tail_scoremargin <- ifelse(!is.null(score_margin),
  17. yes=tail(score_margin, 1),
  18. no=0)
  19. return(tail_scoremargin)
  20. }
  21. # add a total score column to the fouls data frame
  22. sum_score <- function(scorestr) {
  23. parts <- str_split(scorestr, '-', simplify=TRUE)
  24. totalscore <- as.integer(parts[1]) + as.integer(parts[2])
  25. }
  26. # function which reduces the season data into only the foul information
  27. # needed for analysis
  28. reduce_season <- function(filename) {
  29. seasonID = strsplit(filename, '_pbp.csv')[[1]][1]
  30. # load a single-season of data
  31. season <- read_csv(paste0("data/", filename))
  32. # construct a new dataframe which is a subset of the original
  33. # also ensure that SCOREMARGIN is a number and create an integer gameID
  34. # SCOREMARGIN is Home-Away score
  35. # the foul was committed by PLAYER1 against PLAYER2
  36. season_subset <- season %>%
  37. dplyr::select(GAME_ID, EVENTNUM, HOMEDESCRIPTION, VISITORDESCRIPTION,
  38. SCORE, SCOREMARGIN, PCTIMESTRING, PERIOD) %>%
  39. mutate(SCOREMARGIN = as.numeric(SCOREMARGIN)) %>%
  40. mutate(GAME_ID_INT = as.integer(GAME_ID))
  41. # clear original season data from memory
  42. rm(season)
  43. # find the home team foul entries
  44. homefouls <- grep("FOUL",
  45. x = season_subset$HOMEDESCRIPTION,
  46. value=FALSE)
  47. # and the visitor fouls
  48. visitorfouls <- grep("FOUL",
  49. x = season_subset$VISITORDESCRIPTION,
  50. value=FALSE)
  51. # combine foul indices and get unique entries and create for only fouls
  52. allfouls <- season_subset[unique(c(homefouls, visitorfouls)),]
  53. # now go back and search for the game score.
  54. # associate score margins with the fouls
  55. scoremargins <- purrr::map2_dbl(.x = allfouls$GAME_ID_INT,
  56. .y = allfouls$EVENTNUM,
  57. ~find_score_margin(season_subset,
  58. gameid= .x,
  59. eventnum = .y,
  60. colname="SCOREMARGIN"))
  61. # associate scores with the fouls
  62. scores <- purrr::map2_chr(.x = allfouls$GAME_ID_INT,
  63. .y = allfouls$EVENTNUM,
  64. ~find_score_margin(season_subset,
  65. gameid = .x,
  66. eventnum = .y,
  67. colname ="SCORE"))
  68. allfouls$SCOREMARGIN <- replace_na(scoremargins, 0)
  69. allfouls$SCORE <- replace_na(scores, "0-0")
  70. # corrected score margin is for consistently plotting the number of fouls
  71. # when either the home or visiting team is ahead
  72. allfouls <- mutate(allfouls,
  73. SCOREMARGIN_CORR = case_when(is.na(HOMEDESCRIPTION) ~ SCOREMARGIN,
  74. is.na(VISITORDESCRIPTION) ~ -1*SCOREMARGIN))
  75. allfouls <- mutate(allfouls,
  76. TOTALSCORE = purrr::map_int(.x = allfouls$SCORE,
  77. ~sum_score(scorestr = .x)))
  78. # add a column specifying whether the foul was on home or visitor
  79. allfouls <- mutate(allfouls,
  80. FOULTEAM = case_when(!is.na(HOMEDESCRIPTION) ~ "HOME",
  81. !is.na(VISITORDESCRIPTION) ~ "VISITOR"))
  82. # save the derived "all fouls" dataframe
  83. write_csv(allfouls, paste0('data/', paste0(seasonID, '-allfouls.csv')))
  84. }
  85. # load in all data files and create a single variable
  86. # with the filename as an ID column
  87. datafiles <- list.files("data", pattern="*_pbp.csv", full.names = FALSE)
  88. purrr::map_df(datafiles,
  89. ~reduce_season(.x))