single_season.R 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. library(tidyverse)
  2. # use tidyverse csv reader
  3. season <- read_csv("data/2017-18_pbp.csv")
  4. ## for later:
  5. ## load in all data files and create a single variable
  6. ## with the filename as an ID column
  7. #datafiles <- list.files("data", full.names = TRUE)
  8. #
  9. #seasons <- purrr:map_df(datafiles,
  10. # ~read_csv(.x),
  11. # .id = "filename")
  12. # construct a new ID column which is a combination of EVENTNUM and GAME_ID
  13. # and only retain a subset of the original columns
  14. season_subset <- season %>%
  15. dplyr::select(GAME_ID, EVENTNUM, HOMEDESCRIPTION, VISITORDESCRIPTION,
  16. SCORE, SCOREMARGIN, PCTIMESTRING, PERIOD) %>%
  17. mutate(SCOREMARGIN = as.numeric(SCOREMARGIN)) %>%
  18. mutate(GAME_ID_INT = as.integer(GAME_ID))
  19. # find the home team foul entries
  20. homefouls <- grep("FOUL", x = season_subset$HOMEDESCRIPTION, value=FALSE)
  21. # and the visitor fouls (not doing pre-NA checking is faster)
  22. visitorfouls <- grep("FOUL", x = season_subset$VISITORDESCRIPTION, value=FALSE)
  23. # combine foul indices and get unique entries and create for only fouls
  24. allfouls <- season_subset[unique(c(homefouls, visitorfouls)),]
  25. # now go back and search for the game score.
  26. # find the closest previous event number with a score margin entry (can be the
  27. # same event number) and return the score margin
  28. find_score_margin <- function(gameid, eventnum, colname = "SCOREMARGIN"){
  29. score_margin <- filter(season_subset,
  30. GAME_ID_INT == gameid,
  31. EVENTNUM <= eventnum,
  32. !is.na(SCOREMARGIN)) %>%
  33. arrange(EVENTNUM) %>%
  34. pull(colname)
  35. # if no score margin found, there is no score in the game
  36. # return 0 (though this actually returns NA)
  37. tail_scoremargin <- ifelse(!is.null(score_margin),
  38. yes=tail(score_margin, 1),
  39. no=0)
  40. return(tail_scoremargin)
  41. }
  42. # associate score margins with the fouls
  43. scoremargins <- purrr::map2_dbl(.x = allfouls$GAME_ID_INT,
  44. .y = allfouls$EVENTNUM,
  45. ~find_score_margin(gameid= .x,
  46. eventnum = .y,
  47. colname="SCOREMARGIN"))
  48. # associate scores with the fouls
  49. scores <- purrr::map2_chr(.x = allfouls$GAME_ID_INT,
  50. .y = allfouls$EVENTNUM,
  51. ~find_score_margin(gameid = .x,
  52. eventnum = .y,
  53. colname ="SCORE"))
  54. allfouls$SCOREMARGIN <- replace_na(scoremargins, 0)
  55. allfouls$SCORE <- replace_na(scores, "0-0")
  56. # corrected score margin is for consistently plotting the number of fouls
  57. # when either the home or visiting team is ahead
  58. allfouls <- mutate(allfouls,
  59. SCOREMARGIN_CORR = case_when(is.na(HOMEDESCRIPTION) ~ SCOREMARGIN,
  60. is.na(VISITORDESCRIPTION) ~ -1*SCOREMARGIN))
  61. # add a total score column to the fouls data frame
  62. sum_score <- function(scorestr) {
  63. parts <- str_split(scorestr, '-', simplify=TRUE)
  64. totalscore <- as.integer(parts[1]) + as.integer(parts[2])
  65. }
  66. allfouls <- mutate(allfouls,
  67. TOTALSCORE = purrr::map_int(.x = allfouls$SCORE,
  68. ~sum_score(scorestr = .x)))
  69. # add a column specifying whether the foul was on home or visitor
  70. allfouls <- mutate(allfouls,
  71. FOULTEAM = case_when(!is.na(HOMEDESCRIPTION) ~ "HOME",
  72. !is.na(VISITORDESCRIPTION) ~ "VISITOR"))
  73. # save the derived "all fouls" dataframe
  74. write_csv(allfouls, 'data/2017-18_foulsonly.csv')
  75. ## Plots
  76. # histogram of fouls as a function of corrected score margin
  77. png('figures/foul_histogram-all.png')
  78. ggplot(allfouls, aes(x=SCOREMARGIN_CORR)) +
  79. geom_histogram(binwidth=1, fill="black") +
  80. theme_bw() +
  81. scale_y_log10() +
  82. xlab("Score Margin") + ylab("N Fouls")
  83. dev.off()
  84. # histogram of fouls as a function of corrected score margin,
  85. # ignoring overtime and the final minute of regular play
  86. earlyfouls <- filter(allfouls,
  87. PERIOD <= 4,
  88. !(PERIOD == 4 & PCTIMESTRING < "00:01:00"))
  89. png('figures/foul_histogram-regular_nofinalmin.png')
  90. ggplot(earlyfouls, aes(x=SCOREMARGIN_CORR)) +
  91. geom_histogram(binwidth=1, fill="red", alpha=0.5) +
  92. theme_bw() +
  93. scale_y_log10() +
  94. xlab("Score Margin") + ylab("N Fouls") +
  95. geom_histogram(data=allfouls, binwidth=1, fill="green", alpha=0.5)
  96. dev.off()
  97. # hexbin plots of fouls as a function of total score and corrected score
  98. # margin, separated by home and away teams
  99. png('figures/fouls_totalscore-hexbin.png', height=600, width=1200)
  100. ggplot(allfouls, aes(SCOREMARGIN_CORR, TOTALSCORE)) +
  101. geom_hex() +
  102. scale_fill_viridis_c() +
  103. theme_bw() +
  104. facet_wrap(vars(FOULTEAM))
  105. dev.off()