Quellcode durchsuchen

move back to processing individual seasons, map reduction over individual seasons

George C. Privon vor 6 Jahren
Ursprung
Commit
e05f23966c
1 geänderte Dateien mit 76 neuen und 60 gelöschten Zeilen
  1. 76 60
      code/gen_allfouls.R

+ 76 - 60
code/gen_allfouls.R

@@ -2,38 +2,11 @@
 # Save for later processing.
 library(tidyverse)
 
-# load in all data files and create a single variable
-# with the filename as an ID column
-datafiles <- list.files("data", pattern="*_pbp.csv", full.names = FALSE)
-
-seasons <- purrr::map_df(datafiles,
-                        ~read_csv(paste0("data/", .x)),
-                        .id = "filename")
-
-# construct a new dataframe which is a subset of the original
-# also ensure that SCOREMARGIN is a number and create an integer gameID
-season_subset <- seasons %>%
-    dplyr::select(GAME_ID, EVENTNUM, HOMEDESCRIPTION, VISITORDESCRIPTION,
-                  SCORE, SCOREMARGIN, PCTIMESTRING, PERIOD,
-                  PLAYER1_TEAM_ABBREVIATION, PLAYER1_TEAM_CITY,
-                  PLAYER1_TEAM_NICKNAME) %>%
-    mutate(SCOREMARGIN = as.numeric(SCOREMARGIN)) %>%
-    mutate(GAME_ID_INT = as.integer(GAME_ID))
-
-# find the home team foul entries
-homefouls <- grep("FOUL", x = season_subset$HOMEDESCRIPTION, value=FALSE)
-# and the visitor fouls (not doing pre-NA checking is faster)
-visitorfouls <- grep("FOUL", x = season_subset$VISITORDESCRIPTION, value=FALSE)
-
-# combine foul indices and get unique entries and create for only fouls
-allfouls <- season_subset[unique(c(homefouls, visitorfouls)),]
-
-# now go back and search for the game score.
-
+## define some useful functions for later
 # find the closest previous event number with a score margin entry (can be the
 # same event number) and return the score margin
-find_score_margin <- function(gameid, eventnum, colname = "SCOREMARGIN"){
-    score_margin <- filter(season_subset,
+find_score_margin <- function(seasondata, gameid, eventnum, colname = "SCOREMARGIN"){
+    score_margin <- filter(seasondata,
                            GAME_ID_INT == gameid,
                            EVENTNUM <= eventnum,
                            !is.na(SCOREMARGIN)) %>%
@@ -47,42 +20,85 @@ find_score_margin <- function(gameid, eventnum, colname = "SCOREMARGIN"){
     return(tail_scoremargin)
 }
 
-# associate score margins with the fouls
-scoremargins <- purrr::map2_dbl(.x = allfouls$GAME_ID_INT,
-                                .y = allfouls$EVENTNUM,
-                                ~find_score_margin(gameid= .x,
-                                                   eventnum = .y,
-                                                   colname="SCOREMARGIN"))
-# associate scores with the fouls
-scores <- purrr::map2_chr(.x = allfouls$GAME_ID_INT,
-                          .y = allfouls$EVENTNUM,
-                          ~find_score_margin(gameid = .x,
-                                             eventnum = .y,
-                                             colname ="SCORE"))
-
-allfouls$SCOREMARGIN <- replace_na(scoremargins, 0)
-allfouls$SCORE <- replace_na(scores, "0-0")
-
-# corrected score margin is for consistently plotting the number of fouls
-# when either the home or visiting team is ahead
-allfouls <- mutate(allfouls,
-                   SCOREMARGIN_CORR = case_when(is.na(HOMEDESCRIPTION) ~ SCOREMARGIN,
-                                                is.na(VISITORDESCRIPTION) ~ -1*SCOREMARGIN))
 # add a total score column to the fouls data frame
 sum_score <- function(scorestr) {
     parts <- str_split(scorestr, '-', simplify=TRUE)
     totalscore <- as.integer(parts[1]) + as.integer(parts[2])
 }
 
-allfouls <- mutate(allfouls,
-                   TOTALSCORE = purrr::map_int(.x = allfouls$SCORE,
-                                               ~sum_score(scorestr = .x)))
+# function which reduces the season data into only the foul information
+# needed for analysis
+reduce_season <- function(filename) {
+
+    seasonID = strsplit(filename, '_pbp.csv')[[1]][1]
+    # load a single-season of data
+    season <- read_csv(paste0("data/", filename))
+    # construct a new dataframe which is a subset of the original
+    # also ensure that SCOREMARGIN is a number and create an integer gameID
+    # SCOREMARGIN is Home-Away score
+    # the foul was committed by PLAYER1 against PLAYER2
+    season_subset <- season %>%
+        dplyr::select(GAME_ID, EVENTNUM, HOMEDESCRIPTION, VISITORDESCRIPTION,
+                      SCORE, SCOREMARGIN, PCTIMESTRING, PERIOD) %>%
+        mutate(SCOREMARGIN = as.numeric(SCOREMARGIN)) %>%
+        mutate(GAME_ID_INT = as.integer(GAME_ID))
 
-# add a column specifying whether the foul was on home or visitor
-allfouls <- mutate(allfouls,
-                   FOULTEAM = case_when(!is.na(HOMEDESCRIPTION) ~ "HOME",
-                                        !is.na(VISITORDESCRIPTION) ~ "VISITOR"))
+    # clear original season data from memory
+    rm(season)
 
-# save the derived "all fouls" dataframe
-write_csv(allfouls, 'data/foulsonly.csv')
+    # find the home team foul entries
+    homefouls <- grep("FOUL",
+                      x = season_subset$HOMEDESCRIPTION,
+                      value=FALSE)
+    # and the visitor fouls
+    visitorfouls <- grep("FOUL",
+                         x = season_subset$VISITORDESCRIPTION,
+                         value=FALSE)
+
+    # combine foul indices and get unique entries and create for only fouls
+    allfouls <- season_subset[unique(c(homefouls, visitorfouls)),]
+
+    # now go back and search for the game score.
+
+    # associate score margins with the fouls
+    scoremargins <- purrr::map2_dbl(.x = allfouls$GAME_ID_INT,
+                                    .y = allfouls$EVENTNUM,
+                                    ~find_score_margin(season_subset,
+                                                       gameid= .x,
+                                                       eventnum = .y,
+                                                       colname="SCOREMARGIN"))
+    # associate scores with the fouls
+    scores <- purrr::map2_chr(.x = allfouls$GAME_ID_INT,
+                              .y = allfouls$EVENTNUM,
+                              ~find_score_margin(season_subset,
+                                                 gameid = .x,
+                                                 eventnum = .y,
+                                                 colname ="SCORE"))
+
+    allfouls$SCOREMARGIN <- replace_na(scoremargins, 0)
+    allfouls$SCORE <- replace_na(scores, "0-0")
+
+    # corrected score margin is for consistently plotting the number of fouls
+    # when either the home or visiting team is ahead
+    allfouls <- mutate(allfouls,
+                       SCOREMARGIN_CORR = case_when(is.na(HOMEDESCRIPTION) ~ SCOREMARGIN,
+                                                    is.na(VISITORDESCRIPTION) ~ -1*SCOREMARGIN))
+    allfouls <- mutate(allfouls,
+                       TOTALSCORE = purrr::map_int(.x = allfouls$SCORE,
+                                                   ~sum_score(scorestr = .x)))
+
+    # add a column specifying whether the foul was on home or visitor
+    allfouls <- mutate(allfouls,
+                       FOULTEAM = case_when(!is.na(HOMEDESCRIPTION) ~ "HOME",
+                                            !is.na(VISITORDESCRIPTION) ~ "VISITOR"))
+
+    # save the derived "all fouls" dataframe
+    write_csv(allfouls, paste0('data/', paste0(seasonID, '-allfouls.csv')))
+}
+
+# load in all data files and create a single variable
+# with the filename as an ID column
+datafiles <- list.files("data", pattern="*_pbp.csv", full.names = FALSE)
 
+purrr::map_df(datafiles,
+              ~reduce_season(.x))