Updating Missing Ids file with 2022 and 2023 rookie data

mcarman8 commented 12 months ago

Thanks to @jak3sch for the help.

This update updates the ids for 2022 and 2023 rookies that were not properly being identified by the missing_ids.csv file. All data was already covered in the nflverse ecosystem, just had not made it into the player_id repository.

mcarman8 commented 12 months ago

For posterity's sake, here is the code used to create the update that was inserted to the end of the file:

library(nflreadr)
library(tidyverse)
library(dplyr)

#Get all rookies from 2022 and 2023 with MFL_ids
ff_playerids <- nflreadr::load_ff_playerids() %>%
  dplyr::filter(draft_year >= 2022)

#Get all Rookies from 2022 and their gsis_ids
rosters_weekly_2022 <- nflreadr::load_rosters_weekly(seasons = 2022) %>%
  dplyr::filter(rookie_year >= 2022) %>%
  dplyr::distinct(gsis_id, .keep_all = TRUE)

#Get all rookies from 2023 and their gsis_ids
rosters_weekly_2023 <- nflreadr::load_rosters_weekly(seasons = 2023) %>%
  dplyr::filter(rookie_year >= 2022) %>%
  dplyr::distinct(gsis_id, .keep_all = TRUE)

#Get pfr_ids out of pfr dataset
pfr_ids <- nflreadr::load_snap_counts(seasons = 2022:2023) %>%
  dplyr::mutate(player = nflreadr::clean_player_names(player)) %>%
  dplyr::group_by(pfr_player_id, position) %>%
  dplyr::arrange(season, week) %>%
  dplyr::filter(dplyr::row_number() == 1) %>%
  dplyr::filter(
    # filter some ids manually
    !pfr_player_id %in% c(
      "BrowMi0a", # executive
      "JohnBr20", # LB drafted in 2006
      "AlexMa00" # DB drafted in 2014
    )
  ) %>%
  dplyr::mutate(sync_helper = paste0(season, position, gsub(" ", "", player))) %>%
  dplyr::ungroup() %>%
  dplyr::select(player, pfr_player_id, position, sync_helper) %>%
  dplyr::distinct()

gsis_to_sync <- nflreadr::load_rosters_weekly(2022:2023) %>%
#  dplyr::filter(is.na(draft_number)) %>%
  dplyr::mutate(player = nflreadr::clean_player_names(full_name)) %>%
  dplyr::group_by(gsis_id, depth_chart_position) %>%
  dplyr::arrange(season, week) %>%
  dplyr::filter(dplyr::row_number() == 1) %>%
  dplyr::mutate(sync_helper = paste0(season, depth_chart_position, gsub(" ", "", player))) %>%
  dplyr::ungroup() %>%
  dplyr::select(player, gsis_id, sync_helper) %>%
  dplyr::filter(!is.na(gsis_id))

combined_ids <- gsis_to_sync %>%
  dplyr::left_join(
    pfr_ids %>%
      dplyr::select(pfr_player_id, sync_helper),
    by = "sync_helper",
    relationship = "many-to-many"
  ) %>%
  dplyr::filter(!is.na(pfr_player_id))

gsis_to_pfr_ids <- combined_ids %>%
  dplyr::select(gsis_id,pfr_player_id) %>%
  dplyr::rename(pfr_id = pfr_player_id)  %>%
  dplyr::distinct()

# check for duplicates
test <- combined_ids %>%
  dplyr::group_by(sync_helper) %>%
  dplyr::summarise(count = dplyr::n())

#join to fill columns
joined <- ff_playerids %>%
  dplyr::left_join(rosters_weekly_2022, by = join_by(name==full_name, position==position)) %>%
  dplyr::left_join(rosters_weekly_2023, by = join_by(name==full_name, position==position)) 

#This produces 0 results, so we know we didn't get any gsis_ids that were wrong, everything either matches or fills in data
check_joined <- joined %>%
  dplyr::filter(gsis_id.x != gsis_id.y)

output_to_csv <- joined %>%
  dplyr::mutate(
    gsis_id = case_when(!is.na(gsis_id.x) ~ gsis_id.x,
                        !is.na(gsis_id.y) ~ gsis_id.y,
                        .default = gsis_id
                      ),
    sleeper_id = case_when(!is.na(sleeper_id.x) ~ sleeper_id.x,
                           !is.na(sleeper_id.y) ~ sleeper_id.y,
                           .default = sleeper_id),
    pff_id = case_when(!is.na(pff_id.x) ~ pff_id.x,
                       !is.na(pff_id.y) ~ pff_id.y,
                       .default = pff_id),
    espn_id = case_when(!is.na(espn_id.x) ~ espn_id.x,
                       !is.na(espn_id.y) ~ espn_id.y,
                       .default = espn_id),
    yahoo_id = case_when(!is.na(yahoo_id.x) ~ yahoo_id.x,
                       !is.na(yahoo_id.y) ~ yahoo_id.y,
                       .default = yahoo_id),
    ras_id = NA,
    otc_id = NA
  ) %>% 
  dplyr::filter(!is.na(sleeper_id) | !is.na(pfr_id) | !is.na(gsis_id) | !is.na(pff_id) | !is.na(yahoo_id) | !is.na(ras_id) | !is.na(otc_id)) %>%
  dplyr::select(mfl_id,sleeper_id,gsis_id,pff_id,espn_id,yahoo_id,ras_id,otc_id) %>%
  #This last part is to join in 
  dplyr::left_join(gsis_to_pfr_ids, by = join_by(gsis_id)) %>%
  dplyr::arrange(mfl_id) %>%
  dplyr::mutate(count_col = paste0((838+row_number()))) %>%
  #To reorder the columns
  dplyr::select(count_col,mfl_id,sleeper_id,pfr_id,gsis_id,pff_id,espn_id,yahoo_id,ras_id,otc_id)

write.csv(output_to_csv, "new_missing_ids.csv", row.names=FALSE)

tanho63 commented 12 months ago

Thanks!

Noticed a duplicate:

read.csv("https://github.com/mcarman8/data/raw/updating-missing-ids/files/missing_ids.csv") |> 
  dplyr::select(-X) |> 
  janitor::get_dupes(mfl_id)
#>   mfl_id dupe_count sleeper_id   pfr_id    gsis_id pff_id espn_id yahoo_id ras_id otc_id
#> 1  15928          2       8494 HousJa01 00-0037310     NA 4240608       NA     NA     NA
#> 2  15928          2       8494 HousJa01 00-0037310     NA 4240608       NA     NA     NA

mcarman8 commented 11 months ago

Fixed the duplicate. I didn't think to check, but James Houston changed listed position from LB in 2022 to DE in 2023, so that was why he appeared twice.

Edited the code used to generate the new file to represent this change.

tanho63 commented 11 months ago

Thanks!

dynastyprocess / data

Updating Missing Ids file with 2022 and 2023 rookie data #50