Closed mcarman8 closed 11 months ago
For posterity's sake, here is the code used to create the update that was inserted to the end of the file:
library(nflreadr)
library(tidyverse)
library(dplyr)
#Get all rookies from 2022 and 2023 with MFL_ids
ff_playerids <- nflreadr::load_ff_playerids() %>%
dplyr::filter(draft_year >= 2022)
#Get all Rookies from 2022 and their gsis_ids
rosters_weekly_2022 <- nflreadr::load_rosters_weekly(seasons = 2022) %>%
dplyr::filter(rookie_year >= 2022) %>%
dplyr::distinct(gsis_id, .keep_all = TRUE)
#Get all rookies from 2023 and their gsis_ids
rosters_weekly_2023 <- nflreadr::load_rosters_weekly(seasons = 2023) %>%
dplyr::filter(rookie_year >= 2022) %>%
dplyr::distinct(gsis_id, .keep_all = TRUE)
#Get pfr_ids out of pfr dataset
pfr_ids <- nflreadr::load_snap_counts(seasons = 2022:2023) %>%
dplyr::mutate(player = nflreadr::clean_player_names(player)) %>%
dplyr::group_by(pfr_player_id, position) %>%
dplyr::arrange(season, week) %>%
dplyr::filter(dplyr::row_number() == 1) %>%
dplyr::filter(
# filter some ids manually
!pfr_player_id %in% c(
"BrowMi0a", # executive
"JohnBr20", # LB drafted in 2006
"AlexMa00" # DB drafted in 2014
)
) %>%
dplyr::mutate(sync_helper = paste0(season, position, gsub(" ", "", player))) %>%
dplyr::ungroup() %>%
dplyr::select(player, pfr_player_id, position, sync_helper) %>%
dplyr::distinct()
gsis_to_sync <- nflreadr::load_rosters_weekly(2022:2023) %>%
# dplyr::filter(is.na(draft_number)) %>%
dplyr::mutate(player = nflreadr::clean_player_names(full_name)) %>%
dplyr::group_by(gsis_id, depth_chart_position) %>%
dplyr::arrange(season, week) %>%
dplyr::filter(dplyr::row_number() == 1) %>%
dplyr::mutate(sync_helper = paste0(season, depth_chart_position, gsub(" ", "", player))) %>%
dplyr::ungroup() %>%
dplyr::select(player, gsis_id, sync_helper) %>%
dplyr::filter(!is.na(gsis_id))
combined_ids <- gsis_to_sync %>%
dplyr::left_join(
pfr_ids %>%
dplyr::select(pfr_player_id, sync_helper),
by = "sync_helper",
relationship = "many-to-many"
) %>%
dplyr::filter(!is.na(pfr_player_id))
gsis_to_pfr_ids <- combined_ids %>%
dplyr::select(gsis_id,pfr_player_id) %>%
dplyr::rename(pfr_id = pfr_player_id) %>%
dplyr::distinct()
# check for duplicates
test <- combined_ids %>%
dplyr::group_by(sync_helper) %>%
dplyr::summarise(count = dplyr::n())
#join to fill columns
joined <- ff_playerids %>%
dplyr::left_join(rosters_weekly_2022, by = join_by(name==full_name, position==position)) %>%
dplyr::left_join(rosters_weekly_2023, by = join_by(name==full_name, position==position))
#This produces 0 results, so we know we didn't get any gsis_ids that were wrong, everything either matches or fills in data
check_joined <- joined %>%
dplyr::filter(gsis_id.x != gsis_id.y)
output_to_csv <- joined %>%
dplyr::mutate(
gsis_id = case_when(!is.na(gsis_id.x) ~ gsis_id.x,
!is.na(gsis_id.y) ~ gsis_id.y,
.default = gsis_id
),
sleeper_id = case_when(!is.na(sleeper_id.x) ~ sleeper_id.x,
!is.na(sleeper_id.y) ~ sleeper_id.y,
.default = sleeper_id),
pff_id = case_when(!is.na(pff_id.x) ~ pff_id.x,
!is.na(pff_id.y) ~ pff_id.y,
.default = pff_id),
espn_id = case_when(!is.na(espn_id.x) ~ espn_id.x,
!is.na(espn_id.y) ~ espn_id.y,
.default = espn_id),
yahoo_id = case_when(!is.na(yahoo_id.x) ~ yahoo_id.x,
!is.na(yahoo_id.y) ~ yahoo_id.y,
.default = yahoo_id),
ras_id = NA,
otc_id = NA
) %>%
dplyr::filter(!is.na(sleeper_id) | !is.na(pfr_id) | !is.na(gsis_id) | !is.na(pff_id) | !is.na(yahoo_id) | !is.na(ras_id) | !is.na(otc_id)) %>%
dplyr::select(mfl_id,sleeper_id,gsis_id,pff_id,espn_id,yahoo_id,ras_id,otc_id) %>%
#This last part is to join in
dplyr::left_join(gsis_to_pfr_ids, by = join_by(gsis_id)) %>%
dplyr::arrange(mfl_id) %>%
dplyr::mutate(count_col = paste0((838+row_number()))) %>%
#To reorder the columns
dplyr::select(count_col,mfl_id,sleeper_id,pfr_id,gsis_id,pff_id,espn_id,yahoo_id,ras_id,otc_id)
write.csv(output_to_csv, "new_missing_ids.csv", row.names=FALSE)
Thanks!
Noticed a duplicate:
read.csv("https://github.com/mcarman8/data/raw/updating-missing-ids/files/missing_ids.csv") |>
dplyr::select(-X) |>
janitor::get_dupes(mfl_id)
#> mfl_id dupe_count sleeper_id pfr_id gsis_id pff_id espn_id yahoo_id ras_id otc_id
#> 1 15928 2 8494 HousJa01 00-0037310 NA 4240608 NA NA NA
#> 2 15928 2 8494 HousJa01 00-0037310 NA 4240608 NA NA NA
Fixed the duplicate. I didn't think to check, but James Houston changed listed position from LB in 2022 to DE in 2023, so that was why he appeared twice.
Edited the code used to generate the new file to represent this change.
Thanks!
Thanks to @jak3sch for the help.
This update updates the ids for 2022 and 2023 rookies that were not properly being identified by the missing_ids.csv file. All data was already covered in the nflverse ecosystem, just had not made it into the player_id repository.