iangow / notes

1 stars 0 forks source link

Check out DERA code #5

Open iangow opened 2 hours ago

iangow commented 2 hours ago
library(tidyverse)
library(farr)
library(DBI)

load_parquet <- function (conn, table, schema = "", 
                          data_dir = Sys.getenv("DATA_DIR")) {
  file_path <- file.path(data_dir, schema, paste0(table, ".parquet"))
  df_sql <- paste0("SELECT * FROM read_parquet('", file_path, 
                   "', filename = true)")
  dplyr::tbl(conn, dplyr::sql(df_sql))
}

db <- dbConnect(duckdb::duckdb())

sub <- load_parquet(db, "sub_*", "dera")
tag <- load_parquet(db, "tag_*", "dera")
num <- load_parquet(db, "num_*", "dera")

common_tags <- 
  sub |>
  left_join(num, by = "adsh") |>
  filter(is.na(coreg),
         form %in% c("10-K", "10-K/A"),
         str_detect(version, "us-gaap")) |>
  inner_join(tag, by = join_by(tag, version)) |>
  mutate(file_year = as.integer(regexp_extract(filename, "[0-9]{4}"))) |>
  filter(between(file_year, 2012, 2018)) |>
  distinct(file_year, tag) |>
  group_by(tag) |>
  summarize(num_years = n(), .groups = "drop") |>
  filter(num_years == max(num_years, na.rm = TRUE)) |>
  compute() |>
  system_time()

sub_num |> count(version)

sub_num <- sub_num[sub_num$form == "10-K" | sub_num$form == "10-K/A",]
sub_num <- sub_num[grepl("us-gaap", sub_num$version),]
sub_num_tag <- merge(sub_num, tag[,c("tag","datatype")], all = F)
iangow commented 2 hours ago

Also this ...


for (year in c(2012:2018)) {
  dir <- "/XBRL/sub/"
  sub <- do.call(rbind,
                 lapply(list.files(paste(dir,year,sep=""),full.names=T),
                        read.table,
                        sep="\t",
                        header=T,
                        fill=T,
                        quote=""))
  dir <- "/XBRL/num/"
  num <- do.call(rbind,
                 lapply(list.files(paste(dir,year,sep=""),full.names=T),
                        read.table,
                        sep="\t",
                        header=T,
                        fill=T,
                        quote=""))
  dir <- "/XBRL/tag/"
  tag <- do.call(rbind,
                 lapply(list.files(paste(dir,year,sep=""),full.names=T),
                        read.table,
                        sep="\t",
                        header=T,
                        fill=T,
                        quote=""))
  sub_num <- merge(sub, num, by = "adsh", all.x = TRUE, all.y = FALSE)
  sub_num <- sub_num[sub_num$coreg == "",]
  sub_num <- sub_num[sub_num$form == "10-K" | sub_num$form == "10-K/A",]
  sub_num <- sub_num[grepl("us-gaap", sub_num$version),]
  sub_num_tag <- merge(sub_num, tag[,c("tag","datatype")], all = F)
  if (year==2012) {
    common_tax <- unique(sub_num_tag$tag)
  } else {
    common_tax <- common_tax[which(common_tax %in%
                                     unique(sub_num_tag$tag))]
  }
}
write.table(common_tax, "common_standard_tags.csv", sep = ",", col.names =
              F, row.names = F)