Check out DERA code - Githubissues

library(tidyverse)
library(farr)
library(DBI)

load_parquet <- function (conn, table, schema = "", 
                          data_dir = Sys.getenv("DATA_DIR")) {
  file_path <- file.path(data_dir, schema, paste0(table, ".parquet"))
  df_sql <- paste0("SELECT * FROM read_parquet('", file_path, 
                   "', filename = true)")
  dplyr::tbl(conn, dplyr::sql(df_sql))
}

db <- dbConnect(duckdb::duckdb())

sub <- load_parquet(db, "sub_*", "dera")
tag <- load_parquet(db, "tag_*", "dera")
num <- load_parquet(db, "num_*", "dera")

common_tags <- 
  sub |>
  left_join(num, by = "adsh") |>
  filter(is.na(coreg),
         form %in% c("10-K", "10-K/A"),
         str_detect(version, "us-gaap")) |>
  inner_join(tag, by = join_by(tag, version)) |>
  mutate(file_year = as.integer(regexp_extract(filename, "[0-9]{4}"))) |>
  filter(between(file_year, 2012, 2018)) |>
  distinct(file_year, tag) |>
  group_by(tag) |>
  summarize(num_years = n(), .groups = "drop") |>
  filter(num_years == max(num_years, na.rm = TRUE)) |>
  compute() |>
  system_time()

sub_num |> count(version)

sub_num <- sub_num[sub_num$form == "10-K" | sub_num$form == "10-K/A",]
sub_num <- sub_num[grepl("us-gaap", sub_num$version),]
sub_num_tag <- merge(sub_num, tag[,c("tag","datatype")], all = F)

for (year in c(2012:2018)) { dir <- "/XBRL/sub/" sub <- do.call(rbind, lapply(list.files(paste(dir,year,sep=""),full.names=T), read.table, sep="\t", header=T, fill=T, quote="")) dir <- "/XBRL/num/" num <- do.call(rbind, lapply(list.files(paste(dir,year,sep=""),full.names=T), read.table, sep="\t", header=T, fill=T, quote="")) dir <- "/XBRL/tag/" tag <- do.call(rbind, lapply(list.files(paste(dir,year,sep=""),full.names=T), read.table, sep="\t", header=T, fill=T, quote="")) sub_num <- merge(sub, num, by = "adsh", all.x = TRUE, all.y = FALSE) sub_num <- sub_num[sub_num$coreg == "",] sub_num <- sub_num[sub_num$form == "10-K" | sub_num$form == "10-K/A",] sub_num <- sub_num[grepl("us-gaap", sub_num$version),] sub_num_tag <- merge(sub_num, tag[,c("tag","datatype")], all = F) if (year==2012) { common_tax <- unique(sub_num_tag$tag) } else { common_tax <- common_tax[which(common_tax %in% unique(sub_num_tag$tag))] } } write.table(common_tax, "common_standard_tags.csv", sep = ",", col.names = F, row.names = F)

iangow / notes

Check out DERA code #5