mccgr / edgar

Code to manage data related to SEC EDGAR
31 stars 15 forks source link

Examine cases where filings have been deleted #78

Closed iangow closed 4 years ago

iangow commented 4 years ago

@bdcallen

I think we just want to estimate the magnitude of the problem at this stage (so I haven't assigned this to you). This issue was raised in #76. I'm surprised the rate of filings lost is as high as it is.

library(dplyr, warn.conflicts = FALSE)
library(DBI)

pg <- dbConnect(RPostgres::Postgres())

rs <- dbExecute(pg, "SET search_path TO edgar")

cusip_cik_old <- tbl(pg, "cusip_cik_old")
filings <- tbl(pg, "filings")

still_here <-
  filings %>%
  semi_join(cusip_cik_old, by="file_name") 

still_here %>%
  count(form_type)
#> # Source:   lazy query [?? x 2]
#> # Database: postgres [igow@10.101.13.99:5432/crsp]
#>   form_type n      
#>   <chr>     <int64>
#> 1 SC 13D    102532 
#> 2 SC 13D/A  213530 
#> 3 SC 13G    352023 
#> 4 SC 13G/A  631392 
#> 5 SC TO-I        3 
#> 6 SC TO-I/A      8 
#> 7 SC TO-T       17 
#> 8 SC TO-T/A    290 
#> 9 TH             1

n_still_here <-
  still_here %>%
  count() %>%
  pull()

# So form_type is overw

lost_filings <-
  cusip_cik_old %>% 
  anti_join(filings, by="file_name") %>%
  compute()

lost_filings %>%
  count()
#> # Source:   lazy query [?? x 1]
#> # Database: postgres [igow@10.101.13.99:5432/crsp]
#>   n      
#>   <int64>
#> 1 39980

n_lost <- 
  lost_filings %>%
  count() %>%
  pull()

sprintf("About %3.2f%% of filings have been lost!", 100*n_lost/(n_lost+n_still_here))
#> [1] "About 2.98% of filings have been lost!"

Created on 2020-04-13 by the reprex package (v0.3.0)

iangow commented 4 years ago

It seems many fewer filings are missing from other form_type values. I think it might certain form types that are getting deleted (it's hard to say, as we don't have the data on the ones deleted).

Though a lot depends on when we ran the code to update the tables and when the filings were deleted from the SEC index files.

library(dplyr, warn.conflicts = FALSE)
library(DBI)

pg <- dbConnect(RPostgres::Postgres())

rs <- dbExecute(pg, "SET search_path TO edgar")

accession_numbers <- tbl(pg, "accession_numbers")
filings <- tbl(pg, "filings")

still_here <-
  filings %>%
  semi_join(accession_numbers <- tbl(pg, "accession_numbers"), by="file_name") 

still_here %>%
  count(form_type)
#> # Source:   lazy query [?? x 2]
#> # Database: postgres [igow@10.101.13.99:5432/crsp]
#>    form_type n      
#>    <chr>     <int64>
#>  1 1             37 
#>  2 10-12B       666 
#>  3 10-12B/A    1547 
#>  4 10-12G      2413 
#>  5 10-12G/A    4336 
#>  6 10-C        1303 
#>  7 10-C/A        25 
#>  8 10-D       76247 
#>  9 10-D/A      2487 
#> 10 10-K      190730 
#> # … with more rows

n_still_here <-
  still_here %>%
  count() %>%
  pull()

# So form_type is overw

lost_filings <-
  accession_numbers %>% 
  anti_join(filings, by="file_name") %>%
  compute()

lost_filings %>%
  count()
#> # Source:   lazy query [?? x 1]
#> # Database: postgres [igow@10.101.13.99:5432/crsp]
#>   n      
#>   <int64>
#> 1 106046

n_lost <- 
  lost_filings %>%
  count() %>%
  pull()

sprintf("About %3.2f%% of filings have been lost!", 100*n_lost/(n_lost+n_still_here))
#> [1] "About 0.54% of filings have been lost!"

Created on 2020-04-13 by the reprex package (v0.3.0)

Interesting stats:

> filings %>% count(form_type) %>% arrange(desc(n))
# Source:     lazy query [?? x 2]
# Database:   postgres [igow@10.101.13.99:5432/crsp]
# Ordered by: desc(n)
   form_type n      
   <chr>     <int64>
 1 4         7257303
 2 8-K       1628494
 3 SC 13G/A   693342
 4 3          651407
 5 10-Q       591786
 6 497        439090
 7 6-K        408301
 8 SC 13G     375016
 9 424B2      307175
10 424B3      285454
# … with more rows
iangow commented 4 years ago

Let's focus the scope of this to that part relevant to #76. I think the solution for #76 is to semi_join against filings:

library(dplyr, warn.conflicts = FALSE)
library(DBI)

Sys.setenv(PGHOST = "10.101.13.99", PGDATABASE = "crsp")

pg <- dbConnect(RPostgres::Postgres())
rs <- dbExecute(pg, "SET search_path TO edgar")

cusip_cik_old <- tbl(pg, "cusip_cik_old")
filings <- tbl(pg, "filings")

cusip_cik_old %>% count()
#> # Source:   lazy query [?? x 1]
#> # Database: postgres [igow@10.101.13.99:5432/crsp]
#>   n      
#>   <int64>
#> 1 1386850

cusip_cik_old %>%
    semi_join(filings, by="file_name") %>%
    count()
#> # Source:   lazy query [?? x 1]
#> # Database: postgres [igow@10.101.13.99:5432/crsp]
#>   n      
#>   <int64>
#> 1 1346870

Created on 2020-04-22 by the reprex package (v0.3.0)