Closed iangow closed 4 years ago
@iangow
library(dplyr, warn.conflicts = FALSE)
library(DBI)
library(stringdist)
pg <- dbConnect(RPostgres::Postgres())
cusip_cik <- tbl(pg, sql("SELECT * FROM edgar.cusip_cik_test"))
bad_cusips <-
cusip_cik %>%
filter(nchar(cusip)==9L) %>%
mutate(bad_cusip = case_when(! right(cusip, 1L) %~% '[0-9]' ~ TRUE,
as.integer(right(cusip, 1L)) != check_digit ~ TRUE,
TRUE ~ FALSE)) %>%
filter(bad_cusip) %>%
select(-bad_cusip) %>%
compute()
bad_cusips %>%
select(cusip, check_digit, cik, file_name)
#> # Source: lazy query [?? x 4]
#> # Database: postgres [bdcallen@/var/run/postgresql:5432/crsp]
#> cusip check_digit cik file_name
#> <chr> <int> <int> <chr>
#> 1 384632100 5 709136 edgar/data/709136/0000891554-99-001283.txt
#> 2 635838015 4 922868 edgar/data/936294/0000930661-99-002192.txt
#> 3 532187107 1 28626 edgar/data/1014478/0000950115-99-001231.txt
#> 4 644461073 1 1012616 edgar/data/1012616/0000944209-99-001640.txt
#> 5 1011B1017 0 916802 edgar/data/916802/0001068800-99-000405.txt
#> 6 000628811 8 859621 edgar/data/1060207/0000950168-99-003002.txt
#> 7 30731H108 5 1064015 edgar/data/1000619/0000950116-99-002344.txt
#> 8 209432107 3 880323 edgar/data/1080910/0000903423-99-000448.txt
#> 9 747461100 1 790228 edgar/data/790228/0001038838-99-000228.txt
#> 10 01300H105 7 874761 edgar/data/903454/0001005150-00-000215.txt
#> # … with more rows
cusip_cik <- cusip_cik %>% collect()
bad_cusips <- bad_cusips %>% collect()
bad_cusip_cik_dist <- bad_cusips %>% distinct(file_name, cusip, cik)
bad_cusip_cik_dist %>% rename(bad_cusip = cusip) %>% inner_join(cusip_cik) %>%
filter(nchar(cusip) == 9 & substr(cusip, 9, 9) == as.character(check_digit)) %>%
select(file_name, cik, bad_cusip, cusip, check_digit)
#> Joining, by = c("file_name", "cik")
#> # A tibble: 0 x 5
#> # … with 5 variables: file_name <chr>, cik <int>, bad_cusip <chr>, cusip <chr>,
#> # check_digit <int>
bad_v_good <- bad_cusips %>% distinct(cik, cusip) %>% rename(bad_cusip = cusip) %>%
inner_join(cusip_cik) %>% filter(nchar(cusip) == 9 & substr(cusip, 9, 9) == as.character(check_digit)) %>%
distinct(cik, bad_cusip, cusip, check_digit)
#> Joining, by = "cik"
bad_cusips %>% distinct(cik, cusip) %>% count()
#> # A tibble: 1 x 1
#> n
#> <int>
#> 1 3106
bad_v_good %>% distinct(cik, bad_cusip) %>% count()
#> # A tibble: 1 x 1
#> n
#> <int>
#> 1 2883
bad_v_good$cusip_dist_dl <- stringdist(bad_v_good$bad_cusip, bad_v_good$cusip, method = 'dl')
bad_v_good %>% filter(cusip_dist_dl == 1)
#> # A tibble: 1,300 x 5
#> bad_cusip cik cusip check_digit cusip_dist_dl
#> <chr> <int> <chr> <int> <dbl>
#> 1 384632100 709136 384632105 5 1
#> 2 635838015 922868 635838105 5 1
#> 3 532187107 28626 532187101 1 1
#> 4 209432107 880323 204932107 7 1
#> 5 747461100 790228 847461100 0 1
#> 6 878553108 1059920 878553106 6 1
#> 7 G3291A100 1061322 G3921A100 0 1
#> 8 530129C05 1085776 530129105 5 1
#> 9 103188106 1073349 101388106 6 1
#> 10 698148104 1022080 598148104 4 1
#> # … with 1,290 more rows
one_typo <- bad_v_good %>% filter(cusip_dist_dl == 1) %>% distinct(cik, bad_cusip)
one_typo %>% distinct(bad_cusip, cik) %>% count()
#> # A tibble: 1 x 1
#> n
#> <int>
#> 1 1261
bad_v_good %>% anti_join(one_typo) %>% filter(cusip_dist_dl == 2)
#> Joining, by = c("bad_cusip", "cik")
#> # A tibble: 669 x 5
#> bad_cusip cik cusip check_digit cusip_dist_dl
#> <chr> <int> <chr> <int> <dbl>
#> 1 644461073 1012616 064446107 7 2
#> 2 1011B1017 916802 10011B101 1 2
#> 3 30731H108 1064015 03073H108 8 2
#> 4 01300H105 874761 00130H105 5 2
#> 5 140501110 17313 140501107 7 2
#> 6 777171019 1025558 577717101 1 2
#> 7 747456P10 917126 74756P105 5 2
#> 8 029014010 1003265 290140102 2 2
#> 9 041651021 919010 404165102 2 2
#> 10 29425910G 879354 294259403 3 2
#> # … with 659 more rows
two_typo <- bad_v_good %>% anti_join(one_typo) %>% filter(cusip_dist_dl == 2) %>% distinct(cik, bad_cusip)
#> Joining, by = c("bad_cusip", "cik")
two_typo %>% distinct(bad_cusip, cik) %>% count()
#> # A tibble: 1 x 1
#> n
#> <int>
#> 1 661
Created on 2020-07-02 by the reprex package (v0.3.0)
So in the above, I have had a closer look at the bad 9-digit cusips, mostly with regard to joining these cases by cik to other cusips with the same cik, to compare them to the good 9-digit cusips. As you can see with the join I did for bad_cusip_cik_dist
, there were no matches when I did the joining by file_name as well, which I found surprising (perhaps I have made a mistake somewhere). I then did a join by only cik, meaning I ended up comparing bad cusips to cusips mapped to the same cik across all filings for that cik.
You can see from my reprex that I calculated, in the column cusip_dist_dl
, an edit distance between the bad 9-digit cusip and the good 9-digit cusip, a distance called the Damerau–Levenshtein distance, which is closely related to the Levenshtein, using the stringdist
package (the Levenshtein distance can usually just be calculated using adist
, out of interest. I have been experimenting a lot with the functions lately). The former differs from the latter in that it includes transposition of two characters, as well as deletions, insertions and character substitutions (so for instance, 635838015
and 635838105
have a DL distance of 1). I then looked at the cases which had a DL distance of 1 (in one_typo
, and there were 1261 distinct (bad_)cusip, cik pairs. Then I anti_joined these cases on bad_v_good
and had a look at the remaining cases for which the DL distance was 2, and there were a further 661 cases here. So around 60% of cases are within 1 or 2 typos from a good 9-digit cusip which already exists in the dataset.
Also, in the comparison of the distinct pairs in bad_cusips
versus bad_v_good
, the former had 3106 pairs, and the latter 2883. One can infer from this that there are 223 bad 9-digit cusips which do not map to any good 9-digit cusips from any filing for the given cik.
@bdcallen As I've said before, you need to synthesize the information and put your conclusion at the top.
The question was: What's going on here? Are these bad in the original filings?
So around 60% of cases are within 1 or 2 typos from a good 9-digit cusip which already exists in the dataset.
So, I think the answer to the second question is "yes". If so, we should eliminate these CUSIPs in the Python code. For now, simply eliminate them from cusip_cik_test
(see #93, which is the issue that committed code should be linked to).
What's going on here? Are these bad in the original filings?
Created on 2020-04-22 by the reprex package (v0.3.0)