mccgr / edgar

Code to manage data related to SEC EDGAR
31 stars 15 forks source link

Multiple Reporting Owners? #41

Closed bdcallen closed 5 years ago

bdcallen commented 5 years ago

@iangow If one looks at this filing, one can see that some filings have multiple reporting owners. I'm wondering what we should do with these cases. Should we make another table for these, or should we just make multiple rows for the headers associated with these filings?

bdcallen commented 5 years ago

@iangow So I have written a function in forms_345_xml_functions.R

get_rep_owner_details_df <- function(xml_root, file_name, document) {

    rep_owner_nodes <- getNodeSet(xml_root, 'reportingOwner')

    df_rep_owner <- data.frame(matrix(nrow = 0, ncol = 19), stringsAsFactors = FALSE)
    rep_own_col_names <- c('file_name', 'document', 'seq', 'rptOwnerCik', 'rptOwnerCcc', 'rptOwnerName', 'rptOwnerStreet1',
                           'rptOwnerStreet2', 'rptOwnerCity', 'rptOwnerState', 'rptOwnerZipCode',
                           'rptOwnerStateDescription', 'rptOwnerGoodAddress', 'isDirector', 'isOfficer',
                           'isTenPercentOwner', 'isOther', 'officerTitle', 'otherText')

    colnames(df_rep_owner) <- rep_own_col_names

    if(length(rep_owner_nodes)) {

        for(node in rep_owner_nodes) {

            df_rep_owner_id <- xmlToDataFrame(getNodeSet(node, 'reportingOwnerId'), stringsAsFactors = FALSE)
            df_rep_owner_ad <- xmlToDataFrame(getNodeSet(node, 'reportingOwnerAddress'), stringsAsFactors = FALSE)
            df_rep_owner_rel <- xmlToDataFrame(getNodeSet(node, 'reportingOwnerRelationship'), stringsAsFactors = FALSE)

            part <- bind_cols(df_rep_owner_id, bind_cols(df_rep_owner_ad, df_rep_owner_rel))

            df_rep_owner <- bind_rows(df_rep_owner, part)

        }

        df_rep_owner$file_name <- file_name
        df_rep_owner$document <- document
        df_rep_owner$seq <- rownames(df_rep_owner)

        for(column in colnames(df_rep_owner)) {

            df_rep_owner[[column]] <- as.character(df_rep_owner[[column]])
            is_blank <- grepl("^[ \t\n\r]*$", df_rep_owner[[column]])
            df_rep_owner[[column]][is_blank] <- NA

        }

        logical_cols <- c('isDirector', 'isOfficer', 'isTenPercentOwner', 'isOther')

        for(column in logical_cols) {

            df_rep_owner[[column]] <- do.call("c", lapply(df_rep_owner[[column]], string_to_boolean))

        }

        df_rep_owner$seq <- as.integer(df_rep_owner$seq)

    }

    df_rep_owner <- df_rep_owner[, rep_own_col_names]

    return(df_rep_owner)

}

which extracts the reporting owner details from the xml documents, and puts them into dataframes. I have also adjusted process_345_filing

process_345_filing <- function(file_name, document, form_type) {

    pg <- dbConnect(PostgreSQL())

    try({
        xml_root <- get_xml_root(file_name, document)
        got_xml <- TRUE}, {got_xml <- FALSE})

    try({
        header <- get_header(xml_root, file_name, document)
        got_header <- TRUE}, {got_header <- FALSE})

    try({
        rep_own <- get_rep_owner_details_df(xml_root, file_name, document)
        got_rep_own <- TRUE}, {got_rep_own <- FALSE})

    try({
        table1 <- get_nonDerivative_df(xml_root, file_name, document, form_type)
        got_table1 <- TRUE}, {got_table1 <- FALSE})

    try({
        table2 <- get_derivative_df(xml_root, file_name, document, form_type)
        got_table2 <- TRUE}, {got_table2 <- FALSE})

    try({
        footnotes <- get_footnotes(xml_root, file_name, document)
        got_footnotes <- TRUE}, {got_footnotes <- FALSE})

    try({
        footnote_indices <- get_full_footnote_indices(xml_root, file_name, document)
        got_footnote_indices <- TRUE}, {got_footnote_indices <- FALSE})

    try({
        signatures <- get_signature_df(xml_root, file_name, document)
        got_signatures <- TRUE}, {got_signatures <- FALSE})

    try({
        if(nrow(header) & got_header) {
            dbWriteTable(pg, c("edgar", "forms345_header"), header, append = TRUE, row.names = FALSE)
        }
        wrote_header <- TRUE}, {wrote_header <- FALSE})

    try({
        if(nrow(rep_own) & got_rep_own) {
            dbWriteTable(pg, c("edgar", "forms345_reporting_owners"), header, append = TRUE, row.names = FALSE)
        }
        wrote_rep_own <- TRUE}, {wrote_rep_own <- FALSE})

    try({
        if(nrow(table1) & got_table1) {
            dbWriteTable(pg, c("edgar", "forms345_table1"), table1, append = TRUE, row.names = FALSE)
        }
        wrote_table1 <- TRUE}, {wrote_table1 <- FALSE})

    try({
        if(nrow(table2) & got_table2) {
            dbWriteTable(pg, c("edgar", "forms345_table2"), table2, append = TRUE, row.names = FALSE)
        }
        wrote_table2 <- TRUE}, {wrote_table2 <- FALSE})

    try({
        if(nrow(footnotes) & got_footnotes) {
            dbWriteTable(pg, c("edgar", "forms345_footnotes"), footnotes, append = TRUE, row.names = FALSE)
        }
        wrote_footnotes <- TRUE}, {wrote_footnotes <- FALSE})

    try({
        if(nrow(footnote_indices) & got_footnote_indices){
            dbWriteTable(pg, c("edgar", "forms345_footnote_indices"), footnote_indices, append = TRUE, row.names = FALSE)
        }
        wrote_footnote_indices <- TRUE}, {wrote_footnote_indices <- FALSE})

    try({
        if(nrow(signatures) & got_signatures){
            dbWriteTable(pg, c("edgar", "forms345_signatures"), signatures, append = TRUE, row.names = FALSE)
        }
        wrote_signatures <- TRUE}, {wrote_signatures <- FALSE})

    process_df <- data.frame(file_name = file_name, document = document, form_type = form_type, got_xml = got_xml,
                             got_header = got_header, got_rep_own = got_rep_own, got_table1 = got_table1,
                             got_table2 = got_table2, got_footnotes = got_footnotes, got_footnote_indices = got_footnote_indices,
                             got_signatures = got_signatures, wrote_header = wrote_header, wrote_rep_own = wrote_rep_own,
                             wrote_table1 = wrote_table1, wrote_table2 = wrote_table2, wrote_footnotes = wrote_footnotes,
                             wrote_footnote_indices = wrote_footnote_indices, wrote_signatures = wrote_signatures,
                             stringsAsFactors = FALSE)

    dbDisconnect(pg)

    return(process_df)

}

so that the reporting owner details are written to a new table edgar.forms345_reporting_owners.

I thus think we can probably close this issue.

iangow commented 5 years ago

@bdcallen I try to avoid pre-Tidyverse functions such as data.frame precisely because you need nonsense like stringsAsFactors. There’s usually a Tidyverse variant that does a better job. Also lapply with a small function often produces cleaner (and faster) code than for-loops.

Sent with GitHawk

iangow commented 5 years ago

@bdcallen

Where are we on all this Form 3/4/5 stuff? I think it would be very helpful to document this suite of code files, tables, etc, in a separate forms345.md file that is linked to from the readme.md file.

bdcallen commented 5 years ago

@iangow Agree about making forms345.md, as I said in a previous email.

This issue can be closed now. My code handles the issue raised here, but putting the information for all reportingOwner nodes into the table edgar.forms345_reporting_owners, thus handling cases with an arbitrary number of reporting owners.