Closed bdcallen closed 5 years ago
@iangow So I have written a function in forms_345_xml_functions.R
get_rep_owner_details_df <- function(xml_root, file_name, document) {
rep_owner_nodes <- getNodeSet(xml_root, 'reportingOwner')
df_rep_owner <- data.frame(matrix(nrow = 0, ncol = 19), stringsAsFactors = FALSE)
rep_own_col_names <- c('file_name', 'document', 'seq', 'rptOwnerCik', 'rptOwnerCcc', 'rptOwnerName', 'rptOwnerStreet1',
'rptOwnerStreet2', 'rptOwnerCity', 'rptOwnerState', 'rptOwnerZipCode',
'rptOwnerStateDescription', 'rptOwnerGoodAddress', 'isDirector', 'isOfficer',
'isTenPercentOwner', 'isOther', 'officerTitle', 'otherText')
colnames(df_rep_owner) <- rep_own_col_names
if(length(rep_owner_nodes)) {
for(node in rep_owner_nodes) {
df_rep_owner_id <- xmlToDataFrame(getNodeSet(node, 'reportingOwnerId'), stringsAsFactors = FALSE)
df_rep_owner_ad <- xmlToDataFrame(getNodeSet(node, 'reportingOwnerAddress'), stringsAsFactors = FALSE)
df_rep_owner_rel <- xmlToDataFrame(getNodeSet(node, 'reportingOwnerRelationship'), stringsAsFactors = FALSE)
part <- bind_cols(df_rep_owner_id, bind_cols(df_rep_owner_ad, df_rep_owner_rel))
df_rep_owner <- bind_rows(df_rep_owner, part)
}
df_rep_owner$file_name <- file_name
df_rep_owner$document <- document
df_rep_owner$seq <- rownames(df_rep_owner)
for(column in colnames(df_rep_owner)) {
df_rep_owner[[column]] <- as.character(df_rep_owner[[column]])
is_blank <- grepl("^[ \t\n\r]*$", df_rep_owner[[column]])
df_rep_owner[[column]][is_blank] <- NA
}
logical_cols <- c('isDirector', 'isOfficer', 'isTenPercentOwner', 'isOther')
for(column in logical_cols) {
df_rep_owner[[column]] <- do.call("c", lapply(df_rep_owner[[column]], string_to_boolean))
}
df_rep_owner$seq <- as.integer(df_rep_owner$seq)
}
df_rep_owner <- df_rep_owner[, rep_own_col_names]
return(df_rep_owner)
}
which extracts the reporting owner details from the xml documents, and puts them into dataframes. I have also adjusted process_345_filing
process_345_filing <- function(file_name, document, form_type) {
pg <- dbConnect(PostgreSQL())
try({
xml_root <- get_xml_root(file_name, document)
got_xml <- TRUE}, {got_xml <- FALSE})
try({
header <- get_header(xml_root, file_name, document)
got_header <- TRUE}, {got_header <- FALSE})
try({
rep_own <- get_rep_owner_details_df(xml_root, file_name, document)
got_rep_own <- TRUE}, {got_rep_own <- FALSE})
try({
table1 <- get_nonDerivative_df(xml_root, file_name, document, form_type)
got_table1 <- TRUE}, {got_table1 <- FALSE})
try({
table2 <- get_derivative_df(xml_root, file_name, document, form_type)
got_table2 <- TRUE}, {got_table2 <- FALSE})
try({
footnotes <- get_footnotes(xml_root, file_name, document)
got_footnotes <- TRUE}, {got_footnotes <- FALSE})
try({
footnote_indices <- get_full_footnote_indices(xml_root, file_name, document)
got_footnote_indices <- TRUE}, {got_footnote_indices <- FALSE})
try({
signatures <- get_signature_df(xml_root, file_name, document)
got_signatures <- TRUE}, {got_signatures <- FALSE})
try({
if(nrow(header) & got_header) {
dbWriteTable(pg, c("edgar", "forms345_header"), header, append = TRUE, row.names = FALSE)
}
wrote_header <- TRUE}, {wrote_header <- FALSE})
try({
if(nrow(rep_own) & got_rep_own) {
dbWriteTable(pg, c("edgar", "forms345_reporting_owners"), header, append = TRUE, row.names = FALSE)
}
wrote_rep_own <- TRUE}, {wrote_rep_own <- FALSE})
try({
if(nrow(table1) & got_table1) {
dbWriteTable(pg, c("edgar", "forms345_table1"), table1, append = TRUE, row.names = FALSE)
}
wrote_table1 <- TRUE}, {wrote_table1 <- FALSE})
try({
if(nrow(table2) & got_table2) {
dbWriteTable(pg, c("edgar", "forms345_table2"), table2, append = TRUE, row.names = FALSE)
}
wrote_table2 <- TRUE}, {wrote_table2 <- FALSE})
try({
if(nrow(footnotes) & got_footnotes) {
dbWriteTable(pg, c("edgar", "forms345_footnotes"), footnotes, append = TRUE, row.names = FALSE)
}
wrote_footnotes <- TRUE}, {wrote_footnotes <- FALSE})
try({
if(nrow(footnote_indices) & got_footnote_indices){
dbWriteTable(pg, c("edgar", "forms345_footnote_indices"), footnote_indices, append = TRUE, row.names = FALSE)
}
wrote_footnote_indices <- TRUE}, {wrote_footnote_indices <- FALSE})
try({
if(nrow(signatures) & got_signatures){
dbWriteTable(pg, c("edgar", "forms345_signatures"), signatures, append = TRUE, row.names = FALSE)
}
wrote_signatures <- TRUE}, {wrote_signatures <- FALSE})
process_df <- data.frame(file_name = file_name, document = document, form_type = form_type, got_xml = got_xml,
got_header = got_header, got_rep_own = got_rep_own, got_table1 = got_table1,
got_table2 = got_table2, got_footnotes = got_footnotes, got_footnote_indices = got_footnote_indices,
got_signatures = got_signatures, wrote_header = wrote_header, wrote_rep_own = wrote_rep_own,
wrote_table1 = wrote_table1, wrote_table2 = wrote_table2, wrote_footnotes = wrote_footnotes,
wrote_footnote_indices = wrote_footnote_indices, wrote_signatures = wrote_signatures,
stringsAsFactors = FALSE)
dbDisconnect(pg)
return(process_df)
}
so that the reporting owner details are written to a new table edgar.forms345_reporting_owners
.
I thus think we can probably close this issue.
@bdcallen I try to avoid pre-Tidyverse functions such as data.frame
precisely because you need nonsense like stringsAsFactors
. There’s usually a Tidyverse variant that does a better job. Also lapply
with a small function often produces cleaner (and faster) code than for-loops.
Sent with GitHawk
@bdcallen
Where are we on all this Form 3/4/5 stuff? I think it would be very helpful to document this suite of code files, tables, etc, in a separate forms345.md
file that is linked to from the readme.md
file.
@iangow Agree about making forms345.md
, as I said in a previous email.
This issue can be closed now. My code handles the issue raised here, but putting the information for all reportingOwner
nodes into the table edgar.forms345_reporting_owners
, thus handling cases with an arbitrary number of reporting owners.
@iangow If one looks at this filing, one can see that some filings have multiple reporting owners. I'm wondering what we should do with these cases. Should we make another table for these, or should we just make multiple rows for the headers associated with these filings?