mccgr / edgar

Code to manage data related to SEC EDGAR
31 stars 15 forks source link

Create function to scrape insider CIK XML documents #25

Closed bdcallen closed 5 years ago

bdcallen commented 5 years ago

Assuming that in #23, all the relevant filings can be found in XML format, we want to write a function that scrapes the data from pages like this

bdcallen commented 5 years ago

@iangow I just did this

library(RCurl)
fileURL <- "https://www.sec.gov/Archives/edgar/data/821026/000120919118029790/doc4.xml"
xData <- getURL(fileURL)
data <- xmlParse(xData)
xml_data <- xmlToList(data)
xml_data

and got

$schemaVersion
[1] "X0306"

$documentType
[1] "4"

$periodOfReport
[1] "2018-05-11"

$notSubjectToSection16
[1] "0"

$issuer
$issuer$issuerCik
[1] "0000821026"

$issuer$issuerName
[1] "Andersons, Inc."

$issuer$issuerTradingSymbol
[1] "ANDE"

$reportingOwner
$reportingOwner$reportingOwnerId
$reportingOwner$reportingOwnerId$rptOwnerCik
[1] "0001200124"

$reportingOwner$reportingOwnerId$rptOwnerName
[1] "KING ROBERT J JR"

$reportingOwner$reportingOwnerAddress
$reportingOwner$reportingOwnerAddress$rptOwnerStreet1
[1] "1947 BRIARFIELD BLVD."

$reportingOwner$reportingOwnerAddress$rptOwnerStreet2
NULL

$reportingOwner$reportingOwnerAddress$rptOwnerCity
[1] "MAUMEE"

$reportingOwner$reportingOwnerAddress$rptOwnerState
[1] "OH"

$reportingOwner$reportingOwnerAddress$rptOwnerZipCode
[1] "43537"

$reportingOwner$reportingOwnerAddress$rptOwnerStateDescription
NULL

$reportingOwner$reportingOwnerRelationship
$reportingOwner$reportingOwnerRelationship$isDirector
[1] "1"

$reportingOwner$reportingOwnerRelationship$isOfficer
[1] "0"

$reportingOwner$reportingOwnerRelationship$isTenPercentOwner
[1] "0"

$reportingOwner$reportingOwnerRelationship$isOther
[1] "0"

$nonDerivativeTable
$nonDerivativeTable$nonDerivativeTransaction
$nonDerivativeTable$nonDerivativeTransaction$securityTitle
$nonDerivativeTable$nonDerivativeTransaction$securityTitle$value
[1] "COMMON STOCK"

$nonDerivativeTable$nonDerivativeTransaction$transactionDate
$nonDerivativeTable$nonDerivativeTransaction$transactionDate$value
[1] "2018-05-11"

$nonDerivativeTable$nonDerivativeTransaction$deemedExecutionDate
NULL

$nonDerivativeTable$nonDerivativeTransaction$transactionCoding
$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$transactionFormType
[1] "4"

$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$transactionCode
[1] "J"

$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$equitySwapInvolved
[1] "0"

$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$footnoteId
  id 
"F1" 

$nonDerivativeTable$nonDerivativeTransaction$transactionTimeliness
$nonDerivativeTable$nonDerivativeTransaction$transactionTimeliness$value
NULL

$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts
$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionShares
$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionShares$value
[1] "1.9"

$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionPricePerShare
$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionPricePerShare$value
[1] "32.65"

$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionAcquiredDisposedCode
$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionAcquiredDisposedCode$value
[1] "A"

$nonDerivativeTable$nonDerivativeTransaction$postTransactionAmounts
$nonDerivativeTable$nonDerivativeTransaction$postTransactionAmounts$sharesOwnedFollowingTransaction
$nonDerivativeTable$nonDerivativeTransaction$postTransactionAmounts$sharesOwnedFollowingTransaction$value
[1] "9314.9"

$nonDerivativeTable$nonDerivativeTransaction$ownershipNature
$nonDerivativeTable$nonDerivativeTransaction$ownershipNature$directOrIndirectOwnership
$nonDerivativeTable$nonDerivativeTransaction$ownershipNature$directOrIndirectOwnership$value
[1] "D"

$nonDerivativeTable$nonDerivativeTransaction
$nonDerivativeTable$nonDerivativeTransaction$securityTitle
$nonDerivativeTable$nonDerivativeTransaction$securityTitle$value
[1] "COMMON STOCK"

$nonDerivativeTable$nonDerivativeTransaction$transactionDate
$nonDerivativeTable$nonDerivativeTransaction$transactionDate$value
[1] "2018-05-11"

$nonDerivativeTable$nonDerivativeTransaction$deemedExecutionDate
NULL

$nonDerivativeTable$nonDerivativeTransaction$transactionCoding
$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$transactionFormType
[1] "4"

$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$transactionCode
[1] "A"

$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$equitySwapInvolved
[1] "0"

$nonDerivativeTable$nonDerivativeTransaction$transactionTimeliness
$nonDerivativeTable$nonDerivativeTransaction$transactionTimeliness$value
NULL

$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts
$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionShares
$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionShares$value
[1] "2286"

$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionPricePerShare
$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionPricePerShare$value
[1] "0.00"

$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionAcquiredDisposedCode
$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionAcquiredDisposedCode$value
[1] "A"

$nonDerivativeTable$nonDerivativeTransaction$postTransactionAmounts
$nonDerivativeTable$nonDerivativeTransaction$postTransactionAmounts$sharesOwnedFollowingTransaction
$nonDerivativeTable$nonDerivativeTransaction$postTransactionAmounts$sharesOwnedFollowingTransaction$value
[1] "11600.9"

$nonDerivativeTable$nonDerivativeTransaction$ownershipNature
$nonDerivativeTable$nonDerivativeTransaction$ownershipNature$directOrIndirectOwnership
$nonDerivativeTable$nonDerivativeTransaction$ownershipNature$directOrIndirectOwnership$value
[1] "D"

$nonDerivativeTable$nonDerivativeHolding
$nonDerivativeTable$nonDerivativeHolding$securityTitle
$nonDerivativeTable$nonDerivativeHolding$securityTitle$value
[1] "COMMON STOCK"

$nonDerivativeTable$nonDerivativeHolding$postTransactionAmounts
$nonDerivativeTable$nonDerivativeHolding$postTransactionAmounts$sharesOwnedFollowingTransaction
$nonDerivativeTable$nonDerivativeHolding$postTransactionAmounts$sharesOwnedFollowingTransaction$value
[1] "18970"

$nonDerivativeTable$nonDerivativeHolding$ownershipNature
$nonDerivativeTable$nonDerivativeHolding$ownershipNature$directOrIndirectOwnership
$nonDerivativeTable$nonDerivativeHolding$ownershipNature$directOrIndirectOwnership$value
[1] "I"

$nonDerivativeTable$nonDerivativeHolding$ownershipNature$natureOfOwnership
$nonDerivativeTable$nonDerivativeHolding$ownershipNature$natureOfOwnership$value
[1] "Robert J. King Jr. Trust u/a dated  Dec. 20, 2013"

$footnotes
$footnotes$footnote
$footnotes$footnote$text
[1] "Shares in lieu of cash dividend."

$footnotes$footnote$.attrs
  id 
"F1" 

$remarks
NULL

$ownerSignature
$ownerSignature$signatureName
[1] "Robert J. King, by Melissa Trippel, Limited Power of Attorney"

$ownerSignature$signatureDate
[1] "2018-05-14"

It seems there are a lot of nodes that have subnodes, and that the depth of the subnodes is not always the same between different nodes. I'm wondering if all these entries should be put in the one dataframe, or in several. By the way, I tried xmlToDataFrame(data), and it didn't work, which I suspect is due to the different depths of subnodes.

iangow commented 5 years ago

Note that your code requires library(XML). I believe xml2 is a newer alternative.

library(xml2)
fileURL <- "https://www.sec.gov/Archives/edgar/data/821026/000120919118029790/doc4.xml"
xData <- read_xml(fileURL)
xml_data <- as_list(xData)
xml_data
#> $ownershipDocument
#> $ownershipDocument$schemaVersion
#> $ownershipDocument$schemaVersion[[1]]
#> [1] "X0306"
#> 
#> 
#> $ownershipDocument$documentType
#> $ownershipDocument$documentType[[1]]
#> [1] "4"
#> 
#> 
#> $ownershipDocument$periodOfReport
#> $ownershipDocument$periodOfReport[[1]]
#> [1] "2018-05-11"
#> 
#> 
#> $ownershipDocument$notSubjectToSection16
#> $ownershipDocument$notSubjectToSection16[[1]]
#> [1] "0"
#> 
#> 
#> $ownershipDocument$issuer
#> $ownershipDocument$issuer$issuerCik
#> $ownershipDocument$issuer$issuerCik[[1]]
#> [1] "0000821026"
#> 
#> 
#> $ownershipDocument$issuer$issuerName
#> $ownershipDocument$issuer$issuerName[[1]]
#> [1] "Andersons, Inc."
#> 
#> 
#> $ownershipDocument$issuer$issuerTradingSymbol
#> $ownershipDocument$issuer$issuerTradingSymbol[[1]]
#> [1] "ANDE"
#> 
#> 
#> 
#> $ownershipDocument$reportingOwner
#> $ownershipDocument$reportingOwner$reportingOwnerId
#> $ownershipDocument$reportingOwner$reportingOwnerId$rptOwnerCik
#> $ownershipDocument$reportingOwner$reportingOwnerId$rptOwnerCik[[1]]
#> [1] "0001200124"
#> 
#> 
#> $ownershipDocument$reportingOwner$reportingOwnerId$rptOwnerName
#> $ownershipDocument$reportingOwner$reportingOwnerId$rptOwnerName[[1]]
#> [1] "KING ROBERT J JR"
#> 
#> 
#> 
#> $ownershipDocument$reportingOwner$reportingOwnerAddress
#> $ownershipDocument$reportingOwner$reportingOwnerAddress$rptOwnerStreet1
#> $ownershipDocument$reportingOwner$reportingOwnerAddress$rptOwnerStreet1[[1]]
#> [1] "1947 BRIARFIELD BLVD."
#> 
#> 
#> $ownershipDocument$reportingOwner$reportingOwnerAddress$rptOwnerStreet2
#> list()
#> 
#> $ownershipDocument$reportingOwner$reportingOwnerAddress$rptOwnerCity
#> $ownershipDocument$reportingOwner$reportingOwnerAddress$rptOwnerCity[[1]]
#> [1] "MAUMEE"
#> 
#> 
#> $ownershipDocument$reportingOwner$reportingOwnerAddress$rptOwnerState
#> $ownershipDocument$reportingOwner$reportingOwnerAddress$rptOwnerState[[1]]
#> [1] "OH"
#> 
#> 
#> $ownershipDocument$reportingOwner$reportingOwnerAddress$rptOwnerZipCode
#> $ownershipDocument$reportingOwner$reportingOwnerAddress$rptOwnerZipCode[[1]]
#> [1] "43537"
#> 
#> 
#> $ownershipDocument$reportingOwner$reportingOwnerAddress$rptOwnerStateDescription
#> list()
#> 
#> 
#> $ownershipDocument$reportingOwner$reportingOwnerRelationship
#> $ownershipDocument$reportingOwner$reportingOwnerRelationship$isDirector
#> $ownershipDocument$reportingOwner$reportingOwnerRelationship$isDirector[[1]]
#> [1] "1"
#> 
#> 
#> $ownershipDocument$reportingOwner$reportingOwnerRelationship$isOfficer
#> $ownershipDocument$reportingOwner$reportingOwnerRelationship$isOfficer[[1]]
#> [1] "0"
#> 
#> 
#> $ownershipDocument$reportingOwner$reportingOwnerRelationship$isTenPercentOwner
#> $ownershipDocument$reportingOwner$reportingOwnerRelationship$isTenPercentOwner[[1]]
#> [1] "0"
#> 
#> 
#> $ownershipDocument$reportingOwner$reportingOwnerRelationship$isOther
#> $ownershipDocument$reportingOwner$reportingOwnerRelationship$isOther[[1]]
#> [1] "0"
#> 
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$securityTitle
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$securityTitle$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$securityTitle$value[[1]]
#> [1] "COMMON STOCK"
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionDate
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionDate$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionDate$value[[1]]
#> [1] "2018-05-11"
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$deemedExecutionDate
#> list()
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$transactionFormType
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$transactionFormType[[1]]
#> [1] "4"
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$transactionCode
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$transactionCode[[1]]
#> [1] "J"
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$equitySwapInvolved
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$equitySwapInvolved[[1]]
#> [1] "0"
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$footnoteId
#> list()
#> attr(,"id")
#> [1] "F1"
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionTimeliness
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionTimeliness$value
#> list()
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionShares
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionShares$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionShares$value[[1]]
#> [1] "1.9"
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionPricePerShare
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionPricePerShare$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionPricePerShare$value[[1]]
#> [1] "32.65"
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionAcquiredDisposedCode
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionAcquiredDisposedCode$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionAcquiredDisposedCode$value[[1]]
#> [1] "A"
#> 
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$postTransactionAmounts
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$postTransactionAmounts$sharesOwnedFollowingTransaction
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$postTransactionAmounts$sharesOwnedFollowingTransaction$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$postTransactionAmounts$sharesOwnedFollowingTransaction$value[[1]]
#> [1] "9314.9"
#> 
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$ownershipNature
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$ownershipNature$directOrIndirectOwnership
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$ownershipNature$directOrIndirectOwnership$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$ownershipNature$directOrIndirectOwnership$value[[1]]
#> [1] "D"
#> 
#> 
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$securityTitle
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$securityTitle$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$securityTitle$value[[1]]
#> [1] "COMMON STOCK"
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionDate
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionDate$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionDate$value[[1]]
#> [1] "2018-05-11"
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$deemedExecutionDate
#> list()
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$transactionFormType
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$transactionFormType[[1]]
#> [1] "4"
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$transactionCode
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$transactionCode[[1]]
#> [1] "A"
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$equitySwapInvolved
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionCoding$equitySwapInvolved[[1]]
#> [1] "0"
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionTimeliness
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionTimeliness$value
#> list()
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionShares
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionShares$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionShares$value[[1]]
#> [1] "2286"
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionPricePerShare
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionPricePerShare$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionPricePerShare$value[[1]]
#> [1] "0.00"
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionAcquiredDisposedCode
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionAcquiredDisposedCode$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$transactionAmounts$transactionAcquiredDisposedCode$value[[1]]
#> [1] "A"
#> 
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$postTransactionAmounts
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$postTransactionAmounts$sharesOwnedFollowingTransaction
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$postTransactionAmounts$sharesOwnedFollowingTransaction$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$postTransactionAmounts$sharesOwnedFollowingTransaction$value[[1]]
#> [1] "11600.9"
#> 
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$ownershipNature
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$ownershipNature$directOrIndirectOwnership
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$ownershipNature$directOrIndirectOwnership$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeTransaction$ownershipNature$directOrIndirectOwnership$value[[1]]
#> [1] "D"
#> 
#> 
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding$securityTitle
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding$securityTitle$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding$securityTitle$value[[1]]
#> [1] "COMMON STOCK"
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding$postTransactionAmounts
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding$postTransactionAmounts$sharesOwnedFollowingTransaction
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding$postTransactionAmounts$sharesOwnedFollowingTransaction$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding$postTransactionAmounts$sharesOwnedFollowingTransaction$value[[1]]
#> [1] "18970"
#> 
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding$ownershipNature
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding$ownershipNature$directOrIndirectOwnership
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding$ownershipNature$directOrIndirectOwnership$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding$ownershipNature$directOrIndirectOwnership$value[[1]]
#> [1] "I"
#> 
#> 
#> 
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding$ownershipNature$natureOfOwnership
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding$ownershipNature$natureOfOwnership$value
#> $ownershipDocument$nonDerivativeTable$nonDerivativeHolding$ownershipNature$natureOfOwnership$value[[1]]
#> [1] "Robert J. King Jr. Trust u/a dated  Dec. 20, 2013"
#> 
#> 
#> 
#> 
#> 
#> 
#> $ownershipDocument$footnotes
#> $ownershipDocument$footnotes$footnote
#> $ownershipDocument$footnotes$footnote[[1]]
#> [1] "Shares in lieu of cash dividend."
#> 
#> attr(,"id")
#> [1] "F1"
#> 
#> 
#> $ownershipDocument$remarks
#> list()
#> 
#> $ownershipDocument$ownerSignature
#> $ownershipDocument$ownerSignature$signatureName
#> $ownershipDocument$ownerSignature$signatureName[[1]]
#> [1] "Robert J. King, by Melissa Trippel, Limited Power of Attorney"
#> 
#> 
#> $ownershipDocument$ownerSignature$signatureDate
#> $ownershipDocument$ownerSignature$signatureDate[[1]]
#> [1] "2018-05-14"

Created on 2018-10-02 by the reprex package (v0.2.1)

iangow commented 5 years ago

PostgreSQL can store XML documents, but the functionality is limited. Perhaps JSON would be a better format. At table with the partial URL from filing_docs and the JSON equivalent of the XML might be quite workable. One can easily extract values from JSON using functions in PostgreSQL.

library(XML)
library(rjson)
library(RCurl)
#> Loading required package: bitops

xml_url_to_json <- function(url) {

  fileURL <- file.path("https://www.sec.gov/Archives", url)
  xml_parse <- xmlParse(getURL(fileURL))
  xml_root <- xmlRoot(xml_parse)
  xml_list <- xmlToList(xml_root,addAttributes = T, simplify = F)
  xml_rjson <- toJSON(xml_list)
  xml_rjson
}

temp <- xml_url_to_json("edgar/data/821026/000120919118029790/doc4.xml")
cat(temp)
#> {"schemaVersion":"X0306","documentType":"4","periodOfReport":"2018-05-11","notSubjectToSection16":"0","issuer":{"issuerCik":"0000821026","issuerName":"Andersons, Inc.","issuerTradingSymbol":"ANDE"},"reportingOwner":{"reportingOwnerId":{"rptOwnerCik":"0001200124","rptOwnerName":"KING ROBERT J JR"},"reportingOwnerAddress":{"rptOwnerStreet1":"1947 BRIARFIELD BLVD.","rptOwnerStreet2":null,"rptOwnerCity":"MAUMEE","rptOwnerState":"OH","rptOwnerZipCode":"43537","rptOwnerStateDescription":null},"reportingOwnerRelationship":{"isDirector":"1","isOfficer":"0","isTenPercentOwner":"0","isOther":"0"}},"nonDerivativeTable":{"nonDerivativeTransaction":{"securityTitle":{"value":"COMMON STOCK"},"transactionDate":{"value":"2018-05-11"},"deemedExecutionDate":null,"transactionCoding":{"transactionFormType":"4","transactionCode":"J","equitySwapInvolved":"0","footnoteId":{"id":"F1"}},"transactionTimeliness":{"value":null},"transactionAmounts":{"transactionShares":{"value":"1.9"},"transactionPricePerShare":{"value":"32.65"},"transactionAcquiredDisposedCode":{"value":"A"}},"postTransactionAmounts":{"sharesOwnedFollowingTransaction":{"value":"9314.9"}},"ownershipNature":{"directOrIndirectOwnership":{"value":"D"}}},"nonDerivativeTransaction":{"securityTitle":{"value":"COMMON STOCK"},"transactionDate":{"value":"2018-05-11"},"deemedExecutionDate":null,"transactionCoding":{"transactionFormType":"4","transactionCode":"A","equitySwapInvolved":"0"},"transactionTimeliness":{"value":null},"transactionAmounts":{"transactionShares":{"value":"2286"},"transactionPricePerShare":{"value":"0.00"},"transactionAcquiredDisposedCode":{"value":"A"}},"postTransactionAmounts":{"sharesOwnedFollowingTransaction":{"value":"11600.9"}},"ownershipNature":{"directOrIndirectOwnership":{"value":"D"}}},"nonDerivativeHolding":{"securityTitle":{"value":"COMMON STOCK"},"postTransactionAmounts":{"sharesOwnedFollowingTransaction":{"value":"18970"}},"ownershipNature":{"directOrIndirectOwnership":{"value":"I"},"natureOfOwnership":{"value":"Robert J. King Jr. Trust u/a dated  Dec. 20, 2013"}}}},"footnotes":{"footnote":{"text":"Shares in lieu of cash dividend.",".attrs":{"id":"F1"}}},"remarks":null,"ownerSignature":{"signatureName":"Robert J. King, by Melissa Trippel, Limited Power of Attorney","signatureDate":"2018-05-14"}}

Created on 2018-10-02 by the reprex package (v0.2.1)

bdcallen commented 5 years ago

@iangow I just tried this approach

get_info_from_node <- function(node) {

    subnode_names <- names(node)
    names <- c()
    values <- c()

    for(name in subnode_names) {

        subnames <- names(node[[name]])
        if(is.null(subnames)) {

            names <- c(names, name)

            if(is.null(node[[name]])) {
            values <- c(values, NA)
            } else {
                values <- c(values, node[[name]])
            }

        } else {

            subtree_df <- get_info_from_node(node[[name]])
            names <- c(names, paste(name, subtree_df$names, sep = "_"))
            values <- c(values, subtree_df$values)

        }

    }

    df <- data.frame(names, values, stringsAsFactors = FALSE)
    return(df)

}

xml_to_df <- function(data) {

    raw_df <- get_info_from_node(data)
    ncols = dim(raw_df)[1]
    data_df <- data.frame(matrix(nrow = 0, ncol = ncols))
    colnames(data_df) <- raw_df$names
    data_df[1, ] = raw_df$values

    return(data_df)

}

Admittedly, for now, the column names are too long, but at least this code works in principle. It basically converts the xml data lists to a dataframe.

Using get_info_from_node on xml_data I got

> get_info_from_node(xml_data)
                                                                                                      names                                                        values
1                                                                                             schemaVersion                                                         X0306
2                                                                                              documentType                                                             4
3                                                                                            periodOfReport                                                    2018-05-11
4                                                                                     notSubjectToSection16                                                             0
5                                                                                          issuer_issuerCik                                                    0000821026
6                                                                                         issuer_issuerName                                               Andersons, Inc.
7                                                                                issuer_issuerTradingSymbol                                                          ANDE
8                                                               reportingOwner_reportingOwnerId_rptOwnerCik                                                    0001200124
9                                                              reportingOwner_reportingOwnerId_rptOwnerName                                              KING ROBERT J JR
10                                                     reportingOwner_reportingOwnerAddress_rptOwnerStreet1                                         1947 BRIARFIELD BLVD.
11                                                     reportingOwner_reportingOwnerAddress_rptOwnerStreet2                                                          <NA>
12                                                        reportingOwner_reportingOwnerAddress_rptOwnerCity                                                        MAUMEE
13                                                       reportingOwner_reportingOwnerAddress_rptOwnerState                                                            OH
14                                                     reportingOwner_reportingOwnerAddress_rptOwnerZipCode                                                         43537
15                                            reportingOwner_reportingOwnerAddress_rptOwnerStateDescription                                                          <NA>
16                                                     reportingOwner_reportingOwnerRelationship_isDirector                                                             1
17                                                      reportingOwner_reportingOwnerRelationship_isOfficer                                                             0
18                                              reportingOwner_reportingOwnerRelationship_isTenPercentOwner                                                             0
19                                                        reportingOwner_reportingOwnerRelationship_isOther                                                             0
20                                          nonDerivativeTable_nonDerivativeTransaction_securityTitle_value                                                  COMMON STOCK
21                                        nonDerivativeTable_nonDerivativeTransaction_transactionDate_value                                                    2018-05-11
22                                          nonDerivativeTable_nonDerivativeTransaction_deemedExecutionDate                                                          <NA>
23                        nonDerivativeTable_nonDerivativeTransaction_transactionCoding_transactionFormType                                                             4
24                            nonDerivativeTable_nonDerivativeTransaction_transactionCoding_transactionCode                                                             J
25                         nonDerivativeTable_nonDerivativeTransaction_transactionCoding_equitySwapInvolved                                                             0
26                              nonDerivativeTable_nonDerivativeTransaction_transactionCoding_footnoteId_id                                                            F1
27                                  nonDerivativeTable_nonDerivativeTransaction_transactionTimeliness_value                                                          <NA>
28                   nonDerivativeTable_nonDerivativeTransaction_transactionAmounts_transactionShares_value                                                           1.9
29            nonDerivativeTable_nonDerivativeTransaction_transactionAmounts_transactionPricePerShare_value                                                         32.65
30     nonDerivativeTable_nonDerivativeTransaction_transactionAmounts_transactionAcquiredDisposedCode_value                                                             A
31 nonDerivativeTable_nonDerivativeTransaction_postTransactionAmounts_sharesOwnedFollowingTransaction_value                                                        9314.9
32              nonDerivativeTable_nonDerivativeTransaction_ownershipNature_directOrIndirectOwnership_value                                                             D
33                                          nonDerivativeTable_nonDerivativeTransaction_securityTitle_value                                                  COMMON STOCK
34                                        nonDerivativeTable_nonDerivativeTransaction_transactionDate_value                                                    2018-05-11
35                                          nonDerivativeTable_nonDerivativeTransaction_deemedExecutionDate                                                          <NA>
36                        nonDerivativeTable_nonDerivativeTransaction_transactionCoding_transactionFormType                                                             4
37                            nonDerivativeTable_nonDerivativeTransaction_transactionCoding_transactionCode                                                             J
38                         nonDerivativeTable_nonDerivativeTransaction_transactionCoding_equitySwapInvolved                                                             0
39                              nonDerivativeTable_nonDerivativeTransaction_transactionCoding_footnoteId_id                                                            F1
40                                  nonDerivativeTable_nonDerivativeTransaction_transactionTimeliness_value                                                          <NA>
41                   nonDerivativeTable_nonDerivativeTransaction_transactionAmounts_transactionShares_value                                                           1.9
42            nonDerivativeTable_nonDerivativeTransaction_transactionAmounts_transactionPricePerShare_value                                                         32.65
43     nonDerivativeTable_nonDerivativeTransaction_transactionAmounts_transactionAcquiredDisposedCode_value                                                             A
44 nonDerivativeTable_nonDerivativeTransaction_postTransactionAmounts_sharesOwnedFollowingTransaction_value                                                        9314.9
45              nonDerivativeTable_nonDerivativeTransaction_ownershipNature_directOrIndirectOwnership_value                                                             D
46                                              nonDerivativeTable_nonDerivativeHolding_securityTitle_value                                                  COMMON STOCK
47     nonDerivativeTable_nonDerivativeHolding_postTransactionAmounts_sharesOwnedFollowingTransaction_value                                                         18970
48                  nonDerivativeTable_nonDerivativeHolding_ownershipNature_directOrIndirectOwnership_value                                                             I
49                          nonDerivativeTable_nonDerivativeHolding_ownershipNature_natureOfOwnership_value             Robert J. King Jr. Trust u/a dated  Dec. 20, 2013
50                                                                                  footnotes_footnote_text                              Shares in lieu of cash dividend.
51                                                                             footnotes_footnote_.attrs_id                                                            F1
52                                                                                                  remarks                                                          <NA>
53                                                                             ownerSignature_signatureName Robert J. King, by Melissa Trippel, Limited Power of Attorney
54                                                                             ownerSignature_signatureDate                                                    2018-05-14

and getting the values of the first few columns of xml_to_df acting on xml_data works (showing the full dataframe is rather cumbersome right now),

> eg$remarks
[1] NA
> eg$nonDerivativeTable_nonDerivativeTransaction_securityTitle_value
[1] "COMMON STOCK"
> eg$documentType
[1] "4"
iangow commented 5 years ago

It might be worth understanding how the fields in your data frame above map to those in the TFN data (see #31).

bdcallen commented 5 years ago

@iangow Just an aside for now, but defining json_to_df as a very minor modification of `xml_to_df from before,

json_to_df <- function(data_json) {

    data_list <- fromJSON(data_json)
    raw_df <- get_info_from_node(data_list)
    ncols = dim(raw_df)[1]
    data_df <- data.frame(matrix(nrow = 0, ncol = ncols))
    colnames(data_df) <- raw_df$names
    data_df[1, ] = raw_df$values

    return(data_df)

}

we can convert a file in JSON format to the type of dataframe I initially made. I just tested this function on temp, where again

temp <- xml_url_to_json("edgar/data/821026/000120919118029790/doc4.xml")

and it produced the same dataframe that xml_to_df produces on the original XML file.

bdcallen commented 5 years ago

@iangow I just been testing my function process_345_filing for the last couple of hours or so. I've tested it successfully on a few individual cases. I just tried to test it on a sample of 1000 xml documents, I've had a problem with the connections to the database growing out of hand, even though process_345_filing has a dbDisconnect(pg) line after doing dbConnect and writing to the database, which lead to me having to shut it down. In that small test, I managed to get data for around 130 filings into the database. If you would like to have a look, the tables are:

iangow commented 5 years ago

The table forms345_footnotes should use the name footnote_index not index so that the link with edgar.forms345_footnote_indices is clear (or use index in both places).

iangow commented 5 years ago

@bdcallen Please investigate these cases below. Perhaps these two tables should be simply one table (the result of the join below).

library(dplyr, warn.conflicts = FALSE)
library(DBI)
Sys.setenv(PGHOST = "10.101.13.99", PGDATABASE = "crsp")
pg <- dbConnect(RPostgres::Postgres())
rs <- dbExecute(pg, "SET search_path TO edgar")
forms345_header <- tbl(pg, "forms345_header")
forms345_table1 <- tbl(pg, "forms345_table1")
forms345_table2 <- tbl(pg, "forms345_table2")
forms345_footnotes <- tbl(pg, "forms345_footnotes")
forms345_footnote_indices <- tbl(pg, "forms345_footnote_indices")

forms345_footnotes %>%
    anti_join(
        forms345_footnote_indices %>%
            rename(index = footnote_index)) %>% 
    count()
#> Joining, by = c("file_name", "document", "index")
#> # Source:   lazy query [?? x 1]
#> # Database: postgres [igow@10.101.13.99:5432/crsp]
#>   n              
#>   <S3: integer64>
#> 1 40

Created on 2018-11-16 by the reprex package (v0.2.1)