I am trying to use this code in windows. I have run first the first program "get_filings.R" but I am getting some errors. Can you include instructions for building the database? Please see the errors 👍
> # Create functions to get filings ----
> getSECIndexFile <- function(year, quarter) {
+
+ # Download the zipped index file from the SEC website
+ tf <- tempfile(fileext = ".gz")
+ result <- try(curl_download(
+ url=paste("https://www.sec.gov/Archives/edgar/full-index/",
+ year,"/QTR", quarter, "/company.gz",sep=""),
+ destfile=tf))
+
+ # If we didn't encounter and error downloading the file, parse it
+ # and return as a R data frame
+ if (!inherits(result, "try-error")) {
+
+ # Parse the downloaded file and return the extracted data as a data frame
+ temp <-
+ read_fwf(tf, fwf_cols(company_name = c(1,62),
+ form_type = c(63,74),
+ cik = c(75,86),
+ date_filed = c(87,98),
+ file_name = c(99,150)),
+ col_types = "ccicc", skip=10,
+ locale = locale(encoding = "macintosh")) %>%
+ mutate(date_filed = as.Date(date_filed))
+ return(temp)
+ } else {
+ return(NULL)
+ }
+ }
>
> addIndexFileToDatabase <- function(data) {
+ if (is.null(data)) return(NULL)
+ library(RPostgreSQL)
+ pg <- dbConnect(PostgreSQL())
+
+ rs <- dbWriteTable(pg, c("edgar", "filings"), data,
+ append=TRUE, row.names=FALSE)
+
+ rs <- dbGetQuery(pg, "ALTER TABLE edgar.filings OWNER TO edgar")
+ rs <- dbGetQuery(pg, "GRANT SELECT ON TABLE edgar.filings TO edgar_access")
+ comment <- 'CREATED USING get_filings_full.R/get_filings_incremental.R IN iangow-public/edgar'
+ db_comment <- paste0("COMMENT ON TABLE edgar.filings IS '",
+ comment, " ON ", Sys.time() , "'; ")
+ dbGetQuery(pg, db_comment)
+ dbDisconnect(pg)
+ return(rs)
+ }
>
> deleteIndexDataFomDatabase <- function(pg, year, quarter) {
+ if(dbExistsTable(pg, c("edgar", "filings"))) {
+ dbGetQuery(pg, paste(
+ "DELETE
+ FROM edgar.filings
+ WHERE extract(quarter FROM date_filed)=", quarter,
+ " AND extract(year FROM date_filed)=", year))
+ }
+ }
>
> # Function to delete and then enter updated data for a given year and quarter
> updateData <- function(pg, year, quarter) {
+
+ cat("Updating data for ", year, "Q", quarter, "...\n", sep="")
+ try({
+ deleteIndexDataFomDatabase(pg, year, quarter)
+ dbExecute(pg, paste0("DELETE FROM index_last_modified WHERE year=", year,
+ " AND quarter=", quarter))
+ addIndexFileToDatabase(getSECIndexFile(year, quarter))
+ dbExecute(pg, paste0("INSERT INTO index_last_modified ",
+ "SELECT * FROM index_last_modified_new WHERE year=", year,
+ " AND quarter=", quarter))
+ return(TRUE)
+ },
+ return(FALSE))
+ }
>
> # Function to last modified data from EDGAR ----
> getLastUpdate <- function(year, quarter) {
+
+ url <- paste0("https://www.sec.gov/Archives/edgar/full-index/",
+ year, "/QTR", quarter, "/")
+
+ # Scrape the html table from the website for the given year and quarter
+ read_html(url) %>%
+ html_nodes("table") %>%
+ .[[1]] %>%
+ html_table() %>%
+ filter(Name == "company.gz") %>%
+ select(`Last Modified`) %>%
+ mdy_hms(tz = "America/New_York")
+ }
>
> # Create table with last_modified ----
> now <- now(tz = 'America/New_York')
> current_year <- as.integer(year(now))
> current_qtr <- quarter(now)
> year <- 1993L:current_year
> quarter <- 1:4L
>
> index_last_modified_scraped <-
+ crossing(year, quarter) %>%
+ filter(year < current_year |
+ (year == current_year & quarter <= current_qtr)) %>%
+ rowwise() %>%
+ mutate(last_modified = getLastUpdate(year, quarter))
Warning message:
In new.env(parent = emptyenv(), hash = FALSE) :
closing unused connection 3 (https://www.sec.gov/Archives/edgar/full-index/2016/QTR1/master.idx)
>
> # Push results to database ----
> pg <- dbConnect(PostgreSQL())
Error in postgresqlNewConnection(drv, ...) :
RS-DBI driver: (could not connect maitrehec7@localhost:5432 on dbname "maitrehec7": could not connect to server: Connection refused (0x0000274D/10061)
Is the server running on host "localhost" (127.0.0.1) and accepting
TCP/IP connections on port 5432?
could not connect to server: Connection refused (0x0000274D/10061)
Is the server running on host "localhost" (::1) and accepting
TCP/IP connections on port 5432?
)
>
> rs <- dbExecute(pg, "SET search_path TO edgar")
Error in dbExecute(pg, "SET search_path TO edgar") :
object 'pg' not found
>
> rs <- dbWriteTable(pg, "index_last_modified_new", index_last_modified_scraped,
+ row.names = FALSE, overwrite = TRUE)
Error in dbWriteTable(pg, "index_last_modified_new", index_last_modified_scraped, :
object 'pg' not found
>
> # Compare new data with old to identify needed index files ----
> if (dbExistsTable(pg, c("edgar", "index_last_modified"))) {
+ index_last_modified_new <- tbl(pg, "index_last_modified_new")
+ index_last_modified <- tbl(pg, "index_last_modified")
+
+ # Use force_tz to ensure the correct times in EDT.
+ # Database stores times in UTC
+
+ to_update <-
+ index_last_modified_new %>%
+ left_join(index_last_modified,
+ by = c("year", "quarter"),
+ suffix = c("_new", "_old")) %>%
+ filter(is.na(last_modified_old) |
+ last_modified_new > last_modified_old) %>%
+ collect()
+ } else {
+ index_last_modified_scraped %>%
+ mutate(last_modified = as.POSIXct(NA)) %>%
+ dbWriteTable(pg, "index_last_modified", .,
+ row.names = FALSE, overwrite = TRUE)
+
+ to_update <-
+ index_last_modified_scraped
+ }
Error in dbExistsTable(pg, c("edgar", "index_last_modified")) :
object 'pg' not found
>
> #
> # If to_update has a non-trivial number of observations/rows (ie. at least 1), update the data
> #
> if(nrow(to_update) > 0) {
+ to_update <-
+ to_update %>%
+ rowwise() %>%
+ mutate(updated = updateData(pg, year, quarter))
+ }
Error in nrow(to_update) : object 'to_update' not found
>
> rs <- dbExecute(pg, "ALTER TABLE filings OWNER TO edgar")
Error in dbExecute(pg, "ALTER TABLE filings OWNER TO edgar") :
object 'pg' not found
> rs <- dbExecute(pg, "GRANT SELECT ON filings TO edgar_access")
Error in dbExecute(pg, "GRANT SELECT ON filings TO edgar_access") :
object 'pg' not found
> rs <- dbExecute(pg, "ALTER TABLE index_last_modified OWNER TO edgar")
Error in dbExecute(pg, "ALTER TABLE index_last_modified OWNER TO edgar") :
object 'pg' not found
> rs <- dbExecute(pg, "GRANT SELECT ON index_last_modified TO edgar_access")
Error in dbExecute(pg, "GRANT SELECT ON index_last_modified TO edgar_access") :
object 'pg' not found
> rs <- dbExecute(pg, "DROP TABLE IF EXISTS index_last_modified_new")
Error in dbExecute(pg, "DROP TABLE IF EXISTS index_last_modified_new") :
object 'pg' not found
> comment <- 'CREATED USING get_filings.R IN iangow-public/edgar'
> db_comment <- paste0("COMMENT ON TABLE filings IS '",
+ comment, " ON ", Sys.time() , "'")
> rs <- dbExecute(pg, db_comment)
Error in dbExecute(pg, db_comment) : object 'pg' not found
>
> rs <- dbDisconnect(pg)
Error in dbDisconnect(pg) : object 'pg' not found
Thanks iangow.
now that ia have installed postgreSQL. what is the next step? Can you help? I will follow the training to learn postgresql. But I need a quick help to be able to use the script.
Many thanks
I am trying to use this code in windows. I have run first the first program "get_filings.R" but I am getting some errors. Can you include instructions for building the database? Please see the errors 👍