cloudyr / limer

A LimeSurvey R Client
MIT License
67 stars 37 forks source link

R textConnection #48

Open AdamZ2 opened 5 years ago

AdamZ2 commented 5 years ago

I am trying to use the folowing code to download some forms from (https://www.sec.gov/Archives/edgar/full-index/ ). I am getting this error : Error in fread(textConnection(master), sep = "|") : input= must be a single character string containing a file name, a system command containing at least one space, a URL starting 'http[s]://', 'ftp[s]://' or 'file://', or, the input data itself containing at least one \n or \r

this the code start_year <- 2016 start_QTR <- 1

end_year <- 2016 end_QTR <- 4 setwd("D:/Requests/SC13D") require(data.table) require(Hmisc) require(data.table)

########################################### ####### construct SEC master file ######## ########################################### qtr.master.file <- function(year, QTR) { require(data.table) name <- paste0("https://www.sec.gov/Archives/edgar/full-index/", year,"/QTR",QTR,"/master.idx") print(sprintf("Downloading master file for quarter %d of year %s...", QTR, year)) master <- readLines(url(name)) master <- master[grep("SC 13(D|G)", master)] master <- gsub("#", "", master) # R does not treat a comment sign well master_table <- fread(textConnection(master), sep = "|") rm(master) colnames(master_table) <- c("cik", "name", "type", "date", "link") master_table <- as.data.table(master_table) master_table[, link := paste0("https://www.sec.gov/Archives/", link)] master_table[, file := gsub(".*/", "", link)] closeAllConnections() return(master_table) } ###########################################

download all files into temp dir

########################################### dwnld.files <- function(master) { require(RCurl) dir.create("temp_dir") master <- as.data.frame(master) master <- master[!duplicated(master$file),] for(j in 1:length(master$file)) { file <- NA file_url <- as.character(master$link[j]) file_name <- paste0("./temp_dir/",master$file[j]) try(file <- getURL(file_url)) write(file, file_name) } } ########################################### ####### put all forms in SQdatabase ####### ########################################### put.files.in.sql <- function(dbname) { library(DBI) library(RSQLite) together <- function(x) { return(paste(x, collapse = "\n")) }

con = dbConnect(SQLite(), dbname=dbname) dbSendQuery(conn=con, "CREATE TABLE compsubm (FILENAME TEXT, COMLSUBFILE TEXT)") path <- paste0("./temp_dir/") files <- list.files(path) n <- length(files) step <- 500 for(i in 1:(n %/% step + 1)) { start <- 1 + (i-1)step end <- i(step) ind <- start:min(end,n) objects <- lapply(paste0(path,files[ind]), readLines) clean <- lapply(objects, together) data <- NULL data$FILENAME <- files[ind] data <- as.data.frame(data) data$COMLSUBFILE <- unlist(clean) dbWriteTable(conn=con, name = "compsubm", data, append = T) } dbDisconnect(con) unlink("temp_dir", recursive = T) } get_dates <- function(start_year, start_QTR, end_year, end_QTR) { require(data.table) all_dates <- data.table(year = rep(1993:2050, 4)) setkey(all_dates,year) all_dates[, QTR := 1:.N, by = year] all_dates <- as.data.frame(all_dates)

x <- paste0(all_dates$year, all_dates$QTR) >= paste0(start_year, start_QTR) & paste0(all_dates$year, all_dates$QTR) <= paste0(end_year, end_QTR) return(all_dates[x,]) }

dates <- get_dates(start_year, start_QTR, end_year, endQTR) for(i in 1:length(dates$QTR)) { print(Sys.time()) master <- qtr.master.file(dates$year[i], dates$QTR[i]) write.csv(master, paste0("./Master/master", dates$year[i], dates$QTR[i], ".csv"), row.names = F) print("Dowloading files, it takes up to 4 hours") dwnld.files(master) print("Putting all files into SQL & cleaning") put.files.in.sql(paste0("./Forms/",dates$year[i],dates$QTR[i], ".sqlite")) }

Jennyyanhuadong commented 3 years ago

I find that your code is similar to https://github.com/volkovacodes/Block_Codes/blob/master/1.%20download_forms.R The code there uses csv, and then separate. I tried the code, and it works. write(master, "tmp.csv") master_table <- fread("tmp.csv", sep = "|")