jwijffels / ETLUtils

Utilities for easily loading big data from relational databases directly into ffdf objects in R.
https://github.com/jwijffels/ETLUtils/wiki
20 stars 7 forks source link

Please add an option to read.odbc.ffdf for windows authentication to be used #2

Closed phowerter closed 8 years ago

phowerter commented 8 years ago

Hello,

We like to use Windows Authentication in the place of a DSN because they are easier to maintain. Can you implement this into the function read.odbc.ffdf? I created the following workaround that you can use for a reference:

read.odbc.ffdf2 function (query = NULL, connection = NULL, odbcQuery.args = list(), sqlGetResults.args = list(), x = NULL, nrows = -1, first.rows = 1000000, next.rows = NULL, levels = NULL, appendLevels = TRUE, asffdf_args = list(), BATCHBYTES = getOption("ffbatchbytes"), VERBOSE = FALSE, colClasses = NULL, transFUN = NULL, ...) { odbcinfo <- list() odbcinfo$channel <- do.call("odbcDriverConnect", list(connection)) ##PH made a change to the arguments and function on.exit(try(RODBC::odbcClose(odbcinfo$channel), silent = TRUE)) append <- !is.null(x) if (append && !inherits(x, "ffdf")) { stop("only ffdf objects can be used for appending (and skipping the first.row chunk)") } if (VERBOSE) { read.time <- 0 write.time <- 0 } nrows <- as.integer(nrows) N <- 0L if (!append) { if (VERBOSE) { cat("read.odbc.ffdf ", N + 1L, "..", sep = "") read.start <- proc.time()[3] } if (is.null(colClasses)) { colClasses <- NA } if (is.null(first.rows)) { if (is.null(next.rows)) { first.rows <- 1000L } else { first.rows <- as.integer(next.rows) } } else { first.rows <- as.integer(first.rows) } if (nrows >= 0L && nrows < first.rows) { first.rows <- nrows } if (first.rows == 1) { stop("first.rows must not be 1") } odbcQuery.args$channel <- odbcinfo$channel odbcQuery.args$query <- query odbcinfo$resultset <- do.call("odbcQuery", odbcQuery.args) sqlGetResults.args$channel <- odbcQuery.args$channel sqlGetResults.args$max <- first.rows if (is.null(transFUN)) { dat <- ETLUtils:::convertCharacterToFactor(do.call("sqlGetResults", sqlGetResults.args)) ##PH added ETLUtils::: } else { dat <- ETLUtils:::convertCharacterToFactor(transFUN(do.call("sqlGetResults", sqlGetResults.args), ...)) ##PH added ETLUtils::: } n <- nrow(dat) N <- n if (!is.null(levels)) { cnam <- colnames(dat) lnam <- names(levels) if (!is.list(levels) || is.null(lnam) || any(is.na(match(lnam, cnam)))) { stop("levels must be a list with names matching column names of the first data.frame read") } for (i in lnam) { dat[[i]] <- recodeLevels(dat[[i]], levels[[i]]) } } if (VERBOSE) { write.start <- proc.time()[3] read.time <- read.time + (write.start - read.start) cat(N, " (", n, ") odbc-read=", round(write.start - read.start, 3), "sec", sep = "") } if (nrow(dat) == 0) { if (VERBOSE) { cat(" query returned 0 records\n", sep = "") cat(" odbc-read=", round(read.time, 3), "sec TOTAL=", round(read.time, 3), "sec\n", sep = "") } return(dat) } x <- do.call("as.ffdf", c(list(dat), asffdf_args)) colClasses <- repnam(colClasses, colnames(x), default = NA) i.fix <- seq.int(length.out = ncol(dat))[!is.na(match(colClasses, "ordered"))] for (i in i.fix) { virtual(x[[i]])$ramclass <- c("ordered", "factor") } if (VERBOSE) { write.stop <- proc.time()[3] write.time <- write.time + (write.stop - write.start) cat(" ffdf-write=", round(write.stop - write.start, 3), "sec\n", sep = "") } } if (append || N == first.rows) { k <- ncol(x) col.names <- colnames(x) colClasses <- sapply(seq.int(length.out = ncol(x)), function(i) ETLUtils:::colClass.default(x[[i]])) ###PH added another ETLUtils::: if (is.null(next.rows)) { recordsize <- sum(.rambytes[vmode(x)]) next.rows <- BATCHBYTES%/%recordsize if (next.rows < 1L) { next.rows <- 1L warning("single record does not fit into BATCHBYTES") } } else { next.rows <- as.integer(next.rows) } next.nrows <- next.rows appendLevels <- repnam(appendLevels, col.names, default = TRUE) if (any(appendLevels)) { i.fac <- seq.int(length.out = k) if (append == TRUE) { i.fac <- i.fac[appendLevels & sapply(i.fac, function(i) is.factor(x[[i]]))] } else { i.fac <- i.fac[appendLevels & sapply(i.fac, function(i) is.factor(dat[[i]]))] } } if (append == TRUE) { odbcQuery.args$channel <- odbcinfo$channel odbcQuery.args$query <- query odbcinfo$resultset <- do.call("odbcQuery", odbcQuery.args) } while (TRUE) { if (nrows >= 0L && N + next.rows > nrows) { next.nrows <- nrows - N } if (next.nrows < 1L) { break } if (VERBOSE) { cat("read.odbc.ffdf ", N + 1L, "..", sep = "") read.start <- proc.time()[3] } sqlGetResults.args$channel <- odbcQuery.args$channel sqlGetResults.args$max <- next.nrows if (is.null(transFUN)) { dat <- ETLUtils:::convertCharacterToFactor(do.call("sqlGetResults", sqlGetResults.args)) ##PH added ETLUtils::: } else { dat <- ETLUtils:::convertCharacterToFactor(transFUN(do.call("sqlGetResults", sqlGetResults.args), ...)) ##PH added ETLUtils::: } n <- nrow(dat) N <- N + n if (VERBOSE) { write.start <- proc.time()[3] read.time <- read.time + (write.start - read.start) cat(N, " (", n, ") odbc-read=", round(write.start - read.start, 3), "sec", sep = "") } if (n < 1L) { if (VERBOSE) cat("\n") break } if (any(appendLevels)) for (i in i.fac) { lev <- unique(c(levels(x[[i]]), levels(dat[[i]]))) levels(x[[i]]) <- lev dat[[i]] <- recodeLevels(dat[[i]], lev) } nff <- nrow(x) nrow(x) <- nff + n i <- hi(nff + 1L, nff + n) x[i, ] <- dat if (VERBOSE) { write.stop <- proc.time()[3] write.time <- write.time + (write.stop - write.start) cat(" ffdf-write=", round(write.stop - write.start, 3), "sec\n", sep = "") } if (n < next.nrows) { break } } } if (VERBOSE) { cat(" odbc-read=", round(read.time, 3), "sec ffdf-write=", round(write.time, 3), "sec TOTAL=", round(read.time + write.time, 3), "sec\n", sep = "") } return(x) }

jwijffels commented 8 years ago

thanks for the suggest, I've added an argument called odbcDriverConnect.args so that you can call read.odbc.ffdf(odbcDriverConnect.args = list(connection = "myconnection")) please test

phowerter commented 8 years ago

I just tested the odbcDriverConnect.args and it worked great for me. Thanks! This was my quick test, and the results:

read.odbc.ffdf("select * from qai.dbo.secmstrx where id = 'toc'", odbcDriverConnect.args = list(connection = cnet ))

ffdf (all open) dim=c(1,18), dimorder=c(1,2) row.names=NULL ffdf virtual mapping PhysicalName VirtualVmode PhysicalVmode AsIs VirtualIsMatrix PhysicalIsMatrix PhysicalElementNo PhysicalFirstCol PhysicalLastCol PhysicalIsOpen SecCode SecCode integer integer FALSE FALSE FALSE 1 1 1 TRUE Id Id integer integer FALSE FALSE FALSE 2 1 1 TRUE Type Type integer integer FALSE FALSE FALSE 3 1 1 TRUE IdBase IdBase logical logical FALSE FALSE FALSE 4 1 1 TRUE IdSeq IdSeq integer integer FALSE FALSE FALSE 5 1 1 TRUE Cusip Cusip integer integer FALSE FALSE FALSE 6 1 1 TRUE PrevCusip PrevCusip logical logical FALSE FALSE FALSE 7 1 1 TRUE Sedol Sedol integer integer FALSE FALSE FALSE 8 1 1 TRUE PrevSedol PrevSedol logical logical FALSE FALSE FALSE 9 1 1 TRUE Sedol2 Sedol2 integer integer FALSE FALSE FALSE 10 1 1 TRUE PrevSedol2 PrevSedol2 integer integer FALSE FALSE FALSE 11 1 1 TRUE Isin Isin integer integer FALSE FALSE FALSE 12 1 1 TRUE Name Name integer integer FALSE FALSE FALSE 13 1 1 TRUE Country Country integer integer FALSE FALSE FALSE 14 1 1 TRUE Vendors1 Vendors1 double double FALSE FALSE FALSE 15 1 1 TRUE Vendors2 Vendors2 integer integer FALSE FALSE FALSE 16 1 1 TRUE Vendors3 Vendors3 integer integer FALSE FALSE FALSE 17 1 1 TRUE Vendors4 Vendors4 double double FALSE FALSE FALSE 18 1 1 TRUE ffdf data SecCode Id Type_ IdBase IdSeq Cusip PrevCusip 1 75538 TOC 1 NA 0 88490310 NA
Sedol PrevSedol2 Isin Name Country Vendors1 Vendors2 1 212606 : 246585 CA8849031056 THOMSON REUTERS CORP COM CAN 3.641425e+15 0
Vendors3 Vendors4 1 0 -9.223372e+18