pola-rs / r-polars

Bring polars to R
https://pola-rs.github.io/r-polars/
Other
398 stars 35 forks source link

Issue with reading from long urls #1046

Closed DyfanJones closed 3 weeks ago

DyfanJones commented 1 month ago

Hi all,

It looks like there is an issue in reading from urls that are too long.

library(s3fs)
library(polars)

# NOTE: Create temporary file
tmp_file <- tempfile()

pl$DataFrame(
  col1 = sample(0:100, 200, replace = T),
  col2 = sample(c(letters, LETTERS), 200, replace = T)
)$write_csv(tmp_file)

# Will use minio for demo purposes
s3_file_system(profile_name = "minio", signature_version = "s3v4")

bucket <- s3_path("test-bucket")
dstpth <- s3_path(bucket, "test/demo.csv")

s3_bucket_create(bucket)
#> [1] "s3://test-bucket"
s3_file_upload(path = tmp_file, new_path = dstpth, overwrite = T)
#> [1] "s3://test-bucket/test/demo.csv"

obj <- s3fs::s3_file_url(dstpth)

pl$read_csv(obj)
#> Warning in download.file(url = actual_url, destfile = tmp_file): URL
#> http://127.0.0.1:9000/test-bucket/test/demo.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=minioadmin%2F20240416%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240416T134113Z&X-Amz-Expires=3600&X-Amz-Signature=997b021a91547c844fa948989b6a4ef16fada444e19e62c781420fe92aad198f&X-Amz-SignedHeaders=host:
#> cannot open destfile
#> '/var/folders/sp/scbzkbwx6hbchmylsx0y52k80000gn/T//RtmpYpd5jt/http...127.0.0.1.9000.test.bucket.test.demo.csv.X.Amz.Algorithm.AWS4.HMAC.SHA256.X.Amz.Credential.minioadmin.2F20240416.2Fus.east.1.2Fs3.2Faws4_request.X.Amz.Date.20240416T134113Z.X.Amz.Expires.3600.X.Amz.Signature.997b021a91547c844fa948989b6a4ef16fada444e19e62c781420fe92aad198f.X.Amz.SignedHeaders.host',
#> reason 'File name too long'
#> Warning in download.file(url = actual_url, destfile = tmp_file): download had
#> nonzero exit status
#> tmp file placed in 
#>  /var/folders/sp/scbzkbwx6hbchmylsx0y52k80000gn/T//RtmpYpd5jt/http...127.0.0.1.9000.test.bucket.test.demo.csv.X.Amz.Algorithm.AWS4.HMAC.SHA256.X.Amz.Credential.minioadmin.2F20240416.2Fus.east.1.2Fs3.2Faws4_request.X.Amz.Date.20240416T134113Z.X.Amz.Expires.3600.X.Amz.Signature.997b021a91547c844fa948989b6a4ef16fada444e19e62c781420fe92aad198f.X.Amz.SignedHeaders.host
#> Error: Execution halted with the following contexts
#>    0: In R: in pl$read_csv():
#>    0: During function call [base::tryCatch(base::withCallingHandlers({
#>           NULL
#>           base::saveRDS(base::do.call(base::do.call, base::c(base::readRDS("/var/folders/sp/scbzkbwx6hbchmylsx0y52k80000gn/T//Rtmpa6BQxU/callr-fun-f17f59614721"), 
#>               base::list(envir = .GlobalEnv, quote = TRUE)), envir = .GlobalEnv, 
#>               quote = TRUE), file = "/var/folders/sp/scbzkbwx6hbchmylsx0y52k80000gn/T//Rtmpa6BQxU/callr-res-f17f184f55f", 
#>               compress = FALSE)
#>           base::flush(base::stdout())
#>           base::flush(base::stderr())
#>           NULL
#>           base::invisible()
#>       }, error = function(e) {
#>           {
#>               callr_data <- base::as.environment("tools:callr")$`__callr_data__`
#>               err <- callr_data$err
#>               if (FALSE) {
#>                   base::assign(".Traceback", base::.traceback(4), envir = callr_data)
#>                   utils::dump.frames("__callr_dump__")
#>                   base::assign(".Last.dump", .GlobalEnv$`__callr_dump__`, 
#>                       envir = callr_data)
#>                   base::rm("__callr_dump__", envir = .GlobalEnv)
#>               }
#>               e <- err$process_call(e)
#>               e2 <- err$new_error("error in callr subprocess")
#>               class <- base::class
#>               class(e2) <- base::c("callr_remote_error", class(e2))
#>               e2 <- err$add_trace_back(e2)
#>               cut <- base::which(e2$trace$scope == "global")[1]
#>               if (!base::is.na(cut)) {
#>                   e2$trace <- e2$trace[-(1:cut), ]
#>               }
#>               base::saveRDS(base::list("error", e2, e), file = base::paste0("/var/folders/sp/scbzkbwx6hbchmylsx0y52k80000gn/T//Rtmpa6BQxU/callr-res-f17f184f55f", 
#>                   ".error"))
#>           }
#>       }, interrupt = function(e) {
#>           {
#>               callr_data <- base::as.environment("tools:callr")$`__callr_data__`
#>               err <- callr_data$err
#>               if (FALSE) {
#>                   base::assign(".Traceback", base::.traceback(4), envir = callr_data)
#>                   utils::dump.frames("__callr_dump__")
#>                   base::assign(".Last.dump", .GlobalEnv$`__callr_dump__`, 
#>                       envir = callr_data)
#>                   base::rm("__callr_dump__", envir = .GlobalEnv)
#>               }
#>               e <- err$process_call(e)
#>               e2 <- err$new_error("error in callr subprocess")
#>               class <- base::class
#>               class(e2) <- base::c("callr_remote_error", class(e2))
#>               e2 <- err$add_trace_back(e2)
#>               cut <- base::which(e2$trace$scope == "global")[1]
#>               if (!base::is.na(cut)) {
#>                   e2$trace <- e2$trace[-(1:cut), ]
#>               }
#>               base::saveRDS(base::list("error", e2, e), file = base::paste0("/var/folders/sp/scbzkbwx6hbchmylsx0y52k80000gn/T//Rtmpa6BQxU/callr-res-f17f184f55f", 
#>                   ".error"))
#>           }
#>       }, callr_message = function(e) {
#>           base::try(base::signalCondition(e))
#>       }), error = function(e) {
#>           NULL
#>           if (FALSE) {
#>               base::try(base::stop(e))
#>           }
#>           else {
#>               base::invisible()
#>           }
#>       }, interrupt = function(e) {
#>           NULL
#>           if (FALSE) {
#>               e
#>           }
#>           else {
#>               base::invisible()
#>           }
#>       })]
#>    1: Encountered the following error in Rust-Polars:
#>          File name too long (os error 63): ...97b021a91547c844fa948989b6a4ef16fada444e19e62c781420fe92aad198f.X.Amz.SignedHeaders.host

Created on 2024-04-16 with reprex v2.1.0

DyfanJones commented 1 month ago

It looks like the issue is coming from:

tmp_file = paste0(tempdir(), "/", make.names(actual_url))
download.file(url = actual_url, destfile = tmp_file)

This causes download.file fail. A possible solution is to either hash the actual_url or use environments to cache the temp file name. For example:

cache_temp_file <- new.env(parent = new.env())
check_is_link = function(path, reuse_downloaded, raise_error = FALSE) {
  # do nothing let path fail on rust side
  if (is.na(path)) {
    return(NULL)
  }
  if (!file.exists(path)) {
    con = NULL

    # check if possible to open url connection
    assumed_schemas = c("", "https://", "http://", "ftp://")
    for (i_schema in assumed_schemas) {
      if (!is.null(con)) break
      actual_url = paste0(i_schema, path)
      suppressWarnings(
        tryCatch(
          {
            con = url(actual_url, open = "rt")
          },
          error = function(e) {}
        )
      )
    }

    # try download file if valid url
    if (!is.null(con)) {
      close(con)
      if (is.null(cache_temp_file[[actual_url]])) 
        cache_temp_file[[actual_url]] <- tempfile()
      if (isFALSE(reuse_downloaded) || isFALSE(file.exists(cache_temp_file[[actual_url]]))) {
        download.file(url = actual_url, destfile = cache_temp_file[[actual_url]])
        message(paste("tmp file placed in \n", cache_temp_file[[actual_url]]))
      }

      path = cache_temp_file[[actual_url]] # redirect path to tmp downloaded file
    } else {
      if (raise_error) {
        stop("failed to locate file at path/url: ", path)
      }
      # do nothing let path fail on rust side
      path = NULL
    }
  }

  path
}
DyfanJones commented 1 month ago

Happy to raise PR if this approach seem ok with you guys :)

etiennebacher commented 1 month ago

Hi @DyfanJones, thanks for the report and the proposed fix.

This looks good to me but I'm wondering if using a simple list instead of an environment would do the trick? I can't try myself because your reprex requires some S3 credentials that I don't have. Also, I'd like to have a test for this, do you think it would be possible to add one without any dependencies?

DyfanJones commented 1 month ago

Sadly a simple list won't work, as R doesn't really do stuff by reference apart from environments. Here is a small example:

demo_list <- list()
demo_list_fn <- function(x) {
  if (is.null(demo_list[[x]])) demo_list[[x]] <- tempfile()
}

demo_env <- new.env(parent = new.env())
demo_env_fn <- function(x) {
  if (is.null(demo_env[[x]])) demo_env[[x]] <- tempfile()
}

x <- "helloworld"
demo_list_fn(x)
demo_env_fn(x)

demo_list
#> list()
demo_env[[x]]
#> [1] "/var/folders/sp/scbzkbwx6hbchmylsx0y52k80000gn/T//RtmpYkGFjV/file2e7042d07df"

Created on 2024-04-17 with reprex v2.1.0

For unit testing we can just do a mock test as the only thing we want to test is the caching of tempfile locations :)

DyfanJones commented 1 month ago

Saying that there are aways around it:

demo_list <- list()
demo_list_fn <- function(x) {
  if (is.null(demo_list[[x]])) demo_list[[x]] <- tempfile()
}

demo_env <- new.env(parent = new.env())
demo_env_fn <- function(x) {
  if (is.null(demo_env[[x]])) demo_env[[x]] <- tempfile()
}

demo_list_fn_2 <- function(x) {
  if (is.null(demo_list[[x]])) demo_list[[x]] <- tempfile()
  assign("demo_list", demo_list, envir = parent.frame())
}

demo_list_fn_3 <- function(x) {
  if (is.null(demo_list[[x]])) demo_list[[x]] <<- tempfile()
}

x <- "helloworld"
demo_list_fn(x)
demo_env_fn(x)

demo_list
#> list()
demo_env[[x]]
#> [1] "/var/folders/sp/scbzkbwx6hbchmylsx0y52k80000gn/T//Rtmp1J1nrV/file45ec63b5092f"

demo_list_fn_2(x)
demo_list
#> $helloworld
#> [1] "/var/folders/sp/scbzkbwx6hbchmylsx0y52k80000gn/T//Rtmp1J1nrV/file45ec26e1fcd"

demo_list_fn_3("Goodbye world")
demo_list
#> $helloworld
#> [1] "/var/folders/sp/scbzkbwx6hbchmylsx0y52k80000gn/T//Rtmp1J1nrV/file45ec26e1fcd"
#> 
#> $`Goodbye world`
#> [1] "/var/folders/sp/scbzkbwx6hbchmylsx0y52k80000gn/T//Rtmp1J1nrV/file45ecbe9ccfa"

Created on 2024-04-17 with reprex v2.1.0

DyfanJones commented 1 month ago

The problem with <<- is that the package environment will be locked. Soooo it will error. Another option is to use function environments for example

cache_file <- function() {
  cache <- list()
  temp_file <- function(x) {
    if (is.null(cache[[x]])) cache[[x]] <<- tempfile()
    return(cache[[x]])
  }
  return(temp_file)
}

## initialise in package `.onLoad`
cache <- cache_file()

cache("helloworld")
#> [1] "/var/folders/sp/scbzkbwx6hbchmylsx0y52k80000gn/T//RtmpXgXxke/file16c13e313726"
cache("helloworld")
#> [1] "/var/folders/sp/scbzkbwx6hbchmylsx0y52k80000gn/T//RtmpXgXxke/file16c13e313726"

Created on 2024-04-17 with reprex v2.1.0

DyfanJones commented 1 month ago

So it will look something like:

check_is_link = function(path, reuse_downloaded, raise_error = FALSE) {
  # do nothing let path fail on rust side
  if (is.na(path)) {
    return(NULL)
  }
  if (!file.exists(path)) {
    con = NULL

    # check if possible to open url connection
    assumed_schemas = c("", "https://", "http://", "ftp://")
    for (i_schema in assumed_schemas) {
      if (!is.null(con)) break
      actual_url = paste0(i_schema, path)
      suppressWarnings(
        tryCatch(
          {
            con = url(actual_url, open = "rt")
          },
          error = function(e) {}
        )
      )
    }

    # try download file if valid url
    if (!is.null(con)) {
      close(con)
      if (isFALSE(reuse_downloaded) || isFALSE(file.exists(cache(actual_url)))) {
        download.file(url = actual_url, destfile = cache(actual_url))
        message(paste("tmp file placed in \n", cache(actual_url)))
      }

      path = cache(actual_url) # redirect path to tmp downloaded file
    } else {
      if (raise_error) {
        stop("failed to locate file at path/url: ", path)
      }
      # do nothing let path fail on rust side
      path = NULL
    }
  }

  path
}
etiennebacher commented 1 month ago

I think it's fine to use an environment as in your first case. Sorry for the extra work, it was mostly out of curiosity. Do you want to make a PR?

DyfanJones commented 1 month ago

No worries :) happy to give as many options to get the best option :) Happy to raise a PR.

eitsupi commented 1 month ago

Thanks for your contribution!

Tip: Do not use stop() inside this package, use something like the following includes Err_plain() and unwrap() instead.

https://github.com/pola-rs/r-polars/blob/9ea6ec58dce49b4cb38d5cd3aa78d7668b376e6d/R/as_polars.R#L77-L92

DyfanJones commented 1 month ago

Happy to fix stop replace with Err_plain() and unwrap(). @eitsupi Do you want the PR to fix the stop() for the file: https://github.com/pola-rs/r-polars/blob/main/R/io_csv.R

eitsupi commented 1 month ago

Yes, stop() is still in various places and has not been completely replaced (#568), PRs are welcome!