saurfang / spark-sas7bdat

Splittable SAS (.sas7bdat) Input Format for Hadoop and Spark SQL
http://spark-packages.org/package/saurfang/spark-sas7bdat
Apache License 2.0
90 stars 40 forks source link

Can't load directory with sas7bdat files #72

Open mrcmoresi opened 2 years ago

mrcmoresi commented 2 years ago

I'm trying to load a bunch of files that have the same schema (all the files are a dump from the same table in SAS). The expected result is a dataframe with the concatenation of all the sas files.

Apache Spark 3.1.2 Scala 2.12 com.epam:parso:2.0.14 saurfang:spark-sas7bdat:3.0.0-s_2.12

Code

%scala
spark.read.format("com.github.saurfang.sas.spark").option("inferSchema", "true").load("dbfs:/mnt/tables/")

And I get this error when I run the command

FileNotFoundException: /tables is a directory not a file.
    at shaded.databricks.org.apache.hadoop.fs.azure.NativeAzureFileSystem.open(NativeAzureFileSystem.java:2946)
    at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.$anonfun$open$2(DatabricksFileSystemV2.scala:577)
    at com.databricks.s3a.S3AExceptionUtils$.convertAWSExceptionToJavaIOException(DatabricksStreamUtils.scala:66)
    at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.$anonfun$open$1(DatabricksFileSystemV2.scala:575)
    at com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:395)
    at com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:484)
    at com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:504)
    at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:266)
    at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
    at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:261)
    at com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:258)
    at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.withAttributionContext(DatabricksFileSystemV2.scala:510)
    at com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:305)
    at com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:297)
    at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.withAttributionTags(DatabricksFileSystemV2.scala:510)
    at com.databricks.logging.UsageLogging.recordOperationWithResultTags(UsageLogging.scala:479)
    at com.databricks.logging.UsageLogging.recordOperationWithResultTags$(UsageLogging.scala:404)
    at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.recordOperationWithResultTags(DatabricksFileSystemV2.scala:510)
    at com.databricks.logging.UsageLogging.recordOperation(UsageLogging.scala:395)
    at com.databricks.logging.UsageLogging.recordOperation$(UsageLogging.scala:367)
    at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.recordOperation(DatabricksFileSystemV2.scala:510)
    at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.open(DatabricksFileSystemV2.scala:575)
    at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.open(DatabricksFileSystemV2.scala:582)
    at com.databricks.backend.daemon.data.client.DatabricksFileSystem.open(DatabricksFileSystem.scala:88)
    at com.github.saurfang.sas.spark.SasRelation.inferSchema(SasRelation.scala:181)
    at com.github.saurfang.sas.spark.SasRelation.<init>(SasRelation.scala:73)
    at com.github.saurfang.sas.spark.SasRelation$.apply(SasRelation.scala:45)
    at com.github.saurfang.sas.spark.DefaultSource.createRelation(DefaultSource.scala:209)
    at com.github.saurfang.sas.spark.DefaultSource.createRelation(DefaultSource.scala:42)
    at com.github.saurfang.sas.spark.DefaultSource.createRelation(DefaultSource.scala:27)
    at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:390)
    at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:444)
    at org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:400)
    at scala.Option.getOrElse(Option.scala:189)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:400)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:287)
    at $lineb38616653dd84b71ae8efc3b452953aa27.$read$$iw$$iw$$iw$$iw$$iw$$iw.<init>(command-2176066653455135:1)
    at $lineb38616653dd84b71ae8efc3b452953aa27.$read$$iw$$iw$$iw$$iw$$iw.<init>(command-2176066653455135:45)
    at $lineb38616653dd84b71ae8efc3b452953aa27.$read$$iw$$iw$$iw$$iw.<init>(command-2176066653455135:47)
    at $lineb38616653dd84b71ae8efc3b452953aa27.$read$$iw$$iw$$iw.<init>(command-2176066653455135:49)
    at $lineb38616653dd84b71ae8efc3b452953aa27.$read$$iw$$iw.<init>(command-2176066653455135:51)
    at $lineb38616653dd84b71ae8efc3b452953aa27.$read$$iw.<init>(command-2176066653455135:53)
    at $lineb38616653dd84b71ae8efc3b452953aa27.$read.<init>(command-2176066653455135:55)
    at $lineb38616653dd84b71ae8efc3b452953aa27.$read$.<init>(command-2176066653455135:59)
    at $lineb38616653dd84b71ae8efc3b452953aa27.$read$.<clinit>(command-2176066653455135)
    at $lineb38616653dd84b71ae8efc3b452953aa27.$eval$.$print$lzycompute(<notebook>:7)
    at $lineb38616653dd84b71ae8efc3b452953aa27.$eval$.$print(<notebook>:6)
    at $lineb38616653dd84b71ae8efc3b452953aa27.$eval.$print(<notebook>)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at scala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:745)
    at scala.tools.nsc.interpreter.IMain$Request.loadAndRun(IMain.scala:1021)
    at scala.tools.nsc.interpreter.IMain.$anonfun$interpret$1(IMain.scala:574)
    at scala.reflect.internal.util.ScalaClassLoader.asContext(ScalaClassLoader.scala:41)
    at scala.reflect.internal.util.ScalaClassLoader.asContext$(ScalaClassLoader.scala:37)
    at scala.reflect.internal.util.AbstractFileClassLoader.asContext(AbstractFileClassLoader.scala:41)
    at scala.tools.nsc.interpreter.IMain.loadAndRunReq$1(IMain.scala:573)
    at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:600)
    at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:570)
    at com.databricks.backend.daemon.driver.DriverILoop.execute(DriverILoop.scala:219)
    at com.databricks.backend.daemon.driver.ScalaDriverLocal.$anonfun$repl$1(ScalaDriverLocal.scala:235)
    at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
    at com.databricks.backend.daemon.driver.DriverLocal$TrapExitInternal$.trapExit(DriverLocal.scala:887)
    at com.databricks.backend.daemon.driver.DriverLocal$TrapExit$.apply(DriverLocal.scala:840)
    at com.databricks.backend.daemon.driver.ScalaDriverLocal.repl(ScalaDriverLocal.scala:235)
    at com.databricks.backend.daemon.driver.DriverLocal.$anonfun$execute$11(DriverLocal.scala:526)
    at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:266)
    at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
    at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:261)
    at com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:258)
    at com.databricks.backend.daemon.driver.DriverLocal.withAttributionContext(DriverLocal.scala:50)
    at com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:305)
    at com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:297)
    at com.databricks.backend.daemon.driver.DriverLocal.withAttributionTags(DriverLocal.scala:50)
    at com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:503)
    at com.databricks.backend.daemon.driver.DriverWrapper.$anonfun$tryExecutingCommand$1(DriverWrapper.scala:611)
    at scala.util.Try$.apply(Try.scala:213)
    at com.databricks.backend.daemon.driver.DriverWrapper.tryExecutingCommand(DriverWrapper.scala:603)
    at com.databricks.backend.daemon.driver.DriverWrapper.executeCommandAndGetError(DriverWrapper.scala:522)
    at com.databricks.backend.daemon.driver.DriverWrapper.executeCommand(DriverWrapper.scala:557)
    at com.databricks.backend.daemon.driver.DriverWrapper.runInnerLoop(DriverWrapper.scala:427)
    at com.databricks.backend.daemon.driver.DriverWrapper.runInner(DriverWrapper.scala:370)
    at com.databricks.backend.daemon.driver.DriverWrapper.run(DriverWrapper.scala:221)
    at java.lang.Thread.run(Thread.java:748)
CousinsP commented 2 years ago

Can't you loop through all the files and load them individually?

mrcmoresi commented 2 years ago

Yes, that was the workaround that I used but I was wondering if was possible to load all of them at the same time.