18/09/14 13:40:27 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, ip-xx-xx-xxx-xx.xx, executor 1): java.lang.ArrayIndexOutOfBoundsException: 63
at org.apache.spark.unsafe.types.UTF8String.numBytesForFirstByte(UTF8String.java:190)
at org.apache.spark.unsafe.types.UTF8String.numChars(UTF8String.java:205)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:234)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:228)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Hi there, I am trying to sftp from a windows to aws and getting below error. I think its something to do with UTF-8 and UTF-16.
sftp details val df = spark.read. format("com.springml.spark.sftp"). option("host","HOSTNAME"). option("username","USERNAME"). option("password","PASSWORD"). option("fileType", "csv"). option("delimiter", ","). option("inferSchema", "true"). // option("encoding","UTF-16"). // option("decoding","UTF-16"). // option("charset", "ISO-8859-1"). // option("encoding", "utf-16"). // option("createDF", "true"). // option("path","/tmp/rdfile/"). load("/export/20180913_Details.csv")
18/09/14 13:40:27 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, ip-xx-xx-xxx-xx.xx, executor 1): java.lang.ArrayIndexOutOfBoundsException: 63 at org.apache.spark.unsafe.types.UTF8String.numBytesForFirstByte(UTF8String.java:190) at org.apache.spark.unsafe.types.UTF8String.numChars(UTF8String.java:205) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395) at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:234) at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:228) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:108) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748)