Team - Driver fails when there are commas present in source table. Tried to give escapeChar option but looks like process always uses default value instead of arguements. Can you please check.
Sample Data
Cust_ID Cust_Name
1 A,B
2 C, D
3 E, F
Using Spark 1.6 version along-with this jar
val nzoptions = Map("url" -> "jdbc:netezza://host:5480/db", "user" -> "uid", "password" -> "Pwd", "dbtable" -> "CUST", "numPartitions" -> "16", "delimiter" -> "|", "partitioncol" -> "CUST_ID" )
val logDataDf = sqlContext.read.format("com.ibm.spark.netezza").options(nzoptions).load()
logDataDf.saveAsParquetFile("hdfsPath")
Error Stack Trace
18/12/19 20:37:04 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.SparkException: Task failed while writing rows
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:269)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:148)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:148)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:247)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: (line 1) invalid char between encapsulated token and delimiter
at org.apache.commons.csv.Lexer.parseEncapsulatedToken(Lexer.java:275)
at org.apache.commons.csv.Lexer.nextToken(Lexer.java:152)
at org.apache.commons.csv.CSVParser.nextRecord(CSVParser.java:498)
at org.apache.commons.csv.CSVParser.getRecords(CSVParser.java:365)
at com.ibm.spark.netezza.NetezzaRecordParser.parse(NetezzaRecordParser.scala:43)
at com.ibm.spark.netezza.NetezzaDataReader.next(NetezzaDataReader.scala:136)
at com.ibm.spark.netezza.NetezzaRDD$$anon$1.getNext(NetezzaRDD.scala:77)
at com.ibm.spark.netezza.NetezzaRDD$$anon$1.hasNext(NetezzaRDD.scala:106)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer$$anonfun$writeRows$1.apply$mcV$sp(WriterContainer.scala:261)
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer$$anonfun$writeRows$1.apply(WriterContainer.scala:260)
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer$$anonfun$writeRows$1.apply(WriterContainer.scala:260)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1277)
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:266)
... 8 more
Team - Driver fails when there are commas present in source table. Tried to give escapeChar option but looks like process always uses default value instead of arguements. Can you please check.
Sample Data Cust_ID Cust_Name 1 A,B 2 C, D 3 E, F
Using Spark 1.6 version along-with this jar val nzoptions = Map("url" -> "jdbc:netezza://host:5480/db", "user" -> "uid", "password" -> "Pwd", "dbtable" -> "CUST", "numPartitions" -> "16", "delimiter" -> "|", "partitioncol" -> "CUST_ID" )
val logDataDf = sqlContext.read.format("com.ibm.spark.netezza").options(nzoptions).load()
logDataDf.saveAsParquetFile("hdfsPath")
Error Stack Trace 18/12/19 20:37:04 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.SparkException: Task failed while writing rows at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:269) at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:148) at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:148) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at org.apache.spark.scheduler.Task.run(Task.scala:89) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:247) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: java.io.IOException: (line 1) invalid char between encapsulated token and delimiter at org.apache.commons.csv.Lexer.parseEncapsulatedToken(Lexer.java:275) at org.apache.commons.csv.Lexer.nextToken(Lexer.java:152) at org.apache.commons.csv.CSVParser.nextRecord(CSVParser.java:498) at org.apache.commons.csv.CSVParser.getRecords(CSVParser.java:365) at com.ibm.spark.netezza.NetezzaRecordParser.parse(NetezzaRecordParser.scala:43) at com.ibm.spark.netezza.NetezzaDataReader.next(NetezzaDataReader.scala:136) at com.ibm.spark.netezza.NetezzaRDD$$anon$1.getNext(NetezzaRDD.scala:77) at com.ibm.spark.netezza.NetezzaRDD$$anon$1.hasNext(NetezzaRDD.scala:106) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) at org.apache.spark.sql.execution.datasources.DefaultWriterContainer$$anonfun$writeRows$1.apply$mcV$sp(WriterContainer.scala:261) at org.apache.spark.sql.execution.datasources.DefaultWriterContainer$$anonfun$writeRows$1.apply(WriterContainer.scala:260) at org.apache.spark.sql.execution.datasources.DefaultWriterContainer$$anonfun$writeRows$1.apply(WriterContainer.scala:260) at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1277) at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:266) ... 8 more