crealytics / spark-excel

A Spark plugin for reading and writing Excel files
Apache License 2.0
463 stars 147 forks source link

Unexpected formula parsing on string columns #476

Open surendra-mt opened 2 years ago

surendra-mt commented 2 years ago

Hi Team, We are facing an issue while writing pyspark df to xslx. One of our column (string type) has some string like this: = >85. When converting the df to xlsx, it is breaking because it tries to parse this formula resulting in error.

 shadeio.poi.ss.formula.FormulaParseException: Parse error near char 1 '>' in specified formula ' >85'. Expected cell ref or constant literal
    at shadeio.poi.ss.formula.FormulaParser.expected(FormulaParser.java:269)
    at shadeio.poi.ss.formula.FormulaParser.parseSimpleFactor(FormulaParser.java:1553)
    at shadeio.poi.ss.formula.FormulaParser.percentFactor(FormulaParser.java:1506)
    at shadeio.poi.ss.formula.FormulaParser.powerFactor(FormulaParser.java:1493)
    at shadeio.poi.ss.formula.FormulaParser.Term(FormulaParser.java:1867)
    at shadeio.poi.ss.formula.FormulaParser.additiveExpression(FormulaParser.java:1994)
    at shadeio.poi.ss.formula.FormulaParser.concatExpression(FormulaParser.java:1978)
    at shadeio.poi.ss.formula.FormulaParser.comparisonExpression(FormulaParser.java:1935)
    at shadeio.poi.ss.formula.FormulaParser.intersectionExpression(FormulaParser.java:1908)
    at shadeio.poi.ss.formula.FormulaParser.unionExpression(FormulaParser.java:1889)
    at shadeio.poi.ss.formula.FormulaParser.parse(FormulaParser.java:2036)
    at shadeio.poi.ss.formula.FormulaParser.parse(FormulaParser.java:170)
    at shadeio.poi.xssf.usermodel.XSSFCell.setFormula(XSSFCell.java:550)
    at shadeio.poi.xssf.usermodel.XSSFCell.setCellFormulaImpl(XSSFCell.java:526)
    at shadeio.poi.ss.usermodel.CellBase.setCellFormula(CellBase.java:132)
    at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.convertCell(Model2XlsxConversions.scala:49)
    at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$$anonfun$convertRow$4.apply(Model2XlsxConversions.scala:143)
    at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$$anonfun$convertRow$4.apply(Model2XlsxConversions.scala:143)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
    at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.convertRow(Model2XlsxConversions.scala:143)
    at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$$anonfun$writeToExistingSheet$1.apply(Model2XlsxConversions.scala:156)
    at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$$anonfun$writeToExistingSheet$1.apply(Model2XlsxConversions.scala:156)
    at scala.collection.immutable.List.foreach(List.scala:392)
    at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.writeToExistingSheet(Model2XlsxConversions.scala:156)
    at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$$anonfun$writeToExistingWorkbook$4.apply(Model2XlsxConversions.scala:324)
    at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$$anonfun$writeToExistingWorkbook$4.apply(Model2XlsxConversions.scala:321)
    at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
    at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
    at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.writeToExistingWorkbook(Model2XlsxConversions.scala:321)
    at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$XlsxWorkbook.writeToExisting(Model2XlsxConversions.scala:421)
    at com.crealytics.spark.excel.ExcelFileSaver.writeToWorkbook$1(ExcelFileSaver.scala:40)
    at com.crealytics.spark.excel.ExcelFileSaver.save(ExcelFileSaver.scala:48)
    at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:60)
    at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
    at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
    at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
    at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
    at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
    at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
    at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
    at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
    at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
    at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
    at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
    at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
    at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
    at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285)
    at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
    at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
    at sun.reflect.GeneratedMethodAccessor249.invoke(Unknown Source)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:282)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:238)
    at java.lang.Thread.run(Thread.java:748)
Sample code:

temp_data = [("= >85", "valid sting","valid string 2"),(">=85",""">85%
RACE updation in Veeva""",""">10.7 for HCP
> 8 for HCOs""")]

rdd = sc.parallelize(temp_data)

temp_df = rdd.toDF()

with tempfile.TemporaryDirectory() as directory:
    write_df_to_xlsx_format(temp_df,directory)

Expected Behavior

The above code should generate a xslx.

Possible Solution

Format parsing should be disable inside string.

Steps to Reproduce (for bugs)

Can run above sample code for the same.

Context

We use this plugin to covert spark df to xslx and then send the report as an email to client.

Your Environment

ENV: prod

quanghgx commented 2 years ago

Hi @surendra-mt Please help share the mentioned method write_df_to_xlsx_format. Or just part of it, so that we can reproduce the issue with your temp_data. Sincerely,

surendra-mt commented 2 years ago

Hi @quanghgx apologies. I thought I have included that in the main body. Here is the function code.

def write_df_to_xlsx_format(df, directory, sheetName="'Sheet1'!A1"):
    print("writing df to temp dir as xlsx file")
    df.coalesce(1).write.format("com.crealytics.spark.excel").option(
        "dataAddress", sheetName
    ).option("header", "true").option("dateFormat", "yy-mm-dd").option(
        "timestampFormat", "yyyy-MM-dd HH:mm"
    ).mode(
        "overwrite"
    ).save(
        directory + "/temp.xlsx"
    )
surendra-mt commented 2 years ago

Hi @quanghgx any update on this?

quanghgx commented 2 years ago

Hi @surendra-mt I am going to take a try with weekend and get back to you. Thanks for providing all needed detail.

surendra-mt commented 2 years ago

Hi @quanghgx sorry for bothering you. We have a pending ticket for this on prod. Let us know, if there is any update on this. Thanks.

quanghgx commented 2 years ago

I am so sorry for long delay from my side. Checking..

quanghgx commented 2 years ago

Hi @surendra-mt

Summary:


A. I can reproduce given issue with v1 implement and end up with same error message:

22/01/10 23:32:01 INFO DAGScheduler: Job 0 finished: toList at DataLocator.scala:104, took 0.473501 s
Exception in thread "main" org.apache.poi.ss.formula.FormulaParseException: Parse error near char 1 '>' in specified formula ' >85'. Expected cell ref or constant literal
    at org.apache.poi.ss.formula.FormulaParser.expected(FormulaParser.java:272)
FormulaParser.java:272
    at org.apache.poi.ss.formula.FormulaParser.parseSimpleFactor(FormulaParser.java:1544)
FormulaParser.java:1544
    at org.apache.poi.ss.formula.FormulaParser.percentFactor(FormulaParser.java:1497)
FormulaParser.java:1497
    at org.apache.poi.ss.formula.FormulaParser.powerFactor(FormulaParser.java:1484)
FormulaParser.java:1484
    at org.apache.poi.ss.formula.FormulaParser.Term(FormulaParser.java:1858)
FormulaParser.java:1858

B. Is it possible for you to use V2 implement?

Here are my full experimental snippet:

package com.crealytics.spark

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.Row
import java.util
import scala.collection.JavaConverters._
import org.apache.spark.sql.SaveMode

object AppEntry {
  def main(args: Array[String]) = {
    println("Hello, world")
    val spark = SparkSession
      .builder()
      .master("local")
      .appName("Spark SQL basic example")
      .config("spark.ui.enabled", false)
      .getOrCreate()

    val schema = StructType(
      List(
        StructField("C1", StringType, true),
        StructField("C2", StringType, true),
        StructField("C3", StringType, true)
      )
    )

    val temp_data: util.List[Row] = List(
      Row("= >85", "valid sting", "valid string 2"),
      Row(">=85", """>85%RACE updation in Veeva""", """>10.7 for HCP> 8 for HCOs""")
    ).asJava

    val df = spark.createDataFrame(temp_data, schema)
    df.show()

    /* V2 implement, works*/
    // df.write
    //     .format("excel")
    //     .option("dataAddress", "'Sheet1'!A1")
    //     .mode(SaveMode.Append)
    //     .save("/home/quanghgx/Downloads/476_surendra-mt.xlsx")

    /* Same issue with surendra-mt*/
    df.coalesce(1).write.format("com.crealytics.spark.excel")
        .option("dataAddress", "'Sheet1'!A1")
        .option("header", "true")
        .option("dateFormat", "yy-mm-dd")
        .option("timestampFormat", "yyyy-MM-dd HH:mm")
        .mode("overwrite")
        .save("/home/quanghgx/Downloads//temp.xlsx")

    try {
      spark.close()
    } catch {
      case _: Exception => () // NOP
    }
  }
}

C. Output of spark-excel V2 implementation is a directory: And the data file contains correct value Screenshot from 2022-01-10 23-43-51

quanghgx commented 2 years ago

Hi @surendra-mt Do you have time to take another try with .format("excel")?

surendra-mt commented 2 years ago

Hi @quanghgx Sorry, I was busy with some other task. Will check this in this week. Thanks.

surendra-mt commented 2 years ago

Hi @quanghgx I tried with v2. These are the issues I am facing with it. So currently we are using spark 2.4.3 and v2 uses functionality that requires spark 3.2.1+ (for example: org.apache.spark.sql.connector.catalog.TableProvider).

Second, when I tried to build jar for spark version 2.4.3, I found out that even that won't work as we are on scala 2.11.

For both of the issues, I will ask code owner if it is possible to upgrade versions.

Thanks a lot for your help!

tufanrakshit commented 2 years ago

@quanghgx Hi I am facing the similar issue with spark 3.1.2 , scala 2.12 , java 11 , the query is simply reading data from snowflake and writtng back as a excel report , no fancy stuff or calculation , the v2 version also failed

regular version error

2022-05-29T02:01:46.514921848Z Exception in thread "main" shadeio.poi.ss.formula.FormulaParseException: Parse error near char 0 '' in specified formula ''. Expected cell ref or constant literal 2022-05-29T02:01:46.515185671Z at shadeio.poi.ss.formula.FormulaParser.expected(FormulaParser.java:272) 2022-05-29T02:01:46.515195984Z at shadeio.poi.ss.formula.FormulaParser.parseSimpleFactor(FormulaParser.java:1558) 2022-05-29T02:01:46.515200544Z at shadeio.poi.ss.formula.FormulaParser.percentFactor(FormulaParser.java:1511) 2022-05-29T02:01:46.515204460Z at shadeio.poi.ss.formula.FormulaParser.powerFactor(FormulaParser.java:1498) 2022-05-29T02:01:46.515208495Z at shadeio.poi.ss.formula.FormulaParser.Term(FormulaParser.java:1872) 2022-05-29T02:01:46.515212121Z at shadeio.poi.ss.formula.FormulaParser.additiveExpression(FormulaParser.java:1999) 2022-05-29T02:01:46.515215958Z at shadeio.poi.ss.formula.FormulaParser.concatExpression(FormulaParser.java:1983) 2022-05-29T02:01:46.515219717Z at shadeio.poi.ss.formula.FormulaParser.comparisonExpression(FormulaParser.java:1940) 2022-05-29T02:01:46.515223706Z at shadeio.poi.ss.formula.FormulaParser.intersectionExpression(FormulaParser.java:1913) 2022-05-29T02:01:46.515236277Z at shadeio.poi.ss.formula.FormulaParser.unionExpression(FormulaParser.java:1894) 2022-05-29T02:01:46.515243429Z at shadeio.poi.ss.formula.FormulaParser.parse(FormulaParser.java:2041) 2022-05-29T02:01:46.515247654Z at shadeio.poi.ss.formula.FormulaParser.parse(FormulaParser.java:173) 2022-05-29T02:01:46.515251274Z at shadeio.poi.xssf.usermodel.XSSFCell.setFormula(XSSFCell.java:480) 2022-05-29T02:01:46.515257312Z at shadeio.poi.xssf.usermodel.XSSFCell.setCellFormulaImpl(XSSFCell.java:460) 2022-05-29T02:01:46.515260998Z at shadeio.poi.ss.usermodel.CellBase.setCellFormula(CellBase.java:124) 2022-05-29T02:01:46.515264527Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.convertCell(Model2XlsxConversions.scala:49) 2022-05-29T02:01:46.515268517Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.$anonfun$convertRow$6(Model2XlsxConversions.scala:143) 2022-05-29T02:01:46.515307587Z at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) 2022-05-29T02:01:46.515312799Z at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) 2022-05-29T02:01:46.515317106Z at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) 2022-05-29T02:01:46.515327768Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.convertRow(Model2XlsxConversions.scala:143) 2022-05-29T02:01:46.515334415Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.$anonfun$writeToExistingSheet$2(Model2XlsxConversions.scala:156) 2022-05-29T02:01:46.515338305Z at scala.collection.immutable.List.foreach(List.scala:431) 2022-05-29T02:01:46.515341243Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.writeToExistingSheet(Model2XlsxConversions.scala:156) 2022-05-29T02:01:46.515344824Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.$anonfun$writeToExistingWorkbook$1(Model2XlsxConversions.scala:323) 2022-05-29T02:01:46.515347795Z at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36) 2022-05-29T02:01:46.515356669Z at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33) 2022-05-29T02:01:46.515363574Z at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38) 2022-05-29T02:01:46.515370705Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.writeToExistingWorkbook(Model2XlsxConversions.scala:320) 2022-05-29T02:01:46.515378848Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$XlsxWorkbook.writeToExisting(Model2XlsxConversions.scala:420) 2022-05-29T02:01:46.515400467Z at com.crealytics.spark.excel.ExcelFileSaver.writeToWorkbook$1(ExcelFileSaver.scala:37) 2022-05-29T02:01:46.515435730Z at com.crealytics.spark.excel.ExcelFileSaver.save(ExcelFileSaver.scala:45) 2022-05-29T02:01:46.515442388Z at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:54) 2022-05-29T02:01:46.515446291Z at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)