I keep getting this error when I add marian stage to the pipeline. (please see code below)
Current Behavior
sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
opus_mt_en_fr download started this may take some time.
Approximate size to download 378.7 MB
[OK!]
java.util.NoSuchElementException: Param doSample does not exist.
y4JJavaError Traceback (most recent call last)
File , line 23
14 pipeline = Pipeline() \
15 .setStages([
16 documentAssembler,
17 sentence,
18 marian
19 ])
21 data = spark.createDataFrame([["What is the capital of France? We should know this in french."]]).toDF("text")
---> 23 result = pipeline.fit(data).transform(data)
24 result.selectExpr("explode(translation.result) as result").show(truncate=False)
File /databricks/python_shell/dbruntime/MLWorkloadsInstrumentation/_pyspark.py:30, in _create_patch_function..patched_method(self, *args, *kwargs)
28 call_succeeded = False
29 try:
---> 30 result = original_method(self, args, **kwargs)
31 call_succeeded = True
32 return result
File /databricks/spark/python/pyspark/ml/base.py:262, in Transformer.transform(self, dataset, params)
260 return self.copy(params)._transform(dataset)
261 else:
--> 262 return self._transform(dataset)
263 else:
264 raise TypeError("Params must be a param map but got %s." % type(params))
File /databricks/spark/python/pyspark/ml/pipeline.py:304, in PipelineModel._transform(self, dataset)
302 def _transform(self, dataset: DataFrame) -> DataFrame:
303 for t in self.stages:
--> 304 dataset = t.transform(dataset)
305 return dataset
File /databricks/python_shell/dbruntime/MLWorkloadsInstrumentation/_pyspark.py:30, in _create_patch_function..patched_method(self, *args, *kwargs)
28 call_succeeded = False
29 try:
---> 30 result = original_method(self, args, **kwargs)
31 call_succeeded = True
32 return result
File /databricks/spark/python/pyspark/ml/base.py:262, in Transformer.transform(self, dataset, params)
260 return self.copy(params)._transform(dataset)
261 else:
--> 262 return self._transform(dataset)
263 else:
264 raise TypeError("Params must be a param map but got %s." % type(params))
File /databricks/spark/python/pyspark/ml/wrapper.py:397, in JavaTransformer._transform(self, dataset)
394 def _transform(self, dataset: DataFrame) -> DataFrame:
395 assert self._java_obj is not None
--> 397 self._transfer_params_to_java()
398 return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sparkSession)
File /databricks/spark/python/pyspark/ml/wrapper.py:171, in JavaParams._transfer_params_to_java(self)
169 for param in self.params:
170 if self.isSet(param):
--> 171 pair = self._make_java_param_pair(param, self._paramMap[param])
172 self._java_obj.set(pair)
173 if self.hasDefault(param):
File /databricks/spark/python/pyspark/ml/wrapper.py:158, in JavaParams._make_java_param_pair(self, param, value)
155 assert sc is not None and self._java_obj is not None
157 param = self._resolveParam(param)
--> 158 java_param = self._java_obj.getParam(param.name)
159 java_value = _py2java(sc, value)
160 return java_param.w(java_value)
Py4JJavaError: An error occurred while calling o1880.getParam.
: java.util.NoSuchElementException: Param doSample does not exist.
at org.apache.spark.ml.param.Params.$anonfun$getParam$2(params.scala:705)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.ml.param.Params.getParam(params.scala:705)
at org.apache.spark.ml.param.Params.getParam$(params.scala:703)
at org.apache.spark.ml.PipelineStage.getParam(Pipeline.scala:41)
at sun.reflect.GeneratedMethodAccessor508.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:397)
at py4j.Gateway.invoke(Gateway.java:306)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:199)
at py4j.ClientServerConnection.run(ClientServerConnection.java:119)
at java.lang.Thread.run(Thread.java:750)
Expected Behavior
Pipeline should be successfully executed.
Steps To Reproduce
import streamlit as st
import sparknlp
import os
from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import `PretrainedPipeline`
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
sentence = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") \
.setInputCols("document") \
.setOutputCol("sentence")
marian = MarianTransformer.pretrained() \
.setInputCols("sentence") \
.setOutputCol("translation") \
.setMaxInputLength(30)
pipeline = Pipeline() \
.setStages([
documentAssembler,
sentence,
marian
])
data = spark.createDataFrame([["What is the capital of France? We should know this in french."]]).toDF("text")
result = pipeline.fit(data).transform(data)
result.selectExpr("explode(translation.result) as result").show(truncate=False)
Spark NLP version and Apache Spark
Running this in Databricks notebook.
Spark NLP version 5.3.3
Apache Spark version: 3.5.0
Java Version 1.8.0_382
Scala Version 2.12.15
Is there an existing issue for this?
Who can help?
No response
What are you working on?
I am trying to setup spark nlp pipeline using the examples found in the documentation here https://sparknlp.org/api/python/reference/autosummary/sparknlp/annotator/seq2seq/marian_transformer/index.html
I keep getting this error when I add marian stage to the pipeline. (please see code below)
Current Behavior
sentence_detector_dl download started this may take some time. Approximate size to download 514.9 KB [OK!] opus_mt_en_fr download started this may take some time. Approximate size to download 378.7 MB [OK!] java.util.NoSuchElementException: Param doSample does not exist.
y4JJavaError Traceback (most recent call last) File, line 23
14 pipeline = Pipeline() \
15 .setStages([
16 documentAssembler,
17 sentence,
18 marian
19 ])
21 data = spark.createDataFrame([["What is the capital of France? We should know this in french."]]).toDF("text")
---> 23 result = pipeline.fit(data).transform(data)
24 result.selectExpr("explode(translation.result) as result").show(truncate=False)
File /databricks/python_shell/dbruntime/MLWorkloadsInstrumentation/_pyspark.py:30, in _create_patch_function..patched_method(self, *args, *kwargs)
28 call_succeeded = False
29 try:
---> 30 result = original_method(self, args, **kwargs)
31 call_succeeded = True
32 return result
File /databricks/spark/python/pyspark/ml/base.py:262, in Transformer.transform(self, dataset, params)
260 return self.copy(params)._transform(dataset)
261 else:
--> 262 return self._transform(dataset)
263 else:
264 raise TypeError("Params must be a param map but got %s." % type(params))
File /databricks/spark/python/pyspark/ml/pipeline.py:304, in PipelineModel._transform(self, dataset) 302 def _transform(self, dataset: DataFrame) -> DataFrame: 303 for t in self.stages: --> 304 dataset = t.transform(dataset) 305 return dataset
File /databricks/python_shell/dbruntime/MLWorkloadsInstrumentation/_pyspark.py:30, in _create_patch_function..patched_method(self, *args, *kwargs)
28 call_succeeded = False
29 try:
---> 30 result = original_method(self, args, **kwargs)
31 call_succeeded = True
32 return result
File /databricks/spark/python/pyspark/ml/base.py:262, in Transformer.transform(self, dataset, params) 260 return self.copy(params)._transform(dataset) 261 else: --> 262 return self._transform(dataset) 263 else: 264 raise TypeError("Params must be a param map but got %s." % type(params))
File /databricks/spark/python/pyspark/ml/wrapper.py:397, in JavaTransformer._transform(self, dataset) 394 def _transform(self, dataset: DataFrame) -> DataFrame: 395 assert self._java_obj is not None --> 397 self._transfer_params_to_java() 398 return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sparkSession)
File /databricks/spark/python/pyspark/ml/wrapper.py:171, in JavaParams._transfer_params_to_java(self) 169 for param in self.params: 170 if self.isSet(param): --> 171 pair = self._make_java_param_pair(param, self._paramMap[param]) 172 self._java_obj.set(pair) 173 if self.hasDefault(param):
File /databricks/spark/python/pyspark/ml/wrapper.py:158, in JavaParams._make_java_param_pair(self, param, value) 155 assert sc is not None and self._java_obj is not None 157 param = self._resolveParam(param) --> 158 java_param = self._java_obj.getParam(param.name) 159 java_value = _py2java(sc, value) 160 return java_param.w(java_value)
File /databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py:1355, in JavaMember.call(self, *args) 1349 command = proto.CALL_COMMAND_NAME +\ 1350 self.command_header +\ 1351 args_command +\ 1352 proto.END_COMMAND_PART 1354 answer = self.gateway_client.send_command(command) -> 1355 return_value = get_return_value( 1356 answer, self.gateway_client, self.target_id, self.name) 1358 for temp_arg in temp_args: 1359 if hasattr(temp_arg, "_detach"):
File /databricks/spark/python/pyspark/errors/exceptions/captured.py:188, in capture_sql_exception..deco(*a, kw)
186 def deco(*a: Any, *kw: Any) -> Any:
187 try:
--> 188 return f(a, kw)
189 except Py4JJavaError as e:
190 converted = convert_exception(e.java_exception)
File /databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/protocol.py:326, in get_return_value(answer, gateway_client, target_id, name) 324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client) 325 if answer[1] == REFERENCE_TYPE: --> 326 raise Py4JJavaError( 327 "An error occurred while calling {0}{1}{2}.\n". 328 format(target_id, ".", name), value) 329 else: 330 raise Py4JError( 331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n". 332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling o1880.getParam. : java.util.NoSuchElementException: Param doSample does not exist. at org.apache.spark.ml.param.Params.$anonfun$getParam$2(params.scala:705) at scala.Option.getOrElse(Option.scala:189) at org.apache.spark.ml.param.Params.getParam(params.scala:705) at org.apache.spark.ml.param.Params.getParam$(params.scala:703) at org.apache.spark.ml.PipelineStage.getParam(Pipeline.scala:41) at sun.reflect.GeneratedMethodAccessor508.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:397) at py4j.Gateway.invoke(Gateway.java:306) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:199) at py4j.ClientServerConnection.run(ClientServerConnection.java:119) at java.lang.Thread.run(Thread.java:750)
Expected Behavior
Pipeline should be successfully executed.
Steps To Reproduce
Spark NLP version and Apache Spark
Running this in Databricks notebook. Spark NLP version 5.3.3 Apache Spark version: 3.5.0 Java Version 1.8.0_382 Scala Version 2.12.15
Type of Spark Application
No response
Java Version
No response
Java Home Directory
No response
Setup and installation
PyPi, Maven
Operating System and Version
No response
Link to your project (if available)
No response
Additional Information
No response