Describe the bug
When creating a VerificationSuite containing at least 5 checks and one of them being a hasDataType check, I get an error "AttributeError: 'ScalaFunction1' object has no attribute 'hashCode'".
To Reproduce
import pydeequ
from pydeequ.checks import *
from pydeequ.verification import *
from pydeequ.suggestions import *
from pydeequ.analyzers import DataType
from pyspark.sql.functions import col
df = readFromDatabase(***)
pkRevID_check = Check(spark, CheckLevel.Error, "'PK_Review_ID' check")
id_check = Check(spark, CheckLevel.Error, "'ID' check")
type_check = Check(spark, CheckLevel.Error, "'Type' check")
typeID_check = Check(spark, CheckLevel.Error, "'Type_ID' is not null")
content_check = Check(spark, CheckLevel.Warning, "'Content' is not null")
url = Check(spark, CheckLevel.Error, "'Url' is not null")
Adding any number of checks to the VerificationSuite works fine.
vs = VerificationSuite(spark) \
.onData(df) \
.addCheck(
pkRevID_check.isComplete('PK_Review_ID', "PK_Review_ID is not complete.")
.isNonNegative('PK_Review_ID',hint="PK_review_ID has negative values.")
.isUnique('PK_Review_ID', "PK_Review_ID is not unique.")
.addCheck(
id_check.isComplete('ID', "ID is not complete.")) \
.addCheck(
type_check.isComplete('Type', "Type is not complete")) \
.addCheck(
typeID_check.isComplete('Type_ID', "Type_ID is not complete")) \
.addCheck(
content_check.isComplete('Content', "Content is not complete")) \
.addCheck(
url.isComplete('Url', "Url is not complete")) \
.run()
Adding less than 5 checks containing a hasDataType check works fine.
vs = VerificationSuite(spark) \
.onData(df) \
.addCheck(
pkRevID_check.isComplete('PK_Review_ID', "PK_Review_ID is not complete.")
.isNonNegative('PK_Review_ID',hint="PK_review_ID has negative values.")
.isUnique('PK_Review_ID', "PK_Review_ID is not unique.")
.hasDataType('PK_Review_ID', ConstrainableDataTypes.Integral, "PK_Review_ID elements aren't exclusivly ints")
) \
.addCheck(
id_check.isComplete('ID', "ID is not complete.")) \
.addCheck(
type_check.isComplete('Type', "Type is not complete")) \
.addCheck(
typeID_check.isComplete('Type_ID', "Type_ID is not complete")) \
.run()
Adding 5 or more checks with at least 1 hasDataType check raises the error.
vs = VerificationSuite(spark) \
.onData(df) \
.addCheck(
pkRevID_check.isComplete('PK_Review_ID', "PK_Review_ID is not complete.")
.isNonNegative('PK_Review_ID',hint="PK_review_ID has negative values.")
.isUnique('PK_Review_ID', "PK_Review_ID is not unique.")
.hasDataType('PK_Review_ID', ConstrainableDataTypes.Integral, "PK_Review_ID elements aren't exclusivly ints")
) \
.addCheck(
id_check.isComplete('ID', "ID is not complete.")) \
.addCheck(
type_check.isComplete('Type', "Type is not complete")) \
.addCheck(
typeID_check.isComplete('Type_ID', "Type_ID is not complete")) \
.addCheck(
content_check.isComplete('Content', "Content is not complete")) \
.run()
Expected behavior
VerificationSuite is built and checks are added.
Log
Py4JJavaError Traceback (most recent call last)
<[command-1389210201976180]()> in <module>
----> 1 onlyDataType = VerificationSuite(spark).onData(df) \
2 .addCheck(pkRevID_complete) \
3 .addCheck(pkRevID_nonNeg) \
4 .addCheck(pkRevID_unique) \
5 .addCheck(pkRevID_dataType) \
/databricks/python/lib/python3.8/site-packages/pydeequ/verification.py in run(self)
207 :return:a verificationResult object
208 """
--> 209 return VerificationResult(self._spark_session, self._VerificationRunBuilder.run())
210
211 def useRepository(self, repository):
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
115 def deco(*a, **kw):
116 try:
--> 117 return f(*a, **kw)
118 except py4j.protocol.Py4JJavaError as e:
119 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o1942.run.
: py4j.Py4JException: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
File "/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 2442, in _call_proxy
return_value = getattr(self.pool[obj_id], method)(*params)
AttributeError: 'ScalaFunction1' object has no attribute 'hashCode'
at py4j.Protocol.getReturnValue(Protocol.java:476)
at py4j.reflection.PythonProxyHandler.invoke(PythonProxyHandler.java:108)
at com.sun.proxy.$Proxy82.hashCode(Unknown Source)
at scala.runtime.Statics.anyHash(Statics.java:122)
at scala.util.hashing.MurmurHash3.productHash(MurmurHash3.scala:68)
at scala.util.hashing.MurmurHash3$.productHash(MurmurHash3.scala:215)
at scala.runtime.ScalaRunTime$._hashCode(ScalaRunTime.scala:149)
at com.amazon.deequ.constraints.AnalysisBasedConstraint.hashCode(AnalysisBasedConstraint.scala:42)
at scala.runtime.Statics.anyHash(Statics.java:122)
at scala.util.hashing.MurmurHash3.listHash(MurmurHash3.scala:173)
at scala.util.hashing.MurmurHash3$.seqHash(MurmurHash3.scala:225)
at scala.collection.LinearSeqLike.hashCode(LinearSeqLike.scala:41)
at scala.collection.LinearSeqLike.hashCode$(LinearSeqLike.scala:41)
at scala.collection.immutable.List.hashCode(List.scala:89)
at scala.runtime.Statics.anyHash(Statics.java:122)
at scala.util.hashing.MurmurHash3.productHash(MurmurHash3.scala:68)
at scala.util.hashing.MurmurHash3$.productHash(MurmurHash3.scala:215)
at scala.runtime.ScalaRunTime$._hashCode(ScalaRunTime.scala:149)
at com.amazon.deequ.checks.Check.hashCode(Check.scala:61)
at scala.runtime.Statics.anyHash(Statics.java:122)
at scala.collection.immutable.HashMap.elemHashCode(HashMap.scala:87)
at scala.collection.immutable.HashMap.computeHash(HashMap.scala:96)
at scala.collection.immutable.HashMap.updated(HashMap.scala:62)
at scala.collection.immutable.Map$Map4.updated(Map.scala:227)
at scala.collection.immutable.Map$Map4.$plus(Map.scala:228)
at scala.collection.immutable.Map$Map4.$plus(Map.scala:200)
at scala.collection.mutable.MapBuilder.$plus$eq(MapBuilder.scala:32)
at scala.collection.mutable.MapBuilder.$plus$eq(MapBuilder.scala:28)
at scala.collection.TraversableOnce.$anonfun$toMap$1(TraversableOnce.scala:322)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableOnce.toMap(TraversableOnce.scala:321)
at scala.collection.TraversableOnce.toMap$(TraversableOnce.scala:319)
at scala.collection.AbstractTraversable.toMap(Traversable.scala:108)
at com.amazon.deequ.VerificationSuite.evaluate(VerificationSuite.scala:270)
at com.amazon.deequ.VerificationSuite.doVerificationRun(VerificationSuite.scala:132)
at com.amazon.deequ.VerificationRunBuilder.run(VerificationRunBuilder.scala:173)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Describe the bug When creating a VerificationSuite containing at least 5 checks and one of them being a hasDataType check, I get an error "AttributeError: 'ScalaFunction1' object has no attribute 'hashCode'".
To Reproduce
Adding any number of checks to the VerificationSuite works fine.
Adding less than 5 checks containing a hasDataType check works fine.
Adding 5 or more checks with at least 1 hasDataType check raises the error.
Expected behavior VerificationSuite is built and checks are added.
Log