awslabs / python-deequ

Python API for Deequ
Apache License 2.0
713 stars 134 forks source link

'ScalaFunction1' object has no attribute 'hashCode' #91

Open jaronsinz opened 2 years ago

jaronsinz commented 2 years ago

Describe the bug When creating a VerificationSuite containing at least 5 checks and one of them being a hasDataType check, I get an error "AttributeError: 'ScalaFunction1' object has no attribute 'hashCode'".

To Reproduce

import pydeequ
from pydeequ.checks import *
from pydeequ.verification import *
from pydeequ.suggestions import *
from pydeequ.analyzers import DataType
from pyspark.sql.functions import col

df = readFromDatabase(***)

pkRevID_check = Check(spark, CheckLevel.Error, "'PK_Review_ID' check")
id_check = Check(spark, CheckLevel.Error, "'ID' check")
type_check = Check(spark, CheckLevel.Error, "'Type' check")
typeID_check = Check(spark, CheckLevel.Error, "'Type_ID' is not null")
content_check = Check(spark, CheckLevel.Warning, "'Content' is not null")
url = Check(spark, CheckLevel.Error, "'Url' is not null")

Adding any number of checks to the VerificationSuite works fine.

vs = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        pkRevID_check.isComplete('PK_Review_ID', "PK_Review_ID is not complete.") 
        .isNonNegative('PK_Review_ID',hint="PK_review_ID has negative values.") 
        .isUnique('PK_Review_ID', "PK_Review_ID is not unique.")  
    .addCheck(
        id_check.isComplete('ID', "ID is not complete.")) \
    .addCheck(
        type_check.isComplete('Type', "Type is not complete")) \
    .addCheck(
        typeID_check.isComplete('Type_ID', "Type_ID is not complete")) \
    .addCheck(
        content_check.isComplete('Content', "Content is not complete")) \
    .addCheck(
        url.isComplete('Url', "Url is not complete")) \
    .run()

Adding less than 5 checks containing a hasDataType check works fine.

vs = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        pkRevID_check.isComplete('PK_Review_ID', "PK_Review_ID is not complete.") 
        .isNonNegative('PK_Review_ID',hint="PK_review_ID has negative values.") 
        .isUnique('PK_Review_ID', "PK_Review_ID is not unique.")  
        .hasDataType('PK_Review_ID', ConstrainableDataTypes.Integral, "PK_Review_ID elements aren't exclusivly ints")
         )  \
    .addCheck(
        id_check.isComplete('ID', "ID is not complete.")) \
    .addCheck(
        type_check.isComplete('Type', "Type is not complete")) \
    .addCheck(
        typeID_check.isComplete('Type_ID', "Type_ID is not complete")) \
    .run()

Adding 5 or more checks with at least 1 hasDataType check raises the error.

vs = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        pkRevID_check.isComplete('PK_Review_ID', "PK_Review_ID is not complete.") 
        .isNonNegative('PK_Review_ID',hint="PK_review_ID has negative values.") 
        .isUnique('PK_Review_ID', "PK_Review_ID is not unique.")  
        .hasDataType('PK_Review_ID', ConstrainableDataTypes.Integral, "PK_Review_ID elements aren't exclusivly ints")
         )  \
    .addCheck(
        id_check.isComplete('ID', "ID is not complete.")) \
    .addCheck(
        type_check.isComplete('Type', "Type is not complete")) \
    .addCheck(
        typeID_check.isComplete('Type_ID', "Type_ID is not complete")) \
    .addCheck(
        content_check.isComplete('Content', "Content is not complete")) \
    .run()

Expected behavior VerificationSuite is built and checks are added.

Log

Py4JJavaError                             Traceback (most recent call last)
<[command-1389210201976180]()> in <module>
----> 1 onlyDataType = VerificationSuite(spark).onData(df) \
      2 .addCheck(pkRevID_complete) \
      3 .addCheck(pkRevID_nonNeg) \
      4 .addCheck(pkRevID_unique) \
      5 .addCheck(pkRevID_dataType) \

/databricks/python/lib/python3.8/site-packages/pydeequ/verification.py in run(self)
    207         :return:a verificationResult object
    208         """
--> 209         return VerificationResult(self._spark_session, self._VerificationRunBuilder.run())
    210 
    211     def useRepository(self, repository):

/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1302 
   1303         answer = self.gateway_client.send_command(command)
-> 1304         return_value = get_return_value(
   1305             answer, self.gateway_client, self.target_id, self.name)
   1306 

/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
    115     def deco(*a, **kw):
    116         try:
--> 117             return f(*a, **kw)
    118         except py4j.protocol.Py4JJavaError as e:
    119             converted = convert_exception(e.java_exception)

/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    324             value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
    325             if answer[1] == REFERENCE_TYPE:
--> 326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
    328                     format(target_id, ".", name), value)

Py4JJavaError: An error occurred while calling o1942.run.
: py4j.Py4JException: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
  File "/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 2442, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
AttributeError: 'ScalaFunction1' object has no attribute 'hashCode'

    at py4j.Protocol.getReturnValue(Protocol.java:476)
    at py4j.reflection.PythonProxyHandler.invoke(PythonProxyHandler.java:108)
    at com.sun.proxy.$Proxy82.hashCode(Unknown Source)
    at scala.runtime.Statics.anyHash(Statics.java:122)
    at scala.util.hashing.MurmurHash3.productHash(MurmurHash3.scala:68)
    at scala.util.hashing.MurmurHash3$.productHash(MurmurHash3.scala:215)
    at scala.runtime.ScalaRunTime$._hashCode(ScalaRunTime.scala:149)
    at com.amazon.deequ.constraints.AnalysisBasedConstraint.hashCode(AnalysisBasedConstraint.scala:42)
    at scala.runtime.Statics.anyHash(Statics.java:122)
    at scala.util.hashing.MurmurHash3.listHash(MurmurHash3.scala:173)
    at scala.util.hashing.MurmurHash3$.seqHash(MurmurHash3.scala:225)
    at scala.collection.LinearSeqLike.hashCode(LinearSeqLike.scala:41)
    at scala.collection.LinearSeqLike.hashCode$(LinearSeqLike.scala:41)
    at scala.collection.immutable.List.hashCode(List.scala:89)
    at scala.runtime.Statics.anyHash(Statics.java:122)
    at scala.util.hashing.MurmurHash3.productHash(MurmurHash3.scala:68)
    at scala.util.hashing.MurmurHash3$.productHash(MurmurHash3.scala:215)
    at scala.runtime.ScalaRunTime$._hashCode(ScalaRunTime.scala:149)
    at com.amazon.deequ.checks.Check.hashCode(Check.scala:61)
    at scala.runtime.Statics.anyHash(Statics.java:122)
    at scala.collection.immutable.HashMap.elemHashCode(HashMap.scala:87)
    at scala.collection.immutable.HashMap.computeHash(HashMap.scala:96)
    at scala.collection.immutable.HashMap.updated(HashMap.scala:62)
    at scala.collection.immutable.Map$Map4.updated(Map.scala:227)
    at scala.collection.immutable.Map$Map4.$plus(Map.scala:228)
    at scala.collection.immutable.Map$Map4.$plus(Map.scala:200)
    at scala.collection.mutable.MapBuilder.$plus$eq(MapBuilder.scala:32)
    at scala.collection.mutable.MapBuilder.$plus$eq(MapBuilder.scala:28)
    at scala.collection.TraversableOnce.$anonfun$toMap$1(TraversableOnce.scala:322)
    at scala.collection.immutable.List.foreach(List.scala:392)
    at scala.collection.TraversableOnce.toMap(TraversableOnce.scala:321)
    at scala.collection.TraversableOnce.toMap$(TraversableOnce.scala:319)
    at scala.collection.AbstractTraversable.toMap(Traversable.scala:108)
    at com.amazon.deequ.VerificationSuite.evaluate(VerificationSuite.scala:270)
    at com.amazon.deequ.VerificationSuite.doVerificationRun(VerificationSuite.scala:132)
    at com.amazon.deequ.VerificationRunBuilder.run(VerificationRunBuilder.scala:173)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
    at py4j.Gateway.invoke(Gateway.java:295)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:251)
    at java.lang.Thread.run(Thread.java:748)
poolis commented 9 months ago

I am running into this issue as well.