This PR adds support for xxhash64 with struct , list and map types.
The hardcoded expected values in the tests are generated using Spark.
For example, you can produce the same results as in testXXHash64ListOfStruct using the following code.
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
val schema = StructType(Seq(
StructField("listofstruct",
ArrayType(
StructType(Seq(
StructField("string", StringType, true),
StructField("int", IntegerType, true),
StructField("double", DoubleType, true),
StructField("float", FloatType, true),
StructField("bool", BooleanType, true)
)),
true
)
)
))
val data = Seq(
Row(List()),
Row(List(Row("a", 0, 0.0, 0f, true))),
Row(List(Row("B\n", 100, 100.0, 100f, false), Row("dE\"\u0100\t\u0101 \ud720\ud721", -100, -100.0, -100f, null))),
Row(List(Row("A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
"in the MD5 hash function. This string needed to be longer.", Int.MinValue, java.lang.Double.longBitsToDouble(0x7ff0000000000001L), java.lang.Float.intBitsToFloat(0xff800001), false))),
Row(List(Row(null, Int.MaxValue, java.lang.Double.longBitsToDouble(0x7fffffffffffffffL), java.lang.Float.intBitsToFloat(0xffffffff), true), Row(null, null, null, null, null))),
Row(null)
)
val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
df.columns.foreach(c => println(s"$c => ${df.select(xxhash64(col(c))).collect.mkString(",")}"))
Contribute to https://github.com/NVIDIA/spark-rapids-jni/issues/2574
This PR adds support for
xxhash64
withstruct
,list
andmap
types.The hardcoded expected values in the tests are generated using Spark. For example, you can produce the same results as in
testXXHash64ListOfStruct
using the following code.You will get the following output: