NVIDIA / spark-rapids-ml

Spark RAPIDS MLlib – accelerate Apache Spark MLlib with GPUs
https://nvidia.github.io/spark-rapids-ml/
Apache License 2.0
67 stars 30 forks source link

[Bug] Nightly test failed : test_sparse_nlp20news #568

Closed NvTimLiu closed 6 months ago

NvTimLiu commented 8 months ago

spark-rapids-ml_nightly/307 nightly build/test FAILED tests/test_logistic_regression.py::test_sparse_nlp20news

detailed log,

 ------------------------------ Captured log call -------------------------------
 WARNING  spark_rapids_ml.classification.LogisticRegression:classification.py:951 when standardization is True, spark rapids ml forces densifying sparse vectors to dense vectors for training.
 INFO     spark_rapids_ml.classification.LogisticRegression:core.py:905 Stage-level scheduling in spark-rapids-ml requires spark standalone or local-cluster mode
 INFO     spark_rapids_ml.classification.LogisticRegression:core.py:1008 Training spark-rapids-ml with 1 worker(s) ...
 INFO     spark_rapids_ml.classification.LogisticRegression:core.py:1012 Finished training
 ______________________ test_sparse_nlp20news[True-False] _______________________

 fit_intercept = False, standardization = True
 caplog = <_pytest.logging.LogCaptureFixture object at 0x7f9fffe6d130>

     @pytest.mark.parametrize("fit_intercept", [True, False])
     @pytest.mark.parametrize("standardization", [True, False])
     @pytest.mark.slow
     def test_sparse_nlp20news(
         fit_intercept: bool,
         standardization: bool,
         caplog: LogCaptureFixture,
     ) -> None:
         if version.parse(pyspark.__version__) < version.parse("3.4.0"):
             import logging

             err_msg = (
                 "pyspark < 3.4 is detected. Cannot import pyspark `unwrap_udt` function. "
             )
             "The test case will be skipped. Please install pyspark>=3.4."
             logging.info(err_msg)
             return

         tolerance = 0.001
         reg_param = 1e-2

         from pyspark.ml.feature import CountVectorizer, RegexTokenizer
         from sklearn.datasets import fetch_20newsgroups

         try:
             twenty_train = fetch_20newsgroups(subset="train", shuffle=True, random_state=42)
         except:
             pytest.xfail(reason="Error fetching 20 newsgroup dataset")

         X = twenty_train.data
         y = twenty_train.target.tolist()

         conf: Dict[str, Any] = {
             # "spark.rapids.ml.uvm.enabled": True # Commenting this out can resolve a cudaMemSet error
         }  # enable memory management to run the test case on GPU with small memory (e.g. 2G)
         with CleanSparkSession(conf) as spark:
             data = [
                 Row(
                     label=y[i],
                     weight=1.0,
                     text=X[i],
                 )
                 for i in range(len(X))
             ]
             df = spark.createDataFrame(data)
             tokenizer = RegexTokenizer(inputCol="text", outputCol="tokens")
             df = tokenizer.transform(df)

             cv = CountVectorizer(inputCol="tokens", outputCol="features")
             cv_model = cv.fit(df)
             df = cv_model.transform(df)

             df_train, df_test = df.randomSplit([0.8, 0.2])

             gpu_lr = LogisticRegression(
                 enable_sparse_data_optim=True,
                 verbose=6,
                 regParam=reg_param,
                 fitIntercept=fit_intercept,
                 standardization=standardization,
                 featuresCol="features",
                 labelCol="label",
             )

             cpu_lr = SparkLogisticRegression(
                 regParam=reg_param,
                 fitIntercept=fit_intercept,
                 standardization=standardization,
                 featuresCol="features",
                 labelCol="label",
             )

             gpu_model = gpu_lr.fit(df_train)

             cpu_model = cpu_lr.fit(df_train)
             cpu_objective = cpu_model.summary.objectiveHistory[-1]

             assert (
                 gpu_model.objective < cpu_objective
                 or abs(gpu_model.objective - cpu_objective) < tolerance
             )

             # assert "CUDA managed memory enabled." in caplog.text

             if standardization is True:
 >               compare_model(
                     gpu_model,
                     cpu_model,
                     df_train,
                     unit_tol=tolerance,
                     total_tol=tolerance,
                     accuracy_and_probability_only=True,
                 )

 tests/test_logistic_regression.py:1626: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 gpu_model = LogisticRegressionModel_1db9e11ae7a3
 cpu_model = LogisticRegressionModel: uid=LogisticRegression_6a32ce9f6514, numClasses=20, numFeatures=253555
 df_test = DataFrame[label: bigint, weight: double, text: string, tokens: array<string>, features: vector]
 unit_tol = 0.001, total_tol = 0.001, accuracy_and_probability_only = True

     def compare_model(
         gpu_model: LogisticRegressionModel,
         cpu_model: SparkLogisticRegressionModel,
         df_test: DataFrame,
         unit_tol: float = 1e-4,
         total_tol: float = 0.0,
         accuracy_and_probability_only: bool = False,
     ) -> Tuple[LogisticRegressionModel, SparkLogisticRegressionModel]:
         gpu_res = gpu_model.transform(df_test).collect()

         cpu_res = cpu_model.transform(df_test).collect()

         # compare accuracy
         gpu_pred = [row["prediction"] for row in gpu_res]
         cpu_pred = [row["prediction"] for row in cpu_res]
         ytest_true = [row["label"] for row in df_test.select(["label"]).collect()]
         from sklearn.metrics import accuracy_score

         gpu_acc = accuracy_score(ytest_true, gpu_pred)
         cpu_acc = accuracy_score(ytest_true, cpu_pred)
         assert gpu_acc >= cpu_acc or abs(gpu_acc - cpu_acc) < 1e-3

         # compare probability column
         gpu_prob = [row["probability"].toArray().tolist() for row in gpu_res]
         cpu_prob = [row["probability"].toArray().tolist() for row in cpu_res]
 >       assert array_equal(gpu_prob, cpu_prob, unit_tol, total_tol)
 E       assert False
 E        +  where False = array_equal([[0.9980090260505676, 0.0001370712270727381, 7.46113873901777e-05, 7.152637408580631e-05, 6.654122262261808e-05, 9.080...391286779195, 0.00011759638437069952, 0.0001317733112955466, 0.00012303491530474275, 0.00011962313146796077, ...], ...], [[0.9980836056532186, 0.00014482474935862202, 8.055246997223285e-05, 7.516768294603247e-05, 6.941883566501614e-05, 9.5...763216810625, 0.00013226416045172013, 0.0001494816117127533, 0.00013551280265340038, 0.00013556668647785996, ...], ...], 0.001, 0.001)

=========================== short test summary info ============================
 FAILED tests/test_logistic_regression.py::test_sparse_nlp20news[True-True] - assert False
  +  where False = array_equal([[0.9981017708778381, 9.364840661874041e-05, 8.274331776192412e-05, 7.307499618036672e-05, 7.410445687128231e-05, 9.34...702743703499436, 0.0005867596482858062, 0.0005200001178309321, 0.0005646763020195067, 0.0005598910502158105, ...], ...], [[0.9981481951071133, 8.559651669009758e-05, 7.91912906209109e-05, 6.951557133989944e-05, 6.767413359586998e-05, 9.365...6551604852984113, 0.000596236484533501, 0.0005253168769607861, 0.0005650071700412132, 0.0005810279635271993, ...], ...], 0.001, 0.001)
 FAILED tests/test_logistic_regression.py::test_sparse_nlp20news[True-False] - assert False
  +  where False = array_equal([[0.9980090260505676, 0.0001370712270727381, 7.46113873901777e-05, 7.152637408580631e-05, 6.654122262261808e-05, 9.080...391286779195, 0.00011759638437069952, 0.0001317733112955466, 0.00012303491530474275, 0.00011962313146796077, ...], ...], [[0.9980836056532186, 0.00014482474935862202, 8.055246997223285e-05, 7.516768294603247e-05, 6.941883566501614e-05, 9.5...763216810625, 0.00013226416045172013, 0.0001494816117127533, 0.00013551280265340038, 0.00013556668647785996, ...], ...], 0.001, 0.001)
 ========== 2 failed, 644 passed, 8120 warnings in 10439.68s (2:53:59) ==========
lijinf2 commented 8 months ago

This has been fixed. Please check. @NvTimLiu