Closed NvTimLiu closed 6 months ago
spark-rapids-ml_nightly/307 nightly build/test FAILED tests/test_logistic_regression.py::test_sparse_nlp20news
detailed log,
------------------------------ Captured log call ------------------------------- WARNING spark_rapids_ml.classification.LogisticRegression:classification.py:951 when standardization is True, spark rapids ml forces densifying sparse vectors to dense vectors for training. INFO spark_rapids_ml.classification.LogisticRegression:core.py:905 Stage-level scheduling in spark-rapids-ml requires spark standalone or local-cluster mode INFO spark_rapids_ml.classification.LogisticRegression:core.py:1008 Training spark-rapids-ml with 1 worker(s) ... INFO spark_rapids_ml.classification.LogisticRegression:core.py:1012 Finished training ______________________ test_sparse_nlp20news[True-False] _______________________ fit_intercept = False, standardization = True caplog = <_pytest.logging.LogCaptureFixture object at 0x7f9fffe6d130> @pytest.mark.parametrize("fit_intercept", [True, False]) @pytest.mark.parametrize("standardization", [True, False]) @pytest.mark.slow def test_sparse_nlp20news( fit_intercept: bool, standardization: bool, caplog: LogCaptureFixture, ) -> None: if version.parse(pyspark.__version__) < version.parse("3.4.0"): import logging err_msg = ( "pyspark < 3.4 is detected. Cannot import pyspark `unwrap_udt` function. " ) "The test case will be skipped. Please install pyspark>=3.4." logging.info(err_msg) return tolerance = 0.001 reg_param = 1e-2 from pyspark.ml.feature import CountVectorizer, RegexTokenizer from sklearn.datasets import fetch_20newsgroups try: twenty_train = fetch_20newsgroups(subset="train", shuffle=True, random_state=42) except: pytest.xfail(reason="Error fetching 20 newsgroup dataset") X = twenty_train.data y = twenty_train.target.tolist() conf: Dict[str, Any] = { # "spark.rapids.ml.uvm.enabled": True # Commenting this out can resolve a cudaMemSet error } # enable memory management to run the test case on GPU with small memory (e.g. 2G) with CleanSparkSession(conf) as spark: data = [ Row( label=y[i], weight=1.0, text=X[i], ) for i in range(len(X)) ] df = spark.createDataFrame(data) tokenizer = RegexTokenizer(inputCol="text", outputCol="tokens") df = tokenizer.transform(df) cv = CountVectorizer(inputCol="tokens", outputCol="features") cv_model = cv.fit(df) df = cv_model.transform(df) df_train, df_test = df.randomSplit([0.8, 0.2]) gpu_lr = LogisticRegression( enable_sparse_data_optim=True, verbose=6, regParam=reg_param, fitIntercept=fit_intercept, standardization=standardization, featuresCol="features", labelCol="label", ) cpu_lr = SparkLogisticRegression( regParam=reg_param, fitIntercept=fit_intercept, standardization=standardization, featuresCol="features", labelCol="label", ) gpu_model = gpu_lr.fit(df_train) cpu_model = cpu_lr.fit(df_train) cpu_objective = cpu_model.summary.objectiveHistory[-1] assert ( gpu_model.objective < cpu_objective or abs(gpu_model.objective - cpu_objective) < tolerance ) # assert "CUDA managed memory enabled." in caplog.text if standardization is True: > compare_model( gpu_model, cpu_model, df_train, unit_tol=tolerance, total_tol=tolerance, accuracy_and_probability_only=True, ) tests/test_logistic_regression.py:1626: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ gpu_model = LogisticRegressionModel_1db9e11ae7a3 cpu_model = LogisticRegressionModel: uid=LogisticRegression_6a32ce9f6514, numClasses=20, numFeatures=253555 df_test = DataFrame[label: bigint, weight: double, text: string, tokens: array<string>, features: vector] unit_tol = 0.001, total_tol = 0.001, accuracy_and_probability_only = True def compare_model( gpu_model: LogisticRegressionModel, cpu_model: SparkLogisticRegressionModel, df_test: DataFrame, unit_tol: float = 1e-4, total_tol: float = 0.0, accuracy_and_probability_only: bool = False, ) -> Tuple[LogisticRegressionModel, SparkLogisticRegressionModel]: gpu_res = gpu_model.transform(df_test).collect() cpu_res = cpu_model.transform(df_test).collect() # compare accuracy gpu_pred = [row["prediction"] for row in gpu_res] cpu_pred = [row["prediction"] for row in cpu_res] ytest_true = [row["label"] for row in df_test.select(["label"]).collect()] from sklearn.metrics import accuracy_score gpu_acc = accuracy_score(ytest_true, gpu_pred) cpu_acc = accuracy_score(ytest_true, cpu_pred) assert gpu_acc >= cpu_acc or abs(gpu_acc - cpu_acc) < 1e-3 # compare probability column gpu_prob = [row["probability"].toArray().tolist() for row in gpu_res] cpu_prob = [row["probability"].toArray().tolist() for row in cpu_res] > assert array_equal(gpu_prob, cpu_prob, unit_tol, total_tol) E assert False E + where False = array_equal([[0.9980090260505676, 0.0001370712270727381, 7.46113873901777e-05, 7.152637408580631e-05, 6.654122262261808e-05, 9.080...391286779195, 0.00011759638437069952, 0.0001317733112955466, 0.00012303491530474275, 0.00011962313146796077, ...], ...], [[0.9980836056532186, 0.00014482474935862202, 8.055246997223285e-05, 7.516768294603247e-05, 6.941883566501614e-05, 9.5...763216810625, 0.00013226416045172013, 0.0001494816117127533, 0.00013551280265340038, 0.00013556668647785996, ...], ...], 0.001, 0.001) =========================== short test summary info ============================ FAILED tests/test_logistic_regression.py::test_sparse_nlp20news[True-True] - assert False + where False = array_equal([[0.9981017708778381, 9.364840661874041e-05, 8.274331776192412e-05, 7.307499618036672e-05, 7.410445687128231e-05, 9.34...702743703499436, 0.0005867596482858062, 0.0005200001178309321, 0.0005646763020195067, 0.0005598910502158105, ...], ...], [[0.9981481951071133, 8.559651669009758e-05, 7.91912906209109e-05, 6.951557133989944e-05, 6.767413359586998e-05, 9.365...6551604852984113, 0.000596236484533501, 0.0005253168769607861, 0.0005650071700412132, 0.0005810279635271993, ...], ...], 0.001, 0.001) FAILED tests/test_logistic_regression.py::test_sparse_nlp20news[True-False] - assert False + where False = array_equal([[0.9980090260505676, 0.0001370712270727381, 7.46113873901777e-05, 7.152637408580631e-05, 6.654122262261808e-05, 9.080...391286779195, 0.00011759638437069952, 0.0001317733112955466, 0.00012303491530474275, 0.00011962313146796077, ...], ...], [[0.9980836056532186, 0.00014482474935862202, 8.055246997223285e-05, 7.516768294603247e-05, 6.941883566501614e-05, 9.5...763216810625, 0.00013226416045172013, 0.0001494816117127533, 0.00013551280265340038, 0.00013556668647785996, ...], ...], 0.001, 0.001) ========== 2 failed, 644 passed, 8120 warnings in 10439.68s (2:53:59) ==========
This has been fixed. Please check. @NvTimLiu
spark-rapids-ml_nightly/307 nightly build/test FAILED tests/test_logistic_regression.py::test_sparse_nlp20news
detailed log,