Open DrPepper8888 opened 3 months ago
from pyspark.ml.feature import VectorAssembler from pyspark.ml import Pipeline from pyspark.ml.xgboost import XGBoostClassifier from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.sql.functions import col # 假设spark_df是你的PySpark DataFrame,包含特征列和目标列 # 特征列名列表 feature_cols = ['feature1', 'feature2', 'feature3', ...] # 目标列名 label_col = 'target' # 创建一个向量组装器,将所有特征组合成一个单一的特征向量 assembler = VectorAssembler(inputCols=feature_cols, outputCol="features") # 初始化XGBoost分类器 xgb = XGBoostClassifier(objective='binary:logistic', evalMetric='auc') # 创建Pipeline,包含特征组装和XGBoost分类器 pipeline = Pipeline(stages=[assembler, xgb]) # 训练模型 model = pipeline.fit(spark_df) # 获取XGBoost模型的Booster对象 booster = XGBoostUtils.getBooster(model) # 获取特征重要性 feature_importances = booster.getFeatureImportances() # 将特征重要性排序并获取特征名和重要性 sorted_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True) feature_importance_df = spark_df.select(col("features"), col("label")).withColumn("importance", col("features").as("importance")) # 选择重要性排名前N的特征 N = 5 # 假设我们选择前5个最重要的特征 selected_features = [feature for feature, importance in sorted_importances[:N]] # 打印选出的特征 print("Selected features:", selected_features) # 如果需要,可以创建一个新的DataFrame只包含这些特征 selected_features_df = spark_df.select(*selected_features)