boostcampaitech7 / level2-competitiveds-recsys-06

level2-competitiveds-recsys-06 created by GitHub Classroom
2 stars 0 forks source link

[Model] LightGBM #55

Closed ChoonB closed 1 month ago

ChoonB commented 1 month ago

241016_KGY_V4_lightGBM_1 Test public score: 4596.8658

columns = ['apt_idx', 'index', 'area_m2', 'contract_type', 'floor', 'built_year', 'latitude', 'longitude', 'deposit', '_type', 'grid_id', 'interest_rate', 'diff_interest_rate', 'apt_deposit_rank', 'apt_area_deposit_rank', 'recent_deposit']
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
X_train['recent_deposit'] = X_train['recent_deposit'].fillna(39000)

params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42
}

model = lgb.train(
    params, 
    dtrain, 
    num_boost_round=1000, 
    valid_sets=[dtrain, dval], 
    callbacks=[print_evaluation(period=10)]
)

[1000] Train MAE: 3881.4096, Val MAE: 3923.5255

image

ChoonB commented 1 month ago

241016_KGY_V4(+date+subway)_lightGBM_2 / Test public score: 4836.6036

columns = ['apt_idx', 'index', 'area_m2', 'contract_type', 'floor', 'built_year', 'latitude', 'longitude', 'deposit', '_type', 'grid_id', 'interest_rate', 'diff_interest_rate', 'apt_deposit_rank', 'apt_area_deposit_rank', 'recent_deposit', 'nearest_subway_distance', 'num_subway_within_1km', 'category_interchange_within_1km', 'num_subway_within_500m', 'category_interchange_within_500m', 'year','month', 'day', 'month_sin', 'month_cos']

params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42
}

    model = lgb.train(
        params, 
        dtrain, 
        num_boost_round=1000, 
        valid_sets=[dtrain, dval], 
        callbacks=[print_evaluation(period=100)]
    )

Overall OOF MAE: 3918.8854

image

ChoonB commented 1 month ago
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# 학습에 사용할 특징 열
feature_columns = [col for col in temp_train_df.columns if col not in ['deposit', '_type', 'index']]

# 학습 데이터 준비
X = temp_train_df[feature_columns]
y = temp_train_df['deposit']

# 결측치 대체
X['recent_deposit'] = X['recent_deposit'].fillna(39000)

# LightGBM 파라미터 설정
params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42
}

# 콜백 함수 정의
def print_evaluation(period=10):
    def callback(env):
        if (env.iteration + 1) % period == 0:
            train_mae = env.evaluation_result_list[0][2]
            val_mae = env.evaluation_result_list[1][2]
            print(f"[{env.iteration + 1}] Train MAE: {train_mae:.4f}, Val MAE: {val_mae:.4f}")
    return callback

# 5-fold 교차 검증 설정
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# 각 폴드의 예측 결과를 저장할 리스트
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(test_df))

# 교차 검증 수행
for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
    print(f"\nFold {fold}")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    model = lgb.train(
        params, 
        dtrain, 
        num_boost_round=1000, 
        valid_sets=[dtrain, dval], 
        callbacks=[print_evaluation(period=100)]
    )

    # 검증 세트에 대한 예측
    oof_predictions[val_idx] = model.predict(X_val)

    # 테스트 세트에 대한 예측
    test_predictions += model.predict(test_df[feature_columns]) / n_folds

# 전체 검증 세트에 대한 MAE 계산
oof_mae = mean_absolute_error(y, oof_predictions)
print(f"\nOverall OOF MAE: {oof_mae:.4f}")

# 테스트 세트에 대한 최종 예측 결과
y_pred = test_predictions

print("\nTraining completed.")
ChoonB commented 1 month ago

241017_KGY_V4_LightGBM_Kfold_3_이자제거 / Test public score: 4018.9173

columns_to_drop = ['area_m2_price', 'area_price', 'year_month', 'contract_ymd', 'original_index', 'contract_year_month', 'interest_rate', 'diff_interest_rate']
df = df.drop(columns=columns_to_drop, errors='ignore')

image

Overall OOF MAE: 4340.2073

향후 age-builtyear 통합, contract_day 제거 contract_type 원핫코딩변환, deposit을 log변환해서 추정, 평당가격으로 추정 방법 등을 시도해볼 예정

ChoonB commented 1 month ago

LightGBM (EDA/김건율/LightGBM/MODEL_LightGBM_2.ipynb)

k=5



- feature importance
![image](https://github.com/user-attachments/assets/20847a16-f562-495f-bd6d-bfc80eeadaf7)
ChoonB commented 1 month ago

Overall OOF MAE: 3896.4723

columns = ['index', 'area_m2', 'floor', 'age', 'latitude', 'longitude', 'apt_idx','nearest_subway_distance','num_subway_within_1km', 'category_interchange_within_1km', 'deposit', '_type', 'grid_id', 'apt_deposit_rank', 'apt_area_deposit_rank', 'recent_deposit', 'contract_year_month', 'arima_deposit_index', 'contract_0', 'contract_1']

ChoonB commented 1 month ago

MODEL_LightGBM_5_area_deposit

params = { 'objective': 'regression', 'metric': 'mae', 'boosting_type': 'gbdt', 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'seed': 42, }

k=5 num=1000


* feature importance
![image](https://github.com/user-attachments/assets/58629753-be20-4eeb-802e-87ccbaba6f58)

```python
y_pred_df['deposit'] = y_pred_df['area_price'] * test_df['area'].values

y_pred_df.drop(columns=['area_price'], inplace=True)

y_pred_df
ChoonB commented 1 month ago

MODEL_LightGBM_v7_infra


```python
columns = ['index', 'area_m2', 'floor', 'age', 'latitude', 'longitude', 'apt_idx','nearest_subway_distance', 'category_interchange_within_1km',
           'deposit', '_type', 'grid_id', 'apt_deposit_rank', 'apt_area_deposit_rank', 'recent_deposit', 
           'contract_year_month', 'contract_0', 'contract_1', 'infra_score']

params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42,
}

k=5
num=1000
ChoonB commented 1 month ago

MODEL_LightGBM_v9

columns = ['index','_type',
    'contract_date_numeric', 
    'area_m2',
    'floor', 'built_year', 'latitude', 'longitude','age', 
    'contract_0', 'contract_1', 
    'deposit', 
    'apt_idx', 
    'area',
    'grid_deposit', 
    'apt_deposit_rank', 'apt_area_deposit_rank',
    'recent_deposit', 
    'nearest_park_distance','nearest_park_idx', 'park_area',
    'nearest_school_distance', 'nearest_school_idx',
    'nearest_subway_distance', 'nearest_subway_idx',
    'park_count', 'school_count', 'subway_count',]

params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42,
    'early_stopping_round' : 2500
}

    model = lgb.train(
        params, 
        dtrain, 
        num_boost_round=12000, 
        valid_sets=[dtrain, dval], 
        callbacks=[print_evaluation(period=100)],
    )
ChoonB commented 1 month ago

MODEL_LightGBM_v9_2

columns = ['index','_type',
    'contract_date_numeric', 
    'area_m2',
    'floor', 'built_year', 'latitude', 'longitude','age', 
    'contract_0', 'contract_1', 
    'deposit', 
    'apt_idx', 
    'area',
    'grid_deposit', 
    'apt_deposit_rank', 'apt_area_deposit_rank',
    'recent_deposit', 
    'nearest_park_distance','nearest_park_idx', 'park_area',
    'nearest_school_distance', 'nearest_school_idx',
    'nearest_subway_distance', 'nearest_subway_idx',
    'park_count', 'school_count', 'subway_count','subway_1']

    model = lgb.train(
        best_params,
        dtrain,
        num_boost_round=12000,
        valid_sets=[dtrain, dval],
        callbacks=[print_evaluation(100)]
    )