Closed ChoonB closed 1 month ago
241016_KGY_V4(+date+subway)_lightGBM_2 / Test public score: 4836.6036
columns = ['apt_idx', 'index', 'area_m2', 'contract_type', 'floor', 'built_year', 'latitude', 'longitude', 'deposit', '_type', 'grid_id', 'interest_rate', 'diff_interest_rate', 'apt_deposit_rank', 'apt_area_deposit_rank', 'recent_deposit', 'nearest_subway_distance', 'num_subway_within_1km', 'category_interchange_within_1km', 'num_subway_within_500m', 'category_interchange_within_500m', 'year','month', 'day', 'month_sin', 'month_cos']
params = {
'objective': 'regression',
'metric': 'mae',
'boosting_type': 'gbdt',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'seed': 42
}
model = lgb.train(
params,
dtrain,
num_boost_round=1000,
valid_sets=[dtrain, dval],
callbacks=[print_evaluation(period=100)]
)
Overall OOF MAE: 3918.8854
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
# 학습에 사용할 특징 열
feature_columns = [col for col in temp_train_df.columns if col not in ['deposit', '_type', 'index']]
# 학습 데이터 준비
X = temp_train_df[feature_columns]
y = temp_train_df['deposit']
# 결측치 대체
X['recent_deposit'] = X['recent_deposit'].fillna(39000)
# LightGBM 파라미터 설정
params = {
'objective': 'regression',
'metric': 'mae',
'boosting_type': 'gbdt',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'seed': 42
}
# 콜백 함수 정의
def print_evaluation(period=10):
def callback(env):
if (env.iteration + 1) % period == 0:
train_mae = env.evaluation_result_list[0][2]
val_mae = env.evaluation_result_list[1][2]
print(f"[{env.iteration + 1}] Train MAE: {train_mae:.4f}, Val MAE: {val_mae:.4f}")
return callback
# 5-fold 교차 검증 설정
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
# 각 폴드의 예측 결과를 저장할 리스트
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(test_df))
# 교차 검증 수행
for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
print(f"\nFold {fold}")
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)
model = lgb.train(
params,
dtrain,
num_boost_round=1000,
valid_sets=[dtrain, dval],
callbacks=[print_evaluation(period=100)]
)
# 검증 세트에 대한 예측
oof_predictions[val_idx] = model.predict(X_val)
# 테스트 세트에 대한 예측
test_predictions += model.predict(test_df[feature_columns]) / n_folds
# 전체 검증 세트에 대한 MAE 계산
oof_mae = mean_absolute_error(y, oof_predictions)
print(f"\nOverall OOF MAE: {oof_mae:.4f}")
# 테스트 세트에 대한 최종 예측 결과
y_pred = test_predictions
print("\nTraining completed.")
241017_KGY_V4_LightGBM_Kfold_3_이자제거 / Test public score: 4018.9173
columns_to_drop = ['area_m2_price', 'area_price', 'year_month', 'contract_ymd', 'original_index', 'contract_year_month', 'interest_rate', 'diff_interest_rate']
df = df.drop(columns=columns_to_drop, errors='ignore')
Overall OOF MAE: 4340.2073
향후 age-builtyear 통합, contract_day 제거 contract_type 원핫코딩변환, deposit을 log변환해서 추정, 평당가격으로 추정 방법 등을 시도해볼 예정
data set
: v4change
: + 'arima_deposit_index' columns + (contract_year_month)columns
: v4에서 ['area_m2_price', 'area_price', 'year_month', 'contract_ymd', 'original_index', 'interest_rate', 'diff_interest_rate'] 제거
params = {
'objective': 'regression',
'metric': 'mae',
'boosting_type': 'gbdt',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'seed': 42
}
model = lgb.train(
params,
dtrain,
num_boost_round=1000,
valid_sets=[dtrain, dval],
callbacks=[print_evaluation(period=100)]
)
k=5
- feature importance
![image](https://github.com/user-attachments/assets/20847a16-f562-495f-bd6d-bfc80eeadaf7)
Overall OOF MAE: 3896.4723
columns = ['index', 'area_m2', 'floor', 'age', 'latitude', 'longitude', 'apt_idx','nearest_subway_distance','num_subway_within_1km', 'category_interchange_within_1km', 'deposit', '_type', 'grid_id', 'apt_deposit_rank', 'apt_area_deposit_rank', 'recent_deposit', 'contract_year_month', 'arima_deposit_index', 'contract_0', 'contract_1']
columns = ['index', 'area', 'floor', 'age', 'latitude', 'longitude', 'apt_idx','nearest_subway_distance','num_subway_within_1km', 'category_interchange_within_1km',
'area_price', '_type', 'grid_id', 'apt_deposit_rank', 'apt_area_deposit_rank', 'recent_deposit', 'contract_year_month',
'arima_deposit_index', 'contract_0', 'contract_1']
params = { 'objective': 'regression', 'metric': 'mae', 'boosting_type': 'gbdt', 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'seed': 42, }
k=5 num=1000
* feature importance
![image](https://github.com/user-attachments/assets/58629753-be20-4eeb-802e-87ccbaba6f58)
```python
y_pred_df['deposit'] = y_pred_df['area_price'] * test_df['area'].values
y_pred_df.drop(columns=['area_price'], inplace=True)
y_pred_df
v4_arima['infra_score'] = (
v4_arima['num_subway_within_1km'] * 0.419536 +
v4_arima['nearest_high_school_within_1km'] * 0.166885 +
v4_arima['nearest_middle_school_within_1km'] * 0.157445 +
v4_arima['nearest_elementary_school_within_1km'] * 0.115233 +
v4_arima['nearest_park_within_500.0m'] * 0.031857
)
```python
columns = ['index', 'area_m2', 'floor', 'age', 'latitude', 'longitude', 'apt_idx','nearest_subway_distance', 'category_interchange_within_1km',
'deposit', '_type', 'grid_id', 'apt_deposit_rank', 'apt_area_deposit_rank', 'recent_deposit',
'contract_year_month', 'contract_0', 'contract_1', 'infra_score']
params = {
'objective': 'regression',
'metric': 'mae',
'boosting_type': 'gbdt',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'seed': 42,
}
k=5
num=1000
columns = ['index','_type',
'contract_date_numeric',
'area_m2',
'floor', 'built_year', 'latitude', 'longitude','age',
'contract_0', 'contract_1',
'deposit',
'apt_idx',
'area',
'grid_deposit',
'apt_deposit_rank', 'apt_area_deposit_rank',
'recent_deposit',
'nearest_park_distance','nearest_park_idx', 'park_area',
'nearest_school_distance', 'nearest_school_idx',
'nearest_subway_distance', 'nearest_subway_idx',
'park_count', 'school_count', 'subway_count',]
params = {
'objective': 'regression',
'metric': 'mae',
'boosting_type': 'gbdt',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'seed': 42,
'early_stopping_round' : 2500
}
model = lgb.train(
params,
dtrain,
num_boost_round=12000,
valid_sets=[dtrain, dval],
callbacks=[print_evaluation(period=100)],
)
Best parameters: {'num_leaves': 51, 'learning_rate': 0.05, 'feature_fraction': 0.8383624972221345, 'bagging_fraction': 0.9770444779337153, 'bagging_freq': 4, 'min_child_samples': 31, 'max_depth': 35}
Best MAE: 3824.69624841404
columns = ['index','_type',
'contract_date_numeric',
'area_m2',
'floor', 'built_year', 'latitude', 'longitude','age',
'contract_0', 'contract_1',
'deposit',
'apt_idx',
'area',
'grid_deposit',
'apt_deposit_rank', 'apt_area_deposit_rank',
'recent_deposit',
'nearest_park_distance','nearest_park_idx', 'park_area',
'nearest_school_distance', 'nearest_school_idx',
'nearest_subway_distance', 'nearest_subway_idx',
'park_count', 'school_count', 'subway_count','subway_1']
model = lgb.train(
best_params,
dtrain,
num_boost_round=12000,
valid_sets=[dtrain, dval],
callbacks=[print_evaluation(100)]
)
241016_KGY_V4_lightGBM_1 Test public score: 4596.8658
[1000] Train MAE: 3881.4096, Val MAE: 3923.5255