Proj-Caliber / Job-Care

데이콘의 잡케어 추천 알고리즘 경진대회 작업공간입니다.
1 stars 5 forks source link

Feature engineering 결과 #11

Open qkrwjdduf159 opened 2 years ago

qkrwjdduf159 commented 2 years ago

Feature engineering

def D_H_L_mapping(train = None, test = None):
    person_prefer_d_1_l = train.groupby('person_prefer_d_1_l')['target'].mean()
    train['person_prefer_d_1_l'] = train['person_prefer_d_1_l'].map({i:j for i, j in zip(person_prefer_d_1_l.keys(), person_prefer_d_1_l.values)})
    test['person_prefer_d_1_l'] = test['person_prefer_d_1_l'].map({i:j for i, j in zip(person_prefer_d_1_l.keys(), person_prefer_d_1_l.values)})

    person_prefer_d_1_n = train.groupby('person_prefer_d_1_n')['target'].mean()
    train['person_prefer_d_1_n'] = train['person_prefer_d_1_n'].map({i:j for i, j in zip(person_prefer_d_1_n.keys(), person_prefer_d_1_n.values)})
    test['person_prefer_d_1_n'] = test['person_prefer_d_1_n'].map({i:j for i, j in zip(person_prefer_d_1_n.keys(), person_prefer_d_1_n.values)})

    person_prefer_d_1_m = train.groupby('person_prefer_d_1_m')['target'].mean()
    train['person_prefer_d_1_m'] = train['person_prefer_d_1_m'].map({i:j for i, j in zip(person_prefer_d_1_m.keys(), person_prefer_d_1_m.values)})
    test['person_prefer_d_1_m'] = test['person_prefer_d_1_m'].map({i:j for i, j in zip(person_prefer_d_1_m.keys(), person_prefer_d_1_m.values)})

    person_prefer_d_1_s = train.groupby('person_prefer_d_1_s')['target'].mean()
    train['person_prefer_d_1_s'] = train['person_prefer_d_1_s'].map({i:j for i, j in zip(person_prefer_d_1_s.keys(), person_prefer_d_1_s.values)})
    test['person_prefer_d_1_s'] = test['person_prefer_d_1_s'].map({i:j for i, j in zip(person_prefer_d_1_s.keys(), person_prefer_d_1_s.values)})

    person_prefer_d_2_l = train.groupby('person_prefer_d_2_l')['target'].mean()
    train['person_prefer_d_2_l'] = train['person_prefer_d_2_l'].map({i:j for i, j in zip(person_prefer_d_2_l.keys(), person_prefer_d_2_l.values)})
    test['person_prefer_d_2_l'] = test['person_prefer_d_2_l'].map({i:j for i, j in zip(person_prefer_d_2_l.keys(), person_prefer_d_2_l.values)})

    person_prefer_d_2_n = train.groupby('person_prefer_d_2_n')['target'].mean()
    train['person_prefer_d_2_n'] = train['person_prefer_d_2_n'].map({i:j for i, j in zip(person_prefer_d_2_n.keys(), person_prefer_d_2_n.values)})
    test['person_prefer_d_2_n'] = test['person_prefer_d_2_n'].map({i:j for i, j in zip(person_prefer_d_2_n.keys(), person_prefer_d_2_n.values)})

    person_prefer_d_2_m = train.groupby('person_prefer_d_2_m')['target'].mean()
    train['person_prefer_d_2_m'] = train['person_prefer_d_2_m'].map({i:j for i, j in zip(person_prefer_d_2_m.keys(), person_prefer_d_2_m.values)})
    test['person_prefer_d_2_m'] = test['person_prefer_d_2_m'].map({i:j for i, j in zip(person_prefer_d_2_m.keys(), person_prefer_d_2_m.values)})

    person_prefer_d_2_s = train.groupby('person_prefer_d_2_s')['target'].mean()
    train['person_prefer_d_2_s'] = train['person_prefer_d_2_s'].map({i:j for i, j in zip(person_prefer_d_2_s.keys(), person_prefer_d_2_s.values)})
    test['person_prefer_d_2_s'] = test['person_prefer_d_2_s'].map({i:j for i, j in zip(person_prefer_d_2_s.keys(), person_prefer_d_2_s.values)})

    person_prefer_d_3_l = train.groupby('person_prefer_d_3_l')['target'].mean()
    train['person_prefer_d_3_l'] = train['person_prefer_d_3_l'].map({i:j for i, j in zip(person_prefer_d_3_l.keys(), person_prefer_d_3_l.values)})
    test['person_prefer_d_3_l'] = test['person_prefer_d_3_l'].map({i:j for i, j in zip(person_prefer_d_3_l.keys(), person_prefer_d_3_l.values)})

    person_prefer_d_3_n = train.groupby('person_prefer_d_3_n')['target'].mean()
    train['person_prefer_d_3_n'] = train['person_prefer_d_3_n'].map({i:j for i, j in zip(person_prefer_d_3_n.keys(), person_prefer_d_3_n.values)})
    test['person_prefer_d_3_n'] = test['person_prefer_d_3_n'].map({i:j for i, j in zip(person_prefer_d_3_n.keys(), person_prefer_d_3_n.values)})

    person_prefer_d_3_m = train.groupby('person_prefer_d_3_m')['target'].mean()
    train['person_prefer_d_3_m'] = train['person_prefer_d_3_m'].map({i:j for i, j in zip(person_prefer_d_3_m.keys(), person_prefer_d_3_m.values)})
    test['person_prefer_d_3_m'] = test['person_prefer_d_3_m'].map({i:j for i, j in zip(person_prefer_d_3_m.keys(), person_prefer_d_3_m.values)})

    person_prefer_d_3_s = train.groupby('person_prefer_d_3_s')['target'].mean()
    train['person_prefer_d_3_s'] = train['person_prefer_d_3_s'].map({i:j for i, j in zip(person_prefer_d_3_s.keys(), person_prefer_d_3_s.values)})
    test['person_prefer_d_3_s'] = test['person_prefer_d_3_s'].map({i:j for i, j in zip(person_prefer_d_3_s.keys(), person_prefer_d_3_s.values)})

    contents_attribute_d_l = train.groupby('contents_attribute_d_l')['target'].mean()
    train['contents_attribute_d_l'] = train['contents_attribute_d_l'].map({i:j for i, j in zip(contents_attribute_d_l.keys(), contents_attribute_d_l.values)})
    test['contents_attribute_d_l'] = test['contents_attribute_d_l'].map({i:j for i, j in zip(contents_attribute_d_l.keys(), contents_attribute_d_l.values)})

    contents_attribute_d_m = train.groupby('contents_attribute_d_m')['target'].mean()
    train['contents_attribute_d_m'] = train['contents_attribute_d_m'].map({i:j for i, j in zip(contents_attribute_d_m.keys(), contents_attribute_d_m.values)})
    test['contents_attribute_d_m'] = test['contents_attribute_d_m'].map({i:j for i, j in zip(contents_attribute_d_m.keys(), contents_attribute_d_m.values)})

    contents_attribute_d_n = train.groupby('contents_attribute_d_n')['target'].mean()
    train['contents_attribute_d_n'] = train['contents_attribute_d_n'].map({i:j for i, j in zip(contents_attribute_d_n.keys(), contents_attribute_d_n.values)})
    test['contents_attribute_d_n'] = test['contents_attribute_d_n'].map({i:j for i, j in zip(contents_attribute_d_n.keys(), contents_attribute_d_n.values)})

    contents_attribute_d_s = train.groupby('contents_attribute_d_s')['target'].mean()
    train['contents_attribute_d_s'] = train['contents_attribute_d_s'].map({i:j for i, j in zip(contents_attribute_d_s.keys(), contents_attribute_d_s.values)})
    test['contents_attribute_d_s'] = test['contents_attribute_d_s'].map({i:j for i, j in zip(contents_attribute_d_s.keys(), contents_attribute_d_s.values)})

    person_prefer_h_1_l = train.groupby('person_prefer_h_1_l')['target'].mean()
    train['person_prefer_h_1_l'] = train['person_prefer_h_1_l'].map({i:j for i, j in zip(person_prefer_h_1_l.keys(), person_prefer_h_1_l.values)})
    test['person_prefer_h_1_l'] = test['person_prefer_h_1_l'].map({i:j for i, j in zip(person_prefer_h_1_l.keys(), person_prefer_h_1_l.values)})

    person_prefer_h_1_m = train.groupby('person_prefer_h_1_m')['target'].mean()
    train['person_prefer_h_1_m'] = train['person_prefer_h_1_m'].map({i:j for i, j in zip(person_prefer_h_1_m.keys(), person_prefer_h_1_m.values)})
    test['person_prefer_h_1_m'] = test['person_prefer_h_1_m'].map({i:j for i, j in zip(person_prefer_h_1_m.keys(), person_prefer_h_1_m.values)})

    person_prefer_h_2_l = train.groupby('person_prefer_h_2_l')['target'].mean()
    train['person_prefer_h_2_l'] = train['person_prefer_h_2_l'].map({i:j for i, j in zip(person_prefer_h_2_l.keys(), person_prefer_h_2_l.values)})
    test['person_prefer_h_2_l'] = test['person_prefer_h_2_l'].map({i:j for i, j in zip(person_prefer_h_2_l.keys(), person_prefer_h_2_l.values)})

    person_prefer_h_2_m = train.groupby('person_prefer_h_2_m')['target'].mean()
    train['person_prefer_h_2_m'] = train['person_prefer_h_2_m'].map({i:j for i, j in zip(person_prefer_h_2_m.keys(), person_prefer_h_2_m.values)})
    test['person_prefer_h_2_m'] = test['person_prefer_h_2_m'].map({i:j for i, j in zip(person_prefer_h_2_m.keys(), person_prefer_h_2_m.values)})

    person_prefer_h_3_l = train.groupby('person_prefer_h_3_l')['target'].mean()
    train['person_prefer_h_3_l'] = train['person_prefer_h_3_l'].map({i:j for i, j in zip(person_prefer_h_3_l.keys(), person_prefer_h_3_l.values)})
    test['person_prefer_h_3_l'] = test['person_prefer_h_3_l'].map({i:j for i, j in zip(person_prefer_h_3_l.keys(), person_prefer_h_3_l.values)})

    person_prefer_h_3_m = train.groupby('person_prefer_h_3_m')['target'].mean()
    train['person_prefer_h_3_m'] = train['person_prefer_h_3_m'].map({i:j for i, j in zip(person_prefer_h_3_m.keys(), person_prefer_h_3_m.values)})
    test['person_prefer_h_3_m'] = test['person_prefer_h_3_m'].map({i:j for i, j in zip(person_prefer_h_3_m.keys(), person_prefer_h_3_m.values)})

    contents_attribute_h_l = train.groupby('contents_attribute_h_l')['target'].mean()
    train['contents_attribute_h_l'] = train['contents_attribute_h_l'].map({i:j for i, j in zip(contents_attribute_h_l.keys(), contents_attribute_h_l.values)})
    test['contents_attribute_h_l'] = test['contents_attribute_h_l'].map({i:j for i, j in zip(contents_attribute_h_l.keys(), contents_attribute_h_l.values)})

    contents_attribute_h_m = train.groupby('contents_attribute_h_m')['target'].mean()
    train['contents_attribute_h_m'] = train['contents_attribute_h_m'].map({i:j for i, j in zip(contents_attribute_h_m.keys(), contents_attribute_h_m.values)})
    test['contents_attribute_h_m'] = test['contents_attribute_h_m'].map({i:j for i, j in zip(contents_attribute_h_m.keys(), contents_attribute_h_m.values)})

    contents_attribute_l_n = train.groupby('contents_attribute_l_n')['target'].mean()
    train['contents_attribute_l_n'] = train['contents_attribute_l_n'].map({i:j for i, j in zip(contents_attribute_l_n.keys(), contents_attribute_l_n.values)})
    test['contents_attribute_l_n'] = test['contents_attribute_l_n'].map({i:j for i, j in zip(contents_attribute_l_n.keys(), contents_attribute_l_n.values)})

    contents_attribute_l_s = train.groupby('contents_attribute_l_s')['target'].mean()
    train['contents_attribute_l_s'] = train['contents_attribute_l_s'].map({i:j for i, j in zip(contents_attribute_l_s.keys(), contents_attribute_l_s.values)})
    test['contents_attribute_l_s'] = test['contents_attribute_l_s'].map({i:j for i, j in zip(contents_attribute_l_s.keys(), contents_attribute_l_s.values)})

    contents_attribute_l_m = train.groupby('contents_attribute_l_m')['target'].mean()
    train['contents_attribute_l_m'] = train['contents_attribute_l_m'].map({i:j for i, j in zip(contents_attribute_l_m.keys(), contents_attribute_l_m.values)})
    test['contents_attribute_l_m'] = test['contents_attribute_l_m'].map({i:j for i, j in zip(contents_attribute_l_m.keys(), contents_attribute_l_m.values)})

    contents_attribute_l_l = train.groupby('contents_attribute_l_l')['target'].mean()
    train['contents_attribute_l_l'] = train['contents_attribute_l_l'].map({i:j for i, j in zip(contents_attribute_l_l.keys(), contents_attribute_l_l.values)})
    test['contents_attribute_l_l'] = test['contents_attribute_l_l'].map({i:j for i, j in zip(contents_attribute_l_l.keys(), contents_attribute_l_l.values)})

    return train, test

결과

score가 0.687로 증가했습니다. 그리고 0이 11000개 1이 33000개 정도 되는 것으로 보여서 threshold를 다시 0.35로 지정해서 모델을 돌려보고 있습니다. 그 이후에 stacking과 voting중에 어떤 부분이 좋은지 확인해 보고 다시 issue에 올리도록 하겠습니다.

qkrwjdduf159 commented 2 years ago

Threshold를 0.35로 지정하면 0.69211까지 결과가 나왔습니다