Open dohyeon-kim012 opened 3 years ago
1) in metrics.py line 35
from sklearn.utils.linear_assignment_ import linear_assignment ===> from scipy.optimize import linear_sum_assignment as linear_assignment
2) in metrics.py line 45
return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size ===> return sum([w[i, j] for i, j in zip(ind[0], ind[1])]) * 1.0 / y_pred.size
in DEC.py line 337
dec = DEC(dims=[x.shape[-1], 500, 500, 2000, 10], n_clusters=n_clusters, init=init) ===> dec = DEC(dims=[x.shape[-1], 500, 500, 2000, 5], n_clusters=n_clusters, init=init)
in DEC.py line 125
class DEC(object): def __init__(self, dims, n_clusters=10, alpha=1.0, init='glorot_uniform'): ===> class DEC(object): def __init__(self, dims, n_clusters=5, alpha=1.0, init='glorot_uniform'):
in DEC.py line 327
elif args.dataset == 'crawling_data': update_interval = 30 pretrain_epochs = 50
🥨결과 y: [3 3 2 1 5 1 5 5 5 2 1 5 4 4 4 4 3 5 4 2 3 4 4 3 1 3 3 4 4 5 4 3 4 3 3 3 4 2 5 4 2 3 1 5 1 4 2 2 4 4 1 5 3 3 5 4 5 3 5 3 5 4 1] y_pred: [3 2 2 2 5 2 5 4 4 4 5 1 4 4 5 1 5 4 4 4 1 1 1 5 1 1 1 4 4 1 1 1 1 1 4 4 1 1 1 1 1 4 1 1 4 1 1 1 4 1 1 1 1 1 1 1 1 4 1 1 1 1 1]
acc: 0.4725409836065574 clustering time: 2.8612470626831055
in DEC.py line 329, 285
pretrain_epochs = 20 parser.add_argument('--batch_size', default=128, type=int)
🥨결과 acc: 0.4847465769877492
in datasets.py
in DEC.py
update_interval = 70 pretrain_epochs = 100 init = VarianceScaling(scale=1. / 3., mode='fan_in', distribution='uniform') # [-limit, limit], limit=sqrt(1./fan_in) pretrain_optimizer = SGD(lr=1, momentum=0.9) batch_size = 256 불러오는 데이터 nrows = 50000 word2vec(min_count=15)
🥨결과 y: [3 3 2 1 5 1 5 5 5 2 1 5 4 4 4 4 3 5 4 2 3 4 4 3 1 3 3 4 4 5 4 3 4 3 3 3 4 2 5 4 2 3 1 5 1 4 2 2 4 4] y_pred: [3 3 3 3 1 3 1 2 1 1 1 4 2 2 1 4 1 2 1 2 2 4 4 1 2 4 4 2 1 4 4 4 4 4 2 1 4 2 0 4 4 2 4 4 2 4 4 4 2 0]
acc: 0.49414950198240015 clustering time: 5.3788673877716064
in DEC.py line 28
def autoencoder(dims, act='relu', init='glorot_uniform'): ===> def autoencoder(dims, act='tanh', init='glorot_uniform'):
🥨결과 acc: 0.4926022628372498 === 큰 영향 없음
nrows = 50000
🥨결과 acc: 0.4931361175560712
def autoencoder(dims, act='selu', init='lecun_normal'):
🥨결과 acc: 0.49265274555297756
model = Word2Vec(sentences=tokenized_data, window=5, min_count=8, workers=4, sg=1) def autoencoder(dims, act='selu', init='lecun_uniform'):
🥨결과 acc: 0.49279845335911066
class DEC(object): def __init__(self, dims, n_clusters=4, alpha=1.0, init='lecun_uniform'):
🥨결과 acc: 0.49260512324794586
self.autoencoder.compile(optimizer=optimizer, loss='binary_crossentropy') 🥨결과 acc: 0.4924117931367811
self.autoencoder.compile(optimizer=optimizer, loss='binary_crossentropy')
line 29 / def autoencoder(dims, act='softmax', init='glorot_uniform'):
def autoencoder(dims, act='softmax', init='glorot_uniform'):
line 143 / def pretrain(self, x, y=None, optimizer='adam', epochs=50, batch_size=256, save_dir='results/temp'): line 145 / self.autoencoder.compile(optimizer=optimizer, loss='categorical_crossentropy')
def pretrain(self, x, y=None, optimizer='adam', epochs=50, batch_size=256, save_dir='results/temp'):
self.autoencoder.compile(optimizer=optimizer, loss='categorical_crossentropy')
🥨결과 acc: 0.4924117931367811
if tag in ['Noun', 'Verb']:
acc: 0.46600688468158347
if tag in ['Noun']:
acc: 0.4633...
if tag in ['Noun', 'Adjective', 'Adverb', 'Verb']:
acc: 0.4731626754748142
if tag not in ['Josa', 'Eomi', 'Punctation']:
환경 변화 (GPU➡Local) 로 인한 코드 수정
1) in metrics.py line 35
2) in metrics.py line 45
3) DEC model dimension 수정
in DEC.py line 337
4) DEC model n_clusters 수정
in DEC.py line 125
5) dataset 관련 args 추가
in DEC.py line 327
🥨결과 y: [3 3 2 1 5 1 5 5 5 2 1 5 4 4 4 4 3 5 4 2 3 4 4 3 1 3 3 4 4 5 4 3 4 3 3 3 4 2 5 4 2 3 1 5 1 4 2 2 4 4 1 5 3 3 5 4 5 3 5 3 5 4 1] y_pred: [3 2 2 2 5 2 5 4 4 4 5 1 4 4 5 1 5 4 4 4 1 1 1 5 1 1 1 4 4 1 1 1 1 1 4 4 1 1 1 1 1 4 1 1 4 1 1 1 4 1 1 1 1 1 1 1 1 4 1 1 1 1 1]
acc: 0.4725409836065574 clustering time: 2.8612470626831055
6) 파라미터 조정
in DEC.py line 329, 285
🥨결과 acc: 0.4847465769877492
7) Word2Vec를 거치며 사라지는 idx 체크해서 y값에 적용
in datasets.py
8) 파라미터 조정
in DEC.py
🥨결과 y: [3 3 2 1 5 1 5 5 5 2 1 5 4 4 4 4 3 5 4 2 3 4 4 3 1 3 3 4 4 5 4 3 4 3 3 3 4 2 5 4 2 3 1 5 1 4 2 2 4 4] y_pred: [3 3 3 3 1 3 1 2 1 1 1 4 2 2 1 4 1 2 1 2 2 4 4 1 2 4 4 2 1 4 4 4 4 4 2 1 4 2 0 4 4 2 4 4 2 4 4 4 2 0]
acc: 0.49414950198240015 clustering time: 5.3788673877716064
9) 오토인코더의 활성화 함수 변경
in DEC.py line 28
🥨결과 acc: 0.4926022628372498 === 큰 영향 없음
10) 파라미터 조정
nrows = 50000
🥨결과 acc: 0.4931361175560712
11) 오토인코더의 활성화 함수, 초기값 변경
def autoencoder(dims, act='selu', init='lecun_normal'):
🥨결과 acc: 0.49265274555297756
12) word2Vec min_count 변경 / 오토인코더의 활성화 함수, 초기값 변경
🥨결과 acc: 0.49279845335911066
13) DEC 의 n_clusters , 초기값 변경
class DEC(object): def __init__(self, dims, n_clusters=4, alpha=1.0, init='lecun_uniform'):
🥨결과 acc: 0.49260512324794586
14) 오토인코더의 손실함수 변경
self.autoencoder.compile(optimizer=optimizer, loss='binary_crossentropy')
🥨결과 acc: 0.492411793136781115) 오토인코더의 손실함수, 초기값 변경 / dec 클래스의 pretrain 함수 파라미터 변경
line 29 /
def autoencoder(dims, act='softmax', init='glorot_uniform'):
line 143 /
def pretrain(self, x, y=None, optimizer='adam', epochs=50, batch_size=256, save_dir='results/temp'):
line 145 /self.autoencoder.compile(optimizer=optimizer, loss='categorical_crossentropy')
🥨결과 acc: 0.4924117931367811
16) 형태소 분석에서 선정할 품사 변경
if tag in ['Noun', 'Verb']:
acc: 0.46600688468158347
if tag in ['Noun']:
acc: 0.4633...
if tag in ['Noun', 'Adjective', 'Adverb', 'Verb']:
acc: 0.4731626754748142
if tag not in ['Josa', 'Eomi', 'Punctation']:
acc: 0.4731626754748142