Closed baleian closed 7 years ago
class Bracket(list): def init(self, arr): list.init(self, arr)
def tokenize_for_word(word):
word = word.replace('(', ' ( ').replace(')', ' ) ')
tokens = word.split()
while True:
left = len(tokens) - 1 - tokens[::-1].index('(') if '(' in tokens else None
right = tokens[(left if left is not None else 0):].index(')') +left if ')' in tokens else None
if left is None and right is None:
break
elif left is None and right is not None:
tokens[right:right+1] = []
elif left is not None and right is None:
tokens[left:left+1] = []
else:
tokens[left:right+1] = [tokens[left+1:right]]
set_bracket(tokens)
return tokens
def set_bracket(tokens): for i in range(len(tokens)): if type(tokens[i]) is list: set_bracket(tokens[i]) tokens[i] = Bracket(tokens[i])
class Variation(list): def init(self, arr): list.init(self, arr)
def set_word_variation_dictionary(tokens, word_variation_dic): for token in tokens: if type(token) is Bracket: set_word_variation_dictionary(token, word_variation_dic) elif token not in word_variation_dic: word_variation_dic[token] = Variation([(token, 1)])
def link_tokens_to_word_variation_dic(tokens, word_variation_dic): for idx, token in enumerate(tokens): if type(token) is Bracket: link_tokens_to_word_variation_dic(token, word_variation_dic) else: tokens[idx] = word_variation_dic[token]
variation_words = [ [('수퍼', 1), ('슈퍼', 1)], [('CU', 1), ('씨유', 0.5), ('훼미리마트', 0.3)], [('GS', 1), ('지에스', 0.5)], [('25', 1), ('이십오', 0.1)], [('365', 1), ('삼육오', 0.5)] ]
def is_word_includevword(word, vwords): for vword, in vwords: if vword in word: return vword return False
def get_word_variations(word, rate=1):
variations = [(word, rate)]
for vwords in variation_words:
size = len(variations)
for i in range(size):
w, r = variations[i]
_w = is_word_include_vword(w, vwords)
if not _w: continue
for vword, vrate in vwords:
if vword is _w: continue
variations.append((w.replace(_w, vword), r * vrate))
return variations
variation_charactors = { 'A': '에이', 'B': '비', 'C': '씨', 'D': '디', 'E': '이', 'F': '에프', 'G': '지', 'H': '에이치', 'I': '아이', 'J': '제이', 'K': '케이', 'L': '엘', 'M': '엠', 'N': '엔', 'O': '오', 'P': '피', 'Q': '큐', 'R': '알', 'S': '에스', 'T': '티', 'U': '유', 'V': '브이', 'W': '더블유', 'X': '엑스', 'Y': '와이', 'Z': '지', '1': '일', '2': '이', '3': '삼', '4': '사', '5': '오', '6': '육', '7': '칠', '8': '팔', '9': '구', '0': '공' }
def get_charactor_variations(word, rate=1): variations = [(word, rate)] for ch in variation_charactors: size = len(variations) for i in range(size): w, r = variations[i] if ch not in w: continue variations.append((w.replace(ch, variation_charactors[ch]), 0)) return variations
import itertools
class Permutation(list): def init(self, arr): list.init(self, arr)
def make_bracket_permutation(tokens): for idx, token in enumerate(tokens): if type(token) is Bracket: tokens[idx] = make_bracket_permutation(token)
ret = []
perm = itertools.permutations(tokens)
for p in perm:
tmp = [Bracket(list(p))]
for i in range(len(tokens)):
tmp2 = []
for t in tmp:
if type(t[i]) is Permutation:
for ti in t[i]:
ll = t[:]
ll[i:i+1] = [ti]
tmp2.append(Bracket(ll))
else:
tmp2.append(t)
tmp = tmp2
ret += tmp
return Permutation(ret)
def make_validation_str(perm):
ret = [([], 1)]
for p in perm:
size = len(ret)
if type(p) is Variation:
tmp = []
for i in range(size):
for q in p:
tmp.append((ret[i][0] + q[0], ret[i][1] q[1]))
ret = tmp
elif type(p) is Bracket:
tmp = []
for i in range(size):
qs = make_validation_str(p)
for q in qs:
tmp.append((ret[i][0] + q[0], ret[i][1] q[1]))
ret += tmp
return ret
import csv
def load_csv_data(csvfile_path): datas = list() labels = list() with open(csvfile_path, 'r', encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile) for row in reader: datas.append(row['mcnt']) labels.append(int(row['id'])) return {'datas': datas, 'labels': labels}
def tokenizing_datas(datas): for idx, data in enumerate(datas): datas[idx] = tokenize_for_word(data)
def make_word_variation_dictionary(datas): word_variation_dic = dict() for tokens in datas: set_word_variation_dictionary(tokens, word_variation_dic) return word_variation_dic
def puff_word_variation(word_variation_dic): for key in word_variation_dic: variations = word_variation_dic[key] puff_variations = [] for word, rate in variations: puff_variations += get_word_variations(word, rate) word_variation_dic[key] = Variation(puff_variations)
def puff_charactor_variation(word_variation_dic): for key in word_variation_dic: variations = word_variation_dic[key] puff_variations = [] for word, rate in variations: puff_variations += get_charactor_variations(word, rate) word_variation_dic[key] = Variation(puff_variations)
def elementalization(word_variation_dic): for key in word_variation_dic: variations = word_variation_dic[key] for idx, variation in enumerate(variations):
variations[idx] = (list(variation[0]), variation[1]) # char 단위
def element_to_dic(word_variation_dic): ele_set = set() for key in word_variation_dic: for variation in word_variation_dic[key]: ele_set |= set(variation[0]) ele_set = list(ele_set)
ele_dic = {w: i for i, w in enumerate(ele_set)}
for key in word_variation_dic:
for variation in word_variation_dic[key]:
for idx, ele in enumerate(variation[0]):
variation[0][idx] = ele_dic[ele]
return ele_dic
def link_datas_to_word_variation_dic(datas, word_variation_dic): for tokens in datas: link_tokens_to_word_variation_dic(tokens, word_variation_dic)
def make_permutation_from_datas(datas): for idx, tokens in enumerate(datas): perm = make_bracket_permutation(tokens) for i, p in enumerate(perm): perm[i] = list(p) datas[idx] = perm
def make_validation_str_from_datas(datas, make_cnt_each_data): for idx, perms in enumerate(datas): make_cnt = make_cnt_each_data / len(perms) tmp = [] for perm in perms: ds = make_validation_str(perm) ratio_sum = 0 for d in ds: ratio_sum += d[1] for d in ds: i, ratio = d c = int(make_cnt * (ratio / ratio_sum) + 3) tmp.append((i, c)) datas[idx] = tmp
def get_trainXY_datas(datas, labels, max_seq_length): trainX = [] trainY = []
for idx, data in enumerate(datas):
label = labels[idx]
for d in data:
s, c = d
for i in range(c):
trainX.append(s[:max_seq_length] + [-1] * (max_seq_length - len(s)))
trainY.append(label)
return trainX, trainY
train_data = load_csv_data('train_data.csv')
train_data['datas'] = train_data['datas'][100:110] # ['GS25 남영역점 ', '신즈통상 ', 'GS수퍼 관악점 ', 'GS수퍼 양천신은점 (대)', 'GS수퍼 양천신은점 '] train_data['labels'] = train_data['labels'][100:110] # [1040, 2214, 6282, 6317, 6318] train_data['labels'] = [idx for idx in range(len(train_data['labels']))] print(train_data['datas'][:10]) print(train_data['labels'][:10])
tokenizing_datas(train_data['datas'])
word_variation_dic = make_word_variation_dictionary(train_data['datas'])
puff_word_variation(word_variation_dic)
elementalization(word_variation_dic)
ele_dic = element_to_dic(word_variation_dic)
link_datas_to_word_variation_dic(train_data['datas'], word_variation_dic)
make_permutation_from_datas(train_data['datas'])
make_validation_str_from_datas(train_data['datas'], make_cnt_each_data=100)
max_seq_length = 100 char_dic_size = 100 #len(ele_dic) label_dic_size = max(trainY) + 1 learning_rate = 0.001
trainX, trainY = get_trainXY_datas(train_data['datas'], train_data['labels'], max_seq_length) print(len(trainX)) print(len(trainY))
import tensorflow as tf
isTraining = tf.placeholder(tf.bool)
X = tf.placeholder(tf.int32, [None, max_seq_length]) print('X.shape:', X.shape) X_one_hot = tf.one_hot(X, char_dic_size) print('X_one_hot.shape:', X_one_hot.shape) X_img = tf.reshape(X_one_hot, [-1, 1, max_seq_length, char_dic_size]) print('X_img.shape:', X_img.shape) Y = tf.placeholder(tf.int32, [None]) print('Y.shape:', Y.shape) Y_one_hot = tf.one_hot(Y, label_dic_size) print('Y_one_hot.shape:', Y_one_hot.shape)
conv1 = tf.layers.conv2d(inputs=X_img, filters=32, kernel_size=[1, 2], padding='SAME', activation=tf.nn.relu) dropout1 = tf.layers.dropout(inputs=conv1, rate=0.7, training=isTraining) print('conv1.shape:', conv1.shape) print('dropout1.shape:', dropout1.shape)
conv2 = tf.layers.conv2d(inputs=dropout1, filters=64, kernel_size=[1, 2], padding='SAME', activation=tf.nn.relu) dropout2 = tf.layers.dropout(inputs=conv2, rate=0.7, training=isTraining) print('conv2.shape:', conv2.shape) print('dropout2.shape:', dropout2.shape)
flat = tf.reshape(dropout2, [-1, 1 max_seq_length 64]) print('flat.shape:', flat.shape)
dense3 = tf.layers.dense(inputs=flat, units=625, activation=tf.nn.relu) dropout3 = tf.layers.dropout(inputs=dense3, rate=0.5, training=isTraining) print('dense3.shape:', dense3.shape) print('dropout3.shape:', dropout3.shape)
logits = tf.layers.dense(inputs=dropout3, units=label_dic_size) print('logits.shape:', logits.shape)
outputs = tf.argmax(logits, 1) softmax = tf.nn.softmax(logits)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y_one_hot)) train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
print(set(trainY))
with tf.Session() as sess: sess.run(tf.global_variables_initializer())
for i in range(200):
loss, _ = sess.run([cost, train], feed_dict={X: trainX, Y: trainY, isTraining: True})
if i % 10 is 0:
result = sess.run(outputs, feed_dict={X: trainX, isTraining: False})
sum = 0
for i in range(len(trainY)):
if int(trainY[i]) == int(result[i]):
sum += 1
print(loss, sum / len(trainY), set(result))
import unicodedata
def check_hangul(ch): valid_type = ['Lo'] if unicodedata.category(ch) in valid_type: unicode_names = unicodedata.name(ch).split() if 'HANGUL' in unicode_names and 'SYLLABLE' in unicode_names: return True else: return False else: return False
def conv_compatibility_jamo(ch): unicode_names = unicodedata.name(ch)
def conv_jaso(ch): if not check_hangul(ch): return [ch]
def str_to_jaso(str): ret = [] for ch in str: ret += conv_jaso(ch) return ret