baleian / tensorflow

0 stars 0 forks source link

jaso 구분 #2

Closed baleian closed 7 years ago

baleian commented 7 years ago

import unicodedata

def check_hangul(ch): valid_type = ['Lo'] if unicodedata.category(ch) in valid_type: unicode_names = unicodedata.name(ch).split() if 'HANGUL' in unicode_names and 'SYLLABLE' in unicode_names: return True else: return False else: return False

def conv_compatibility_jamo(ch): unicode_names = unicodedata.name(ch)

if unicode_names.find('CHOSEONG') >= 0:
    unicode_names = unicode_names.replace('CHOSEONG', 'LETTER')
elif unicode_names.find('JUNGSEONG') >= 0:
    unicode_names = unicode_names.replace('JUNGSEONG', 'LETTER')
elif unicode_names.find('JONGSEONG') >= 0:
    unicode_names = unicode_names.replace('JONGSEONG', 'LETTER')
return unicodedata.lookup(unicode_names)

def conv_jaso(ch): if not check_hangul(ch): return [ch]

jaso = []
ch = ord(ch) - 0xAC00
jong = ch % 28
jung = ((ch - jong) / 28) % 21
cho = (((ch - jong) / 28) - jung) / 21

if cho >= 0:
    jaso.append(conv_compatibility_jamo(chr(int(cho + 0x1100))))
if jung >= 0:
    jaso.append(conv_compatibility_jamo(chr(int(jung + 0x1161))))
if jong > 0:
    jaso.append(conv_compatibility_jamo(chr(int(jong + 0x11A7))))
return jaso

def str_to_jaso(str): ret = [] for ch in str: ret += conv_jaso(ch) return ret

baleian commented 7 years ago

class Bracket(list): def init(self, arr): list.init(self, arr)

def tokenize_for_word(word):
word = word.replace('(', ' ( ').replace(')', ' ) ') tokens = word.split()

while True:    
    left = len(tokens) - 1 - tokens[::-1].index('(') if '(' in tokens else None
    right = tokens[(left if left is not None else 0):].index(')') +left if ')' in tokens else None

    if left is None and right is None:
        break
    elif left is None and right is not None:
        tokens[right:right+1] = []
    elif left is not None and right is None:
        tokens[left:left+1] = []
    else:
        tokens[left:right+1] = [tokens[left+1:right]]

set_bracket(tokens)
return tokens

def set_bracket(tokens): for i in range(len(tokens)): if type(tokens[i]) is list: set_bracket(tokens[i]) tokens[i] = Bracket(tokens[i])

class Variation(list): def init(self, arr): list.init(self, arr)

def set_word_variation_dictionary(tokens, word_variation_dic): for token in tokens: if type(token) is Bracket: set_word_variation_dictionary(token, word_variation_dic) elif token not in word_variation_dic: word_variation_dic[token] = Variation([(token, 1)])

def link_tokens_to_word_variation_dic(tokens, word_variation_dic): for idx, token in enumerate(tokens): if type(token) is Bracket: link_tokens_to_word_variation_dic(token, word_variation_dic) else: tokens[idx] = word_variation_dic[token]

variation_words = [ [('수퍼', 1), ('슈퍼', 1)], [('CU', 1), ('씨유', 0.5), ('훼미리마트', 0.3)], [('GS', 1), ('지에스', 0.5)], [('25', 1), ('이십오', 0.1)], [('365', 1), ('삼육오', 0.5)] ]

def is_word_includevword(word, vwords): for vword, in vwords: if vword in word: return vword return False

def get_word_variations(word, rate=1): variations = [(word, rate)] for vwords in variation_words: size = len(variations) for i in range(size): w, r = variations[i] _w = is_word_include_vword(w, vwords) if not _w: continue for vword, vrate in vwords: if vword is _w: continue variations.append((w.replace(_w, vword), r * vrate))
return variations

variation_charactors = { 'A': '에이', 'B': '비', 'C': '씨', 'D': '디', 'E': '이', 'F': '에프', 'G': '지', 'H': '에이치', 'I': '아이', 'J': '제이', 'K': '케이', 'L': '엘', 'M': '엠', 'N': '엔', 'O': '오', 'P': '피', 'Q': '큐', 'R': '알', 'S': '에스', 'T': '티', 'U': '유', 'V': '브이', 'W': '더블유', 'X': '엑스', 'Y': '와이', 'Z': '지', '1': '일', '2': '이', '3': '삼', '4': '사', '5': '오', '6': '육', '7': '칠', '8': '팔', '9': '구', '0': '공' }

def get_charactor_variations(word, rate=1): variations = [(word, rate)] for ch in variation_charactors: size = len(variations) for i in range(size): w, r = variations[i] if ch not in w: continue variations.append((w.replace(ch, variation_charactors[ch]), 0)) return variations

import itertools

class Permutation(list): def init(self, arr): list.init(self, arr)

def make_bracket_permutation(tokens): for idx, token in enumerate(tokens): if type(token) is Bracket: tokens[idx] = make_bracket_permutation(token)

ret = []
perm = itertools.permutations(tokens)
for p in perm:
    tmp = [Bracket(list(p))]
    for i in range(len(tokens)):
        tmp2 = []
        for t in tmp:
            if type(t[i]) is Permutation:
                for ti in t[i]:
                    ll = t[:]
                    ll[i:i+1] = [ti]
                    tmp2.append(Bracket(ll))
            else:
                tmp2.append(t)
        tmp = tmp2
    ret += tmp

return Permutation(ret)

def make_validation_str(perm): ret = [([], 1)] for p in perm: size = len(ret)
if type(p) is Variation: tmp = [] for i in range(size): for q in p: tmp.append((ret[i][0] + q[0], ret[i][1] q[1])) ret = tmp elif type(p) is Bracket: tmp = [] for i in range(size): qs = make_validation_str(p) for q in qs: tmp.append((ret[i][0] + q[0], ret[i][1] q[1])) ret += tmp return ret

baleian commented 7 years ago

import csv

def load_csv_data(csvfile_path): datas = list() labels = list() with open(csvfile_path, 'r', encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile) for row in reader: datas.append(row['mcnt']) labels.append(int(row['id'])) return {'datas': datas, 'labels': labels}

def tokenizing_datas(datas): for idx, data in enumerate(datas): datas[idx] = tokenize_for_word(data)

def make_word_variation_dictionary(datas): word_variation_dic = dict() for tokens in datas: set_word_variation_dictionary(tokens, word_variation_dic) return word_variation_dic

def puff_word_variation(word_variation_dic): for key in word_variation_dic: variations = word_variation_dic[key] puff_variations = [] for word, rate in variations: puff_variations += get_word_variations(word, rate) word_variation_dic[key] = Variation(puff_variations)

def puff_charactor_variation(word_variation_dic): for key in word_variation_dic: variations = word_variation_dic[key] puff_variations = [] for word, rate in variations: puff_variations += get_charactor_variations(word, rate) word_variation_dic[key] = Variation(puff_variations)

def elementalization(word_variation_dic): for key in word_variation_dic: variations = word_variation_dic[key] for idx, variation in enumerate(variations):

variations[idx] = ([variation[0]], variation[1]) # word 단위

        variations[idx] = (list(variation[0]), variation[1]) # char 단위

variations[idx] = (str_to_jaso(variation[0]), variation[1]) # 자소 단위

def element_to_dic(word_variation_dic): ele_set = set() for key in word_variation_dic: for variation in word_variation_dic[key]: ele_set |= set(variation[0]) ele_set = list(ele_set)

ele_dic = {w: i for i, w in enumerate(ele_set)}
for key in word_variation_dic:
    for variation in word_variation_dic[key]:
        for idx, ele in enumerate(variation[0]):
            variation[0][idx] = ele_dic[ele]
return ele_dic

def link_datas_to_word_variation_dic(datas, word_variation_dic): for tokens in datas: link_tokens_to_word_variation_dic(tokens, word_variation_dic)

def make_permutation_from_datas(datas): for idx, tokens in enumerate(datas): perm = make_bracket_permutation(tokens) for i, p in enumerate(perm): perm[i] = list(p) datas[idx] = perm

def make_validation_str_from_datas(datas, make_cnt_each_data): for idx, perms in enumerate(datas): make_cnt = make_cnt_each_data / len(perms) tmp = [] for perm in perms: ds = make_validation_str(perm) ratio_sum = 0 for d in ds: ratio_sum += d[1] for d in ds: i, ratio = d c = int(make_cnt * (ratio / ratio_sum) + 3) tmp.append((i, c)) datas[idx] = tmp

def get_trainXY_datas(datas, labels, max_seq_length): trainX = [] trainY = []

for idx, data in enumerate(datas):
    label = labels[idx]

    for d in data:
        s, c = d
        for i in range(c):
            trainX.append(s[:max_seq_length] + [-1] * (max_seq_length - len(s)))
            trainY.append(label)

return trainX, trainY
baleian commented 7 years ago

Load Data

train_data = load_csv_data('train_data.csv')

train_data['datas'] = train_data['datas'][100:110] # ['GS25 남영역점 ', '신즈통상 ', 'GS수퍼 관악점 ', 'GS수퍼 양천신은점 (대)', 'GS수퍼 양천신은점 '] train_data['labels'] = train_data['labels'][100:110] # [1040, 2214, 6282, 6317, 6318] train_data['labels'] = [idx for idx in range(len(train_data['labels']))] print(train_data['datas'][:10]) print(train_data['labels'][:10])

Tokenizing Data

tokenizing_datas(train_data['datas'])

Create Word Variation Dictionary

word_variation_dic = make_word_variation_dictionary(train_data['datas'])

Puff Word Variation

puff_word_variation(word_variation_dic)

Puff Charactor Variation

puff_charactor_variation(word_variation_dic)

Word Elementalization

elementalization(word_variation_dic)

Element To Dictionary

ele_dic = element_to_dic(word_variation_dic)

Link data and vocab

link_datas_to_word_variation_dic(train_data['datas'], word_variation_dic)

Make Permutation From Datas

make_permutation_from_datas(train_data['datas'])

Make Validation String From Datas (Make Validation String Count Each Data)

make_validation_str_from_datas(train_data['datas'], make_cnt_each_data=100)

baleian commented 7 years ago

hyper parameters

max_seq_length = 100 char_dic_size = 100 #len(ele_dic) label_dic_size = max(trainY) + 1 learning_rate = 0.001

trainX, trainY = get_trainXY_datas(train_data['datas'], train_data['labels'], max_seq_length) print(len(trainX)) print(len(trainY))

baleian commented 7 years ago

import tensorflow as tf

isTraining = tf.placeholder(tf.bool)

X = tf.placeholder(tf.int32, [None, max_seq_length]) print('X.shape:', X.shape) X_one_hot = tf.one_hot(X, char_dic_size) print('X_one_hot.shape:', X_one_hot.shape) X_img = tf.reshape(X_one_hot, [-1, 1, max_seq_length, char_dic_size]) print('X_img.shape:', X_img.shape) Y = tf.placeholder(tf.int32, [None]) print('Y.shape:', Y.shape) Y_one_hot = tf.one_hot(Y, label_dic_size) print('Y_one_hot.shape:', Y_one_hot.shape)

conv1 = tf.layers.conv2d(inputs=X_img, filters=32, kernel_size=[1, 2], padding='SAME', activation=tf.nn.relu) dropout1 = tf.layers.dropout(inputs=conv1, rate=0.7, training=isTraining) print('conv1.shape:', conv1.shape) print('dropout1.shape:', dropout1.shape)

conv2 = tf.layers.conv2d(inputs=dropout1, filters=64, kernel_size=[1, 2], padding='SAME', activation=tf.nn.relu) dropout2 = tf.layers.dropout(inputs=conv2, rate=0.7, training=isTraining) print('conv2.shape:', conv2.shape) print('dropout2.shape:', dropout2.shape)

flat = tf.reshape(dropout2, [-1, 1 max_seq_length 64]) print('flat.shape:', flat.shape)

dense3 = tf.layers.dense(inputs=flat, units=625, activation=tf.nn.relu) dropout3 = tf.layers.dropout(inputs=dense3, rate=0.5, training=isTraining) print('dense3.shape:', dense3.shape) print('dropout3.shape:', dropout3.shape)

logits = tf.layers.dense(inputs=dropout3, units=label_dic_size) print('logits.shape:', logits.shape)

outputs = tf.argmax(logits, 1) softmax = tf.nn.softmax(logits)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y_one_hot)) train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

baleian commented 7 years ago

print(set(trainY))

with tf.Session() as sess: sess.run(tf.global_variables_initializer())

print(sess.run(cost, feed_dict={X: trainX, Y: trainY, isTraining: True}))

for i in range(200):
    loss, _ = sess.run([cost, train], feed_dict={X: trainX, Y: trainY, isTraining: True})
    if i % 10 is 0:
        result = sess.run(outputs, feed_dict={X: trainX, isTraining: False})
        sum = 0
        for i in range(len(trainY)):
            if int(trainY[i]) == int(result[i]):
                sum += 1
        print(loss, sum / len(trainY), set(result))

test_result, soft = sess.run([outputs, softmax], feed_dict={X: testX, isTraining: False})

print('----------------')

slist = [max(s) for s in soft]

for i, d in enumerate(testY):

print(str(d) + '-' + str(test_result[i]), ':', soft[i])

print('----------------')