doxgxxn / data_TIL

toy project dumster and daily review of everything
1 stars 0 forks source link

230913 / review - word2vec, konlpy twitter #22

Open doxgxxn opened 1 year ago

doxgxxn commented 1 year ago

import os
import pymysql
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
from gensim.models.word2vec import Word2Vec

# 한국어 문자를 다루는 twitter
from konlpy.tag import Twitter
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-11"
os.environ["JAVA_HOME"]
twitter = Twitter()
def dataprocessing(data):
    tagged_review = twitter.pos(data, stem=True)
    stems =[]
    for word,pos in tagged_review:
        if pos =='Noun' or pos == 'Adjective':
            stems.append(word)
    return " ".join(stems)
dataprocessing("아버지가 방에 들어갔다 나왔다 아니고 배고프다 예쁘게 치킨 훔쳐")
con = pymysql.connect(user = "admin",
                      password= "Ehdrbs123$",
                      host = "database-01.canftvohs2xd.ap-northeast-2.rds.amazonaws.com",
                      port= 3306,
                      db='movie_db',
                      charset='utf8')
cursor = con.cursor()
sql = "select * from movie_tbl;"

movie_df = pd.read_sql(sql, con)
movie_df.head(3)
movie_df["synopsys_clear"] = np.nan
movie_df.head(2)
movie_df["synopsys_clear"] = movie_df["synopsys"].apply(dataprocessing)
movie_df.head(1)
movie_df["synopsys_clear_list"] = movie_df["synopsys_clear"].apply(lambda x: x.split())
movie_df.head(1)
movie_df["synopsys_clear_list"]
word2vec = Word2Vec(movie_df["synopsys_clear_list"],
                    sg=1,
                    vector_size=80,
                    window=3,
                    min_count=2,
                    workers=10)
word2vec.wv.key_to_index["살인"]
word2vec.wv["액션"]
word2vec.wv.most_similar("액션")
word2vec_words = word2vec.wv.key_to_index.keys()

for index in range(len(movie_df)):
    num = movie_df.loc[index, "num"]
    title = movie_df.loc[index, "title"]
    line = movie_df.loc[index, "synopsys_clear_list"]

    doc2vec = None
    count = 0

    for word in line:
        if word in word2vec_words:
            count += 1

            if doc2vec is None:
                doc2vec = word2vec.wv[word]
            else:
                doc2vec = doc2vec + word2vec.wv[word]
    if doc2vec is not None:
        doc2vec = doc2vec / count

    sql = "update movie_tbl set synopsys_vector=%s where num=%s"
    cursor.execute(sql, (doc2vec.tostring(), num))
    con.commit()

sql = "select title, synopsys_vector from movie_tbl where num=1"

cursor.execute(sql)

rows = cursor.fetchall()

for row in rows:
    print(row[0], row[1])
    print(np.fromstring(row[1], dtype="float32"))
    print("="*100)
sql = "select * from movie_tbl"
movie_df = pd.read_sql(sql, con)
movie_df
movie_df["synopsys_vector_numpy"] = movie_df["synopsys_vector"].apply(lambda x: np.fromstring(x, dtype="float32"))
movie_df["synopsys_vector_numpy"].head(2)
movie_df["synopsys_vector_numpy"].tolist()
np.array(movie_df["synopsys_vector_numpy"].tolist())
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() 
movie_df["synopsys_vector_numpy_scale"] = scaler.fit_transform(np.array(movie_df["synopsys_vector_numpy"].tolist())).tolist()
movie_df.head(1)
from sklearn.metrics.pairwise import euclidean_distances

sim_score = euclidean_distances(movie_df["synopsys_vector_numpy_scale"].tolist(),movie_df["synopsys_vector_numpy_scale"].tolist())
sim_df = pd.DataFrame(sim_score)
sim_df
sim_df.index = movie_df["title"]
sim_df.columns = movie_df["title"]

sim_df
sim_df["오토라는 남자"].sort_values
result = pd.DataFrame(sim_df["오토라는 남자"].sort_values()[1:4]).reset_index().values.tolist()
result
import json
json.dumps(result, ensure_ascii=False)
movie_df["synopsys_vector"][0]