Open doxgxxn opened 1 year ago
import os import pymysql import time import pandas as pd import numpy as np from tqdm import tqdm from gensim.models.word2vec import Word2Vec # 한국어 문자를 다루는 twitter from konlpy.tag import Twitter os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-11" os.environ["JAVA_HOME"] twitter = Twitter() def dataprocessing(data): tagged_review = twitter.pos(data, stem=True) stems =[] for word,pos in tagged_review: if pos =='Noun' or pos == 'Adjective': stems.append(word) return " ".join(stems) dataprocessing("아버지가 방에 들어갔다 나왔다 아니고 배고프다 예쁘게 치킨 훔쳐") con = pymysql.connect(user = "admin", password= "Ehdrbs123$", host = "database-01.canftvohs2xd.ap-northeast-2.rds.amazonaws.com", port= 3306, db='movie_db', charset='utf8') cursor = con.cursor() sql = "select * from movie_tbl;" movie_df = pd.read_sql(sql, con) movie_df.head(3) movie_df["synopsys_clear"] = np.nan movie_df.head(2) movie_df["synopsys_clear"] = movie_df["synopsys"].apply(dataprocessing) movie_df.head(1) movie_df["synopsys_clear_list"] = movie_df["synopsys_clear"].apply(lambda x: x.split()) movie_df.head(1) movie_df["synopsys_clear_list"] word2vec = Word2Vec(movie_df["synopsys_clear_list"], sg=1, vector_size=80, window=3, min_count=2, workers=10) word2vec.wv.key_to_index["살인"] word2vec.wv["액션"] word2vec.wv.most_similar("액션") word2vec_words = word2vec.wv.key_to_index.keys() for index in range(len(movie_df)): num = movie_df.loc[index, "num"] title = movie_df.loc[index, "title"] line = movie_df.loc[index, "synopsys_clear_list"] doc2vec = None count = 0 for word in line: if word in word2vec_words: count += 1 if doc2vec is None: doc2vec = word2vec.wv[word] else: doc2vec = doc2vec + word2vec.wv[word] if doc2vec is not None: doc2vec = doc2vec / count sql = "update movie_tbl set synopsys_vector=%s where num=%s" cursor.execute(sql, (doc2vec.tostring(), num)) con.commit() sql = "select title, synopsys_vector from movie_tbl where num=1" cursor.execute(sql) rows = cursor.fetchall() for row in rows: print(row[0], row[1]) print(np.fromstring(row[1], dtype="float32")) print("="*100) sql = "select * from movie_tbl" movie_df = pd.read_sql(sql, con) movie_df movie_df["synopsys_vector_numpy"] = movie_df["synopsys_vector"].apply(lambda x: np.fromstring(x, dtype="float32")) movie_df["synopsys_vector_numpy"].head(2) movie_df["synopsys_vector_numpy"].tolist() np.array(movie_df["synopsys_vector_numpy"].tolist()) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() movie_df["synopsys_vector_numpy_scale"] = scaler.fit_transform(np.array(movie_df["synopsys_vector_numpy"].tolist())).tolist() movie_df.head(1) from sklearn.metrics.pairwise import euclidean_distances sim_score = euclidean_distances(movie_df["synopsys_vector_numpy_scale"].tolist(),movie_df["synopsys_vector_numpy_scale"].tolist()) sim_df = pd.DataFrame(sim_score) sim_df sim_df.index = movie_df["title"] sim_df.columns = movie_df["title"] sim_df sim_df["오토라는 남자"].sort_values result = pd.DataFrame(sim_df["오토라는 남자"].sort_values()[1:4]).reset_index().values.tolist() result import json json.dumps(result, ensure_ascii=False) movie_df["synopsys_vector"][0]