Open Hyhyhyhyhyhyh opened 4 years ago
create table word_frequency(
id int auto_increment primary key,
datatime timestamp not null default current_timestamp,
db_name varchar(40) not null,
table_name varchar(40) not null,
col_name varchar(40) not null,
word varchar(4000) not null,
frequency int not null
);
import re
import jieba
import jieba.posseg as pseg
from collections import Counter
# from itertools import chain
import pandas as pd
from sqlalchemy import create_engine
# from sqlalchemy.orm import sessionmaker, scoped_session
# 配置数据库连接
def db_connection(database):
host = '127.0.0.1'
user = ''
passwd = ''
port = 3306
charset = 'utf8mb4'
engine = create_engine(
f'mysql+mysqldb://{user}:{passwd}@{host}:{port}/{database}?charset={charset}',
echo=True, # 打印sql语句
max_overflow=0, # 超过连接池大小外最多创建的连接
pool_size=5, # 连接池大小
pool_timeout=30, # 池中没有线程最多等待的时间,否则报错
pool_recycle=-1, # 多久之后对线程池中的线程进行一次连接的回收(重置)
)
# Session = scoped_session(sessionmaker(bind=engine, expire_on_commit=False))
return engine
def analyze_word_fequency(database, table_name, col_name):
'''词频分析
分析传入的数据库中表字段的词频,存入结果数据库中
'''
query_engine = db_connection(database)
insert_engine = db_connection('analysis_db')
jieba.enable_parallel(4)
try:
sql = f'select {col_name} from {table_name}'
result = pd.read_sql(sql, con=query_engine)
# 清洗标点
pattern = r"[!\"#$%&'()*+,-.:;<=>?@[\\\]^_`{|}~——!,。?、¥…():;【】《》‘’“”\s]+"
data = [re.sub(pattern, "", r) for r in result[col_name]]
# 分词
w = pd.DataFrame({'word': [], 'flag': []})
for line in data:
for words in pseg.cut(line):
w = w.append({'word': words.word, 'flag': words.flag}, ignore_index=True)
# 取出名词
n_words = w[w.flag.isin(['n', 'ng', 'nr', 'nrfg', 'nrt', 'ns', 'nt', 'ns', 'nz'])]
# 过滤停用词
stopword = []
with open ('./stopword.txt', encoding='UTF-8') as f:
for line in f:
stopword.append(line.strip())
words_new = [word for word in n_words['word'] if word not in stopword]
c = Counter(words_new)
common = c.most_common()
df = pd.DataFrame(common, columns=['word', 'frequency'])
df['db_name'] = database
df['table_name'] = table_name
df['col_name'] = col_name
df.head(30).to_sql('word_frequency', con=insert_engine, if_exists='append', index=False)
except Exception as e:
return e
finally:
query_engine.dispose()
insert_engine.dispose()
V1.0
说明
分析指定数据库-表-列中的文本内容(中文),统计单词出现的频数,结果可以制作词云图
结果示例
Python代码