Hyhyhyhyhyhyh / data-analysis

Python数据分析
2 stars 1 forks source link

数据库表字段词频分析 #2

Open Hyhyhyhyhyhyh opened 4 years ago

Hyhyhyhyhyhyh commented 4 years ago

V1.0

说明

分析指定数据库-表-列中的文本内容(中文),统计单词出现的频数,结果可以制作词云图

结果示例

+----+---------------------+----------+------------+----------+--------+-----------+
| id | datatime            | db_name  | table_name | col_name | word   | frequency |
+----+---------------------+----------+------------+----------+--------+-----------+
|  1 | 2020-01-10 10:24:03 | test     | testtab    | content  | 时     |        16 |
|  2 | 2020-01-10 10:24:03 | test     | testtab    | content  | 梦     |        10 |
|  3 | 2020-01-10 10:24:03 | test     | testtab    | content  | 雨     |         9 |
|  4 | 2020-01-10 10:24:03 | test     | testtab    | content  | 水     |         8 |
|  5 | 2020-01-10 10:24:03 | test     | testtab    | content  | 歌     |         7 |
|  6 | 2020-01-10 10:24:03 | test     | testtab    | content  | 花红   |         6 |
|  7 | 2020-01-10 10:24:03 | test     | testtab    | content  | 鱼     |         6 |
|  8 | 2020-01-10 10:24:03 | test     | testtab    | content  | 亭     |         6 |
|  9 | 2020-01-10 10:24:03 | test     | testtab    | content  | 湖面   |         6 |
| 10 | 2020-01-10 10:24:03 | test     | testtab    | content  | 竹     |         6 |
| 11 | 2020-01-10 10:24:03 | test     | testtab    | content  | 情     |         6 |
| 12 | 2020-01-10 10:24:03 | test     | testtab    | content  | 路     |         6 |
| 13 | 2020-01-10 10:24:03 | test     | testtab    | content  | 晚风   |         6 |
| 14 | 2020-01-10 10:24:03 | test     | testtab    | content  | 灯     |         6 |
| 15 | 2020-01-10 10:24:03 | test     | testtab    | content  | 时间   |         6 |
| 16 | 2020-01-10 10:24:03 | test     | testtab    | content  | 寒风   |         6 |
| 17 | 2020-01-10 10:24:03 | test     | testtab    | content  | 游人   |         6 |
| 18 | 2020-01-10 10:24:03 | test     | testtab    | content  | 流水   |         5 |
| 19 | 2020-01-10 10:24:03 | test     | testtab    | content  | 宁静   |         5 |
| 20 | 2020-01-10 10:24:03 | test     | testtab    | content  | 天空   |         5 |
| 21 | 2020-01-10 10:24:03 | test     | testtab    | content  | 笑语   |         5 |
| 22 | 2020-01-10 10:24:03 | test     | testtab    | content  | 夕阳   |         4 |
| 23 | 2020-01-10 10:24:03 | test     | testtab    | content  | 清风   |         4 |
| 24 | 2020-01-10 10:24:03 | test     | testtab    | content  | 秋风   |         4 |
| 25 | 2020-01-10 10:24:03 | test     | testtab    | content  | 西湖   |         4 |
| 26 | 2020-01-10 10:24:03 | test     | testtab    | content  | 感觉   |         4 |
| 27 | 2020-01-10 10:24:03 | test     | testtab    | content  | 美     |         4 |
| 28 | 2020-01-10 10:24:03 | test     | testtab    | content  | 伊人   |         4 |
| 29 | 2020-01-10 10:24:03 | test     | testtab    | content  | 燕子   |         4 |
| 30 | 2020-01-10 10:24:03 | test     | testtab    | content  | 红灯   |         4 |
+----+---------------------+----------+------------+----------+--------+-----------+

Python代码

import re
import jieba
import jieba.posseg as pseg
from collections import Counter
# from itertools import chain
import pandas as pd
import MySQLdb

def analyze_db_connect(database):
    mysql_host    = ''
    mysql_port    = 
    conn_user     = ''
    conn_password = ''
    conn = MySQLdb.connect(host=mysql_host,
                        port=mysql_port,
                        user=conn_user,
                        passwd=conn_password,
                        db=database,
                        charset='utf8',
                        use_unicode=True)
    return conn

conn = analyze_db_connect('')
curs = conn.cursor()

table_name = ''
col_name = ''

sql = "select {0} from {1}".format(col_name, table_name)
curs.execute(sql)
result = curs.fetchall()

# 清洗标点
pattern = r"[!\"#$%&'()*+,-.:;<=>?@[\\\]^_`{|}~——!,。?、¥…():;【】《》‘’“”\s]+"
data = [re.sub(pattern, "", r[0]) for r in result]

# 分词
w = pd.DataFrame({'word': [], 'flag': []})
for line in data:
    for words in pseg.cut(line):
        w = w.append({'word': words.word, 'flag': words.flag}, ignore_index=True)

# 名词
n_words = w[w.flag.isin(['n', 'ng', 'nr', 'nrfg', 'nrt', 'ns', 'nt', 'ns', 'nz'])]

# 停用词
stopword = []
with open ('C:\\Users\\81491\\Desktop\\数据分析\\数据挖掘\\文本分类\\stopword.txt', encoding='UTF-8') as f:
    for line in f:
        stopword.append(line.strip())

words_new = [word for word in n_words['word'] if word not in stopword]

c = Counter(words_new)
# 降序返回词语频数
common = c.most_common()
Hyhyhyhyhyhyh commented 4 years ago

停用词

停用词,指的是在语句中大量出现,但又对语义分析没有帮助的词。对于这样的词汇,通常可以将其删除,好处在于:

Hyhyhyhyhyhyh commented 4 years ago

v2.0函数版本

建表语句

create table word_frequency(
    id         int           auto_increment primary key,
    datatime   timestamp     not null default current_timestamp,
    db_name    varchar(40)   not null,
    table_name varchar(40)   not null,
    col_name   varchar(40)   not null,
    word       varchar(4000) not null,
    frequency  int           not null
);

Python代码

import re
import jieba
import jieba.posseg as pseg
from collections import Counter
# from itertools import chain
import pandas as pd
from sqlalchemy import create_engine
# from sqlalchemy.orm import sessionmaker, scoped_session

# 配置数据库连接
def db_connection(database):
    host    = '127.0.0.1'
    user    = ''
    passwd  = ''
    port    = 3306
    charset = 'utf8mb4'

    engine = create_engine(
        f'mysql+mysqldb://{user}:{passwd}@{host}:{port}/{database}?charset={charset}',
        echo=True,              # 打印sql语句
        max_overflow=0,         # 超过连接池大小外最多创建的连接
        pool_size=5,            # 连接池大小
        pool_timeout=30,        # 池中没有线程最多等待的时间,否则报错
        pool_recycle=-1,        # 多久之后对线程池中的线程进行一次连接的回收(重置)
    )
    # Session = scoped_session(sessionmaker(bind=engine, expire_on_commit=False))
    return engine

def analyze_word_fequency(database, table_name, col_name):
    '''词频分析
    分析传入的数据库中表字段的词频,存入结果数据库中
    '''
    query_engine = db_connection(database)
    insert_engine = db_connection('analysis_db')

    jieba.enable_parallel(4)

    try:
        sql = f'select {col_name} from {table_name}'
        result = pd.read_sql(sql, con=query_engine)

        # 清洗标点
        pattern = r"[!\"#$%&'()*+,-.:;<=>?@[\\\]^_`{|}~——!,。?、¥…():;【】《》‘’“”\s]+"
        data = [re.sub(pattern, "", r) for r in result[col_name]]

        # 分词
        w = pd.DataFrame({'word': [], 'flag': []})
        for line in data:
            for words in pseg.cut(line):
                w = w.append({'word': words.word, 'flag': words.flag}, ignore_index=True)
        # 取出名词
        n_words = w[w.flag.isin(['n', 'ng', 'nr', 'nrfg', 'nrt', 'ns', 'nt', 'ns', 'nz'])]

        # 过滤停用词
        stopword = []
        with open ('./stopword.txt', encoding='UTF-8') as f:
            for line in f:
                stopword.append(line.strip())

        words_new = [word for word in n_words['word'] if word not in stopword]

        c = Counter(words_new)
        common = c.most_common()

        df = pd.DataFrame(common, columns=['word', 'frequency'])
        df['db_name']    = database
        df['table_name'] = table_name
        df['col_name']   = col_name
        df.head(30).to_sql('word_frequency', con=insert_engine, if_exists='append', index=False)
    except Exception as e:
        return e
    finally:
        query_engine.dispose()
        insert_engine.dispose()