网页内容自动抽取与词云展示

最近接到一批数据需要分析，当然常规的就是先用bs4解析处理，提取内容。然而很早之前就听说过了网页内容自动抽取，于是就尝试了下，CxExtractor来自cx-extractor-python

目前我了解到的网页自动抽取方式有:

基于行块分布
基于文本密度

本篇中尝试的方法为基于行块分布的。CxExtractor

读取->提取->过滤即可

import re
import chardet
import requests

class CxExtractor:
    """cx-extractor implemented in Python"""

    __text = []
    # __threshold = 186
    __indexDistribution = []
    # __blocksWidth = 3

    def __init__(self, threshold=86, blocksWidth=3):
        self.__blocksWidth = blocksWidth
        self.__threshold = threshold

    def getText(self, content):
        if self.__text:
            self.__text = []
        lines = content.split('\n')
        for i in range(len(lines)):
            # lines[i] = lines[i].replace("\\n", "")
            if lines[i] == ' ' or lines[i] == '\n':
                lines[i] = ''
        self.__indexDistribution.clear()
        for i in range(0, len(lines) - self.__blocksWidth):
            wordsNum = 0
            for j in range(i, i + self.__blocksWidth):
                lines[j] = lines[j].replace("\\s", "")
                wordsNum += len(lines[j])
            self.__indexDistribution.append(wordsNum)
        start = -1
        end = -1
        boolstart = False
        boolend = False
        for i in range(len(self.__indexDistribution) - 1):
            if(self.__indexDistribution[i] > self.__threshold and (not boolstart)):
                if (self.__indexDistribution[i + 1] != 0 or self.__indexDistribution[i + 2] != 0 or self.__indexDistribution[i + 3] != 0):
                    boolstart = True
                    start = i
                    continue
            if (boolstart):
                if (self.__indexDistribution[i] == 0 or self.__indexDistribution[i + 1] == 0):
                    end = i
                    boolend = True
            tmp = []
            if(boolend):
                for ii in range(start, end + 1):
                    if(len(lines[ii]) < 5):
                        continue
                    tmp.append(lines[ii] + "\n")
                str = "".join(list(tmp))
                if ("Copyright" in str or "版权所有" in str):
                    continue
                self.__text.append(str)
                boolstart = boolend = False
        result = "".join(list(self.__text))
        return result

    def replaceCharEntity(self, htmlstr):
        CHAR_ENTITIES = {'nbsp': ' ', '160': ' ',
                         'lt': '<', '60': '<',
                         'gt': '>', '62': '>',
                         'amp': '&', '38': '&',
                         'quot': '"', '34': '"', }
        re_charEntity = re.compile(r'&#?(?P<name>\w+);')
        sz = re_charEntity.search(htmlstr)
        while sz:
            entity = sz.group()
            key = sz.group('name')
            try:
                htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1)
                sz = re_charEntity.search(htmlstr)
            except KeyError:
                # 以空串代替
                htmlstr = re_charEntity.sub('', htmlstr, 1)
                sz = re_charEntity.search(htmlstr)
        return htmlstr

    def getHtml(self, url):
        response = requests.get(url)
        encode_info = chardet.detect(response.content)
        response.encoding = encode_info['encoding']
        return response.text

    def readHtml(self, path, coding):
        page = open(path, encoding=coding)
        lines = page.readlines()
        s = ''
        for line in lines:
            s += line
        page.close()
        return s

    def filter_tags(self, htmlstr):
        re_nav = re.compile('<nav.+</nav>')
        re_cdata = re.compile('//<!\[CDATA\[.*//\]\]>', re.DOTALL)
        re_script = re.compile(
            '<\s*script[^>]*>.*?<\s*/\s*script\s*>', re.DOTALL | re.I)
        re_style = re.compile(
            '<\s*style[^>]*>.*?<\s*/\s*style\s*>', re.DOTALL | re.I)
        re_textarea = re.compile(
            '<\s*textarea[^>]*>.*?<\s*/\s*textarea\s*>', re.DOTALL | re.I)
        re_br = re.compile('<br\s*?/?>')
        re_h = re.compile('</?\w+.*?>', re.DOTALL)
        re_comment = re.compile('<!--.*?-->', re.DOTALL)
        re_space = re.compile(' +')
        s = re_cdata.sub('', htmlstr)
        s = re_nav.sub('', s)
        s = re_script.sub('', s)
        s = re_style.sub('', s)
        s = re_textarea.sub('', s)
        s = re_br.sub('', s)
        s = re_h.sub('', s)
        s = re_comment.sub('', s)
        s = re.sub('\\t', '', s)
        # s = re.sub(' ', '', s)
        s = re_space.sub(' ', s)
        s = self.replaceCharEntity(s)
        return s

具体可以点过去看一下，不难理解，在我的应用场景里，应用起来也很简单。不过在我这边的应用场景效果并不是非常好。因为这边的html是报告形式的，提取的手机数据为所有信息例如微信聊天记录，删除后的数据也有等等，基本是所有数据。而报告形式十分规整，没有所谓的大型的主体存在，大部分数据都非常规整。所以效果并不是99%的好，但已经很不错了。再稍微进行下处理就可以了。节省了不少时间 2018-05-04 11-24-49

 import glob
 import codecs
 import queue
 import threading
 from CxExtractor import CxExtractor

 cx = CxExtractor(threshold=133)

 uqueue = queue.Queue()

 from bs4 import BeautifulSoup
 import lxml

 def parserfile(f):
     # f = open(fpath).read()
     with open(f, encoding='utf-16le') as c:
         content = c.read()

     soup = BeautifulSoup(content, 'lxml')
     texts = []
     for item in soup.select('.selfTable'):
         try:
             text = item.find_all('a')[0].text
             texts.append(text)
         except Exception as e:
             print(e)

     for item in soup.select('.OuterTable'):
         if item:
             for sub in item.find_all('td'):
                 texts.append(sub.text)
     return texts
 # parserfile(html)

 # with codecs.open(html,encoding="utf-16") as f:
 #     parserfile(f.read())

 # parserfile('../html/Contents0.html')
 # get('.OuterTable')
 htmls = glob.glob("../html/*.html")

 def parserfile_auto(htmlpath):
     html = cx.readHtml(htmlpath, coding='utf-16le')
     content = cx.filter_tags(html)
     s = cx.getText(content)
     return s

 import os
 for html in htmls:
     text = parserfile_auto(html)
     # text = parserfile(html)
     with open('./phone-text-auto/'+os.path.basename(html) + '.txt', 'w', encoding='utf-8') as textfile:
         # t = "\n".join(text)
         textfile.write(text)

对于提取后的数据即可进行分词，关键词提取，然后绘制成词云进行展示了。这里有一个问题就是，如果展示中文的话，worldcloud本身是不能显示的，需要指定字体路径才行。


 #coding:utf-8
 import matplotlib.pyplot as plt
 from wordcloud import WordCloud
 import jieba
 import re
 import jieba.analyse
 # jieba.load_userdict('./dict.ji')
 jieba.enable_parallel(4)

 with open('./zhongwen.txt', errors='ignore') as f:
     text_from_file_with_apath = f.read()

 def stopwordslist(filepath):
     stopwords = [line.strip() for line in open(
         filepath, 'r', encoding='utf-8').readlines()]
     return stopwords

 stopwords = stopwordslist('./dict.jieba')

 for i in stopwords:
     text_from_file_with_apath.replace(i, " ")
     jieba.add_word(i)

 font_path = "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc"
 wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=False)
 wl_space_split = " ".join(wordlist_after_jieba)

 my_wordcloud = WordCloud(font_path=font_path).generate(wl_space_split)
 tags = jieba.analyse.extract_tags(text_from_file_with_apath, topK=100)

 print(",".join(tags))

 plt.imshow(my_wordcloud)
 plt.axis("off")
 plt.show()

词云的图片我就不放了，关于关联分析的话，我觉得可以做的地方有异常检测，转账记录分析，聊天记录分析，文本主题模型。很明显的例如这些个数据的分析中让我想到了neo4j中的sanbox里的川普Twitter分析，其中有很大的相似性。

mylamour / blog

网页内容自动抽取与词云展示 #26