elliotgao2 / gain

Web crawling framework based on asyncio.
GNU General Public License v3.0
2.03k stars 207 forks source link

Add document's own parsing #37

Closed allphfa closed 6 years ago

allphfa commented 6 years ago

Using Firefox 57, you can copy the XPath and CSS paths

selector.py

import re

from lxml import etree
from pyquery import PyQuery as pq

class Selector:
    def __init__(self, rule, attr=None,process_func=None):
        self.rule = rule
        self.attr = attr
        self.process_func = process_func

    def __str__(self):
        return '{}({})'.format(self.__class__.__name__, self.rule)

    def __repr__(self):
        return '{}({})'.format(self.__class__.__name__, self.rule)

    def parse_detail(self, html):
        raise NotImplementedError

class Css(Selector):
    def parse_detail(self, html):

        d = pq(html)

        if self.process_func:
            try:
                if self.rule != 'document':
                    d = d(self.rule)
                results = self.process_func(d)
            except IndexError:
                return None
            return results if results else None

        if self.attr is None:
            try:
                return d(self.rule)[0].text
            except IndexError:
                return None
        return d(self.rule)[0].attr(self.attr, None)

class Xpath(Selector):
    def parse_detail(self, html):
        d = etree.HTML(html)

        if self.process_func:
            try:
                if self.rule != 'document':
                    d = d.xpath(self.rule)
                results = self.process_func(d)
            except IndexError:
                return None
            return results if results else None

        try:
            if self.attr is None:
                return d.xpath(self.rule)[0].text
            return d.xpath(self.rule)[0].get(self.attr, None)
        except IndexError:
            return None

class Regex(Selector):
    def parse_detail(self, html):
        try:
            return re.findall(self.rule, html)[0]
        except IndexError:
            return None

test.py (The importance of processing functions)

In some cases, the creeper rules are complex and need to be resolved by themselves

from gain import Css, Item, Parser, Spider

class Post(Item):

    title = Css('html body div#content div.layout.fn-clear div#primary.mainbox.fn-left div.ui-box.l-h div.ui-cnt ul.primary-list.min-video-list.fn-clear li h5 a', process_func=lambda pq:[x.text for x in pq])
    # title is List
    async def save(self):
        if hasattr(self,'title'):
            # title is List
            for x in self.title:
                print(x)
        else:
            print('error')

class MySpider(Spider):
    concurrency = 5
    encoding = 'gbk'
    headers = {'User-Agent': 'Google Spider'}
    start_url = r'http://www.xinxin46.com/L/lilunpian.html'
    parsers = [Parser('/L/lilunpian\d+\.html',Post)]

MySpider.run()
allphfa commented 6 years ago

It is recommended to get the URL option and the relevance between pages

Similar: A > B > C There may be only one download link in C, B stores the name information of C, A stores the classified information of B, and the information of a movie has different links, which is very troublesome. When A is captured, an associated ID is generated, passed to the B, and then B is passed to the C

for example


class One(Item):

    title = Css('.title')
    async def save(self, info):
        if hasattr(self,'title'):
            if info == None:
               info=dict()
            info['title'] = self.title
            return info  # Return transfer information 
        else:
            return None

class Two(Item):
    videoType = Css('.videoType')
    async def save(self, info):
        if hasattr(self,'videoType'):
            if info == None:
               info=dict()
            info['videoType'] = self.videoType
            return info  # Return transfer information 
        else:
            return None

class videoSave(Item):
    videoLink = Css('.videoLink')
    async def save(self, info):
        if hasattr(self,'videoLink'):
            if info != None:
                info['videoLink'] = self.videoLink
                # save func
                saveFunc(info)
                return info
        return None