elliotgao2 / gain

Web crawling framework based on asyncio.
GNU General Public License v3.0
2.04k stars 207 forks source link

add cssParser #38

Closed allphfa closed 6 years ago

allphfa commented 6 years ago

Parser.py

class BaseParser(object):    
    def __init__(self, rule, item=None,attr='href'):
        self.rule = rule
        self.item = item
        self.parsing_urls = []
        self.pre_parse_urls = Queue()
        self.filter_urls = set()
        self.done_urls = []
        self.attr = attr    # hare

....ellipsis

class cssParser(BaseParser):
    def abstract_urls(self, html):
        urls = [pq(x).attr(self.attr) for x in pq(html)(self.rule)]
        return urls

eg:


from gain import Css, Item, Parser, Spider, cssParser
from pyquery import PyQuery as pq

class Post(Item):

    videoTitle = Css('div.ui-cnt ul.intro li h2 a.title')
    videoType = Css('.intro > li:nth-child(1) > p', process_func=lambda pqObj:' '.join([x.text for x in pq(pqObj[0])('a')]))
    videoAuthor = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:' '.join([x.text for x in pq(pqObj[1])('a')]))
    videoNotes = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[2]).text())
    videoLang = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[3]).text())
    videoRegion = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[4]).text())

    # title is List
    async def save(self):
        if hasattr(self,'videoTitle') and hasattr(self,'videoType') and hasattr(self,'videoAuthor') and hasattr(self,'videoNotes') and hasattr(self,'videoLang') and hasattr(self,'videoRegion'):
            print('片名:%s' % self.videoTitle)
            print('类型:%s' % self.videoType)
            print('主演:%s' % self.videoAuthor)
            print('%s' % self.videoNotes)
            print('%s' % self.videoLang)
            print('%s' % self.videoRegion)
            print('-------')

class MySpider(Spider):
    concurrency = 5
    encoding = 'gbk'
    headers = {'User-Agent': 'Google Spider'}
    start_url = r'http://www.xinxin46.com/L/lilunpian.html'
    parsers = [cssParser('.ui-pages a[href^="/L/lilunpian"]',attr='href'),
               cssParser('.primary-list li h5 a[href^="/V/"]',attr='href'),
               cssParser('.play-list a[href^="/player/"]',Post,attr='href'),
               ]

MySpider.run()