Closed allphfa closed 6 years ago
class BaseParser(object): def __init__(self, rule, item=None,attr='href'): self.rule = rule self.item = item self.parsing_urls = [] self.pre_parse_urls = Queue() self.filter_urls = set() self.done_urls = [] self.attr = attr # hare ....ellipsis class cssParser(BaseParser): def abstract_urls(self, html): urls = [pq(x).attr(self.attr) for x in pq(html)(self.rule)] return urls
from gain import Css, Item, Parser, Spider, cssParser from pyquery import PyQuery as pq class Post(Item): videoTitle = Css('div.ui-cnt ul.intro li h2 a.title') videoType = Css('.intro > li:nth-child(1) > p', process_func=lambda pqObj:' '.join([x.text for x in pq(pqObj[0])('a')])) videoAuthor = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:' '.join([x.text for x in pq(pqObj[1])('a')])) videoNotes = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[2]).text()) videoLang = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[3]).text()) videoRegion = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[4]).text()) # title is List async def save(self): if hasattr(self,'videoTitle') and hasattr(self,'videoType') and hasattr(self,'videoAuthor') and hasattr(self,'videoNotes') and hasattr(self,'videoLang') and hasattr(self,'videoRegion'): print('片名:%s' % self.videoTitle) print('类型:%s' % self.videoType) print('主演:%s' % self.videoAuthor) print('%s' % self.videoNotes) print('%s' % self.videoLang) print('%s' % self.videoRegion) print('-------') class MySpider(Spider): concurrency = 5 encoding = 'gbk' headers = {'User-Agent': 'Google Spider'} start_url = r'http://www.xinxin46.com/L/lilunpian.html' parsers = [cssParser('.ui-pages a[href^="/L/lilunpian"]',attr='href'), cssParser('.primary-list li h5 a[href^="/V/"]',attr='href'), cssParser('.play-list a[href^="/player/"]',Post,attr='href'), ] MySpider.run()
Parser.py
eg: