Closed allphfa closed 6 years ago
Concurrency 50, duplicated links super (fried chicken many kind of)
Don't believe it, try it yourself
from gain import Css, Item, Parser, Spider, cssParser,Xpath from pyquery import PyQuery as pq import re import requests from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker from sqlalchemy import create_engine from sqlalchemy import Column, Integer, String engine = create_engine('sqlite:////home/dde/test.db', echo=False) Base = declarative_base() class videoInfo(Base): __tablename__ = 'users' id = Column(Integer, primary_key=True) videoTitle = Column(String) videoType = Column(String) videoAuthor = Column(String) videoNotes = Column(String) videoLang = Column(String) videoRegion = Column(String) videoPlayPage = Column(String) videoPlayLink = Column(String) Session = sessionmaker(bind=engine) session = Session() Base.metadata.create_all(engine) class getVideoInfo(Item): def filterPlayLink(link): url = 'http://www.xinxin46.com%s' % link[0] content = requests.get(url).text playUrl = eval(re.findall(r'\[\[.*?\]\]\]', content)[0])[0][1] result = str() for x in playUrl: line,playUrl,player = x.split('$') result += 'player----{}----{}----{}\n'.format(player,line,playUrl) # result = re.findall(r'/player/.*?/', content)[0][1:-1]+'$$$$'+ result return result videoTitle = Css('div.ui-cnt ul.intro li h2 a.title') videoType = Css('.intro > li:nth-child(1) p', process_func=lambda pqObj: ' '.join([pq(x).text() for x in pq(pqObj[0])('a')]) if len(pq(pqObj[0])('a'))>0 else pq(pqObj).text()) videoAuthor = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:' '.join([pq(x).text() for x in pq(pqObj[1])('a')]) if len(pq(pqObj[1])('a'))>0 else pq(pqObj).text()) videoNotes = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[2]).text()) videoLang = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[3]).text()) videoRegion = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[4]).text()) videoPlayPage = Css('.play-list li a[href^="/player/"]',process_func=lambda pqObj:'\n'.join(['link----'+pq(x).text()+'----' +pq(x).attr('href') for x in pqObj])) videoPlayLink = Xpath('/html/body/div[3]/div/div[1]/div[1]/script[1]/@src',process_func=filterPlayLink) async def save(self): if hasattr(self,'videoTitle')\ and hasattr(self,'videoType')\ and hasattr(self,'videoAuthor')\ and hasattr(self,'videoNotes')\ and hasattr(self,'videoLang')\ and hasattr(self,'videoRegion')\ and hasattr(self,'videoPlayPage')\ and hasattr(self,'videoPlayLink'): """ if self.videoPlayLink.find('qvod') >-1: return print('片名:%s' % self.videoTitle) print('类型:%s' % self.videoType) print('主演:%s' % self.videoAuthor) print('%s' % self.videoNotes) print('%s' % self.videoLang) print('%s' % self.videoRegion) print('%s' % self.videoPlayPage) print('%s' % self.videoPlayLink) print('-------') """ global session addInfo = videoInfo(videoTitle=self.videoTitle,videoType=self.videoType,videoAuthor=self.videoAuthor,videoNotes=self.videoNotes,videoLang=self.videoLang,videoRegion=self.videoRegion,videoPlayPage=self.videoPlayPage,videoPlayLink=self.videoPlayLink) session.add(addInfo) session.commit() class MySpider(Spider): concurrency = 50 encoding = 'gbk' headers = {'User-Agent': 'Google Spider'} start_url = r'http://www.xinxin46.com/L/lilunpian.html' parsers = [cssParser('.ui-pages a[href^="/L/lilunpian"]',attr='href'), cssParser('.primary-list li h5 a[href^="/V/"]',attr='href'), cssParser('.play-list a[href^="/player/"]',getVideoInfo,attr='href'), ] MySpider.run() session.close() ''' import requests a= requests.get('http://www.xinxin46.com/player/baishilingyincangjurudepusuOLshimingantizhidenvhaiFSET680/index-0-0.html').text print(pq(a)('script[src^="/playdata/"]')) '''
Concurrency 50, duplicated links super (fried chicken many kind of)
Don't believe it, try it yourself