elliotgao2 / gain

Web crawling framework based on asyncio.
GNU General Public License v3.0
2.04k stars 207 forks source link

Repeated bug #39

Closed allphfa closed 6 years ago

allphfa commented 6 years ago

Concurrency 50, duplicated links super (fried chicken many kind of)

Don't believe it, try it yourself

from gain import Css, Item, Parser, Spider, cssParser,Xpath
from pyquery import PyQuery as pq
import re
import requests

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String

engine = create_engine('sqlite:////home/dde/test.db', echo=False)
Base = declarative_base()

class videoInfo(Base):
    __tablename__ = 'users'
    id = Column(Integer, primary_key=True)
    videoTitle = Column(String)
    videoType = Column(String)
    videoAuthor = Column(String)
    videoNotes = Column(String)
    videoLang = Column(String)
    videoRegion = Column(String)
    videoPlayPage = Column(String)
    videoPlayLink = Column(String)

Session = sessionmaker(bind=engine)
session = Session()
Base.metadata.create_all(engine)

class getVideoInfo(Item):

    def filterPlayLink(link):
        url = 'http://www.xinxin46.com%s' % link[0]
        content = requests.get(url).text
        playUrl = eval(re.findall(r'\[\[.*?\]\]\]', content)[0])[0][1]
        result = str()
        for x in playUrl:
            line,playUrl,player = x.split('$')
            result += 'player----{}----{}----{}\n'.format(player,line,playUrl)
        # result = re.findall(r'/player/.*?/', content)[0][1:-1]+'$$$$'+ result
        return result
    videoTitle = Css('div.ui-cnt ul.intro li h2 a.title')
    videoType = Css('.intro > li:nth-child(1)  p', process_func=lambda pqObj: ' '.join([pq(x).text() for x in pq(pqObj[0])('a')]) if len(pq(pqObj[0])('a'))>0 else pq(pqObj).text())
    videoAuthor = Css('.intro > li:nth-child(1)  p',process_func=lambda pqObj:' '.join([pq(x).text() for x in pq(pqObj[1])('a')]) if len(pq(pqObj[1])('a'))>0 else pq(pqObj).text())
    videoNotes = Css('.intro > li:nth-child(1)  p',process_func=lambda pqObj:pq(pqObj[2]).text())
    videoLang = Css('.intro > li:nth-child(1)  p',process_func=lambda pqObj:pq(pqObj[3]).text())
    videoRegion = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[4]).text())
    videoPlayPage = Css('.play-list li a[href^="/player/"]',process_func=lambda pqObj:'\n'.join(['link----'+pq(x).text()+'----' +pq(x).attr('href') for x in pqObj]))

    videoPlayLink = Xpath('/html/body/div[3]/div/div[1]/div[1]/script[1]/@src',process_func=filterPlayLink)

    async def save(self):
        if hasattr(self,'videoTitle')\
            and hasattr(self,'videoType')\
            and hasattr(self,'videoAuthor')\
            and hasattr(self,'videoNotes')\
            and hasattr(self,'videoLang')\
            and hasattr(self,'videoRegion')\
            and hasattr(self,'videoPlayPage')\
            and hasattr(self,'videoPlayLink'):
            """
            if self.videoPlayLink.find('qvod') >-1:
                return

            print('片名:%s' % self.videoTitle)
            print('类型:%s' % self.videoType)
            print('主演:%s' % self.videoAuthor)
            print('%s' % self.videoNotes)
            print('%s' % self.videoLang)
            print('%s' % self.videoRegion)
            print('%s' % self.videoPlayPage)
            print('%s' % self.videoPlayLink)
            print('-------')
            """
            global session
            addInfo = videoInfo(videoTitle=self.videoTitle,videoType=self.videoType,videoAuthor=self.videoAuthor,videoNotes=self.videoNotes,videoLang=self.videoLang,videoRegion=self.videoRegion,videoPlayPage=self.videoPlayPage,videoPlayLink=self.videoPlayLink)
            session.add(addInfo)
            session.commit()

class MySpider(Spider):
    concurrency = 50
    encoding = 'gbk'
    headers = {'User-Agent': 'Google Spider'}
    start_url = r'http://www.xinxin46.com/L/lilunpian.html'
    parsers = [cssParser('.ui-pages a[href^="/L/lilunpian"]',attr='href'),
               cssParser('.primary-list li h5 a[href^="/V/"]',attr='href'),
               cssParser('.play-list a[href^="/player/"]',getVideoInfo,attr='href'),
               ]

MySpider.run()

session.close()

'''
import requests

a= requests.get('http://www.xinxin46.com/player/baishilingyincangjurudepusuOLshimingantizhidenvhaiFSET680/index-0-0.html').text
print(pq(a)('script[src^="/playdata/"]'))
'''