alecxe / scrapy-fake-useragent

Random User-Agent middleware based on fake-useragent
MIT License
686 stars 98 forks source link

cant scrapy-fake-useragent work well with pymongo?? #33

Closed doi-h closed 3 years ago

doi-h commented 3 years ago

hi guys :) i have one issue. when i don't use pymongo in scrapy after setting scrapy-fake-useragent in settings, scrapy can run well. but when i use pymongo, scrapy never work, it keep to stop. can you tell me what is problem ? :)

this is scrapy code to test

import scrapy
import pymongo
import requests
connection = pymongo.MongoClient()

# connection = pymongo.MongoClient()
db_link = connection.CoupangLinkTestDB 
db_product = connection.CoupangLinkProductTestDB

class TesterSpider(scrapy.Spider):
    name = 'tester4'

    def start_requests(self):
        yield scrapy.Request(url='https://www.amazon.jp', callback=self.router_start, dont_filter=True)

    def router_start(self, response):
        link = ['https://www.amazon.com/s/ref=s9_acss_bw_cts_Computer_T1_w?fst=as%3Aoff&rh=n%3A16225007011%2Cn%3A172456&bbn=16225007011&ie=UTF8&qid=1487012920&rnid=16225007011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-4&pf_rd_r=PF0Q3FXYDG3N1QMXFA8Y&pf_rd_t=101&pf_rd_p=74069509-93ef-4a3c-8dca-a9e3fa773a64&pf_rd_i=16225007011','https://www.amazon.com/s?bbn=16225007011&rh=n%3A16225007011%2Cn%3A193870011&dc&fst=as%3Aoff&pf_rd_i=16225007011&pf_rd_m=ATVPDKIKX0DER&pf_rd_p=74069509-93ef-4a3c-8dca-a9e3fa773a64&pf_rd_r=PF0Q3FXYDG3N1QMXFA8Y&pf_rd_s=merchandised-search-4&pf_rd_t=101&qid=1487012920&rnid=16225007011&ref=s9_acss_bw_cts_Computer_T2_w','https://www.amazon.com/s?bbn=16225007011&rh=n%3A16225007011%2Cn%3A13896617011&dc&fst=as%3Aoff&pf_rd_i=16225007011&pf_rd_m=ATVPDKIKX0DER&pf_rd_p=74069509-93ef-4a3c-8dca-a9e3fa773a64&pf_rd_r=PF0Q3FXYDG3N1QMXFA8Y&pf_rd_s=merchandised-search-4&pf_rd_t=101&qid=1487012920&rnid=16225007011&ref=s9_acss_bw_cts_Computer_T3_w','https://www.amazon.com/s?bbn=16225007011&rh=n%3A16225007011%2Cn%3A172504&dc&fst=as%3Aoff&pf_rd_i=16225007011&pf_rd_m=ATVPDKIKX0DER&pf_rd_p=74069509-93ef-4a3c-8dca-a9e3fa773a64&pf_rd_r=PF0Q3FXYDG3N1QMXFA8Y&pf_rd_s=merchandised-search-4&pf_rd_t=101&qid=1487012920&rnid=16225007011&ref=s9_acss_bw_cts_Computer_T4_w']

        for i in link:  

            yield scrapy.Request(url=i, callback=self.product_1,  priority=20000, dont_filter=True, meta={'link':i})
            print('product_1 request')

        for i in link:  

            yield scrapy.Request(url=i, callback=self.product_2,  priority=1000, dont_filter=True, meta={'link':i})
            print('product_2 request')
        for i in link:  

            yield scrapy.Request(url=i, callback=self.product_3,  priority=100, dont_filter=True, meta={'link':i})
            print('product_3 request')
        for i in link:  

            yield scrapy.Request(url=i, callback=self.product_4,  priority=10, dont_filter=True, meta={'link':i})
            print('product_4 request')

        # db_start = 40
        # db_end = 50

        # for i in db_link.product_6.find({})[db_start:db_end]:  

        #     yield scrapy.Request(url=i['link'], callback=self.product_1,  priority=20000, dont_filter=True, meta={'link':i['link']})
        #     print('product_1 request')

        # for i in db_link.product_6.find({})[db_start:db_end]:  

        #     yield scrapy.Request(url=i['link'], callback=self.product_2,  priority=1000, dont_filter=True, meta={'link':i['link']})
        #     print('product_2 request')

        # for i in db_link.product_6.find({})[db_start:db_end]:  

        #     yield scrapy.Request(url=i['link'], callback=self.product_3,  priority=100, dont_filter=True, meta={'link':i['link']})
        #     print('product_3 request')

        # for i in db_link.product_6.find({})[db_start:db_end]:  

        #     yield scrapy.Request(url=i['link'], callback=self.product_4,  priority=10, dont_filter=True, meta={'link':i['link']})
        #     print('product_4 request')

    def product_1(self, response):
        print('product_1 run')
        yield scrapy.Request(url=response.meta['link'] + ' ', callback=self.product, dont_filter=True)

    def product_2(self, response):
        print('product_2 run')
        yield scrapy.Request(url=response.meta['link'] + ' ', callback=self.product, dont_filter=True)

    def product_3(self, response):
        print('product_3 run')
        yield scrapy.Request(url=response.meta['link'] + ' ', callback=self.product, dont_filter=True)

    def product_4(self, response):
        print('product_4 run')
        yield scrapy.Request(url=response.meta['link'] + ' ', callback=self.product, dont_filter=True)

    def product(self, response):

        for i in range(1,1000):
            print(i)

this is settings.py

DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
    'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
    'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401,
}

FAKEUSERAGENT_PROVIDERS = [
    'scrapy_fake_useragent.providers.FakeUserAgentProvider',
    'scrapy_fake_useragent.providers.FakerProvider',  
    'scrapy_fake_useragent.providers.FixedUserAgentProvider',
]
alecxe commented 3 years ago

Hi @doi-h! This would definitely be a good question for https://stackoverflow.com/. Make sure to provide errors/logs with your question as well. Thanks!