scrapy / scrapy

Scrapy, a fast high-level web crawling & scraping framework for Python.
https://scrapy.org
BSD 3-Clause "New" or "Revised" License
50.99k stars 10.34k forks source link

Not able to use requests inside with scrapy. #6304

Closed virenramani closed 1 month ago

virenramani commented 1 month ago

I have written a Python code to send a POST request to a URL using the requests library. The code works fine when used outside Scrapy. However, when I tried to use the same code in Scrapy shell, it did not work and returned an error.

I have attached the code below for reference. The first snippet uses requests library and returns a 200 response when executed. The second snippet uses Scrapy's Request object to send the POST request. Although it uses the same headers, cookies, and body as the first snippet, it does not work in Scrapy shell and throws an error.

Please note that I have also attached a screenshot of the error message for reference.

import requests

url = "https://nevadaepro.com/bso/external/bidDetail.sdo"

payload = '_csrf=7012f86b-0a1f-418a-ab61-5860f1e4521a&mode=download&bidId=04SOS-S2794&docId=04SOS-S2794&currentPage=1&querySql=&downloadFileNbr=193715&itemNbr=0&parentUrl=&fromQuote=&destination='
headers = {
  'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
  'cache-control': 'max-age=0',
  'content-type': 'application/x-www-form-urlencoded',
  'cookie': 'XSRF-TOKEN=7012f86b-0a1f-418a-ab61-5860f1e4521a;',
  'origin': 'https://nevadaepro.com',
  'referer': 'https://nevadaepro.com/bso/external/bidDetail.sdo?docId=04SOS-S2794&external=true&parentUrl=close',
  'sec-ch-ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
  'sec-ch-ua-mobile': '?0',
  'sec-ch-ua-platform': '"Windows"',
  'sec-fetch-dest': 'document',
  'sec-fetch-mode': 'navigate',
  'sec-fetch-site': 'same-origin',
  'sec-fetch-user': '?1',
  'upgrade-insecure-requests': '1',
  'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}

response = requests.request("POST", url, headers=headers, data=body)
print(response.status_code)

This code returns a 200 response, but it doesn't work when used in scrapy shell.

from scrapy import Request

url = 'https://nevadaepro.com/bso/external/bidDetail.sdo'

headers = {
  'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
  'cache-control': 'max-age=0',
  'content-type': 'application/x-www-form-urlencoded',
  'cookie': 'XSRF-TOKEN=7012f86b-0a1f-418a-ab61-5860f1e4521a;',
  'origin': 'https://nevadaepro.com',
  'referer': 'https://nevadaepro.com/bso/external/bidDetail.sdo?docId=04SOS-S2794&external=true&parentUrl=close',
  'sec-ch-ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
  'sec-ch-ua-mobile': '?0',
  'sec-ch-ua-platform': '"Windows"',
  'sec-fetch-dest': 'document',
  'sec-fetch-mode': 'navigate',
  'sec-fetch-site': 'same-origin',
  'sec-fetch-user': '?1',
  'upgrade-insecure-requests': '1',
  'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}

cookies = {
    "XSRF-TOKEN": "1b0686fb-5d1e-40b9-9c6d-f9fe8d8666ad",
    }

body = '_csrf=7012f86b-0a1f-418a-ab61-5860f1e4521a&mode=download&bidId=04SOS-S2794&docId=04SOS-S2794&currentPage=1&querySql=&downloadFileNbr=193715&itemNbr=0&parentUrl=&fromQuote=&destination='
request = Request(
    url=url,
    method='POST',
    dont_filter=True,
    cookies=cookies,
    headers=headers,
    body=body,
)

fetch(request)

Using requests code in scrapy shell won't work but outside scrapy it will. image

import scrapy
from scrapy.shell import inspect_response
import requests

class NevadaEPro(scrapy.Spider):
    name = 'nevadaepro'

    def start_requests(self):
        url = 'https://nevadaepro.com/bso/view/search/external/advancedSearchBid.xhtml?openBids=true'

        headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
            "cache-control": "max-age=0",
            "sec-ch-ua": "\"Google Chrome\";v=\"123\", \"Not:A-Brand\";v=\"8\", \"Chromium\";v=\"123\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"Windows\"",
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "none",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
        }

        yield scrapy.Request(
            url=url,
            headers=headers,
            meta={
            'num':0
            },
        )

    def parse(self, response):
        # inspect_response(response, self)
        half_urls = response.xpath('//*[@id="bidSearchResultsForm:bidResultId"]').re(r'role="row"&gt;&lt;td role="gridcell"&gt;&lt;a href="(.*?)"') or response.xpath('//*[@id="bidSearchResultsForm:bidResultId_data"]').re(r'role\="row"><td role\="gridcell"><a href\="(.*?)"')
        num = response.meta.get('num')
        next_page = response.xpath('.//*[contains(@class,"ui-paginator-next") and not(contains(@class,"disabled"))]').extract_first()
        _csrf = response.xpath('//form/input[@name="_csrf"]/@value').extract_first()
        javaxFacesViewState = response.xpath('//form/input[@name="javax.faces.ViewState"]/@value').extract_first()

        for h_url in half_urls:
            url = response.urljoin(h_url)
            headers = {
                        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
                        "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
                        "cache-control": "max-age=0",
                        "sec-ch-ua": "\"Google Chrome\";v=\"123\", \"Not:A-Brand\";v=\"8\", \"Chromium\";v=\"123\"",
                        "sec-ch-ua-mobile": "?0",
                        "sec-ch-ua-platform": "\"Windows\"",
                        "sec-fetch-dest": "document",
                        "sec-fetch-mode": "navigate",
                        "sec-fetch-site": "none",
                        "sec-fetch-user": "?1",
                        "upgrade-insecure-requests": "1",
                        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
                    }

            yield scrapy.Request(
                        url,
                        method='GET',
                        headers=headers,
                        callback=self.detail_page
                    )

        if next_page:
            headers = {
                        "accept": "application/xml, text/xml, */*; q=0.01",
                        "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
                        "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
                        "faces-request": "partial/ajax",
                        "origin": "https://nevadaepro.com",
                        "referer": "https://nevadaepro.com/bso/view/search/external/advancedSearchBid.xhtml?openBids=true",
                        "sec-ch-ua": "\"Google Chrome\";v=\"123\", \"Not:A-Brand\";v=\"8\", \"Chromium\";v=\"123\"",
                        "sec-ch-ua-mobile": "?0",
                        "sec-ch-ua-platform": "\"Windows\"",
                        "sec-fetch-dest": "empty",
                        "sec-fetch-mode": "cors",
                        "sec-fetch-site": "same-origin",
                        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
                        "x-requested-with": "XMLHttpRequest"
                    }
            num += len(half_urls)
            body = f'javax.faces.partial.ajax=true&javax.faces.source=bidSearchResultsForm%3AbidResultId&javax.faces.partial.execute=bidSearchResultsForm%3AbidResultId&javax.faces.partial.render=bidSearchResultsForm%3AbidResultId&bidSearchResultsForm%3AbidResultId=bidSearchResultsForm%3AbidResultId&bidSearchResultsForm%3AbidResultId_pagination=true&bidSearchResultsForm%3AbidResultId_first={num}&bidSearchResultsForm%3AbidResultId_rows=25&bidSearchResultsForm%3AbidResultId_encodeFeature=true&bidSearchResultsForm=bidSearchResultsForm&_csrf={_csrf}&openBids=true&javax.faces.ViewState={javaxFacesViewState}'
            yield scrapy.Request(response.url,
                method='POST',
                body=body,
                dont_filter=True,
                headers=headers,
                meta={
                'num':num
                },
                callback=self.parse
                )

    def detail_page(self,response):
        inspect_response(response, self)

        url='http://example.com/bso/external/bidDetail.sdo'
        _csrf = response.xpath('//form/input[@name="_csrf"]/@value').get()
        mode = response.xpath('//form/input[@name="mode"]/@value').get() or 'download'
        bidId = response.xpath('//form/input[@name="bidId"]/@value').get()
        docId = response.xpath('//form/input[@name="docId"]/@value').get()
        currentPage = response.xpath('//form/input[@name="currentPage"]/@value').get()
        querySql = response.xpath('//form/input[@name="querySql"]/@value').get()
        downloadFileNbr = response.xpath('//*[contains(text(),"File Attachments:")]//following::*[1]//@href').re_first(r"downloadFile\('(\d+)'\)")
        itemNbr = response.xpath('//form/input[@name="itemNbr"]/@value').get()
        parentUrl = response.xpath('//form/input[@name="parentUrl"]/@value').get()
        fromQuote = response.xpath('//form/input[@name="fromQuote"]/@value').get()
        destination = response.xpath('//form/input[@name="destination"]/@value').get()
        headers = {
                    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
                    'cache-control': 'max-age=0',
                    'content-type': 'application/x-www-form-urlencoded',
                    'cookie': f'XSRF-TOKEN={_csrf};',
                    'origin': 'https://nevadaepro.com',
                    'referer': 'https://nevadaepro.com/bso/external/bidDetail.sdo?docId=04SOS-S2794&external=true&parentUrl=close',
                    'sec-ch-ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
                    'sec-ch-ua-mobile': '?0',
                    'sec-ch-ua-platform': '"Windows"',
                    'sec-fetch-dest': 'document',
                    'sec-fetch-mode': 'navigate',
                    'sec-fetch-site': 'same-origin',
                    'sec-fetch-user': '?1',
                    'upgrade-insecure-requests': '1',
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
                    }
        body = f'_csrf={_csrf}&mode={mode}&bidId={bidId}&docId={docId}&currentPage={currentPage}&querySql={querySql}&downloadFileNbr={downloadFileNbr}&itemNbr={itemNbr}&parentUrl={parentUrl}&fromQuote={fromQuote}&destination={destination}'
        formdata = {element.split('=')[0]: element.split('=')[1] for element in body.split('&')}
        response = requests.request("POST", url, headers=headers, data=body)
        print('\n\Working or Not : ',response.status_code)
        # cookies = {
        #     "XSRF-TOKEN": str(_csrf),
        #     }

        # body = f'_csrf={_csrf}&mode=download&bidId=04SOS-S2794&docId=04SOS-S2794&currentPage=1&querySql=&downloadFileNbr=193715&itemNbr=undefined&parentUrl=close&fromQuote=&destination='
        # print(body)
        # print(cookies)

        # yield scrapy.Request(
        #                       url,
        #                       method='POST',
        #                       dont_filter=True,
        #                       headers=headers,
        #                       cookies = cookies,
        #                       meta={'dont_merge_cookies':True},
        #                       body=body,
        #                       callback=self.saving_files
        #                   )
        # yield scrapy.FormRequest(
        #                       url='http://example.com/bso/external/bidDetail.sdo', 
        #                       method='POST',
        #                       formdata=formdata,
        #                       headers=headers,
        #                       cookies = {
        #                                   "XSRF-TOKEN": _csrf,
        #                                   },
        #                       meta={'dont_merge_cookies':True},
        #                       callback=self.saving_files
        #                       )
    def saving_files(self, response):
        inspect_response(response, self)
wRAR commented 1 month ago

This is the Scrapy issue tracker, please ask questions about your code on suitable platforms. If you think you've found a bug in Scrapy you need to provide a minimal reproducible example of it.