xiaoxiaosuaxuan / newscrapy

newscrapy
1 stars 6 forks source link

用scrapy调试能输出内容但是运行后的results是空的 #26

Open lsy641111 opened 1 year ago

lsy641111 commented 1 year ago

http://app.zgsyb.com.cn/paper/layout/202208/26/l01.html

msLiu98 commented 1 year ago

image

image

# -*- coding: utf-8 -*-
import scrapy
from scrapy import FormRequest
import re
from newscrapy.items import NewscrapyItem
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
from newscrapy.tools import dateGen
from urllib.parse import urljoin

class mySpider(scrapy.Spider):

    name = "zgsyb"

    def start_requests(self):
        urls = [
            'http://app.zgsyb.com.cn/paper/layout/202208/26/l01.html'
        ]
        for url in urls:
            yield FormRequest(url, dont_filter=True)

    def parse(self, response, **kwargs):
        next_urls = re.findall('l\d+.html', response.text)
        for url in next_urls:
            url_next = f'http://app.zgsyb.com.cn/paper/layout/202208/26/{url}'
            yield FormRequest(url_next, callback=self.parse_page)

    def parse_page(self, response, **kwargs):
        next_urls = re.findall('../../../c/\d+/\d+/c\d+.html', response.text)
        # print(next_urls)
        for url in next_urls:
            href = url.replace('../', '')
            url_next = urljoin('http://app.zgsyb.com.cn/paper/', href)
            yield FormRequest(url_next, callback=self.parse_item)

    def parse_item(self, response):
        title = response.xpath('//h2[@id="Title"]//text()').extract()
        content = response.xpath('//div[@id="ozoom"]//text()').extract()
        author = ''
        date = ''
        url = response.url

        item = NewscrapyItem()
        item['title'] = re.sub('[\n\r\s\t]', '', ','.join(title))
        item['content'] = re.sub('[\n\r\s\t]', '', ','.join(content))
        item['author'] = re.sub('[\n\r\s\t]', '', ','.join(author))
        item['date'] = ''.join(date)
        item['url'] = url
        item['newspaper'] = self.name
        item['imgs'] = ''
        yield item