Open lsy641111 opened 1 year ago
html
的节点提取,它的提取逻辑是:首先,获取响应response
中所有doc
中的链接(例如各个href
节点),然后再基于我们给定的Rule里的allow的正则去匹配符合要求的链接(见图1红框)。以上逻辑成立的关键前提是链接存在于html的文本(body节点中,非script)
,如果链接不在doc中,则无法提取到链接。# -*- coding: utf-8 -*-
import scrapy
from scrapy import FormRequest
import re
from newscrapy.items import NewscrapyItem
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
from newscrapy.tools import dateGen
from urllib.parse import urljoin
class mySpider(scrapy.Spider):
name = "zgsyb"
def start_requests(self):
urls = [
'http://app.zgsyb.com.cn/paper/layout/202208/26/l01.html'
]
for url in urls:
yield FormRequest(url, dont_filter=True)
def parse(self, response, **kwargs):
next_urls = re.findall('l\d+.html', response.text)
for url in next_urls:
url_next = f'http://app.zgsyb.com.cn/paper/layout/202208/26/{url}'
yield FormRequest(url_next, callback=self.parse_page)
def parse_page(self, response, **kwargs):
next_urls = re.findall('../../../c/\d+/\d+/c\d+.html', response.text)
# print(next_urls)
for url in next_urls:
href = url.replace('../', '')
url_next = urljoin('http://app.zgsyb.com.cn/paper/', href)
yield FormRequest(url_next, callback=self.parse_item)
def parse_item(self, response):
title = response.xpath('//h2[@id="Title"]//text()').extract()
content = response.xpath('//div[@id="ozoom"]//text()').extract()
author = ''
date = ''
url = response.url
item = NewscrapyItem()
item['title'] = re.sub('[\n\r\s\t]', '', ','.join(title))
item['content'] = re.sub('[\n\r\s\t]', '', ','.join(content))
item['author'] = re.sub('[\n\r\s\t]', '', ','.join(author))
item['date'] = ''.join(date)
item['url'] = url
item['newspaper'] = self.name
item['imgs'] = ''
yield item
http://app.zgsyb.com.cn/paper/layout/202208/26/l01.html