Open LLMApple opened 10 months ago
我改了一下代码,能跑起来了: `# -- coding: utf-8 -- import scrapy import logging import urllib import os import glob import re import pymongo from scrapy.selector import Selector from neo4j import GraphDatabase import logging import time logfilename = time.ctime(time.time()).replace(' ', '') if not os.path.exists('logs\'): os.mkdir('logs\') logfile_name = time.strftime('%d-%b-%y %H-%M-%S', time.localtime()) log_file_path = os.path.join('logs', f'{logfile_name}.log')
logging.basicConfig(filename=log_file_path, filemode='a+', format='%(levelname)s - %(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
class BaikeSpider(scrapy.Spider): name = 'baike' allowed_domains = ['baike.baidu.com'] start_urls = ['https://baike.baidu.com/item/文汇报'] db = pymongo.MongoClient("mongodb://127.0.0.1:27017/")["db_kg"] db_baike = db['db_baike'] db_triples = db['db_triples'] olds = set([item['_id'] for item in db_baike.find({}, {'_id': 1})]) if len(olds) > 0: start_urls = ['https://baike.baidu.com/item/'+olds.pop()]
uri = "bolt://localhost:7687"
user = "neo4j" # 替换成您的用户名
password = "123" # 替换成您的密码
driver = GraphDatabase.driver(uri, auth=(user, password), encrypted=False)
def add_node(self, tx, name1, relation, name2):
tx.run("MERGE (a:Node {name: $name1}) "
"MERGE (b:Node {name: $name2}) "
"MERGE (a)-[:"+relation+"]-> (b)",
name1=name1, name2=name2)
print("Nodes and relationship added to Neo4j.")
def parse(self, response):
# print(response.url)
item_name = re.sub('/', '', re.sub('https://baike.baidu.com/item/',
'', urllib.parse.unquote(response.url)))
# 爬取过的直接忽视
if item_name in self.olds:
return
# 将网页内容存入mongodb
try:
text = ''.join(response.xpath('//div[@class="main-content"]').xpath('//div[@class="para"]//text()').getall())
if not text:
text = item_name
self.db_baike.insert_one(
{
'_id': item_name,
'text': text
})
except pymongo.errors.DuplicateKeyError:
pass
# 更新爬取过的item集合
self.olds.add(item_name)
# 爬取页面内的item
items = set(response.xpath(
'//a[contains(@href, "/item/")]/@href').re(r'/item/[A-Za-z0-9%\u4E00-\u9FA5]+'))
for item in items:
new_url = 'https://baike.baidu.com'+urllib.parse.unquote(item)
new_item_name = re.sub(
'/', '', re.sub('https://baike.baidu.com/item/', '', new_url))
if new_item_name not in self.olds:
yield response.follow(new_url, callback=self.parse)
# 处理三元组
entity = ''.join(response.xpath(
'//h1/text()').getall()).replace('/', '')
attrs = response.xpath('//dt[@class="basicInfoItem_Ql5xB itemName_bc1nm"]/text()').getall()
values = response.xpath('//dd[@class="basicInfoItem_Ql5xB itemValue_Kzb4E"]//text()').getall()
if len(attrs) != len(values):
return
with self.driver.session() as session:
try:
for attr, value in zip(attrs, values):
if attr == "" or value == "":
continue
# attr
temp = Selector(text=attr).xpath(
'//dt//text()').getall()
relation_type = re.sub(r'\s+', '', attr)
value = re.sub(r'\s+', '', value)
print(relation_type)
# value
# value = ''.join(Selector(text=value).xpath(
# '//dd/text()|//dd/a//text()').getall())
try:
value = value.replace('\n', '')
logging.warning(entity+'_'+attr+'_'+value)
self.db_triples.insert_one({
"_id": entity+'_'+attr+'_'+value,
"item_name": entity,
"attr": attr,
"value": value, }
)
except pymongo.errors.DuplicateKeyError:
pass
# print(f"session: {session}")
session.write_transaction(
self.add_node, entity, relation_type, value)
except Exception as e:
print(e)
logging.error('\n---'.join(attrs) +
'\n_________________'+'\n---'.join(values))
`
像爬取的text为空,还有就是添加三元组的时候attrs和values也是空的,所以加不到三元组里