lixiang0 / WEB_KG

爬取百度百科中文页面,抽取三元组信息,构建中文知识图谱
http://kg.rubenxiao.com
937 stars 189 forks source link

爬到的东西为空 #30

Open LLMApple opened 10 months ago

LLMApple commented 10 months ago

像爬取的text为空,还有就是添加三元组的时候attrs和values也是空的,所以加不到三元组里

LLMApple commented 10 months ago

我改了一下代码,能跑起来了: `# -- coding: utf-8 -- import scrapy import logging import urllib import os import glob import re import pymongo from scrapy.selector import Selector from neo4j import GraphDatabase import logging import time logfilename = time.ctime(time.time()).replace(' ', '') if not os.path.exists('logs\'): os.mkdir('logs\') logfile_name = time.strftime('%d-%b-%y %H-%M-%S', time.localtime()) log_file_path = os.path.join('logs', f'{logfile_name}.log')

logging.basicConfig(filename=log_file_path, filemode='a+', format='%(levelname)s - %(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')

class BaikeSpider(scrapy.Spider): name = 'baike' allowed_domains = ['baike.baidu.com'] start_urls = ['https://baike.baidu.com/item/文汇报'] db = pymongo.MongoClient("mongodb://127.0.0.1:27017/")["db_kg"] db_baike = db['db_baike'] db_triples = db['db_triples'] olds = set([item['_id'] for item in db_baike.find({}, {'_id': 1})]) if len(olds) > 0: start_urls = ['https://baike.baidu.com/item/'+olds.pop()]

uri = "bolt://localhost:7687"
user = "neo4j"  # 替换成您的用户名
password = "123"  # 替换成您的密码
driver = GraphDatabase.driver(uri, auth=(user, password), encrypted=False)

def add_node(self, tx, name1, relation, name2):
    tx.run("MERGE (a:Node {name: $name1}) "
           "MERGE (b:Node {name: $name2}) "
           "MERGE (a)-[:"+relation+"]-> (b)",
           name1=name1, name2=name2)

    print("Nodes and relationship added to Neo4j.")

def parse(self, response):
    # print(response.url)
    item_name = re.sub('/', '', re.sub('https://baike.baidu.com/item/',
                                       '', urllib.parse.unquote(response.url)))
    # 爬取过的直接忽视
    if item_name in self.olds:
        return
    # 将网页内容存入mongodb
    try:
        text = ''.join(response.xpath('//div[@class="main-content"]').xpath('//div[@class="para"]//text()').getall())
        if not text:
            text = item_name
        self.db_baike.insert_one(
            {
                '_id': item_name,
                'text': text
            })
    except pymongo.errors.DuplicateKeyError:
        pass
    # 更新爬取过的item集合
    self.olds.add(item_name)
    # 爬取页面内的item
    items = set(response.xpath(
        '//a[contains(@href, "/item/")]/@href').re(r'/item/[A-Za-z0-9%\u4E00-\u9FA5]+'))
    for item in items:
        new_url = 'https://baike.baidu.com'+urllib.parse.unquote(item)
        new_item_name = re.sub(
            '/', '', re.sub('https://baike.baidu.com/item/', '', new_url))
        if new_item_name not in self.olds:
            yield response.follow(new_url, callback=self.parse)

    # 处理三元组
    entity = ''.join(response.xpath(
        '//h1/text()').getall()).replace('/', '')
    attrs = response.xpath('//dt[@class="basicInfoItem_Ql5xB itemName_bc1nm"]/text()').getall()
    values = response.xpath('//dd[@class="basicInfoItem_Ql5xB itemValue_Kzb4E"]//text()').getall()

    if len(attrs) != len(values):
        return
    with self.driver.session() as session:
        try:
            for attr, value in zip(attrs, values):
                if attr == "" or value == "":
                    continue

                # attr
                temp = Selector(text=attr).xpath(
                    '//dt//text()').getall()
                relation_type = re.sub(r'\s+', '', attr)
                value = re.sub(r'\s+', '', value)
                print(relation_type)
                # value
                # value = ''.join(Selector(text=value).xpath(
                #     '//dd/text()|//dd/a//text()').getall())
                try:
                    value = value.replace('\n', '')
                    logging.warning(entity+'_'+attr+'_'+value)
                    self.db_triples.insert_one({
                        "_id": entity+'_'+attr+'_'+value,
                        "item_name": entity,
                        "attr": attr,
                        "value": value, }
                    )
                except pymongo.errors.DuplicateKeyError:
                    pass

                # print(f"session: {session}")
                session.write_transaction(
                    self.add_node, entity, relation_type, value)
        except Exception as e:
            print(e)
            logging.error('\n---'.join(attrs) +
                          '\n_________________'+'\n---'.join(values))

`