opencitations / oc_meta

ISC License
8 stars 5 forks source link

Discrepancy between the number of BRs in dump files (RDF and CSV) and the number of BRs in the triplestore #20

Open eliarizzetto opened 9 months ago

eliarizzetto commented 9 months ago

In version 5 of the “OpenCitations Meta CSV dataset of all bibliographic metadata” (https://doi.org/10.6084/m9.figshare.21747461.v5) and version 5 of the “OpenCitations Meta RDF dataset of all bibliographic metadata and its provenance information” (https://doi.org/10.6084/m9.figshare.21747536.v5) the number of bibliographic resources differs from the number of bibliographic resources in the triplestore (105,953,699 BRs, from what can be read in the two datasets’ metadata on their Figshare pages).

More specifically, 99,270,517 bibliographic resources can be found in the CSV files, and 105,912,463 bibliographic resources can be found in the RDF files. As concerns the CSV files, the lesser number of bibliographic resources may be (partly?) due to the fact that entities have been counted by OMID, and OMIDs of journal issues and journal volumes are not represented in the CSV dump files.

The observations can be reproduced with the following script.

import csv
from os import listdir
from os.path import join, isdir
from tqdm import tqdm
import re
from zipfile import ZipFile
import json

def get_br_data_from_rdf(br_rdf_path):
    with ZipFile(br_rdf_path) as archive:
        for filepath in archive.namelist():
            if 'prov' not in filepath and filepath.endswith('.zip'):
                with ZipFile(archive.open(filepath)) as br_data_archive:
                    for file in br_data_archive.namelist():
                        if file.endswith('.json'):
                            with br_data_archive.open(file) as f:
                                data: list = json.load(f)
                                for obj in data:
                                    for br in obj['@graph']:
                                        yield br

def read_csv_tables(*dirs):
    """
    Reads the output CSV non-compressed tables from one or more directories and yields rows
    as dictionaries.

    :param dirs: One or more directories to read files from, provided as variable-length arguments.
    :return: Yields rows as dictionaries.
    """
    csv.field_size_limit(131072 * 12)  # increase the default field size limit
    for directory in dirs:
        if isdir(directory):
            files = [file for file in listdir(directory) if file.endswith('.csv')]
            for file in tqdm(files, desc=f"Processing {directory}", unit="file"):
                file_path = join(directory, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    reader = csv.DictReader(f, dialect='unix')
                    for row in reader:
                        yield row
        else:
            raise ValueError("Each argument must be a string representing the path to an existing directory.")

def count_brs_in_rdf(br_rdf_path):
    brcount = 0
    brset = set()
    for br in tqdm(get_br_data_from_rdf(br_rdf_path), desc='Counting BRs in RDF files', unit='br'):
        if br['@id']:
            brcount +=1
            brset.add(br['@id'].replace('https://w3id.org/oc/meta/', 'omid:'))

    print('Number of BRs in the RDF files: ', brcount)
    print('Are there duplicates?', len(brset)!=brcount)
    return brcount, len(brset)

def count_brs_in_csv(meta_csv_dump):
    all_brs = set()
    row_count = 0
    pattern = r'omid:[^ \[\]]+'

    reader = read_csv_tables(meta_csv_dump)

    for row in reader:
        row_count +=1
        interested_fields = ' '.join([row['id'], row['venue'], row['volume'], row['issue']])
        omids_in_row = re.findall(pattern, interested_fields)
        all_brs.update(set(omids_in_row))

    print('Total number of BRs in CSV files: ', len(all_brs))
    print('Total number of rows: ', row_count)

    return (len(all_brs), row_count)

if __name__ == '__main__':
    csv_dump_path = 'path/to/csv/files' # directory w/ uncompressed files of CSV dump
    print(count_brs_in_csv(csv_dump_path))
    br_rdf_path = 'path/to/rdf/br.zip' # zip archive path!
    print(count_brs_in_rdf(br_rdf_path))