DataONEorg / metrics-service

An efficient database and REST API for delivering aggregated data set metrics to clients.
Apache License 2.0
2 stars 1 forks source link

Query for citation metrics differ between repository-wide and dataset-level queries #88

Open vchendrix opened 2 years ago

vchendrix commented 2 years ago

In trying to assess the total number of citations for the ESS-DIVE repository, it was discovered that the citation count is off when querying by repository vs querying citations by individual datasets.

Citations from 1/1/2016 to 11/30/2021

Repository-wide query
  235 Total Count Returned (totalCitations)
  216 Total Citations Returned (citations)
  71 Unique Citations (from citations)
Dataset-level query
  330 Total Citations
  121 Data packages with citations
  217 Unique Citations

Python code

The following python code was used to generate the counts above. You need to install pandas and requests libraries. This code was executed in a Jupyter notebook

import requests
import json

# Import pandas library 
import pandas as pd 
pd.set_option('display.max_colwidth', None)

from ipywidgets import widgets, interact
from IPython.display import display

# Setup the inputs
from_date = widgets.Text("01/01/2016", description="From Date:")
to_date = widgets.Text("09/30/2021", description="To Date:")

display(from_date)
display(to_date)

def get_repo_citations(to_date, from_date="01/01/2016"):
    """
    Repository level citations from the metrics service.

    IMPORTANT: These have been found to be incomplete when compared
    to the individual doi queries for citations.
    """
    metrics_request_json = {
        "metricsPage": {
            "total": 0,
            "start": 0,
            "count": 0
        },
        "metrics": [
            "citations",
            "downloads",
            "views"
        ],
        "filterBy": [
            {
                "filterType": "repository",
                "values": [
                    "urn:node:ESS_DIVE"
                ],
                "interpretAs": "list"
            },
            {
                "filterType": "month",
                "values": [
                    from_date,
                    to_date
                ],
                "interpretAs": "range"
            }
        ],
        "groupBy": [
            "month"
        ]
    }
    metrics_request = json.dumps(metrics_request_json)
    metrics_response = requests.get(f"https://logproc-stage-ucsb-1.test.dataone.org/metrics?metricsRequest={metrics_request}")
    repository_results =  metrics_response.json()['resultDetails']
    repo_citations={}

    for c in repository_results['citations']:
        citation = repository_results['citations'][c]
        for t in citation['target_id']:
            repo_citations.setdefault(f"doi:{t}", set())
            repo_citations[f"doi:{t}"].add(c)
    return repo_citations, repository_results['citations'], repository_results['resultDetails']['totalCitations']

def get_citations(to_date, from_date="01/01/2016"):
    """
    Get the citations for the specified date range

    returns tuple (dataframe, dictionary of citations)
    """

    # Prepare the data frame
    df = pd.DataFrame(columns=['citations', 'doi', 'title'])

    #IMPORTANT must use archive=* to get all archived and current data packages
    response = requests.get(f"https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/query/solr?q=formatId:*eml*+AND+NOT+obsoletedBy:*+AND+isPublic:true&fl=id,seriesId,title&wt=json&rows=0&archived=*")
    max_rows = response.json()['response']['numFound']
    print(f"{max_rows} datasets found.")

    # query ESS-DIVE and the metrics service to get the data package citations
    #   TODO: this should be updated to page over the results if it is over 400
    response = requests.get(f"https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/query/solr?q=formatId:*eml*+AND+NOT+obsoletedBy:*+AND+isPublic:true&fl=id,seriesId,title&wt=json&rows={max_rows}&archived=*")
    response_json = response.json()
    individual_citations = dict()

    # Iterator over datasets and query the metrics service for citations
    for d in response_json['response']['docs']:
        series_id = 'seriesId' in d and d['seriesId'] or d['id']
        title = d['title']

        metrics_request_json = {"metricsPage":{"total":0,"start":0,"count":0},
                     "metrics":["citations","downloads","views"],
                     "filterBy":[{"filterType":"dataset","values":[series_id],"interpretAs":"list"},{"filterType":"month","values":[from_date,to_date],"interpretAs":"range"}],
                     "groupBy":["month"]}
        metrics_request = json.dumps(metrics_request_json)

        metrics_response = requests.get(f"https://logproc-stage-ucsb-1.test.dataone.org/metrics?metricsRequest={metrics_request}")

        # Get the citations from the metrics response
        unique_citations = [c['source_id'] for c in metrics_response.json()['resultDetails']['citations']]
        print(f"{len(unique_citations)} ", end="")

        # append to data frame
        df = df.append({'citations': len(unique_citations), 
                        'doi': series_id, 
                        'title': d['title']}, ignore_index=True)
        individual_citations[series_id]=set(un

Counts the citations

# Dataset level citations
df, individual_citations = get_citations(to_date.value, from_date=from_date.value)
has_citations = df['citations']>0
df_has_citations = df[has_citations]
print(df_has_citations.shape[0])

df_has_citations = df_has_citations.sort_values(by=['citations'], ascending=False).head(df_has_citations.shape[0])
from IPython.display import display, HTML
display(HTML(df_has_citations.to_html(index=False)))

# Repository-wide citations 
unique_repo_citations, repo_query_result, total_citations = get_repo_citations(to_date.value, from_date=from_date.value)
unique=set()
for c in individual_citations:
    unique.update(individual_citations[c])

print("Repository-wide query")
print(f"  {total_citations} Total Count Returned (totalCitations)")
print(f"  {len(repo_query_result)} Total Citations Returned (citations)")
print(f"  {len(unique_repo_citations)} Unique Citations (from citations)")
print("Dataset-level query")
print(f"  {df_has_citations['citations'].sum()} Total Citations")
print(f"  {df_has_citations['citations'].count()} Data packages with citations")
print(f"  {len(unique)} Unique Citations")
rushirajnenuji commented 2 years ago

Hey @vchendrix - thank you for filing this bug. I'll look into the issue and get back to you.

rushirajnenuji commented 2 years ago

The ES identifiers index is not up to date with the datasetIdentifierFamily information. I'm working on getting that up to date. Once we have proper datasetIdentifierFamily index, we'll be able to index the identifiers in the citation metadata table. This applies to both repository level citation metrics and portal level citation metrics