wellcomecollection / platform

Wellcome Collection Digital Platform
https://developers.wellcomecollection.org/
MIT License
48 stars 10 forks source link

Don't show a "Location of duplicates: digitised work" note if there's a METS work on the page #5318

Open alexwlchan opened 2 years ago

alexwlchan commented 2 years ago
Screenshot 2021-09-30 at 14 30 07

We don't need this note if there's already a digitised work on the page.

alexwlchan commented 2 years ago

This is the code I vaguely ended up gravitating towards:

from elasticsearch.helpers import scan
import tqdm
from pprint import pprint

es = get_es_client()

index = "calm_catalog"

scanner = scan(
    es,
    index=index,
    query={"_source": "Copies"},
    scroll="1m"
)

items = {
    it["_id"]: it["_source"].get("Copies")
    for it in tqdm.tqdm(scanner)
    if it["_source"].get("Copies")
}
tally = collections.Counter()

def tidy(v):
    return v
    return (
        v.replace("A digitised copy is held by the Wellcome Library as part of Codebreakers: Makers of Modern Genetics.", "")
              .replace("A digitised copy is held by the Wellcome Library as part of the Codebreakers: Makers of Modern Genetics programme.", "")
              .replace("A digitised copy is held by the Wellcome Library as part of The Mental Health Archives digitisation project.", "")
              .replace("A digitised copy is held by the Wellcome Library.", "")
              .replace("A digitised copy is held by the Wellcome Library as part of The Mental Health Archives digitisation project", "")
              .replace("A digitised copy is held by the Wellcome Library as part of Codebreakers: Makers of Modern Genetics", "")
              .replace("This material has been digitised and can be freely accessed online through the Wellcome Library catalogue.", "")
              .strip()
    )

import termcolor

for v in items.values():
    if isinstance(v, list):
        for v_e in v:
            tally[tidy(v_e)] += 1
    else:
        tally[tidy(v)
    ] += 1

tally.most_common()
from weco_datascience.reporting import get_es_client
from elasticsearch.helpers import scan
import tqdm
from pprint import pprint

es = get_es_client()

index = "sierra_varfields"

terms = {
    "varField.marcTag": "535",
    "varField.ind1": "2",
}

scanner = scan(
    es,
    index=index,
    query={"query": {"bool": {"filter": [{"term": {k: v}} for k, v in terms.items()]}}},
    scroll="1m"
)

notes = list(tqdm.tqdm(scanner))

But it only affects ~50k works, not sure it's worth it right now.