Open alexwlchan opened 2 years ago
This is the code I vaguely ended up gravitating towards:
from elasticsearch.helpers import scan
import tqdm
from pprint import pprint
es = get_es_client()
index = "calm_catalog"
scanner = scan(
es,
index=index,
query={"_source": "Copies"},
scroll="1m"
)
items = {
it["_id"]: it["_source"].get("Copies")
for it in tqdm.tqdm(scanner)
if it["_source"].get("Copies")
}
tally = collections.Counter()
def tidy(v):
return v
return (
v.replace("A digitised copy is held by the Wellcome Library as part of Codebreakers: Makers of Modern Genetics.", "")
.replace("A digitised copy is held by the Wellcome Library as part of the Codebreakers: Makers of Modern Genetics programme.", "")
.replace("A digitised copy is held by the Wellcome Library as part of The Mental Health Archives digitisation project.", "")
.replace("A digitised copy is held by the Wellcome Library.", "")
.replace("A digitised copy is held by the Wellcome Library as part of The Mental Health Archives digitisation project", "")
.replace("A digitised copy is held by the Wellcome Library as part of Codebreakers: Makers of Modern Genetics", "")
.replace("This material has been digitised and can be freely accessed online through the Wellcome Library catalogue.", "")
.strip()
)
import termcolor
for v in items.values():
if isinstance(v, list):
for v_e in v:
tally[tidy(v_e)] += 1
else:
tally[tidy(v)
] += 1
tally.most_common()
from weco_datascience.reporting import get_es_client
from elasticsearch.helpers import scan
import tqdm
from pprint import pprint
es = get_es_client()
index = "sierra_varfields"
terms = {
"varField.marcTag": "535",
"varField.ind1": "2",
}
scanner = scan(
es,
index=index,
query={"query": {"bool": {"filter": [{"term": {k: v}} for k, v in terms.items()]}}},
scroll="1m"
)
notes = list(tqdm.tqdm(scanner))
But it only affects ~50k works, not sure it's worth it right now.
We don't need this note if there's already a digitised work on the page.