Open ntarocco opened 6 years ago
I would like to request the review for the production script below:
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function
import logging
import sqlalchemy
from invenio_db import db
from invenio_indexer.api import RecordIndexer
from invenio_records.api import Record
from invenio_records.models import RecordMetadata
logger = logging.getLogger('fix_blank_fields')
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
fh = logging.FileHandler('./fix_blank_fields.log')
fh.setFormatter(formatter)
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
def nested_field_query(first_level_field, broken_val, broken_key):
"""
Builds a query to search for specified values in JSON field
:param first_level_field: parent field key
:param broken_val: searching phrase as string, f.e. "\"name\": \"\"",
:param broken_key: key expected to have a broken value
:return: built query
"""
if isinstance(broken_val, str) and isinstance(first_level_field, str)\
and isinstance(broken_key, str):
return db.session.query(RecordMetadata).\
filter(sqlalchemy.cast(RecordMetadata.json[first_level_field],
sqlalchemy.String).contains(broken_val))
else:
raise ValueError(u'Invalid argument. '
u'value argument must be of type str')
class BlankFieldFixer(object):
"""Class for fixing broken values"""
def __init__(self, first_level_field, broken_val, broken_key):
"""
:param first_level_field: top level key f.e. 'keywords'
:param broken_val: string value to search for in the JSON field
:param broken_key: JSON dict key expected to have empty string val
"""
super(BlankFieldFixer, self).__init__()
self.query = nested_field_query(first_level_field, broken_val,
broken_key)
self.key = first_level_field
self.broken_val = broken_val
self.broken_key = broken_key
def run(self, chunk_size=10):
"""Runs fixing and reindexing operations on the records
:param chunk_size how many entries we want to review
:return: list of the records that have to be fixed manually,
list of fixed
"""
logger.info('Fixing blank in JSON field \'{0}\' for key \'{1}\'...'.
format(self.key, self.broken_key))
unfixables, fixed = self.check_and_fix(chunk_size)
return unfixables
def remove_empty(self, record):
"""Removes empty values"""
for i, entry in enumerate(record[self.key]):
if entry[self.broken_key] == "":
record[self.key].pop(i)
def review_records(self):
"""Allows to check the list of broken records"""
return self.query.all()
def check_and_fix(self, chunk_size):
unfixable_records = []
fixed = []
indexer = RecordIndexer()
for record_meta in self.query.all()[:chunk_size]:
# it can't leave the blank field
if len(record_meta.json[self.key]) <= 1:
unfixable_records.append(record_meta)
logger.warning('Record <{0}> can\'t be fixed'
.format(record_meta.id))
else:
record = Record.get_record(str(record_meta.id))
logger.info('[START] Fixing record <{0}>...'.format(record_meta.id))
self.remove_empty(record)
logger.info('Record <{0}> fixed'.format(record_meta.id))
record.commit()
fixed.append(record)
db.session.commit()
logger.info('Record <{0}> saved to db'.format(record_meta.id))
indexer.index(record)
logger.info('[FIN] Record <{0}> indexed.'.format(record_meta.id))
return unfixable_records, fixed
There is 167 records with empty strings in ['keywords']['name']
and 616 with empty strings in ['contributors']['name']
4772 with empty strings in ['description']
4 with empty string in ['date']
and 22 with empty ['title']
@kprzerwa is it easy to check if we have something else that is empty?
I've added the other fields I've checked to my last comment
We have a specific repo for production scripts, you can create there: https://gitlab.cern.ch/AIGROUP-cds-admin/production_scripts
wow.. ~5K empty abstracts? can those deposits be edited/published?
@kprzerwa looks good to me!
@ludmilamarian I've checked randomly, the ones I've seen are published, for some of them I can't go into the edit
4772 with empty strings in ['description']
wow.. ~5K empty abstracts? can those deposits be edited/published?
This could be that there was no description anywhere on CDS and we just added and empty string to pass the validation. But it should be able to edit/publish them, if you add a description in the corresponding field.
BTW, I think this and https://github.com/CERNDocumentServer/cds-videos/issues/1442 are the same :wink:
videos.cern.ch/record/2017767 still needs fixing: https://test-cds-sentry.web.cern.ch/sentry/videos/issues/4195/ videos.cern.ch/record/2036133 has internal server error on publish attempt
Some records have keywords without name, for example:
Some records have authors with empty value, this breaks deposit: