Closed GeoffFroh closed 5 years ago
Here's a script that gets a list of all collections on mits2, looks for those collections in the filesystem (in DDR.config.MEDIA_BASE
), and finds all files in each collection that are missing data from the sha1
, sha256
, md5
, or size
fields. It writes a list of those files to the /tmp/
dir.
To run the script you have to be logged in as ddr
and have the virtualenv activated.
from datetime import datetime
import os
from DDR import config, dvcs, identifier, util
# Find files where these fields are blank
CHECK_FIELDS = [
'sha1', 'sha256', 'md5', 'size'
]
def collections():
"""Get list of collections from Gitolite
Screen out gitolite-admin, ddr-testing-* and org repos
"""
print('connecting to Gitolite...')
gitolite = dvcs.Gitolite(config.GITOLITE)
gitolite.initialize()
print('connected %s' % gitolite.connected)
print('authorized %s' % gitolite.authorized)
collections = [
c
for c in gitolite.collections()
if 'testing' not in c \
and 'admin' not in c \
and identifier.Identifier(c).model == 'collection'
]
return util.natural_sort(collections)
def collection_files(ci):
"""Get all the file paths in collection
"""
return util.find_meta_files(
basedir=ci.path_abs(), recursive=True, model='file'
)
def empty_fields(file_):
"""Check file for empty fields
"""
empty_fields = [field for field in CHECK_FIELDS if not getattr(file_, field)]
if empty_fields:
return True
return False
def check_collection(collection_path):
"""Check each file in collection
"""
ci = identifier.Identifier(collection_path)
paths = collection_files(ci)
bad_paths = [
file_path_abs
for file_path_abs in paths
if empty_fields(identifier.Identifier(file_path_abs).object())
]
return paths,bad_paths
def check_collections(cids):
bad_collections = []
for n,cid in enumerate(cids):
collection_path = os.path.join(config.MEDIA_BASE, cid)
paths,bad_paths = check_collection(collection_path)
if bad_paths:
print('%s | %s/%s bad:%s/%s %s' % (
datetime.now(), n, len(cids), len(bad_paths), len(paths), cid
))
elif paths:
print('%s | %s/%s bad:%s %s' % (
datetime.now(), n, len(cids), len(bad_paths), cid
))
else:
print('%s | %s/%s bad: %s NO FILES PRESENT' % (
datetime.now(), n, len(cids), cid
))
if len(bad_paths):
bad_collections.append((cid, bad_paths))
return bad_collections
def write_bad_filenames(bad_collections, filename):
with open(filename, 'w') as f:
for cid,bad_paths in bad_collections:
f.write(cid + '\n')
for path in bad_paths:
f.write(path + '\n')
f.write('\n')
def main():
bad_collections = check_collections(collections())
if bad_collections:
print('PROBLEMS IN THESE COLLECTIONS')
for cid,bad_paths in bad_collections:
print(cid)
filename = '/tmp/files-missing-hashes'
print('Writing filenames to %s' % filename)
write_bad_filenames(bad_collections, filename)
else:
print('NO PROBLEMS FOUND')
if __name__ == '__main__':
main()
This should work for fixing collections. Save this somewhere and then run it once for each list of files per collection. It will save changes but not commit.
import csv, os
import click
from DDR import csvfile, fileio, identifier, util
AGENT = 'cli'
@click.command()
@click.option('--user','-u', help='User name.')
@click.option('--mail','-m', help='User e-mail address.')
@click.argument('filename')
@click.argument('collection')
def main(user, mail, filename, collection):
git_name = user
git_mail = mail
basename = os.path.dirname(os.path.dirname(collection))
with open(filename, 'r') as f:
lines = f.readlines()
fids = [line.strip() for line in lines if line.strip()]
for fid in fids:
f = identifier.Identifier(fid, basename).object()
print(f)
if f.path_abs and not f.size:
f.size = os.path.getsize(f.path_abs)
print(' %s' % f.size)
f.md5 = util.file_hash(f.path_abs, 'md5')
print(' %s' % f.md5)
f.sha1 = util.file_hash(f.path_abs, 'sha1')
print(' %s' % f.sha1)
f.sha256 = util.file_hash(f.path_abs, 'sha256')
print(' %s' % f.sha256)
exit,status,updated_files = f.save(git_name, git_mail, AGENT, commit=False)
print(status)
if __name__ == '__main__':
main()
@sarabeckman: please go ahead and give it a try, then close this issue if working
Script to find files works as planned. Make sure to remove collection ID headers from each section before using the text files for the script to fix the files. Also had a fix to the fix files script. Here is the new script.
`import csv, os import click from DDR import csvfile, fileio, identifier, util AGENT = 'cli'
@click.command() @click.option('--user','-u', help='User name.') @click.option('--mail','-m', help='User e-mail address.') @click.argument('filename') @click.argument('collection') def main(user, mail, filename, collection): git_name = user git_mail = mail basename = os.path.dirname(collection) with open(filename, 'r') as f: lines = f.readlines() fids = [line.strip() for line in lines if line.strip()] for fid in fids: f = identifier.Identifier(fid, basename).object() print(f) if f.path_abs and not f.size: f.size = os.path.getsize(f.path_abs) print(' %s' % f.size) f.md5 = util.file_hash(f.path_abs, 'md5') print(' %s' % f.md5) f.sha1 = util.file_hash(f.path_abs, 'sha1') print(' %s' % f.sha1) f.sha256 = util.file_hash(f.path_abs, 'sha256') print(' %s' % f.sha256) exit,status,updated_files = f.save(git_name, git_mail, AGENT, commit=False) print(status)
if name == 'main': main()`
Yay, glad it worked for you!
A group of
File
jsons that were batch updated from csv using theddrimport
tool overwrote the existing checksum and other technical metadata with empty vals. The set of corrupted jsons is large enough that it cannot be corrected manually.See @sarabeckman comment at: https://github.com/densho/ddr-cmdln/issues/145#issuecomment-488337249
The script/tool does not need to be integrated with the DDR platform. It can be a one-time use script.