Open grossir opened 5 months ago
We corrected some 31 files with this script, including the example in the previous comment There are other related bugs that prevent us from correcting more files
from cl.audio.models import Audio
from cl.scrapers.tasks import process_audio_file
import requests
from django.core.files.base import ContentFile
from django.db import transaction
from cl.scrapers.utils import (
get_binary_content,
get_extension,
)
from importlib import import_module
from django.utils.encoding import force_bytes
from cl.lib.string_utils import trunc
import traceback
from cl.lib.crypto import sha1
def get_valid_download_url(audio):
download_url_works = ["ca9", "ca4", "ca3", "ca9", "ca1", "cafc"]
# No original download url in ca2 will work, seems session bound
download_url_never_works = ["ca2"]
court_id = audio.docket.court_id
url = audio.download_url
if court_id in download_url_works:
return url
elif court_id in download_url_never_works:
return None
elif court_id == "cadc":
# cadc has a bunch of invalid urls
# https://www.cadc.uscourts.gov/recordings/recordings.nsf/
# https://www.cadc.uscourts.gov/recordings/recordings.nsf/Pages/Unavailable1
if "Unavailable" in url or url == "https://www.cadc.uscourts.gov/recordings/recordings.nsf/":
return None
else:
return url
elif court_id == "ca6":
return url.replace("/recent/", "/audio/")
elif court_id == "ca7":
year = url.rsplit(".", 1)[0].split("_")[-1]
# Original: http://media.ca7.uscourts.gov/sound/external/ds.20-3050.20-3050_03_31_2021.mp3
# Functioning: https://media.ca7.uscourts.gov/sound/2021/ds.20-3050.20-3050_03_31_2021.mp3
return f"https://media.ca7.uscourts.gov/sound/{year}/{url.split('/')[-1]}"
# `process_audio_file`` task gets `local_path_original_file` data
# and sends it to doctor. Doctor returns processed data which is
# used to create `local_path_mp3``
# So, if the local_path_original_file is corrupted, we need to re-download
# from the original URL
qs = Audio.objects.filter(duration__lte=60)
print(f"Correcting short {qs.count()} audio files")
for audio in qs.select_related('docket'):
court_id = audio.docket.court_id
if court_id == "ca9" and audio.duration > 10:
# ca9 does not need correcting, unless, they lasts very very little
# the audios are actually that short
continue
print(f"Processing audio {audio.pk}")
corrected_url = get_valid_download_url(audio)
if not corrected_url:
print("Can't get a valid URL, aborting")
continue
wrong_duration = audio.duration
try:
print("Getting corrected url", corrected_url)
content_request = requests.get(corrected_url)
content_request.raise_for_status()
content = content_request.content
sha1_hash = sha1(force_bytes(content))
# This will affect many fields in different stages:
# duration, local_path_original_file, sha1, local_path_mp3
# So, it's better not to use celery so we don't end up
# with partially updated data
with transaction.atomic():
audio.download_url = corrected_url
# copy from cl_scrape_oral_arguments
cf = ContentFile(content)
extension = get_extension(content)
if extension not in [".mp3", ".wma"]:
extension = f".{audio.download_url.lower().rsplit('.', 1)[1]}"
file_name = trunc(audio.case_name.lower(), 75) + extension
audio.file_with_date = audio.docket.date_argued
audio.local_path_original_file.save(file_name, cf, save=False)
audio.sha1 = sha1_hash
#
audio.save()
process_audio_file(pk=audio.pk)
except:
print(traceback.format_exc())
continue
Some
audio_audio
rows have all the file fields and even a duration, but their files are incomplete. They play, but it's only a part of the audio. On the dev DB, I count 72 that have all the files but last less than 60 seconds.We can correct some of these on the fly for our current project, but we should validate durations before insertion
For example:
where the
local_path_mp3
andlocal_path_original_file
andfilepath_ia
audios last only a second. Thedownload_url
is deadHowever, we can track down the original file on the source play / download where it lasts 37 minutes and 10 seconds