Some audio files are incomplete

Some audio_audio rows have all the file fields and even a duration, but their files are incomplete. They play, but it's only a part of the audio. On the dev DB, I count 72 that have all the files but last less than 60 seconds.

We can correct some of these on the fly for our current project, but we should validate durations before insertion

For example:

{
    "resource_uri": "https://www.courtlistener.com/api/rest/v3/audio/62587/",
    "duration": 1,
    "download_url": "http://www.opn.ca6.uscourts.gov/internet/court_audio/recent/03-15-2019 - Friday/18-1605 Lerod Butler v City of Detroit MI.mp3",
    "local_path_mp3": "https://storage.courtlistener.com/mp3/2019/03/15/lerod_butler_v._city_of_detroit_mi_cl_1.mp3",
    "local_path_original_file": "https://storage.courtlistener.com/mp3/2019/03/15/lerod_butler_v._city_of_detroit_mi_1.mp3",
    "filepath_ia": "https://archive.org/download/gov.uscourts.ca6.18-1605/gov.uscourts.ca6.18-1605.2019-03-15.mp3",

where the local_path_mp3 and local_path_original_file and filepath_ia audios last only a second. The download_url is dead

However, we can track down the original file on the source play / download where it lasts 37 minutes and 10 seconds

We corrected some 31 files with this script, including the example in the previous comment There are other related bugs that prevent us from correcting more files

from cl.audio.models import Audio
from cl.scrapers.tasks import process_audio_file
import requests
from django.core.files.base import ContentFile
from django.db import transaction
from cl.scrapers.utils import (
    get_binary_content,
    get_extension,
)
from importlib import import_module
from django.utils.encoding import force_bytes
from cl.lib.string_utils import trunc
import traceback
from cl.lib.crypto import sha1

def get_valid_download_url(audio):
    download_url_works = ["ca9", "ca4", "ca3", "ca9", "ca1", "cafc"]

    # No original download url in ca2 will work, seems session bound
    download_url_never_works = ["ca2"]
    court_id = audio.docket.court_id
    url = audio.download_url

    if court_id in download_url_works:
        return url
    elif court_id in download_url_never_works:
        return None
    elif court_id == "cadc":
        # cadc has a bunch of invalid urls
        # https://www.cadc.uscourts.gov/recordings/recordings.nsf/
        # https://www.cadc.uscourts.gov/recordings/recordings.nsf/Pages/Unavailable1
        if "Unavailable" in url or url == "https://www.cadc.uscourts.gov/recordings/recordings.nsf/":
            return None
        else:
            return url
    elif court_id == "ca6":
        return url.replace("/recent/", "/audio/")
    elif court_id == "ca7":
        year = url.rsplit(".", 1)[0].split("_")[-1]
        # Original: http://media.ca7.uscourts.gov/sound/external/ds.20-3050.20-3050_03_31_2021.mp3
        # Functioning: https://media.ca7.uscourts.gov/sound/2021/ds.20-3050.20-3050_03_31_2021.mp3
        return f"https://media.ca7.uscourts.gov/sound/{year}/{url.split('/')[-1]}"

# `process_audio_file`` task gets `local_path_original_file` data 
# and sends it to doctor. Doctor returns processed data which is
# used to create `local_path_mp3``
# So, if the local_path_original_file is corrupted, we need to re-download
# from the original URL
qs =  Audio.objects.filter(duration__lte=60)
print(f"Correcting short {qs.count()} audio files")

for audio in qs.select_related('docket'):
    court_id = audio.docket.court_id
    if court_id == "ca9" and audio.duration > 10:
        # ca9 does not need correcting, unless, they lasts very very little
        # the audios are actually that short
        continue

    print(f"Processing audio {audio.pk}")

    corrected_url = get_valid_download_url(audio)
    if not corrected_url:
        print("Can't get a valid URL, aborting")
        continue

    wrong_duration = audio.duration

    try:
        print("Getting corrected url", corrected_url)
        content_request = requests.get(corrected_url)
        content_request.raise_for_status()
        content = content_request.content
        sha1_hash = sha1(force_bytes(content))

        # This will affect many fields in different stages:
        # duration, local_path_original_file, sha1, local_path_mp3
        # So, it's better not to use celery so we don't end up
        # with partially updated data
        with transaction.atomic():
            audio.download_url = corrected_url

            # copy from cl_scrape_oral_arguments             
            cf = ContentFile(content)
            extension = get_extension(content)
            if extension not in [".mp3", ".wma"]:
                extension = f".{audio.download_url.lower().rsplit('.', 1)[1]}"

            file_name = trunc(audio.case_name.lower(), 75) + extension
            audio.file_with_date = audio.docket.date_argued
            audio.local_path_original_file.save(file_name, cf, save=False)
            audio.sha1 = sha1_hash
            #

            audio.save()

            process_audio_file(pk=audio.pk)
    except:
        print(traceback.format_exc())
        continue

freelawproject / courtlistener

Some audio files are incomplete #4109