bitdruid / python-wayback-machine-downloader

Query and download archive.org as simple as possible.
MIT License
34 stars 2 forks source link

Recovering progress from the filesystem in case of skipfile corruption #8

Closed Ghost-chu closed 5 months ago

Ghost-chu commented 5 months ago

I exited the program using Ctrl+C while pulling data, however unfortunately the program was forced to interrupt by me while writing to the skipfile. So my progress was reset.

I wrote my own code which helped me skip files already in the filesystem and resumed my previous progress. So I am submitting it here in the hope that it will help.


def download(output, snapshot_entry, connection, status_message, no_redirect=False, skipset=None):
    """
    Download a single URL and save it to the specified filepath.
    If there is a redirect, the function will follow the redirect and update the download URL.
    gzip decompression is used if the response is encoded.
    According to the response status, the function will write a status message to the console and append a failed URL.
    """
    download_url = snapshot_entry["url_archive"]
+   output_file = sc.create_output(download_url, snapshot_entry["timestamp"], output)
+   if os.path.isfile(output_file):
+       vb.write(f"\nSKIPPING -> Local File {output_file}")
+       return True
    encoded_download_url = urllib.parse.quote(download_url, safe=':/') # used for GET - otherwise always download_url
    if skipset and skip_read(skipset, download_url):
        vb.write(f"\nSKIPPING -> URL: {download_url}")
        return True
    max_retries = 2
    sleep_time = 45
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}
    for i in range(max_retries):
        try:
            connection.request("GET", encoded_download_url, headers=headers)
            response = connection.getresponse()
            response_data = response.read()
            response_status = response.status
            response_status_message = parse_response_code(response_status)
            sc.snapshot_entry_modify(snapshot_entry, "response", response_status)
            if not no_redirect:
                if response_status == 302:
                    status_message = f"{status_message}\n" + \
                        f"REDIRECT   -> HTTP: {response.status} - {response_status_message}\n" + \
                        f"           -> FROM: {download_url}"
                    redirect_count = 0
                    while response_status == 302:
                        redirect_count += 1
                        if redirect_count > 5:
                            break
                        connection.request("GET", encoded_download_url, headers=headers)
                        response = connection.getresponse()
                        response_data = response.read()
                        response_status = response.status
                        response_status_message = parse_response_code(response_status)
                        location = response.getheader("Location")
                        if location:
                            encoded_download_url = urllib.parse.quote(urljoin(download_url, location), safe=':/')
                            status_message = f"{status_message}\n" + \
                                f"           ->   TO: {download_url}"
                            sc.snapshot_entry_modify(snapshot_entry, "redirect_timestamp", url_get_timestamp(location))
                            sc.snapshot_entry_modify(snapshot_entry, "redirect_url", download_url)
                        else:
                            break
            if response_status == 200:
                output_file = sc.create_output(download_url, snapshot_entry["timestamp"], output)
                output_path = os.path.dirname(output_file)

                # case if output_path is a file, move file to temporary name, create output_path and move file into output_path
                if os.path.isfile(output_path):
                    move_index(existpath=output_path)
                else: 
                    os.makedirs(output_path, exist_ok=True)
                # case if output_file is a directory, create file as index.html in this directory
                if os.path.isdir(output_file):
                    output_file = move_index(existfile=output_file, filebuffer=response_data)

                # if filename is too long, skip download
                if len(os.path.basename(output_file)) > 255:
                    status_message = f"{status_message}\n" + \
                        f"FILENAME TOO LONG -> HTTP: {response_status} - {response_status_message}\n" + \
                        f"                  -> URL: {download_url}"
                    vb.write(status_message)
                    skip_write(skipset, snapshot_entry["url_archive"]) if skipset is not None else None
                    return True

                if not os.path.isfile(output_file):
                    with open(output_file, 'wb') as file:
                        if response.getheader('Content-Encoding') == 'gzip':
                            response_data = gzip.decompress(response_data)
                            file.write(response_data)
                        else:
                            file.write(response_data)
                    if os.path.isfile(output_file):
                        status_message = f"{status_message}\n" + \
                            f"SUCCESS    -> HTTP: {response_status} - {response_status_message}"
                else:
                    status_message = f"{status_message}\n" + \
                        f"EXISTING   -> HTTP: {response_status} - {response_status_message}"
                status_message = f"{status_message}\n" + \
                    f"           -> URL: {download_url}\n" + \
                    f"           -> FILE: {output_file}"
                vb.write(status_message)
                sc.snapshot_entry_modify(snapshot_entry, "file", output_file)
                skip_write(skipset, snapshot_entry["url_archive"]) if skipset is not None else None
                return True

            else:
                status_message = f"{status_message}\n" + \
                    f"UNEXPECTED -> HTTP: {response_status} - {response_status_message}\n" + \
                    f"           -> URL: {download_url}"
                vb.write(status_message)
                return True
        # exception returns false and appends the url to the failed list
        except http.client.HTTPException as e:
            status_message = f"{status_message}\n" + \
                f"EXCEPTION -> ({i+1}/{max_retries}), append to failed_urls: {download_url}\n" + \
                f"          -> {e}"
            vb.write(status_message)
            return False
        # connection timeout waits and retries
        except:
            vb.write(status_message)
            time.sleep(sleep_time)
    vb.write(f"FAILED  -> download, append to failed_urls: {download_url}")
    return False
bitdruid commented 5 months ago

we could also just ignore SIGINT in the finally statement. each isfile() produces additional I/O

Ghost-chu commented 5 months ago

we could also just ignore SIGINT in the finally statement. each isfile() produces additional I/O

Actually checking if a file exists only involves the directory area of the file system, and IO isn't that bad, especially on modern systems with SSD storage devices. We are able to reduce some of the IO by loading and processing skipset first, and then isFile.

bitdruid commented 5 months ago

your hint with the keyboardinterruption was pretty good though. i 'm just thinking as we already reduced I/O / cpu loads we should try to keep this savings as much as possible.

edit: one remaining problem is that skipped urls are not assigned to a file. that makes json and csv response wrong. fixing this may need additional resources