wkentaro / gdown

Google Drive Public File Downloader when Curl/Wget Fails
MIT License
4.11k stars 343 forks source link

Suggestion: Skip downloading if the file already downloaded #175

Closed ArtemisDicoTiar closed 2 years ago

ArtemisDicoTiar commented 2 years ago

Downloading the file that already on the path will make redundant download. I hope there will be an option to handle this both on CLI and Python package.

kinoc commented 2 years ago

I have a function that downloads the metadata first then calls the normal download if the data is valid and the file doesn't exist. Might be useful either way for ideas... url_set contains a list of file id's dest_path contains the target directory with trailing dir separator already appended, and the dir already exists

def fetch_file_metadata_from_google_drive(id):
    URL = "https://drive.google.com/uc?id="+id+"&authuser=0&export=download&confirm=t"
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"  # NOQA
    }
    verify=True
    url_origin = URL
    session = requests.Session()
    meta={}
    meta['id']=id
    meta['valid']=False
    meta['filename']=''
    meta['url']=URL

    response = session.get(URL,allow_redirects=True, headers=headers, stream = True,verify=verify)
    token = get_confirm_token(response)
    meta['token']=token
    for k in response.headers:
        meta[k] = str(response.headers[k])

    if ('Content-Disposition' in response.headers):
        # Get the filename from the response header 'Content-Disposition'
        match = re.search(r'filename="(?P<filename>.+)"', response.headers['Content-Disposition'])
        # Make sure we found the filename field inside Content-Disposition
        if match is None:
            print('\n\nERROR: Unable to retrieve `dest_path` from `file_id`, please set it manually.')
            return meta
        if platform == 'win32':
            # Make it Windows safe, stripping: \/<>:"|?*
            remove_characters = dict((ord(char), None) for char in '\\/<>:"|?*')
        else:
            # Make it macOS and linux safe, stripping: /
            remove_characters = dict((ord(char), None) for char in '/')

        dest_path = match['filename'].translate(remove_characters)
        meta['filename']=dest_path
        meta['valid']=True
        print(json.dumps(meta,indent=4))
    else:
        # Need to redirect with confirmation
        try:
            url = get_url_from_gdrive_confirmation(response.text)
        except RuntimeError as e:
            print("Access denied with the following error:")
            error = "\n".join(textwrap.wrap(str(e)))
            error = indent(error, "\t")
            print("\n", error, "\n", file=sys.stderr)
            print(
                "You may still be able to access the file from the browser:",
                file=sys.stderr,
            )
            print("\n\t", url_origin, "\n", file=sys.stderr)
            meta['browser_url']=url_origin 
            return meta
        print("redirect_url:",id,"-->",url)
        meta['redirect_url']=url        
    return meta

for file_id in url_set:
    meta = fetch_file_metadata_from_google_drive(file_id)
    if (meta['valid']):
        epath = dest_path + meta['filename']
        if (not os.path.exists(epath)):
            gdown.download(meta['url'],output=dest_path,quiet=False)
        else:
            print("ALREADY HAVE:", meta['filename'])
    else:
        print("INVALID:",file_id)