mikf / gallery-dl

Command-line program to download image galleries and collections from several image hosting sites
GNU General Public License v2.0
11.4k stars 931 forks source link

Twitter database configuration #2896

Open ArneAnka opened 2 years ago

ArneAnka commented 2 years ago

Hi!

Using version 1.23.0 of gallery-dl

Here is my configuration file:

{
    "extractor":
    {
        "base-directory": "~/Pictures/gallery-dl/",

        "#": "set global archive file for all extractors",
        "archive": "~/Pictures/gallery-dl/archive.sqlite3",

        "#": "add two custom keywords into the metadata dictionary",
        "#": "these can be used to further refine your output directories or filenames",
        "keywords": {"bkey": "", "ckey": ""},
        "#": "make sure that custom keywords are empty, i.e. they don't appear unless specified by the user",
        "keywords-default": "",

        "#": "replace invalid path characters with unicode alternatives",
        "path-restrict": {
            "\\": "⧹",
            "/" : "⧸",
            "|" : "│",
            ":" : "꞉",
            "*" : "∗",
            "?" : "?",
            "\"": "″",
            "<" : "﹤",
            ">" : "﹥"
        },

        "#": "write tags for several *booru sites",
        "postprocessors": [
            {
                "name": "metadata",
                "mode": "tags",
                "whitelist": ["danbooru", "moebooru", "sankaku"]
            }
        ],

        "reddit":
        {
            "#": "only spawn child extractors for links to specific sites",
            "whitelist": ["imgur", "redgifs", "gfycat"],

            "#": "put files from child extractors into the reddit directory",
            "parent-directory": true,

            "#": "transfer metadata to any child extractor as '_reddit'",
            "parent-metadata": "_reddit"
        },

        "imgur":
        {
            "#": "use different directory and filename formats when coming from a reddit post",
            "directory":
            {
                "'_reddit' in locals()": []
            },
            "filename":
            {
                "'_reddit' in locals()": "{_reddit[id]} {id}.{extension}",
                ""                     : "{id}.{extension}"
            }
        },

        "twitter":
        {
            "#": "write text content for *all* tweets",
            "postprocessors": ["content"],
            "text-tweets": true,
            "username": "<username>",
            "password": "<password>"
        }

    },

    "downloader":
    {
        "#": "restrict download speed to 1 MB/s",
        "rate": "1M",

        "#": "show download progress indicator after 2 seconds",
        "progress": 2.0,

        "#": "retry failed downloads up to 3 times",
        "retries": 3,

        "#": "consider a download 'failed' after 8 seconds of inactivity",
        "timeout": 8.0,

        "#": "write '.part' files into a special directory",
        "part-directory": "/tmp/.download/",

        "#": "do not update file modification times",
        "mtime": false,

        "ytdl":
        {
            "#": "use yt-dlp instead of youtube-dl",
            "module": "yt_dlp"
        }
    },

    "output":
    {
        "log": {
            "level": "info",

            "#": "use different ANSI colors for each log level",
            "format": {
                "debug"  : "\u001b[0;37m{name}: {message}\u001b[0m",
                "info"   : "\u001b[1;37m{name}: {message}\u001b[0m",
                "warning": "\u001b[1;33m{name}: {message}\u001b[0m",
                "error"  : "\u001b[1;31m{name}: {message}\u001b[0m"
            }
        },

        "#": "shorten filenames to fit into one terminal line",
        "#": "while also considering wider East-Asian characters",
        "shorten": "eaw",

        "#": "enable ANSI escape sequences on Windows",
        "ansi": true,

        "#": "write logging messages to a separate file",
        "logfile": {
            "path": "~/Pictures/gallery-dl/log.txt",
            "mode": "w",
            "level": "debug"
        },

        "#": "write unrecognized URLs to a separate file",
        "unsupportedfile": {
            "path": "~/Pictures/gallery-dl/unsupported.txt",
            "mode": "a",
            "format": "{asctime} {message}",
            "format-date": "%Y-%m-%d-%H-%M-%S"
        }
    },

    "postprocessor":
    {
        "#": "write 'content' metadata into separate files",
        "content":
        {
            "name" : "metadata",

            "#": "write data for every post instead of each individual file",
            "event": "post",
            "filename": "{post_id|tweet_id|id}.txt",

            "#": "write only the values for 'content' or 'description'",
            "mode" : "custom",
            "format": "{content|description}\n"
        },

        "#": "put files into a '.cbz' archive",
        "cbz":
        {
            "name": "zip",
            "extension": "cbz"
        },

        "#": "various ugoira post processor configurations to create different file formats",
        "ugoira-webm":
        {
            "name": "ugoira",
            "extension": "webm",
            "ffmpeg-args": ["-c:v", "libvpx-vp9", "-an", "-b:v", "0", "-crf", "30"],
            "ffmpeg-twopass": true,
            "ffmpeg-demuxer": "image2"
        },
        "ugoira-mp4":
        {
            "name": "ugoira",
            "extension": "mp4",
            "ffmpeg-args": ["-c:v", "libx264", "-an", "-b:v", "4M", "-preset", "veryslow"],
            "ffmpeg-twopass": true,
            "libx264-prevent-odd": true
        },
        "ugoira-gif":
        {
            "name": "ugoira",
            "extension": "gif",
            "ffmpeg-args": ["-filter_complex", "[0:v] split [a][b];[a] palettegen [p];[b][p] paletteuse"]
        },
        "ugoira-copy": {
            "name": "ugoira",
            "extension": "mkv",
            "ffmpeg-args": ["-c", "copy"],
            "libx264-prevent-odd": false,
            "repeat-last-frame": false
        }
    },

    "#": "use a custom cache file location",
    "cache": {
        "file": "~/Pictures/gallery-dl/cache.sqlite3"
    }
}

Im investigating if gallery-dl can write more data to the database (in my case ~/Pictures/gallery-dl/archive.sqlite3), for example tweet content, images, post date of the tweet etc.

I dont know how i otherwise would get that information. I somehow need to make an extra script to once again hit the url with, for example, python script. I would like to somehow display them locally.

And what about the logs? Every time i run gallery-dl , it seems to check EVERY already downloaded tweet.

VaslD commented 2 years ago

There is "archive-format" key which controls which fields are stored in the database record. I use it to combine several fields to make a record unique for some other extractor. It should work on "twitter".

Although stored as SQLite 3, gallery-dl does not use the archive file as a full-featured database. Archive database works like a checklist. I think gallery-dl use it solely for de-duplication; that is, when the metadata for a tweet is known, gallery-dl uses your "archive-format" to generate a hash-like digest and compares that with records already in the database, it then skips downloading if the same record exists. It doesn't care what your "archive-format" contains, but each record must be stable (doesn't change for the same source), and unique across all extractors. The record itself also has no structure; think of it as a single line in a text (txt) file. If you want structure (fields that map back to columns of a table or properties of a class), you need to design one yourself and make that transportable in a single line of text.