Semi-automate identifying matching snaps/flatpaks

Here's a script which

fetches details for all the flatpaks in Flathub (via the flatpak command above, so it needs flatpak installed, and via the Flathub API)
fetches details for all the snaps (via the snap file above and via the local snap store API, so it needs snap installed)
pairs them up by website, on the assumption that a website for a package is a unique key -- slightly dubious assumption, but it's kinda reasonable, and it seems to find some pairs
prints them out

It aggressively caches the data (because fetching six thousand things takes a little while), but it tells you what it's cached, and you can remove that; in particular, it caches all the fetched results in one big file (cache/{flatpak,snap}-data.json) but if you remove one of those and the cache of names (cache/flatpak-names.json) then it will get the list of flatpaks afresh and fetch details for any new ones, while not refetching all the ones it knows about.

Script first:

#!/usr/bin/env python3

import requests_cache
import subprocess
import json
import os
import socket

from urllib3.connection import HTTPConnection
from urllib3.connectionpool import HTTPConnectionPool
from requests.adapters import HTTPAdapter

import logging

logging.basicConfig(level='WARNING')
logging.getLogger('requests_cache').setLevel('DEBUG')

# from https://stackoverflow.com/a/59594889
class SnapdConnection(HTTPConnection):
    def __init__(self):
        super().__init__("localhost")

    def connect(self):
        self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
        self.sock.connect("/run/snapd.socket")

class SnapdConnectionPool(HTTPConnectionPool):
    def __init__(self):
        super().__init__("localhost")

    def _new_conn(self):
        return SnapdConnection()

class SnapdAdapter(HTTPAdapter):
    def get_connection(self, url, proxies=None):
        return SnapdConnectionPool()

def make_cache():
    cache_obj = lambda: None  # haaaaaaaaack to make a singleton obj with props
    cache_obj.cache_folder = os.path.join(os.path.dirname(__file__), "cache")
    try:
        os.makedirs(cache_obj.cache_folder)
    except FileExistsError:
        pass
    cache_obj.flatpak = requests_cache.CachedSession(
        cache_name=os.path.join(cache_obj.cache_folder, "flatpak"),
        allowable_codes=(200, 404))
    cache_obj.snap = requests_cache.CachedSession(
        cache_name=os.path.join(cache_obj.cache_folder, "snap"),
        allowable_codes=(200, 404))
    cache_obj.snap.mount("http://snapd/", SnapdAdapter())
    cache_obj.file = lambda x: os.path.join(cache_obj.cache_folder, x)
    return cache_obj

def get_flatpak_names(cache):
    cache_fn = cache.file("flatpak-names.json")
    try:
        with open(cache_fn, encoding="utf-8") as fp:
            print(f"(using cached flatpak name data: rm {cache_fn} to clear)")
            return json.load(fp)
    except:
        pass
    proc = subprocess.run(["flatpak", "search", ".", "--columns=application"],
                          capture_output=True, encoding="utf-8")
    fns = [x.strip() for x in proc.stdout.split("\n")
           if x.strip() and "Gtk3theme" not in x and not x.endswith(".Sdk")]
    with open(cache_fn, encoding="utf-8", mode="w") as fp:
        json.dump(fns, fp, indent=2)
    return fns

def get_snap_names(cache):
    with open("/var/cache/snapd/names", encoding="utf-8") as fp:
        sns = [x.strip() for x in fp.readlines() if x.strip()]
    return sns

def populate_flatpak_data(fns, cache):
    cache_fn = cache.file("flatpak-data.json")
    try:
        with open(cache_fn, encoding="utf-8") as fp:
            print(f"(using cached flatpak detail data: rm {cache_fn} to clear)")
            return json.load(fp)
    except:
        pass
    print(f"Fetching flatpak data from API (which may be cached; rm {cache.flatpak._cache_name} to clear)")
    flatpak_data = {}
    count = len(fns)
    for (idx, flatpak_name) in enumerate(fns):
        if idx % 100 == 0:
            print(f"  flatpak data {idx}/{count}")
        url = f"https://flathub.org/api/v1/apps/{flatpak_name}"
        resp = cache.flatpak.get(url)
        if resp.status_code == 404:
            continue
        try:
            flatpak_data[flatpak_name] = resp.json()
        except:
            print("Unexpected flathub data error from {url}, {resp}")
    with open(cache_fn, encoding="utf-8", mode="w") as fp:
        json.dump(flatpak_data, fp, indent=2)
    return flatpak_data

def populate_snap_data(sns, cache):
    cache_fn = cache.file("snap-data.json")
    try:
        with open(cache_fn, encoding="utf-8") as fp:
            print(f"(using cached snap detail data: rm {cache_fn} to clear)")
            return json.load(fp)
    except:
        pass
    print(f"Fetching snap data from API (which may be cached; rm {cache.flatpak._cache_name} to clear)")
    snap_data = {}
    count = len(sns)
    for idx, snap_name in enumerate(sns):
        if idx % 100 == 0:
            print(f"  snap data {idx}/{count}")
        resp = cache.snap.get(f"http://snapd/v2/find?name={snap_name}")
        snap_data[snap_name] = resp.json()
    with open(cache_fn, encoding="utf-8", mode="w") as fp:
        json.dump(snap_data, fp, indent=2)
    return snap_data

def get_pairs(flatpaks, snaps):
    flatpaks_by_website = [
        (v.get("homepageUrl", v.get("bugtrackerUrl")), k)
        for (k, v) in flatpaks.items()
    ]
    flatpaks_by_website = dict([x for x in flatpaks_by_website if x[0]])
    snaps_by_website = [
        (v.get("result", [{}])[0].get("website"), k)
        for (k, v) in snaps.items()
        if v.get("status-code") == 200
    ]
    snaps_by_website = dict([x for x in snaps_by_website if x[0]])
    snap_websites = set(snaps_by_website.keys())
    flatpak_websites = set(flatpaks_by_website.keys())
    matches = snap_websites.intersection(flatpak_websites)
    return sorted([
        (snaps_by_website[w], flatpaks_by_website[w])
        for w in matches
    ])

def main():
    cache = make_cache()
    flatpak_names = get_flatpak_names(cache)
    snap_names = get_snap_names(cache)
    flatpaks = populate_flatpak_data(flatpak_names, cache)
    snaps = populate_snap_data(snap_names, cache)
    pairs = get_pairs(flatpaks, snaps)
    print("I suggest that the following are snap/flatpak pairs:")
    print("\n".join(["{} = {}".format(s, f) for (s, f) in pairs]))
    print(f"({len(pairs)} potential matches)")

if __name__ == "__main__":
    main()

Secondly, the results. This is not a PR against the CSV file because I have not checked most of these, so I don't know whether the decisions it makes are good. But here's the list so someone else can do that.

I suggest that the following are snap/flatpak pairs:
audacity = org.audacityteam.Audacity
bible-multi-the-son-of-man = org.hlwd.sonofman
bisq-desktop = network.bisq.Bisq
bitcoin-core = org.bitcoincore.bitcoin-qt
blender = org.blender.Blender
blockbench-snapcraft = net.blockbench.Blockbench
bolls = life.bolls.bolls
boxy-svg = com.boxy_svg.BoxySVG
cbetar2 = io.github.mrmyhuang.cbetar2
citra-emu = org.citra_emu.citra
cudatext = io.github.cudatext.CudaText-Qt5
czkawka = com.github.qarmin.czkawka
dbeaver-ce = io.dbeaver.DBeaverCommunity
dc-tiledmap = org.mapeditor.Tiled
deja-dup = org.gnome.DejaDup
digitales-klassenzimmer = de.hoppfoundation.klassenzimmer
duolingo-desktop = ro.go.hmlendea.DL-Desktop
firefox = org.mozilla.firefox
fluffychat = im.fluffychat.Fluffychat
foliate = com.github.johnfactotum.Foliate
freac = org.freac.freac
freeplane-mindmapping = org.freeplane.App
fsearch = io.github.cboxdoerfer.FSearch
fsuae = net.fsuae.FS-UAE
gimp = org.gimp.GIMP
gitkraken = com.axosoft.GitKraken
gnome-easytag = org.gnome.EasyTAG
google-play-music-desktop-player = com.googleplaymusicdesktopplayer.GPMDP
gridplayer = com.vzhd1701.gridplayer
gtkhash = org.gtkhash.gtkhash
handbrake-jz = fr.handbrake.ghb
hw-probe = org.linux_hardware.hw-probe
i2pd = website.i2pd.i2pd
icalingua = io.github.Icalingua.Icalingua
inkscape = org.inkscape.Inkscape
jahresarbeit-2003 = com.github.christianrauch.Jahresarbeit-2003
jdreplace = com.gitlab.JakobDev.jdReplace
joplin = net.cozic.joplin_desktop
keepassxc = org.keepassxc.KeePassXC
krop = com.github.arminstraub.krop
librepcb = org.librepcb.LibrePCB
libretrack = ru.proninyaroslav.libretrack
liferea = net.sourceforge.liferea
litteris = com.github.raibtoffoletto.litteris
logarithmplotter = eu.ad5001.LogarithmPlotter
losslesscut = no.mifi.losslesscut
love2d = org.love2d.love2d
m64p = io.github.m64p.m64p
mediaconch-gui = net.mediaarea.MediaConch
melonds = net.kuribo64.melonDS
midori = org.midori_browser.Midori
midterm = app.midterm.MidtermDesktop
motrix = net.agalwood.Motrix
movie-monad = com.lettier.movie-monad
muezzin = io.github.dbchoco.muezzin
munadi = org.munadi.Munadi
neochat = org.kde.neochat
netbeans = org.apache.netbeans
nitrokey-app = com.nitrokey.nitrokey-app
nordpass = com.nordpass.NordPass
notepadqq = com.notepadqq.Notepadqq
nuclear = org.js.nuclear.Nuclear
octave = org.octave.Octave
openscad-nightly = org.openscad.OpenSCAD
opentodolist = net.rpdev.OpenTodoList
openttd = org.openttd.OpenTTD
passky = com.rabbit_company.passky
picard = org.musicbrainz.Picard
pick-colour-picker = org.kryogenix.Pick
poedit = net.poedit.Poedit
postman = com.getpostman.Postman
powertabeditor = com.github.powertab.powertabeditor
qbittorrent-arnatious = org.qbittorrent.qBittorrent
qprompt = com.cuperino.qprompt
quadrix = chat.quadrix.Quadrix
qv2ray = com.github.Qv2ray
redis-desktop-manager = app.resp.RESP
remote-touchpad = com.github.unrud.RemoteTouchpad
retroarch = org.libretro.RetroArch
rocketchat-server = chat.rocket.RocketChat
rpmlauncher = ga.rpmtw.rpmlauncher
runelite = net.runelite.RuneLite
scantailor-advanced = com.github._4lex4.ScanTailor-Advanced
scummvm = org.scummvm.ScummVM
shortwave = de.haeckerfelix.Shortwave
shotcut = org.shotcut.Shotcut
skycheckers = net.zgcoder.skycheckers
slack = com.slack.Slack
sleek = com.github.ransome1.sleek
squirrelsql = net.sourceforge.squirrel_sql
sublime-merge = com.sublimemerge.App
sweethome3d-homedesign = com.sweethome3d.Sweethome3d
synfigstudio = org.synfig.SynfigStudio
szyszka = com.github.qarmin.szyszka
tandem = chat.tandem.Client
teams-for-linux = com.github.IsmaelMartinez.teams_for_linux
the-powder-toy = uk.co.powdertoy.tpt
tradesim = com.github.horaciodrs.tradesim
umbrello = org.kde.kmines
vice-jz = net.sf.VICE
vidcutter = com.ozmartians.VidCutter
video-downloader = com.github.unrud.VideoDownloader
virtualxt = org.virtualxt.VirtualXT
vlc = org.videolan.VLC
wallpaperdownloader = es.estoes.wallpaperDownloader
warzone2100 = net.wz2100.wz2100
whatsapp-for-linux = com.github.eneshecan.WhatsAppForLinux
xmind = net.xmind.XMind
youtube-music-desktop-app = app.ytmdesktop.ytmdesktop
yuzu = org.yuzu_emu.yuzu
zaproxy = org.zaproxy.ZAP
zoom-client = us.zoom.Zoom
(112 potential matches)

popey / unsnap

Semi-automate identifying matching snaps/flatpaks #31