itchio / itch

🎮 The best way to play your itch.io games
https://itch.io/app
MIT License
2.37k stars 210 forks source link

consider accepting shift-jis in zip and falling back to cp437 again #2103

Closed fasterthanlime closed 6 years ago

fasterthanlime commented 6 years ago

Test go program:

package main

import (
    "bytes"
    "log"
    "os"

    "github.com/pkg/errors"

    "github.com/gogits/chardet"
    "github.com/itchio/arkive/zip"
    "golang.org/x/text/encoding/charmap"
    "golang.org/x/text/encoding/japanese"
)

func main() {
    log.SetOutput(os.Stdout)
    filename := os.Args[1]

    zr, err := zip.OpenReader(filename)
    must(err)

    buf := new(bytes.Buffer)

    for _, f := range zr.File {
        if f.NonUTF8 {
            buf.WriteString(f.Name)
        }
    }

    pathBytes := buf.Bytes()
    d := chardet.NewTextDetector()
    res, err := d.DetectBest(pathBytes)
    must(err)

    for _, f := range zr.File {
        if f.NonUTF8 {
            if res.Confidence > 70 {
                decoded, _ := decode(f.Name, res.Charset)
                log.Printf("%s: [%s] (%d%% confidence)", decoded, res.Charset, res.Confidence)
            } else {
                cp437, _ := charmap.CodePage437.NewDecoder().String(f.Name)
                log.Printf("%s: [CP-437] (fallback)", cp437)
            }
        }
    }
}

func decode(input string, charset string) (string, error) {
    switch charset {
    case "ISO-8859-1":
        return charmap.ISO8859_1.NewDecoder().String(input)
    case "ISO-8859-2":
        return charmap.ISO8859_2.NewDecoder().String(input)
    case "ISO-8859-9":
        return charmap.ISO8859_9.NewDecoder().String(input)
    case "windows-1250":
        return charmap.Windows1250.NewDecoder().String(input)
    case "windows-1252":
        return charmap.Windows1252.NewDecoder().String(input)
    case "Shift_JIS":
        return japanese.ShiftJIS.NewDecoder().String(input)
    case "EUC-JP":
        return japanese.EUCJP.NewDecoder().String(input)
    case "UTF-8":
        return input, nil
    default:
        return "", errors.Errorf("Unknown charset %s", charset)
    }
}

func must(err error) {
    if err != nil {
        panic(err)
    }
}

Command line:

for i in *.zip; do echo $i; time cdtest "$i" | head -3; done

Output:

a-dark-place.zip
2018/09/08 03:32:12 A Dark Place/M-ê▌▌-ç▌Ö▌▌▌¥u-ü▌▌-à-ös▌º-Ö-ê▌«▌¡-ò▌+▌ùi▌▌▌▌-Öc▌▌▌«▌ÿ▌»B▌ío▌í-ò▌ù▌«▌+-òx▌¿e-ì-à▌ñ▌+s-à▌¼-Ö▌ú▌Ñ/: [CP-437] (fallback)
futayuri.zip
2018/09/08 03:32:12 futayuri/www/audio/bgm/m_さんさん日和.rpgmvo: [Shift_JIS] (100% confidence)
2018/09/08 03:32:12 futayuri/www/audio/bgm/m_まどろむ時間.rpgmvo: [Shift_JIS] (100% confidence)
2018/09/08 03:32:12 futayuri/www/audio/bgm/m_まどろむ時間2.rpgmvo: [Shift_JIS] (100% confidence)
lizard.zip
2018/09/08 03:32:12 language/text_français.lit: [CP-437] (fallback)
lizard-repacked.zip
typeknight.zip
2018/09/08 03:32:12 type knight/dictionnaires/français.txt: [CP-437] (fallback)
win10-7z.zip
2018/09/08 03:32:12 win10/mémoire.txt: [CP-437] (fallback)
win10-winrar.zip
2018/09/08 03:32:12 win10/mémoire.txt: [CP-437] (fallback)
fasterthanlime commented 6 years ago

I'm curious to find an ISO-8859-{1,2} zip in the wild, could only find CP-437 and Shift-JIS so far.

fasterthanlime commented 6 years ago

The only thing left to do is to release a butler point version, which I will do tomorrow morning.

fasterthanlime commented 6 years ago

butler v14.3.0 shipped with that, and it seems to work fine.