utf8 option fails to convert some subtitles

milahu commented 1 year ago

$ alias subdl='subdl --utf8 --download=all --existing=bypass --force-filename --username=xx --password=xx'

$ subdl Irreversible.2002.mp4 
Searching for subtitles for query=Irreversible.2002...
Found 24 results for 'Irreversible.2002.mp4':
#1952041941 [en] [Rat: 0.0 DL:70016] "Irreversible" Irreversible.2002.DVDRip.XviD.AC3-DK.EN.srt 
[...]
Found encoding Johab with a confidence of 99.00%. Converting to utf8.
Downloading #1952041941 to Irreversible.2002.en.1952041941.srt... Traceback (most recent call last):
  File ".subdl-wrapped", line 9, in <module>
    sys.exit(cli())
  File "subdl.py", line 534, in cli
    main(sys.argv[1:])
  File "subdl.py", line 504, in main
    AutoDownloadAndSave(file, search_result, downloaded)
  File "subdl.py", line 339, in AutoDownloadAndSave
    DownloadAndSaveSubtitle(search_result.IDSubtitleFile, output_filename)
  File "subdl.py", line 315, in DownloadAndSaveSubtitle
    s = s.decode(result["encoding"]) # bytes -> str
UnicodeDecodeError: 'johab' codec can't decode byte 0xb4 in position 13235: illegal multibyte sequence

Irreversible.2002.DVDRip.XviD.AC3-DK.EN.srt https://www.opensubtitles.org/en/subtitles/3431287/irreversible-en

cd $(mktemp -d)
sub_id=3431287
wget https://dl.opensubtitles.org/en/download/sub/$sub_id
unzip -B $sub_id
chardetect *.srt 
# Irreversible.2002.DVDRip.XviD.AC3-DK.EN.srt: Johab with confidence 0.99
iconv -f johab -t utf8 *.srt >/dev/null 
# iconv: illegal input sequence at position 13187
dd if=$(ls *.srt) bs=1 skip=$((13187 - 8)) count=16 status=none | hexdump -C
# 00000000  79 21 0d 0a 2d 20 57 65  b4 72 65 20 67 6f 69 6e  |y!..- We.re goin|

"johab" sounds weird. lets try latin1

iconv -f latin1 -t utf8 *.srt >/dev/null && echo ok
# ok

success! so its a bug in chardet ...

todo: workaround: when conversion to utf8 fails, keep the original file, show a warning, rename the result file to $basename.noutf8.$extension, for example Irreversible.2002.en.1952041941.noutf8.srt

milahu commented 1 year ago

to make this a warning:

--- a/subdl.py
+++ b/subdl.py
@@ -312,8 +312,11 @@
         if not result["encoding"] in {"ascii", "utf-8"}:
             print(f"Found encoding {result['encoding']} with a confidence of {result['confidence']*100:.2f}%. Converting to utf8.")
             # separate lines for easier debugging
-            s = s.decode(result["encoding"]) # bytes -> str
-            s = s.encode("utf8") # str -> bytes
+            try:
+                s = s.decode(result["encoding"]).encode("utf8") # bytes -> str -> bytes
+            except UnicodeDecodeError as err:
+                print(f"failed to convert {destfilename}: {err}")
+                # keep original encoding of file
     writefile(destfilename, s)
     print("done, wrote %d bytes."% (len(s)), file=sys.stderr)

milahu commented 1 year ago

to use libmagic instead of chardet:

--- a/subdl.py
+++ b/subdl.py
@@ -307,13 +307,16 @@
     if options.filter:
         s = filtersub(s)
     if options.utf8:
-        import chardet
-        result = chardet.detect(s)
-        if not result["encoding"] in {"ascii", "utf-8"}:
-            print(f"Found encoding {result['encoding']} with a confidence of {result['confidence']*100:.2f}%. Converting to utf8.")
+        import magic
+        result = magic.detect_from_content(s)
+        if not result.encoding in {"us-ascii", "utf-8"}:
+            print(f"Found encoding {result['encoding']}. Converting to utf8.")
             # separate lines for easier debugging
-            s = s.decode(result["encoding"]) # bytes -> str
-            s = s.encode("utf8") # str -> bytes
+            try:
+                s = s.decode(result.encoding).encode("utf8") # bytes -> str -> bytes
+            except UnicodeDecodeError as err:
+                print(f"failed to convert to utf8: {destfilename}: {err}")
+                # keep original encoding of file
     writefile(destfilename, s)
     print("done, wrote %d bytes."% (len(s)), file=sys.stderr)

@@ -436,9 +439,9 @@
         elif option == '--utf8':
             options.utf8 = True
             try:
-                import chardet
+                import magic
             except ModuleNotFoundError:
-                sys.stderr.write("Error: The --utf8 option requires the chardet module from https://pypi.org/project/chardet/ - Hint: pip install chardet\n")
+                sys.stderr.write("Error: The --utf8 option requires the python-magic module from https://pypi.org/project/python-magic/ - Hint: pip install python-magic\n")
                 sys.exit(1)
         elif option == '--list-languages':
             ListLanguages()

--- a/subdl.py
+++ b/subdl.py
@@ -310,12 +310,12 @@
         import magic
         result = magic.detect_from_content(s)
         if not result.encoding in {"us-ascii", "utf-8"}:
-            print(f"Found encoding {result['encoding']}. Converting to utf8.")
+            print(f"Found encoding {result.encoding}. Converting to utf8.")
             # separate lines for easier debugging
             try:
                 s = s.decode(result.encoding).encode("utf8") # bytes -> str -> bytes
             except UnicodeDecodeError as err:
-                print(f"failed to convert to utf8: {destfilename}: {err}")
+                print(f"failed to convert to utf8 from {result.encoding}: {destfilename}: {err}")
                 # keep original encoding of file
     writefile(destfilename, s)
     print("done, wrote %d bytes."% (len(s)), file=sys.stderr)

--- a/subdl.py
+++ b/subdl.py
@@ -309,7 +309,7 @@
     if options.utf8:
         import magic
         result = magic.detect_from_content(s)
-        if not result.encoding in {"us-ascii", "utf-8"}:
+        if not result.encoding in {"us-ascii", "utf-8", "unknown-8bit", "binary"}:
             print(f"Found encoding {result.encoding}. Converting to utf8.")
             # separate lines for easier debugging
             try:

alexanderwink / subdl

utf8 option fails to convert some subtitles #37