alexanderwink / subdl

subdl - command-line tool to download subtitles from opensubtitles.org Official support for subdl seems to have been dropped. This site is intended for the future use of subdl with community driven support.
87 stars 18 forks source link

utf8 option fails to convert some subtitles #37

Open milahu opened 1 year ago

milahu commented 1 year ago
$ alias subdl='subdl --utf8 --download=all --existing=bypass --force-filename --username=xx --password=xx'

$ subdl Irreversible.2002.mp4 
Searching for subtitles for query=Irreversible.2002...
Found 24 results for 'Irreversible.2002.mp4':
#1952041941 [en] [Rat: 0.0 DL:70016] "Irreversible" Irreversible.2002.DVDRip.XviD.AC3-DK.EN.srt 
[...]
Found encoding Johab with a confidence of 99.00%. Converting to utf8.
Downloading #1952041941 to Irreversible.2002.en.1952041941.srt... Traceback (most recent call last):
  File ".subdl-wrapped", line 9, in <module>
    sys.exit(cli())
  File "subdl.py", line 534, in cli
    main(sys.argv[1:])
  File "subdl.py", line 504, in main
    AutoDownloadAndSave(file, search_result, downloaded)
  File "subdl.py", line 339, in AutoDownloadAndSave
    DownloadAndSaveSubtitle(search_result.IDSubtitleFile, output_filename)
  File "subdl.py", line 315, in DownloadAndSaveSubtitle
    s = s.decode(result["encoding"]) # bytes -> str
UnicodeDecodeError: 'johab' codec can't decode byte 0xb4 in position 13235: illegal multibyte sequence

Irreversible.2002.DVDRip.XviD.AC3-DK.EN.srt https://www.opensubtitles.org/en/subtitles/3431287/irreversible-en

cd $(mktemp -d)
sub_id=3431287
wget https://dl.opensubtitles.org/en/download/sub/$sub_id
unzip -B $sub_id
chardetect *.srt 
# Irreversible.2002.DVDRip.XviD.AC3-DK.EN.srt: Johab with confidence 0.99
iconv -f johab -t utf8 *.srt >/dev/null 
# iconv: illegal input sequence at position 13187
dd if=$(ls *.srt) bs=1 skip=$((13187 - 8)) count=16 status=none | hexdump -C
# 00000000  79 21 0d 0a 2d 20 57 65  b4 72 65 20 67 6f 69 6e  |y!..- We.re goin|

"johab" sounds weird. lets try latin1

iconv -f latin1 -t utf8 *.srt >/dev/null && echo ok
# ok

success! so its a bug in chardet ...

todo: workaround: when conversion to utf8 fails, keep the original file, show a warning, rename the result file to $basename.noutf8.$extension, for example Irreversible.2002.en.1952041941.noutf8.srt

milahu commented 1 year ago

to make this a warning:

--- a/subdl.py
+++ b/subdl.py
@@ -312,8 +312,11 @@
         if not result["encoding"] in {"ascii", "utf-8"}:
             print(f"Found encoding {result['encoding']} with a confidence of {result['confidence']*100:.2f}%. Converting to utf8.")
             # separate lines for easier debugging
-            s = s.decode(result["encoding"]) # bytes -> str
-            s = s.encode("utf8") # str -> bytes
+            try:
+                s = s.decode(result["encoding"]).encode("utf8") # bytes -> str -> bytes
+            except UnicodeDecodeError as err:
+                print(f"failed to convert {destfilename}: {err}")
+                # keep original encoding of file
     writefile(destfilename, s)
     print("done, wrote %d bytes."% (len(s)), file=sys.stderr)
milahu commented 1 year ago

to use libmagic instead of chardet:

--- a/subdl.py
+++ b/subdl.py
@@ -307,13 +307,16 @@
     if options.filter:
         s = filtersub(s)
     if options.utf8:
-        import chardet
-        result = chardet.detect(s)
-        if not result["encoding"] in {"ascii", "utf-8"}:
-            print(f"Found encoding {result['encoding']} with a confidence of {result['confidence']*100:.2f}%. Converting to utf8.")
+        import magic
+        result = magic.detect_from_content(s)
+        if not result.encoding in {"us-ascii", "utf-8"}:
+            print(f"Found encoding {result['encoding']}. Converting to utf8.")
             # separate lines for easier debugging
-            s = s.decode(result["encoding"]) # bytes -> str
-            s = s.encode("utf8") # str -> bytes
+            try:
+                s = s.decode(result.encoding).encode("utf8") # bytes -> str -> bytes
+            except UnicodeDecodeError as err:
+                print(f"failed to convert to utf8: {destfilename}: {err}")
+                # keep original encoding of file
     writefile(destfilename, s)
     print("done, wrote %d bytes."% (len(s)), file=sys.stderr)

@@ -436,9 +439,9 @@
         elif option == '--utf8':
             options.utf8 = True
             try:
-                import chardet
+                import magic
             except ModuleNotFoundError:
-                sys.stderr.write("Error: The --utf8 option requires the chardet module from https://pypi.org/project/chardet/ - Hint: pip install chardet\n")
+                sys.stderr.write("Error: The --utf8 option requires the python-magic module from https://pypi.org/project/python-magic/ - Hint: pip install python-magic\n")
                 sys.exit(1)
         elif option == '--list-languages':
             ListLanguages()
--- a/subdl.py
+++ b/subdl.py
@@ -310,12 +310,12 @@
         import magic
         result = magic.detect_from_content(s)
         if not result.encoding in {"us-ascii", "utf-8"}:
-            print(f"Found encoding {result['encoding']}. Converting to utf8.")
+            print(f"Found encoding {result.encoding}. Converting to utf8.")
             # separate lines for easier debugging
             try:
                 s = s.decode(result.encoding).encode("utf8") # bytes -> str -> bytes
             except UnicodeDecodeError as err:
-                print(f"failed to convert to utf8: {destfilename}: {err}")
+                print(f"failed to convert to utf8 from {result.encoding}: {destfilename}: {err}")
                 # keep original encoding of file
     writefile(destfilename, s)
     print("done, wrote %d bytes."% (len(s)), file=sys.stderr)
--- a/subdl.py
+++ b/subdl.py
@@ -309,7 +309,7 @@
     if options.utf8:
         import magic
         result = magic.detect_from_content(s)
-        if not result.encoding in {"us-ascii", "utf-8"}:
+        if not result.encoding in {"us-ascii", "utf-8", "unknown-8bit", "binary"}:
             print(f"Found encoding {result.encoding}. Converting to utf8.")
             # separate lines for easier debugging
             try: