Open milahu opened 1 year ago
to make this a warning:
--- a/subdl.py
+++ b/subdl.py
@@ -312,8 +312,11 @@
if not result["encoding"] in {"ascii", "utf-8"}:
print(f"Found encoding {result['encoding']} with a confidence of {result['confidence']*100:.2f}%. Converting to utf8.")
# separate lines for easier debugging
- s = s.decode(result["encoding"]) # bytes -> str
- s = s.encode("utf8") # str -> bytes
+ try:
+ s = s.decode(result["encoding"]).encode("utf8") # bytes -> str -> bytes
+ except UnicodeDecodeError as err:
+ print(f"failed to convert {destfilename}: {err}")
+ # keep original encoding of file
writefile(destfilename, s)
print("done, wrote %d bytes."% (len(s)), file=sys.stderr)
to use libmagic instead of chardet:
--- a/subdl.py
+++ b/subdl.py
@@ -307,13 +307,16 @@
if options.filter:
s = filtersub(s)
if options.utf8:
- import chardet
- result = chardet.detect(s)
- if not result["encoding"] in {"ascii", "utf-8"}:
- print(f"Found encoding {result['encoding']} with a confidence of {result['confidence']*100:.2f}%. Converting to utf8.")
+ import magic
+ result = magic.detect_from_content(s)
+ if not result.encoding in {"us-ascii", "utf-8"}:
+ print(f"Found encoding {result['encoding']}. Converting to utf8.")
# separate lines for easier debugging
- s = s.decode(result["encoding"]) # bytes -> str
- s = s.encode("utf8") # str -> bytes
+ try:
+ s = s.decode(result.encoding).encode("utf8") # bytes -> str -> bytes
+ except UnicodeDecodeError as err:
+ print(f"failed to convert to utf8: {destfilename}: {err}")
+ # keep original encoding of file
writefile(destfilename, s)
print("done, wrote %d bytes."% (len(s)), file=sys.stderr)
@@ -436,9 +439,9 @@
elif option == '--utf8':
options.utf8 = True
try:
- import chardet
+ import magic
except ModuleNotFoundError:
- sys.stderr.write("Error: The --utf8 option requires the chardet module from https://pypi.org/project/chardet/ - Hint: pip install chardet\n")
+ sys.stderr.write("Error: The --utf8 option requires the python-magic module from https://pypi.org/project/python-magic/ - Hint: pip install python-magic\n")
sys.exit(1)
elif option == '--list-languages':
ListLanguages()
--- a/subdl.py
+++ b/subdl.py
@@ -310,12 +310,12 @@
import magic
result = magic.detect_from_content(s)
if not result.encoding in {"us-ascii", "utf-8"}:
- print(f"Found encoding {result['encoding']}. Converting to utf8.")
+ print(f"Found encoding {result.encoding}. Converting to utf8.")
# separate lines for easier debugging
try:
s = s.decode(result.encoding).encode("utf8") # bytes -> str -> bytes
except UnicodeDecodeError as err:
- print(f"failed to convert to utf8: {destfilename}: {err}")
+ print(f"failed to convert to utf8 from {result.encoding}: {destfilename}: {err}")
# keep original encoding of file
writefile(destfilename, s)
print("done, wrote %d bytes."% (len(s)), file=sys.stderr)
--- a/subdl.py
+++ b/subdl.py
@@ -309,7 +309,7 @@
if options.utf8:
import magic
result = magic.detect_from_content(s)
- if not result.encoding in {"us-ascii", "utf-8"}:
+ if not result.encoding in {"us-ascii", "utf-8", "unknown-8bit", "binary"}:
print(f"Found encoding {result.encoding}. Converting to utf8.")
# separate lines for easier debugging
try:
Irreversible.2002.DVDRip.XviD.AC3-DK.EN.srt https://www.opensubtitles.org/en/subtitles/3431287/irreversible-en
"johab" sounds weird. lets try latin1
success! so its a bug in chardet ...
todo: workaround: when conversion to utf8 fails, keep the original file, show a warning, rename the result file to
$basename.noutf8.$extension
, for exampleIrreversible.2002.en.1952041941.noutf8.srt