Closed qcgm1978 closed 7 months ago
Usage of encodingExists seems incorrect. Something like this? (not tested)
const fs = require('fs')
const jschardet = require('jschardet')
const iconv = require('iconv-lite')
// not sure whether this is necessary
function fixed_encoding(detected) {
const {encoding, confidence} = detected
// cf. Sabaki (ugf.js)
if (detected.confidence <= 0.2) {return 'utf8'}
// cf. KaTrain (sgf_parser.py)
const fix = {
'windows-1252': 'gbk',
gb2312: 'gbk',
}
return fix[encoding] || encoding
}
const buffer = fs.readFileSync(filename)
const detected = jschardet.detect(buffer)
const encoding = fixed_encoding(detected)
const sgf_str = iconv.decode(buffer, encoding)
Yes, I tested it, and it worked. It's a clever solution.
In the current implementation, .gib and .ngf are supposed to be UTF-8 and GB18030 respectively in xyz2sgf. Is this correct? Is it better to use similar auto-detection for them, too?
It seems that this should also be handled. When opening Sabaki/test/gib/gb2312.gib, garbled code appears. It might be a good idea to handle this uniformly in read_sgf
and ascii seems to be handled independently:
function get_go_str(buffer_or_str) {
const detected = jschardet.detect(buffer_or_str);
const encoding = fixed_encoding(detected);
const sgf_str = iconv.decode(buffer_or_str, encoding);
return sgf_str;
}
const fix = {
'windows-1252': 'gbk',
ascii:'gb2312',
gb2312: 'gbk',
}
function read_sgf(sgf_str, filename, internally, n = Infinity, enable_save_img = false, img_name, dir) {
sgf_str = get_go_str(sgf_str);
...
thx. It may be safer to apply iconv BEFORE calling xyz2sgf for the conversion from GIB to SGF (014bb03a4).
Yes, it works and get the correct result. My solution has problem.
Garbled characters appeared when pasting link. The reason might be that res.setEncoding('utf8')
was set in the open_url_sub
method, but this is not necessarily correct. According to the explanation in Use Buffers when decoding, I can solve this problem using the following code:
function open_url_sub(url, u) {
...
const on_get = res => {
...
let chunks = [];
res.on('data', function (chunk) {
chunks.push(chunk);
});
res.on('end', function () {
const buffer = Buffer.concat(chunks);
const decode_str = get_decode_str(buffer);
read_sgf(decode_str);
update_all()
});
}
function get_decode_str(buffer, given_encoding) {
const encoding = given_encoding || fixed_encoding(jschardet.detect(buffer));
const decode_str = iconv.decode(buffer, encoding);
return decode_str;
}
function fixed_encoding(detected_by_jschardet) {
...
const fix = {
'windows-1252': 'gbk',
gb2312: 'gbk',
}
// encoding maybe capital letters
return fix[encoding.toLowerCase()] || encoding
}
However, for SGF files that have already been downloaded, if they are saved in an incorrect format, such as GBK encoding being saved as UTF-8 encoding, they cannot be parsed correctly. This is because double-decoding not only leads to wrong results, but it is also nearly impossible to restore the original bytes because UTF-8 conversion is lossy
.
You are correct. thx!
new URL(url)
This method returns a file URL string (macOS) that may be encoded, like:
'/Users/dickphilipp/Documents/bar_chart_race/Django/media/images/go/books/%E5%9B%B4%E6%A3%8B%E7%94%B5%E5%AD%90%E4%B9%A6/%E5%90%B4%E6%B8%85%E6%BA%90/%E3%80%8A%E5%A6%82%E4%BD%95%E4%BD%BF%E4%BC%98%E5%8A%BF%E6%A3%8B%E7%A1%AE%E5%AE%9E%E8%B5%A2%E5%88%B0%E6%89%8B%E3%80%8B/%E8%B5%A2%E6%A3%8B%E4%B8%8D%E9%97%B9%E4%BA%8B-1-01.sgf'
It needs to be decoded before read. readFileSync
may error when handling such addresses.
I forgot to mention this in my previous reply. For example:
function read_file_with_iconv(filename, given_encoding) {
return read_buffer_with_iconv(fs.readFileSync(decodeURIComponent(filename)), given_encoding)
}
Ah, is this ok?
--- a/src/main.js
+++ b/src/main.js
@@ -2227,7 +2227,7 @@ function open_url_sub(url, u) {
switch (u.protocol) {
case 'https:': https.get(url, on_get); break;
case 'http:': http.get(url, on_get); break;
- case 'file:': load_sgf_etc(u.pathname); break;
+ case 'file:': load_sgf_etc(decodeURIComponent(u.pathname)); break;
default: toast(`Unsupported protocol: ${url}`); break;
}
}
Yes, it works.
fixed. thx!
When opening some sgf files, garbled code appears. How to choose the appropriate encoding to open them? I tried the following code, which seems to solve the problem, but I don't know if it is still effective in other language conditions?