Closed ebassi closed 1 month ago
The assertion happens when trying to collate a file using Japanese Unicode glyphs on a UTF-8 file system; create a file named 立秋 feat.ちょこ - プナイプナイせんそう.mp3
.
I can't reproduce this here. Unclear what's happening. I tried
use gio::prelude::*;
fn main() {
let path = std::path::PathBuf::from("立秋 feat.ちょこ - プナイプナイせんそう.mp3");
let file = gio::File::for_path(&path);
let _ = std::fs::File::create(&path).unwrap();
assert!(path.exists());
let basename = file.basename().unwrap();
let key = glib::FilenameCollationKey::from(basename.to_string_lossy());
println!("{} {} {:?}", file.parse_name(), basename.display(), key);
}
and
diff --git a/glib/src/unicollate.rs b/glib/src/unicollate.rs
index 8e43bff5bac..39223532b58 100644
--- a/glib/src/unicollate.rs
+++ b/glib/src/unicollate.rs
@@ -71,6 +71,25 @@ mod tests {
assert_eq!(unsorted, sorted);
}
+ #[test]
+ fn collate_non_ascii() {
+ let mut unsorted = vec![
+ String::from("猫の手も借りたい"),
+ String::from("日本語は難しい"),
+ String::from("ありがとう"),
+ ];
+
+ let sorted = vec![
+ String::from("ありがとう"),
+ String::from("日本語は難しい"),
+ String::from("猫の手も借りたい"),
+ ];
+
+ unsorted.sort_by(|s1, s2| CollationKey::from(&s1).cmp(&CollationKey::from(&s2)));
+
+ assert_eq!(unsorted, sorted);
+ }
+
#[test]
fn collate_filenames() {
let mut unsorted = vec![
@@ -91,4 +110,51 @@ mod tests {
assert_eq!(unsorted, sorted);
}
+
+ #[test]
+ fn collate_filenames_non_ascii() {
+ let mut unsorted = vec![
+ String::from("猫の手も借りたい.foo"),
+ String::from("日本語は難しい.bar"),
+ String::from("ありがとう.baz"),
+ ];
+
+ let sorted = vec![
+ String::from("ありがとう.baz"),
+ String::from("日本語は難しい.bar"),
+ String::from("猫の手も借りたい.foo"),
+ ];
+
+ unsorted.sort_by(|s1, s2| {
+ FilenameCollationKey::from(&s1).cmp(&FilenameCollationKey::from(&s2))
+ });
+
+ assert_eq!(unsorted, sorted);
+ }
+
+ #[test]
+ fn collate_filenames_from_path() {
+ use std::path::PathBuf;
+
+ let mut unsorted = vec![
+ PathBuf::from("猫の手も借りたい.foo"),
+ PathBuf::from("日本語は難しい.bar"),
+ PathBuf::from("ありがとう.baz"),
+ PathBuf::from("立秋 feat.ちょこ - プナイプナイせんそう.baz"),
+ ];
+
+ let sorted = vec![
+ PathBuf::from("ありがとう.baz"),
+ PathBuf::from("日本語は難しい.bar"),
+ PathBuf::from("猫の手も借りたい.foo"),
+ PathBuf::from("立秋 feat.ちょこ - プナイプナイせんそう.baz"),
+ ];
+
+ unsorted.sort_by(|s1, s2| {
+ FilenameCollationKey::from(&s1.to_string_lossy())
+ .cmp(&FilenameCollationKey::from(&s2.to_string_lossy()))
+ });
+
+ assert_eq!(unsorted, sorted);
+ }
}
Also importing directories in Amberol git main with lots of files with Japanese filenames works fine here.
The error looks like GLib is returning us something that is not valid UTF-8.
Testing with Python:
>>> import locale
>>> locale.strxfrm("立秋 feat.ちょこ - プナイプナイせん.mp3")
'立秋 feat.ちょこ - プナイプナイせん.mp3'
>>> locale.setlocale(locale.LC_ALL, "en_GB.UTF-8")
'en_GB.UTF-8'
>>> locale.strxfrm("立秋 feat.ちょこ - プナイプナイせん.mp3")
'\udcc9\udbc9ƥƎšɨὰᾆὩύὴὠύὴὠὭᾐǵȪŚ\x01\x1d\x1d\x1d\x1d\x1d\x1d\x1d\x1d5\x1d\x1d\x1d5\x1d\x1d\x1d\x1d\x1d\x1d\x1d\x01\x02\x02\x02\x02\r\x0c\r\x10\x02\x10\x10\x10\x02\x10\x10\r\r\x02\x02\x02\x01\x03㵛\x01懘\x01憊\x01悝\x01擞\x01㵨\x01醉\x01鈺\x01酑\x01㵛\x01㵧\x01㵛\x01釢\x01醪\x01鄀\x01釢\x01醪\x01鄀\x01酴\x01鉶\x01㵨\x01挹\x01搒\x01嶼'
GLib calls strxfrm()
inside g_utf8_collate_key_for_filename()
, and apparently strxfrm()
does not return a UTF-8 encoded string. Ideally, the API documentation should make it clear that the input has to be UTF-8, but not the output.
Turns out that #1329 breaks code that does:
which now generates a debugging assertion:
and a backtrace like this one taken from Amberol:
Originally posted by @ebassi in https://github.com/gtk-rs/gtk-rs-core/issues/1329#issuecomment-2336176266