BurntSushi / ucd-generate

A command line tool to generate Unicode tables as source code.
Apache License 2.0
93 stars 21 forks source link

Add option to generate script as an enum #14

Closed wezm closed 4 years ago

wezm commented 4 years ago

Adds an --enum option to allow ucd-generate to generate script data as a single table.

Sample output:

// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
//  ucd-generate script --enum /home/wmoore/Downloads/ucd-9
//
// ucd-generate is available on crates.io.

pub const SCRIPT_ENUM: &'static [&'static str] = &[
  "Adlam", "Ahom", "Anatolian_Hieroglyphs", "Arabic", "Armenian", "Avestan",
  "Balinese", "Bamum", "Bassa_Vah", "Batak", "Bengali", "Bhaiksuki",
  "Bopomofo", "Brahmi", "Braille", "Buginese", "Buhid", "Canadian_Aboriginal",
  "Carian", "Caucasian_Albanian", "Chakma", "Cham", "Cherokee", "Common",
  "Coptic", "Cuneiform", "Cypriot", "Cyrillic", "Deseret", "Devanagari",
  "Duployan", "Egyptian_Hieroglyphs", "Elbasan", "Ethiopic", "Georgian",
  "Glagolitic", "Gothic", "Grantha", "Greek", "Gujarati", "Gurmukhi", "Han",
  "Hangul", "Hanunoo", "Hatran", "Hebrew", "Hiragana", "Imperial_Aramaic",
  "Inherited", "Inscriptional_Pahlavi", "Inscriptional_Parthian", "Javanese",
  "Kaithi", "Kannada", "Katakana", "Kayah_Li", "Kharoshthi", "Khmer",
  "Khojki", "Khudawadi", "Lao", "Latin", "Lepcha", "Limbu", "Linear_A",
  "Linear_B", "Lisu", "Lycian", "Lydian", "Mahajani", "Malayalam", "Mandaic",
  "Manichaean", "Marchen", "Meetei_Mayek", "Mende_Kikakui",
  "Meroitic_Cursive", "Meroitic_Hieroglyphs", "Miao", "Modi", "Mongolian",
  "Mro", "Multani", "Myanmar", "Nabataean", "New_Tai_Lue", "Newa", "Nko",
  "Ogham", "Ol_Chiki", "Old_Hungarian", "Old_Italic", "Old_North_Arabian",
  "Old_Permic", "Old_Persian", "Old_South_Arabian", "Old_Turkic", "Oriya",
  "Osage", "Osmanya", "Pahawh_Hmong", "Palmyrene", "Pau_Cin_Hau", "Phags_Pa",
  "Phoenician", "Psalter_Pahlavi", "Rejang", "Runic", "Samaritan",
  "Saurashtra", "Sharada", "Shavian", "Siddham", "SignWriting", "Sinhala",
  "Sora_Sompeng", "Sundanese", "Syloti_Nagri", "Syriac", "Tagalog",
  "Tagbanwa", "Tai_Le", "Tai_Tham", "Tai_Viet", "Takri", "Tamil", "Tangut",
  "Telugu", "Thaana", "Thai", "Tibetan", "Tifinagh", "Tirhuta", "Ugaritic",
  "Vai", "Warang_Citi", "Yi",
];

pub const SCRIPT: &'static [(u32, u32, u8)] = &[
  (0, 64, 23), (65, 90, 61), (91, 96, 23), (97, 122, 61), (123, 169, 23),
  (170, 170, 61), (171, 185, 23), (186, 186, 61), (187, 191, 23),
  (192, 214, 61), (215, 215, 23), (216, 246, 61), (247, 247, 23),
  (248, 696, 61), (697, 735, 23), (736, 740, 61), (741, 745, 23),
  (746, 747, 12), (748, 767, 23), (768, 879, 48), (880, 883, 38),
  (884, 884, 23), (885, 887, 38), (890, 893, 38), (894, 894, 23),
  (895, 895, 38), (900, 900, 38), (901, 901, 23), (902, 902, 38),
  (903, 903, 23), (904, 906, 38), (908, 908, 38), (910, 929, 38),
  (931, 993, 38), (994, 1007, 24), (1008, 1023, 38), (1024, 1156, 27),
  ⋮
];
wezm commented 4 years ago

Related question. Is there a reason that an actual Rust enum isn't generated by ranges_to_enum? In this particular case it would be ideal if this was generated as something like:

use self::Script::*;

#[repr(u8)]
pub enum Script {
  Adlam,
  Ahom,
  Anatolian_Hieroglyphs,
  Arabic,
  Armenian,
  Avestan,
  ⋮
}

pub const SCRIPT: &'static [(u32, u32, Script)] = &[
  (0, 64, Armenian),
  (65, 90, Ahom),
  (91, 96, Armenian),
  (97, 122, Anatolian_Hieroglyphs),
  (123, 169, Armenian),
  ⋮
];

There could perhaps be a method generated to get a String from the enum too if needed.