magiclen / unicode-blocks

This crate contains a list of all unicode blocks and provides some functions to search across them.
MIT License
5 stars 5 forks source link

Feature request: update to unicode 15.1.0 #5

Closed SKalt closed 9 months ago

SKalt commented 9 months ago

Hi @magiclen, would you be interested in re-running your code generator to handle the new characters from Unicode 15.1.0, which was released in September 2023? I'm asking since I was generating a list of code blocks and noticed that the new CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I block was missing.

(here's the hacky script I used to generate a list of all code blocks, if you're interested in using it for #3) ```sh #!/usr/bin/env bash target="/tmp/unicode_blocks.txt" if ! [ -f "$target" ]; then curl -o "$target" "https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt" fi echo "/// all the unicode blocks ordered by their range of code points" echo "pub const ALL_UNICODE_BLOCKS: &[UnicodeBlock] = &[" grep -E '^[0-9A-F]{4,}' "$target" | # find the lines with unicode ranges tr '[:lower:]' '[:upper:]' | # convert range names to uppercase awk -F '; ' ' { range_name=$2; gsub(/[- ]/, "_", range_name); # convert range names into snake_case identifiers print range_name "," # you could also print `"// " $1` if you want the range itself } ' | sed 's/^/ /g' | # indent the output grep -v "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I" | # remove missing range cat - # output the result echo "];" ```
output ```rs /// all the unicode blocks ordered by their range of code points pub const ALL_UNICODE_BLOCKS: &[UnicodeBlock] = &[ BASIC_LATIN, LATIN_1_SUPPLEMENT, LATIN_EXTENDED_A, LATIN_EXTENDED_B, IPA_EXTENSIONS, SPACING_MODIFIER_LETTERS, COMBINING_DIACRITICAL_MARKS, GREEK_AND_COPTIC, CYRILLIC, CYRILLIC_SUPPLEMENT, ARMENIAN, HEBREW, ARABIC, SYRIAC, ARABIC_SUPPLEMENT, THAANA, NKO, SAMARITAN, MANDAIC, SYRIAC_SUPPLEMENT, ARABIC_EXTENDED_B, ARABIC_EXTENDED_A, DEVANAGARI, BENGALI, GURMUKHI, GUJARATI, ORIYA, TAMIL, TELUGU, KANNADA, MALAYALAM, SINHALA, THAI, LAO, TIBETAN, MYANMAR, GEORGIAN, HANGUL_JAMO, ETHIOPIC, ETHIOPIC_SUPPLEMENT, CHEROKEE, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, OGHAM, RUNIC, TAGALOG, HANUNOO, BUHID, TAGBANWA, KHMER, MONGOLIAN, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED, LIMBU, TAI_LE, NEW_TAI_LUE, KHMER_SYMBOLS, BUGINESE, TAI_THAM, COMBINING_DIACRITICAL_MARKS_EXTENDED, BALINESE, SUNDANESE, BATAK, LEPCHA, OL_CHIKI, CYRILLIC_EXTENDED_C, GEORGIAN_EXTENDED, SUNDANESE_SUPPLEMENT, VEDIC_EXTENSIONS, PHONETIC_EXTENSIONS, PHONETIC_EXTENSIONS_SUPPLEMENT, COMBINING_DIACRITICAL_MARKS_SUPPLEMENT, LATIN_EXTENDED_ADDITIONAL, GREEK_EXTENDED, GENERAL_PUNCTUATION, SUPERSCRIPTS_AND_SUBSCRIPTS, CURRENCY_SYMBOLS, COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS, LETTERLIKE_SYMBOLS, NUMBER_FORMS, ARROWS, MATHEMATICAL_OPERATORS, MISCELLANEOUS_TECHNICAL, CONTROL_PICTURES, OPTICAL_CHARACTER_RECOGNITION, ENCLOSED_ALPHANUMERICS, BOX_DRAWING, BLOCK_ELEMENTS, GEOMETRIC_SHAPES, MISCELLANEOUS_SYMBOLS, DINGBATS, MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A, SUPPLEMENTAL_ARROWS_A, BRAILLE_PATTERNS, SUPPLEMENTAL_ARROWS_B, MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B, SUPPLEMENTAL_MATHEMATICAL_OPERATORS, MISCELLANEOUS_SYMBOLS_AND_ARROWS, GLAGOLITIC, LATIN_EXTENDED_C, COPTIC, GEORGIAN_SUPPLEMENT, TIFINAGH, ETHIOPIC_EXTENDED, CYRILLIC_EXTENDED_A, SUPPLEMENTAL_PUNCTUATION, CJK_RADICALS_SUPPLEMENT, KANGXI_RADICALS, IDEOGRAPHIC_DESCRIPTION_CHARACTERS, CJK_SYMBOLS_AND_PUNCTUATION, HIRAGANA, KATAKANA, BOPOMOFO, HANGUL_COMPATIBILITY_JAMO, KANBUN, BOPOMOFO_EXTENDED, CJK_STROKES, KATAKANA_PHONETIC_EXTENSIONS, ENCLOSED_CJK_LETTERS_AND_MONTHS, CJK_COMPATIBILITY, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, YIJING_HEXAGRAM_SYMBOLS, CJK_UNIFIED_IDEOGRAPHS, YI_SYLLABLES, YI_RADICALS, LISU, VAI, CYRILLIC_EXTENDED_B, BAMUM, MODIFIER_TONE_LETTERS, LATIN_EXTENDED_D, SYLOTI_NAGRI, COMMON_INDIC_NUMBER_FORMS, PHAGS_PA, SAURASHTRA, DEVANAGARI_EXTENDED, KAYAH_LI, REJANG, HANGUL_JAMO_EXTENDED_A, JAVANESE, MYANMAR_EXTENDED_B, CHAM, MYANMAR_EXTENDED_A, TAI_VIET, MEETEI_MAYEK_EXTENSIONS, ETHIOPIC_EXTENDED_A, LATIN_EXTENDED_E, CHEROKEE_SUPPLEMENT, MEETEI_MAYEK, HANGUL_SYLLABLES, HANGUL_JAMO_EXTENDED_B, HIGH_SURROGATES, HIGH_PRIVATE_USE_SURROGATES, LOW_SURROGATES, PRIVATE_USE_AREA, CJK_COMPATIBILITY_IDEOGRAPHS, ALPHABETIC_PRESENTATION_FORMS, ARABIC_PRESENTATION_FORMS_A, VARIATION_SELECTORS, VERTICAL_FORMS, COMBINING_HALF_MARKS, CJK_COMPATIBILITY_FORMS, SMALL_FORM_VARIANTS, ARABIC_PRESENTATION_FORMS_B, HALFWIDTH_AND_FULLWIDTH_FORMS, SPECIALS, LINEAR_B_SYLLABARY, LINEAR_B_IDEOGRAMS, AEGEAN_NUMBERS, ANCIENT_GREEK_NUMBERS, ANCIENT_SYMBOLS, PHAISTOS_DISC, LYCIAN, CARIAN, COPTIC_EPACT_NUMBERS, OLD_ITALIC, GOTHIC, OLD_PERMIC, UGARITIC, OLD_PERSIAN, DESERET, SHAVIAN, OSMANYA, OSAGE, ELBASAN, CAUCASIAN_ALBANIAN, VITHKUQI, LINEAR_A, LATIN_EXTENDED_F, CYPRIOT_SYLLABARY, IMPERIAL_ARAMAIC, PALMYRENE, NABATAEAN, HATRAN, PHOENICIAN, LYDIAN, MEROITIC_HIEROGLYPHS, MEROITIC_CURSIVE, KHAROSHTHI, OLD_SOUTH_ARABIAN, OLD_NORTH_ARABIAN, MANICHAEAN, AVESTAN, INSCRIPTIONAL_PARTHIAN, INSCRIPTIONAL_PAHLAVI, PSALTER_PAHLAVI, OLD_TURKIC, OLD_HUNGARIAN, HANIFI_ROHINGYA, RUMI_NUMERAL_SYMBOLS, YEZIDI, ARABIC_EXTENDED_C, OLD_SOGDIAN, SOGDIAN, OLD_UYGHUR, CHORASMIAN, ELYMAIC, BRAHMI, KAITHI, SORA_SOMPENG, CHAKMA, MAHAJANI, SHARADA, SINHALA_ARCHAIC_NUMBERS, KHOJKI, MULTANI, KHUDAWADI, GRANTHA, NEWA, TIRHUTA, SIDDHAM, MODI, MONGOLIAN_SUPPLEMENT, TAKRI, AHOM, DOGRA, WARANG_CITI, DIVES_AKURU, NANDINAGARI, ZANABAZAR_SQUARE, SOYOMBO, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED_A, PAU_CIN_HAU, DEVANAGARI_EXTENDED_A, BHAIKSUKI, MARCHEN, MASARAM_GONDI, GUNJALA_GONDI, MAKASAR, KAWI, LISU_SUPPLEMENT, TAMIL_SUPPLEMENT, CUNEIFORM, CUNEIFORM_NUMBERS_AND_PUNCTUATION, EARLY_DYNASTIC_CUNEIFORM, CYPRO_MINOAN, EGYPTIAN_HIEROGLYPHS, EGYPTIAN_HIEROGLYPH_FORMAT_CONTROLS, ANATOLIAN_HIEROGLYPHS, BAMUM_SUPPLEMENT, MRO, TANGSA, BASSA_VAH, PAHAWH_HMONG, MEDEFAIDRIN, MIAO, IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION, TANGUT, TANGUT_COMPONENTS, KHITAN_SMALL_SCRIPT, TANGUT_SUPPLEMENT, KANA_EXTENDED_B, KANA_SUPPLEMENT, KANA_EXTENDED_A, SMALL_KANA_EXTENSION, NUSHU, DUPLOYAN, SHORTHAND_FORMAT_CONTROLS, ZNAMENNY_MUSICAL_NOTATION, BYZANTINE_MUSICAL_SYMBOLS, MUSICAL_SYMBOLS, ANCIENT_GREEK_MUSICAL_NOTATION, KAKTOVIK_NUMERALS, MAYAN_NUMERALS, TAI_XUAN_JING_SYMBOLS, COUNTING_ROD_NUMERALS, MATHEMATICAL_ALPHANUMERIC_SYMBOLS, SUTTON_SIGNWRITING, LATIN_EXTENDED_G, GLAGOLITIC_SUPPLEMENT, CYRILLIC_EXTENDED_D, NYIAKENG_PUACHUE_HMONG, TOTO, WANCHO, NAG_MUNDARI, ETHIOPIC_EXTENDED_B, MENDE_KIKAKUI, ADLAM, INDIC_SIYAQ_NUMBERS, OTTOMAN_SIYAQ_NUMBERS, ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS, MAHJONG_TILES, DOMINO_TILES, PLAYING_CARDS, ENCLOSED_ALPHANUMERIC_SUPPLEMENT, ENCLOSED_IDEOGRAPHIC_SUPPLEMENT, MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS, EMOTICONS, ORNAMENTAL_DINGBATS, TRANSPORT_AND_MAP_SYMBOLS, ALCHEMICAL_SYMBOLS, GEOMETRIC_SHAPES_EXTENDED, SUPPLEMENTAL_ARROWS_C, SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS, CHESS_SYMBOLS, SYMBOLS_AND_PICTOGRAPHS_EXTENDED_A, SYMBOLS_FOR_LEGACY_COMPUTING, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F, CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_H, TAGS, VARIATION_SELECTORS_SUPPLEMENT, SUPPLEMENTARY_PRIVATE_USE_AREA_A, SUPPLEMENTARY_PRIVATE_USE_AREA_B, ]; ```
magiclen commented 9 months ago

Thank you for the news. I have updated the list of code blocks.

And also thanks for the script, I will take a look at that and add an array containing all blocks in the near future.

SKalt commented 9 months ago

Excellent, thank you! 7015958d6 resolves this issue.