HCL Parsing fails on valid UTF-8 identifier

TeamDman commented 2 months ago

tl;dr at bottom section


    #[test]
    fn utf8_problem() {
        let text = r#"
            import {
                id = "omitted"
                to = azuread_group.écurité
            }
        "#;
        let _body: Body = text.parse().unwrap();
    }

is_id_start and is_id_continue filter out non-utf8: Utf8Error { valid_up_to: 0, error_len: None }

backtrace

``` running 1 test test reflow::tests::utf8_problem ... FAILED successes: successes: failures: ---- reflow::tests::utf8_problem stdout ---- thread 'reflow::tests::utf8_problem' panicked at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\string.rs:243:36: `is_id_start` and `is_id_continue` filter out non-utf8: Utf8Error { valid_up_to: 0, error_len: None } stack backtrace: 0: std::panicking::begin_panic_handler at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d/library\std\src\panicking.rs:652 1: core::panicking::panic_fmt at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d/library\core\src\panicking.rs:72 2: core::result::unwrap_failed at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d/library\core\src\result.rs:1654 3: enum2$,core::str::error::Utf8Error> >::expect,core::str::error::Utf8Error> at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d\library\core\src\result.rs:1034 4: hcl_edit::parser::string::from_utf8_unchecked at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\string.rs:243 5: hcl_edit::parser::string::str_ident::closure$0 at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\string.rs:197 6: winnow::combinator::parser::impl$3::parse_next > > > at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\combinator\parser.rs:80 7: hcl_edit::parser::string::str_ident at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\string.rs:194 8: core::ops::function::FnMut::call_mut,enum2$ > > > > > (*)(ref_mut$ > > >),tuple$,enum2$ > > > > > (*)(ref_mut$ > > >),hcl_ed at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\combinator\parser.rs:78 11: hcl_edit::parser::string::ident at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\string.rs:188 12: core::ops::function::FnMut::call_mut,enum2$ > > > > > (*)(ref_mut$,enum2$ > > > > > (*)(ref_mut$ > >,enum2$,winnow::error::ContextError > >,enum2$,winn at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\combinator\parser.rs:78 19: winnow::parser::impl$12::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\parser.rs:977 20: winnow::combinator::parser::impl$3::parse_next,enum2$ > > > > > (*)(ref_mut$,enum2$ > > > > > (*)(ref_mut$,enum2$ > > > > > (*)(ref_mut$,enum2$ > at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\combinator\parser.rs:759 28: winnow::combinator::parser::impl$3::parse_next,enum2$ > >,tuple$<>,winnow::error::ContextError > >, at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\combinator\parser.rs:78 31: winnow::combinator::multi::repeat1_ > >,hcl_edit::repr::Decorated >,alloc::vec::Vec > >,tuple$<>,winnow::error::ContextEr at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\combinator\multi.rs:294 33: winnow::parser::impl$0::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\parser.rs:715 34: winnow::combinator::multi::impl$1::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\combinator\multi.rs:291 35: winnow::combinator::parser::impl$3::parse_next > >,tuple$<>, at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\combinator\parser.rs:78 36: hcl_edit::parser::expr::traversal::closure$0 at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\expr.rs:204 37: winnow::parser::impl$0::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\parser.rs:715 38: hcl_edit::parser::expr::expr_inner::closure$0 at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\expr.rs:89 39: winnow::parser::impl$0::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\parser.rs:715 40: hcl_edit::parser::expr::parse_expr at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\expr.rs:53 41: hcl_edit::parser::expr::expr at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\expr.rs:42 42: core::ops::function::FnMut::call_mut,enum2$ > > > > > (*)(ref_mut$ at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d\library\core\src\ops\function.rs:166 43: winnow::parser::impl$0::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\parser.rs:715 44: winnow::combinator::parser::impl$31::parse_next,enum2$ > > > > > (*)(ref_mut$,enum2$ > > > > > (*)(re at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\combinator\parser.rs:78 46: winnow::parser::impl$12::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\parser.rs:977 47: winnow::combinator::parser::impl$3::parse_next,enum2$ > > > > > (*)(ref_mut$ > >,char,enum2$,winnow::error::ContextError >,winnow::combinator::parser::Context > >,tuple$,tuple$<>,tuple$<> >,tuple$<>,winnow::error::ContextError >,tuple$ > >,tuple$,tuple$<>,tuple$<> >,tuple$<>,winnow::error::ContextError >,winnow::combinator::sequence::terminated::closure_env$0 > >,tuple$,tuple$<>,tuple$<> >,tuple$<>,winnow::error::ContextError > >,tuple$<>,winnow::error::ContextError >,winnow::combinator::multi::Repeat > > > > > (*)(ref_mut$ > at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d\library\core\src\ops\function.rs:166 64: winnow::parser::impl$0::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\parser.rs:715 65: winnow::combinator::sequence::preceded::closure$0 > >,ref$ >,hcl_edit::structure::body::Body,winnow::error::ContextError >,enum2$ at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\combinator\sequence.rs:41 66: winnow::parser::impl$0::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\parser.rs:715 67: winnow::combinator::parser::impl$31::parse_next > >,ref$ >,hcl_edit::structure::body::Body,winnow::error::ContextError > >,ref$ >,hcl_edit::structure::body::Body,winnow::error::Contex at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\combinator\parser.rs:78 69: winnow::parser::impl$12::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\parser.rs:977 70: winnow::combinator::parser::impl$3::parse_next,enum2$ > > > > > (*)(ref_mut$ > >,hcl_edit::structure::body::Body,winnow::error::ContextError >,winnow::combinator::parser::Map > >,hcl_edit::structure::body::Body,winnow::error::ContextError >,tuple$ > >,char,hcl_edit::structure::body::Body,char,winnow::error::ContextError >,winnow::combinator::parser::Context > >,tuple$,tuple$<>,tuple$<> >,tuple$<>,winnow::error::ContextError >,tuple$ > >,tuple$,tuple$<>,tuple$<> >,tuple$<>,winnow::error::ContextError >,winnow::combinator::sequence::terminated::closure_env$0 > >,tuple$,tuple$<>,tuple$<> >,tuple$<>,winnow::error::ContextError > >,tuple$<>,winnow::error::ContextError >,winnow::combinator::multi::Repeat > > > > > (*)(ref_mut$ > at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d\library\core\src\ops\function.rs:166 90: winnow::parser::impl$0::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\parser.rs:715 91: winnow::combinator::parser::impl$1::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\combinator\parser.rs:34 92: winnow::parser::impl$12::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\parser.rs:977 93: winnow::parser::Parser::parse > > > > > (*)(ref_mut$ > > >),wi at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.6\src\parser.rs:64 94: hcl_edit::parser::parse_complete > > > > > (*)(ref_mut$ > > >) at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\mod.rs:72 95: hcl_edit::parser::parse_body at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\mod.rs:39 96: hcl_edit::structure::body::impl$3::from_str at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\structure\body.rs:706 97: core::str::impl$0::parse at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d\library\core\src\str\mod.rs:2420 98: tofu::reflow::tests::utf8_problem at .\src\reflow.rs:143 99: tofu::reflow::tests::utf8_problem::closure$0 at .\src\reflow.rs:136 100: core::ops::function::FnOnce::call_once > at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d\library\core\src\ops\function.rs:250 101: core::ops::function::FnOnce::call_once at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d/library\core\src\ops\function.rs:250 note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace. failures: reflow::tests::utf8_problem test result: FAILED. 0 passed; 1 failed; 0 ignored; 0 measured; 1 filtered out; finished in 0.05s ```

Terraform and OpenTofu support labels beginning with utf-8

When I use terraform plan -generate-config-out="generated.tf" followed by terraform init and terraform apply it works with the special characters in the identifier, so Terraform supports it but hcl-rs doesn't D:

# __generated__ by Terraform
# Please review these resources and move them into your main configuration files.

# __generated__ by Terraform from "omitted"
resource "azuread_group" "écurité" {
  administrative_unit_ids    = []
  assignable_to_role         = false
  auto_subscribe_new_members = false
  ...

Do you want to perform these actions?
  OpenTofu will perform the actions described above.
  Only 'yes' will be accepted to approve.

  Enter a value: yes
azuread_group.écurité: Importing... [id=omitted]
azuread_group.écurité: Import complete [id=omitted]

Apply complete! Resources: 1 imported, 0 added, 0 changed, 0 destroyed.

also tried

    #[test]
    fn utf8_problem2() {
        let text = r#"
            locals {
                é = 4
            }
            output "ééé" {
            value = local.é
            }
        "#;
        let _body: Body = text.parse().unwrap();
    }

is_id_start and is_id_continue filter out non-utf8: Utf8Error { valid_up_to: 0, error_len: None }

rust\ignore\processed on  main [!] via 💠 default on 󰠅 OPSScSub
❯ terraform validate
Success! The configuration is valid.

rust\ignore\processed on  main [!] via 💠 default on 󰠅 OPSScSub
❯ terraform apply

Changes to Outputs:
  + ééé    = 4

You can apply this plan to save these new output values to the Terraform state, without changing any real infrastructure.

Do you want to perform these actions?
  Terraform will perform the actions described above.
  Only 'yes' will be accepted to approve.

  Enter a value: yes

Apply complete! Resources: 0 added, 0 changed, 0 destroyed.

Outputs:

ééé = 4

crates:

hcl-rs v0.17.2
hcl-edit v0.8.0

unicode-ident v1.0.12

❯ terraform -version 
Terraform v1.5.7
on windows_amd64
❯ tofu -version     
OpenTofu v1.6.2
on windows_amd64

relevant:

https://github.com/martinohmann/hcl-rs/blob/48de0e7b524cf8619fbf97a9214cc924ecca4697/crates/hcl-edit/src/parser/structure.rs#L63

https://github.com/martinohmann/hcl-rs/blob/48de0e7b524cf8619fbf97a9214cc924ecca4697/crates/hcl-edit/src/parser/string.rs#L220-L226

https://github.com/martinohmann/hcl-rs/blob/48de0e7b524cf8619fbf97a9214cc924ecca4697/crates/hcl-edit/src/parser/string.rs#L193-L200

https://github.com/martinohmann/hcl-rs/blob/48de0e7b524cf8619fbf97a9214cc924ecca4697/crates/hcl-primitives/src/ident.rs#L277-L309

tangent

The Hashicorp [spec](https://github.com/hashicorp/hcl/blob/212a40e528766634a1aa6dd1e820d7936762196e/hclsyntax/spec.md#identifiers) says > ### Identifiers > Identifiers name entities such as blocks, attributes and expression variables. Identifiers are interpreted as per [UAX #31][uax31] Section 2. Specifically, their syntax is defined in terms of the `ID_Start` and `ID_Continue` character properties as follows: > ```ebnf Identifier = ID_Start (ID_Continue | '-')*; ``` > The Unicode specification provides the normative requirements for identifier parsing. Non-normatively, the spirit of this specification is that `ID_Start` consists of Unicode letter and certain unambiguous punctuation tokens, while `ID_Continue` augments that set with Unicode digits, combining marks, etc. > The dash character `-` is additionally allowed in identifiers, even though that is not part of the unicode `ID_Continue` definition. This is to allow attribute names and block type names to contain dashes, although underscores as word separators are considered the idiomatic usage. [uax31]: http://unicode.org/reports/tr31/ "Unicode Identifier and Pattern Syntax" We can search the repo for ID_START https://github.com/search?q=repo%3Ahashicorp%2Fhcl%20ID_START&type=code ![image](https://github.com/martinohmann/hcl-rs/assets/9356891/26333e7f-e495-4657-addc-8b75899d3039) which reveals the truth for what is supported https://github.com/hashicorp/hcl/blob/212a40e528766634a1aa6dd1e820d7936762196e/hclsyntax/unicode_derived.rl preview: ```rs ID_Start = 0x41..0x5A #L& [26] LATIN CAPITAL LETTER A..LATIN CAPI... | 0x61..0x7A #L& [26] LATIN SMALL LETTER A..LATIN SMALL ... | 0xC2 0xAA #Lo FEMININE ORDINAL INDICATOR | 0xC2 0xB5 #L& MICRO SIGN | 0xC2 0xBA #Lo MASCULINE ORDINAL INDICATOR | 0xC3 0x80..0x96 #L& [23] LATIN CAPITAL LETTER A WITH GRAVE.... ... a whole bunch more ``` Looks like the [`unicode_ident`](https://rtic.rs/stable/api/unicode_ident/index.html) crate used by `hcl-rs` is also targetting the [UAX #31][uax31] spec. > Implementation of [Unicode Standard Annex #31](https://www.unicode.org/reports/tr31/) for determining which char values are valid in programming language identifiers. I tried asking ChatGPT if "é" is a valid ID_START or ID_CONTINUE character and it said both yes and no when I restarted the chat, so I'm not sure where the spec deviates, it's either Terraform+OpenTofu or the `unicode_ident` crate. ... I asked again and it gave the following ```python import regex char = 'é' # Check if it's a valid ID_Start character is_id_start = bool(regex.match(r'\p{ID_Start}', char)) # Check if it's a valid ID_Continue character is_id_continue = bool(regex.match(r'\p{ID_Continue}', char)) is_id_start, is_id_continue ``` > True, True https://chatgpt.com/share/e910f089-fee1-47ad-bdbe-e988cde57aee

TL;DR - parsing fails for some reason and I have no idea why

fn main() {}

#[cfg(test)]
mod test {
    #[test]
    fn unicode_ident_test() {
        let ident = 'é';
        assert!(unicode_ident::is_xid_start(ident));
        assert!(unicode_ident::is_xid_continue(ident));
    }

    #[test]
    fn hcl_primitives_test() {
        let ident = "ééé";
        assert!(hcl_primitives::ident::is_ident(ident));
    }

    #[test]
    fn hcl_edit_ident_test() {
        let ident = "ééé";
        ident.parse::<hcl::edit::Ident>().unwrap();
    }

    #[test]
    fn hcl_edit_body_test() {
        let text = r#"
            locals {
                é = 4
            }
            output "ééé" {
                value = local.é
            }
        "#;
        let _body: hcl::edit::structure::Body = text.parse().unwrap();
    }
}

[dependencies]
hcl-primitives = "0.1.5"
hcl-rs = "0.17.2"
unicode-ident = "1.0.12"

running 4 tests
test test::hcl_edit_ident_test ... ok
test test::hcl_primitives_test ... ok
test test::unicode_ident_test ... ok
test test::hcl_edit_body_test ... FAILED

backtrace

``` Finished `test` profile [unoptimized + debuginfo] target(s) in 0.08s Running unittests src/main.rs (target\debug\deps\unicode_ident_uax_31-ae869406db3b55a7.exe) running 4 tests test test::hcl_edit_ident_test ... ok test test::hcl_primitives_test ... ok test test::unicode_ident_test ... ok test test::hcl_edit_body_test ... FAILED failures: ---- test::hcl_edit_body_test stdout ---- thread 'test::hcl_edit_body_test' panicked at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\string.rs:243:36: `is_id_start` and `is_id_continue` filter out non-utf8: Utf8Error { valid_up_to: 0, error_len: None } stack backtrace: 0: std::panicking::begin_panic_handler at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d/library\std\src\panicking.rs:652 1: core::panicking::panic_fmt at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d/library\core\src\panicking.rs:72 2: core::result::unwrap_failed at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d/library\core\src\result.rs:1654 3: enum2$,core::str::error::Utf8Error> >::expect,core::str::error::Utf8Error> at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d\library\core\src\result.rs:1034 4: hcl_edit::parser::string::from_utf8_unchecked at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\string.rs:243 5: hcl_edit::parser::string::str_ident::closure$0 at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\string.rs:197 6: winnow::combinator::parser::impl$3::parse_next > > > at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.8\src\combinator\parser.rs:82 7: hcl_edit::parser::string::str_ident at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\string.rs:194 8: core::ops::function::FnMut::call_mut,enum2$ > > > > > (*)(ref_mut$ > > >),tuple$ > >,ref$,winnow::error::ContextError >,enum2$,enum2$ > >,ref$,winnow::error::ContextError >,enum2$ > >,ref$,winnow::error::ContextError >,enum2$ at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.8\src\combinator\parser.rs:902 15: hcl_edit::parser::string::cut_str_ident at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\string.rs:221 16: core::ops::function::FnMut::call_mut,enum2$ > > > > > (*)(ref_mut$ > > >),tuple$,enum2$ > > > > > (*)(ref_mut$ > > >),winno at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.8\src\combinator\parser.rs:761 19: hcl_edit::parser::structure::structure::closure$0 at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\structure.rs:63 20: winnow::parser::impl$0::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.8\src\parser.rs:719 21: winnow::parser::impl$13::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.8\src\parser.rs:981 22: winnow::combinator::sequence::terminated::closure$0 > >,tuple$,tuple$<>,tuple$<> >,tuple$<>,winnow::error::ContextError >,tuple$ > >,tuple$,tuple$<>,tuple$<> >,tuple$<>,winnow::error::ContextError >,winnow::combinator::sequence::terminated::closure_env$0 > >,tuple$,tuple$<>,tuple$<> >,tuple$<>,winnow::error::ContextError > >,tuple$<>,winnow::error::ContextError >,winnow::combinator::multi::Repeat > > > > > (*)(ref_mut$ > at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d\library\core\src\ops\function.rs:166 32: winnow::parser::impl$0::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.8\src\parser.rs:719 33: winnow::combinator::sequence::preceded::closure$0 > >,ref$ >,hcl_edit::structure::body::Body,winnow::error::ContextError >,enum2$ at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.8\src\combinator\sequence.rs:41 34: winnow::parser::impl$0::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.8\src\parser.rs:719 35: winnow::combinator::parser::impl$31::parse_next > >,ref$ >,hcl_edit::structure::body::Body,winnow::error::ContextError > >,ref$ >,hcl_edit::structure::body::Body,winnow::error::Contex at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.8\src\combinator\parser.rs:80 37: winnow::parser::impl$12::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.8\src\parser.rs:981 38: winnow::combinator::parser::impl$3::parse_next,enum2$ > > > > > (*)(ref_mut$ > >,hcl_edit::structure::body::Body,winnow::error::ContextError >,winnow::combinator::parser::Map > >,hcl_edit::structure::body::Body,winnow::error::ContextError >,tuple$ > >,char,hcl_edit::structure::body::Body,char,winnow::error::ContextError >,winnow::combinator::parser::Context > >,tuple$,tuple$<>,tuple$<> >,tuple$<>,winnow::error::ContextError >,tuple$ > >,tuple$,tuple$<>,tuple$<> >,tuple$<>,winnow::error::ContextError >,winnow::combinator::sequence::terminated::closure_env$0 > >,tuple$,tuple$<>,tuple$<> >,tuple$<>,winnow::error::ContextError > >,tuple$<>,winnow::error::ContextError >,winnow::combinator::multi::Repeat > > > > > (*)(ref_mut$ > at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d\library\core\src\ops\function.rs:166 58: winnow::parser::impl$0::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.8\src\parser.rs:719 59: winnow::combinator::parser::impl$1::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.8\src\combinator\parser.rs:36 60: winnow::parser::impl$12::parse_next at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.8\src\parser.rs:981 61: winnow::parser::Parser::parse > > > > > (*)(ref_mut$ > > >),wi at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\winnow-0.6.8\src\parser.rs:66 62: hcl_edit::parser::parse_complete > > > > > (*)(ref_mut$ > > >) at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\mod.rs:72 63: hcl_edit::parser::parse_body at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\parser\mod.rs:39 64: hcl_edit::structure::body::impl$3::from_str at C:\Users\teamy\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hcl-edit-0.8.0\src\structure\body.rs:706 65: core::str::impl$0::parse at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d\library\core\src\str\mod.rs:2420 66: unicode_ident_uax_31::test::hcl_edit_body_test at .\src\main.rs:36 67: unicode_ident_uax_31::test::hcl_edit_body_test::closure$0 at .\src\main.rs:27 68: core::ops::function::FnOnce::call_once > at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d\library\core\src\ops\function.rs:250 69: core::ops::function::FnOnce::call_once at /rustc/79734f1db8dbe322192dea32c0f6b80ab14c4c1d/library\core\src\ops\function.rs:250 note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace. failures: test::hcl_edit_body_test test result: FAILED. 3 passed; 1 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.04s error: test failed, to rerun pass `--bin unicode_ident_uax_31` ```

martinohmann commented 1 month ago

Hello @TeamDman.

First of all, thank you very much for the very detailed bug report and reproducers, that's very helpful.

This is a serious bug in the parser and the current implementation is even unsound because the bug is triggered in an unsafe block.

After digging around a bit I was able to find the root cause for this.

The parser is accepting a &str but internally works on a &[u8] because this yields better performance. The input stream is guaranteed to be valid UTF-8, but since the token type is u8 (one bytes) instead of char (multiple bytes), individual tokens are not UTF-8. You probably see where this is going 😉 .

These two function are the culpit here: https://github.com/martinohmann/hcl-rs/blob/main/crates/hcl-edit/src/parser/string.rs#L228-L236

They work on single bytes, checking if these are ID start/continue. This obviously excludes UTF-8 chars from the mix and the handling for these is broken. I completely overlooked this in the implementation phase.

There are now multiple alternatives for fixing this:

Make the parser work on a &str instead of &[u8]. This will avoid this issue, but also degrades parser performance.
Keep working on &[u8] but do something similar to what toml_edit does, which also works on bytes: since the input bytes originated from a &str we know they are always valid UTF-8, so we can parse out runs of bytes, convert them into a &str and defer the validity checks (is_id_start/is_id_continue) for the individual chars until then.
Maybe something else?

Alternative 1 is the easiest to implement, but i'd like to avoid harming parser performance too much if it can be avoided, so i'll also try to first explore variant 2, which is a bit more complex to get right, but should have better performance.

Here's a minimal reproducer comparing the current &[u8]-based ident parser code with a variant that works on &str:

mod issue_350_minimal {
    use winnow::{
        stream::AsChar,
        token::{one_of, take_while},
        PResult, Parser,
    };

    fn ident_bytes<'a>(input: &mut &'a [u8]) -> PResult<&'a str> {
        (
            one_of(|b: u8| hcl_primitives::ident::is_id_start(b.as_char())),
            take_while(0.., |b: u8| {
                hcl_primitives::ident::is_id_continue(b.as_char())
            }),
        )
            .recognize()
            .map(|s: &[u8]| {
                print_bytes("parsed bytes with `u8` stream", s);
                std::str::from_utf8(s)
                    .expect("`is_id_start` and `is_id_continue` filter out non-utf8")
            })
            .parse_next(input)
    }

    fn ident_str<'a>(input: &mut &'a str) -> PResult<&'a str> {
        (
            one_of(hcl_primitives::ident::is_id_start),
            take_while(0.., hcl_primitives::ident::is_id_continue),
        )
            .recognize()
            .map(|s: &str| {
                print_bytes("parsed bytes with `char` stream", s.as_bytes());
                s
            })
            .parse_next(input)
    }

    fn print_bytes(desc: &str, buf: &[u8]) {
        print!("{desc}: ");
        for b in buf {
            print!("0x{b:02x} ");
        }
        println!();
    }

    #[test]
    fn parse_ident() {
        let mut input = "ééé";
        let mut input_bytes = input.as_bytes();

        print_bytes("input bytes", input_bytes);

        let parsed = ident_str.parse(&mut input).unwrap();

        assert_eq!(parsed, "ééé");

        let parsed = ident_bytes.parse(&mut input_bytes).unwrap();

        assert_eq!(parsed, "ééé");
    }
}

Output:

running 1 test
test issue_350_minimal::parse_ident ... FAILED

failures:

---- issue_350_minimal::parse_ident stdout ----
input bytes: 0xc3 0xa9 0xc3 0xa9 0xc3 0xa9
parsed bytes with `char` stream: 0xc3 0xa9 0xc3 0xa9 0xc3 0xa9
parsed bytes with `u8` stream: 0xc3
thread 'issue_350_minimal::parse_ident' panicked at crates/hcl-edit/tests/regressions.rs:221:22:
`is_id_start` and `is_id_continue` filter out non-utf8: Utf8Error { valid_up_to: 0, error_len: None }
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace

failures:
    issue_350_minimal::parse_ident

test result: FAILED. 0 passed; 1 failed; 0 ignored; 0 measured; 9 filtered out; finished in 0.00s

error: test failed, to rerun pass `-p hcl-edit --test regressions`

martinohmann commented 1 month ago

Btw: the reason why the following does not have the same issue is that it works on a &str and not &[u8] to parse the identifier, so the UTF-8 handling is correct here:

let ident = "ééé";
ident.parse::<hcl::edit::Ident>().unwrap();

martinohmann commented 1 month ago

@TeamDman thanks again for the bug report!

I just released the fix for this via hcl-edit 0.8.1 / hcl-rs 0.18.0.

Sadly, the fix decreased the parser performance by 20-30%, but I'm planning to iterate on that to bring performance levels back closer to where they were before. For now, a correct parser is more important than performance.

Funny enough, I also found another bug related to unicode in the error reporting and fixed that along the way.

Let me know if you spot other issues around unicode handling.

TeamDman commented 1 month ago

Woohoo! Props for the quick fix 🎉🐇

I'm using import blocks to generate hundreds of thousands of lines of HCL, so I definitely appreciate the care being placed on the performance

Thank you!

martinohmann / hcl-rs

HCL Parsing fails on valid UTF-8 identifier #350