dtolnay / clang-ast

Deserialization logic for efficiently processing Clang's `-ast-dump=json` format
Apache License 2.0
132 stars 13 forks source link

Wrong line number when parsing #30

Open planetA opened 6 months ago

planetA commented 6 months ago

Hello,

it seems that when parsing AST, sometimes, the lines are mapped incorrectly below is a small working example:

tests/lines.rs:

use serde_derive::Deserialize;

pub type Node = clang_ast::Node<Clang>;

#[derive(Deserialize, Debug)]
pub enum Clang {
    FunctionDecl(FunctionDecl),
    CallExpr(CallExpr),
    DeclRefExpr(DeclRefExpr),
    Other,
}

#[derive(Deserialize, Debug)]
pub struct TranslationUnitDecl{
    pub loc: Option<clang_ast::SourceLocation>,
    pub range: Option<clang_ast::SourceRange>,
}

#[derive(Deserialize, Debug)]
pub struct FunctionDecl {
    pub name: Option<String>,
    pub loc: Option<clang_ast::SourceLocation>,
    pub range: Option<clang_ast::SourceRange>,
    pub inner: Option<Vec<Node>>,
}

#[derive(Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct CompoundStmt {
    pub name: Option<String>,
    pub loc: Option<clang_ast::SourceLocation>,
    pub range: Option<clang_ast::SourceRange>,
}

#[derive(Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct CallExpr {
    pub name: Option<String>,
    pub loc: Option<clang_ast::SourceLocation>,
    pub range: Option<clang_ast::SourceRange>,
}

#[derive(Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct DeclRefExpr {
    pub name: Option<String>,
    pub loc: Option<clang_ast::SourceLocation>,
    pub range: Option<clang_ast::SourceRange>,
    pub referenced_decl: Option<Box<Node>>,
}

const JSON: &str = r#"
{
  "id": "0xac5738",
  "kind": "TranslationUnitDecl",
  "loc": {},
  "range": {
    "begin": {},
    "end": {}
  },
  "inner": [
    {
      "id": "0xb218d0",
      "kind": "FunctionDecl",
      "loc": {
        "offset": 4,
        "file": "main.c",
        "line": 1,
        "col": 5,
        "tokLen": 1
      },
      "range": {
        "begin": {
          "offset": 0,
          "col": 1,
          "tokLen": 3
        },
        "end": {
          "offset": 6,
          "col": 7,
          "tokLen": 1
        }
      },
      "isUsed": true,
      "name": "c",
      "mangledName": "c",
      "type": {
        "qualType": "int ()"
      }
    },
    {
      "id": "0xb219e8",
      "kind": "FunctionDecl",
      "loc": {
        "offset": 13,
        "line": 2,
        "col": 5,
        "tokLen": 1
      },
      "range": {
        "begin": {
          "offset": 9,
          "col": 1,
          "tokLen": 3
        },
        "end": {
          "offset": 40,
          "line": 4,
          "col": 1,
          "tokLen": 1
        }
      },
      "name": "d",
      "mangledName": "d",
      "type": {
        "qualType": "int ()"
      },
      "inner": [
        {
          "id": "0xb21ba8",
          "kind": "CompoundStmt",
          "range": {
            "begin": {
              "offset": 17,
              "line": 2,
              "col": 9,
              "tokLen": 1
            },
            "end": {
              "offset": 40,
              "line": 4,
              "col": 1,
              "tokLen": 1
            }
          },
          "inner": [
            {
              "id": "0xb21b90",
              "kind": "DeclStmt",
              "range": {
                "begin": {
                  "offset": 27,
                  "line": 3,
                  "col": 9,
                  "tokLen": 3
                },
                "end": {
                  "offset": 38,
                  "col": 20,
                  "tokLen": 1
                }
              },
              "inner": [
                {
                  "id": "0xb21aa8",
                  "kind": "VarDecl",
                  "loc": {
                    "offset": 31,
                    "col": 13,
                    "tokLen": 1
                  },
                  "range": {
                    "begin": {
                      "offset": 27,
                      "col": 9,
                      "tokLen": 3
                    },
                    "end": {
                      "offset": 37,
                      "col": 19,
                      "tokLen": 1
                    }
                  },
                  "name": "j",
                  "type": {
                    "qualType": "int"
                  },
                  "init": "c",
                  "inner": [
                    {
                      "id": "0xb21b70",
                      "kind": "CallExpr",
                      "range": {
                        "begin": {
                          "offset": 35,
                          "col": 17,
                          "tokLen": 1
                        },
                        "end": {
                          "offset": 37,
                          "col": 19,
                          "tokLen": 1
                        }
                      },
                      "type": {
                        "qualType": "int"
                      },
                      "valueCategory": "prvalue",
                      "inner": [
                        {
                          "id": "0xb21b58",
                          "kind": "ImplicitCastExpr",
                          "range": {
                            "begin": {
                              "offset": 35,
                              "col": 17,
                              "tokLen": 1
                            },
                            "end": {
                              "offset": 35,
                              "col": 17,
                              "tokLen": 1
                            }
                          },
                          "type": {
                            "qualType": "int (*)()"
                          },
                          "valueCategory": "prvalue",
                          "castKind": "FunctionToPointerDecay",
                          "inner": [
                            {
                              "id": "0xb21b10",
                              "kind": "DeclRefExpr",
                              "range": {
                                "begin": {
                                  "offset": 35,
                                  "col": 17,
                                  "tokLen": 1
                                },
                                "end": {
                                  "offset": 35,
                                  "col": 17,
                                  "tokLen": 1
                                }
                              },
                              "type": {
                                "qualType": "int ()"
                              },
                              "valueCategory": "prvalue",
                              "referencedDecl": {
                                "id": "0xb218d0",
                                "kind": "FunctionDecl",
                                "name": "c",
                                "type": {
                                  "qualType": "int ()"
                                }
                              }
                            }
                          ]
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      ]
    }
  ]
}
"#;

#[test]
fn test_lines() {
    let node: Node = serde_json::from_str(JSON).unwrap();

    println!("{:#?}", node);
}

The problem is the referencedDecl should be line 3, but it parses to line 4, which is the end of the enclosing function. This AST is slightly shortened version of the following parsed code:

int c();
int d() {
        int j = c(); // <-- where I want it to point.
} // <-- where c.range points to
planetA commented 6 months ago

Here is what is the result of parsing:

Node {
    id: Id(0xac5738),
    kind: Other,
    inner: [
        Node {
            id: Id(0xb218d0),
            kind: FunctionDecl(
                FunctionDecl {
                    name: Some(
                        "c",
                    ),
                    loc: Some(
                        SourceLocation {
                            spelling_loc: Some(
                                BareSourceLocation {
                                    offset: 4,
                                    file: "main.c",
                                    line: 1,
                                    presumed_file: None,
                                    presumed_line: None,
                                    col: 5,
                                    tok_len: 1,
                                    included_from: None,
                                    is_macro_arg_expansion: false,
                                },
                            ),
                            expansion_loc: Some(
                                BareSourceLocation {
                                    offset: 4,
                                    file: "main.c",
                                    line: 1,
                                    presumed_file: None,
                                    presumed_line: None,
                                    col: 5,
                                    tok_len: 1,
                                    included_from: None,
                                    is_macro_arg_expansion: false,
                                },
                            ),
                        },
                    ),
                    range: Some(
                        SourceRange {
                            begin: SourceLocation {
                                spelling_loc: Some(
                                    BareSourceLocation {
                                        offset: 0,
                                        file: "main.c",
                                        line: 1,
                                        presumed_file: None,
                                        presumed_line: None,
                                        col: 1,
                                        tok_len: 3,
                                        included_from: None,
                                        is_macro_arg_expansion: false,
                                    },
                                ),
                                expansion_loc: Some(
                                    BareSourceLocation {
                                        offset: 0,
                                        file: "main.c",
                                        line: 1,
                                        presumed_file: None,
                                        presumed_line: None,
                                        col: 1,
                                        tok_len: 3,
                                        included_from: None,
                                        is_macro_arg_expansion: false,
                                    },
                                ),
                            },
                            end: SourceLocation {
                                spelling_loc: Some(
                                    BareSourceLocation {
                                        offset: 6,
                                        file: "main.c",
                                        line: 1,
                                        presumed_file: None,
                                        presumed_line: None,
                                        col: 7,
                                        tok_len: 1,
                                        included_from: None,
                                        is_macro_arg_expansion: false,
                                    },
                                ),
                                expansion_loc: Some(
                                    BareSourceLocation {
                                        offset: 6,
                                        file: "main.c",
                                        line: 1,
                                        presumed_file: None,
                                        presumed_line: None,
                                        col: 7,
                                        tok_len: 1,
                                        included_from: None,
                                        is_macro_arg_expansion: false,
                                    },
                                ),
                            },
                        },
                    ),
                    inner: None,
                },
            ),
            inner: [],
        },
        Node {
            id: Id(0xb219e8),
            kind: FunctionDecl(
                FunctionDecl {
                    name: Some(
                        "d",
                    ),
                    loc: Some(
                        SourceLocation {
                            spelling_loc: Some(
                                BareSourceLocation {
                                    offset: 13,
                                    file: "main.c",
                                    line: 2,
                                    presumed_file: None,
                                    presumed_line: None,
                                    col: 5,
                                    tok_len: 1,
                                    included_from: None,
                                    is_macro_arg_expansion: false,
                                },
                            ),
                            expansion_loc: Some(
                                BareSourceLocation {
                                    offset: 13,
                                    file: "main.c",
                                    line: 2,
                                    presumed_file: None,
                                    presumed_line: None,
                                    col: 5,
                                    tok_len: 1,
                                    included_from: None,
                                    is_macro_arg_expansion: false,
                                },
                            ),
                        },
                    ),
                    range: Some(
                        SourceRange {
                            begin: SourceLocation {
                                spelling_loc: Some(
                                    BareSourceLocation {
                                        offset: 9,
                                        file: "main.c",
                                        line: 2,
                                        presumed_file: None,
                                        presumed_line: None,
                                        col: 1,
                                        tok_len: 3,
                                        included_from: None,
                                        is_macro_arg_expansion: false,
                                    },
                                ),
                                expansion_loc: Some(
                                    BareSourceLocation {
                                        offset: 9,
                                        file: "main.c",
                                        line: 2,
                                        presumed_file: None,
                                        presumed_line: None,
                                        col: 1,
                                        tok_len: 3,
                                        included_from: None,
                                        is_macro_arg_expansion: false,
                                    },
                                ),
                            },
                            end: SourceLocation {
                                spelling_loc: Some(
                                    BareSourceLocation {
                                        offset: 40,
                                        file: "main.c",
                                        line: 4,
                                        presumed_file: None,
                                        presumed_line: None,
                                        col: 1,
                                        tok_len: 1,
                                        included_from: None,
                                        is_macro_arg_expansion: false,
                                    },
                                ),
                                expansion_loc: Some(
                                    BareSourceLocation {
                                        offset: 40,
                                        file: "main.c",
                                        line: 4,
                                        presumed_file: None,
                                        presumed_line: None,
                                        col: 1,
                                        tok_len: 1,
                                        included_from: None,
                                        is_macro_arg_expansion: false,
                                    },
                                ),
                            },
                        },
                    ),
                    inner: None,
                },
            ),
            inner: [
                Node {
                    id: Id(0xb21ba8),
                    kind: Other,
                    inner: [
                        Node {
                            id: Id(0xb21b90),
                            kind: Other,
                            inner: [
                                Node {
                                    id: Id(0xb21aa8),
                                    kind: Other,
                                    inner: [
                                        Node {
                                            id: Id(0xb21b70),
                                            kind: CallExpr(
                                                CallExpr {
                                                    name: None,
                                                    loc: None,
                                                    range: Some(
                                                        SourceRange {
                                                            begin: SourceLocation {
                                                                spelling_loc: Some(
                                                                    BareSourceLocation {
                                                                        offset: 35,
                                                                        file: "main.c",
                                                                        line: 4,
                                                                        presumed_file: None,
                                                                        presumed_line: None,
                                                                        col: 17,
                                                                        tok_len: 1,
                                                                        included_from: None,
                                                                        is_macro_arg_expansion: false,
                                                                    },
                                                                ),
                                                                expansion_loc: Some(
                                                                    BareSourceLocation {
                                                                        offset: 35,
                                                                        file: "main.c",
                                                                        line: 4,
                                                                        presumed_file: None,
                                                                        presumed_line: None,
                                                                        col: 17,
                                                                        tok_len: 1,
                                                                        included_from: None,
                                                                        is_macro_arg_expansion: false,
                                                                    },
                                                                ),
                                                            },
                                                            end: SourceLocation {
                                                                spelling_loc: Some(
                                                                    BareSourceLocation {
                                                                        offset: 37,
                                                                        file: "main.c",
                                                                        line: 4,
                                                                        presumed_file: None,
                                                                        presumed_line: None,
                                                                        col: 19,
                                                                        tok_len: 1,
                                                                        included_from: None,
                                                                        is_macro_arg_expansion: false,
                                                                    },
                                                                ),
                                                                expansion_loc: Some(
                                                                    BareSourceLocation {
                                                                        offset: 37,
                                                                        file: "main.c",
                                                                        line: 4,
                                                                        presumed_file: None,
                                                                        presumed_line: None,
                                                                        col: 19,
                                                                        tok_len: 1,
                                                                        included_from: None,
                                                                        is_macro_arg_expansion: false,
                                                                    },
                                                                ),
                                                            },
                                                        },
                                                    ),
                                                },
                                            ),
                                            inner: [
                                                Node {
                                                    id: Id(0xb21b58),
                                                    kind: Other,
                                                    inner: [
                                                        Node {
                                                            id: Id(0xb21b10),
                                                            kind: DeclRefExpr(
                                                                DeclRefExpr {
                                                                    name: None,
                                                                    loc: None,
                                                                    range: Some(
                                                                        SourceRange {
                                                                            begin: SourceLocation {
                                                                                spelling_loc: Some(
                                                                                    BareSourceLocation {
                                                                                        offset: 35,
                                                                                        file: "main.c",
                                                                                        line: 4,
                                                                                        presumed_file: None,
                                                                                        presumed_line: None,
                                                                                        col: 17,
                                                                                        tok_len: 1,
                                                                                        included_from: None,
                                                                                        is_macro_arg_expansion: false,
                                                                                    },
                                                                                ),
                                                                                expansion_loc: Some(
                                                                                    BareSourceLocation {
                                                                                        offset: 35,
                                                                                        file: "main.c",
                                                                                        line: 4,
                                                                                        presumed_file: None,
                                                                                        presumed_line: None,
                                                                                        col: 17,
                                                                                        tok_len: 1,
                                                                                        included_from: None,
                                                                                        is_macro_arg_expansion: false,
                                                                                    },
                                                                                ),
                                                                            },
                                                                            end: SourceLocation {
                                                                                spelling_loc: Some(
                                                                                    BareSourceLocation {
                                                                                        offset: 35,
                                                                                        file: "main.c",
                                                                                        line: 4,
                                                                                        presumed_file: None,
                                                                                        presumed_line: None,
                                                                                        col: 17,
                                                                                        tok_len: 1,
                                                                                        included_from: None,
                                                                                        is_macro_arg_expansion: false,
                                                                                    },
                                                                                ),
                                                                                expansion_loc: Some(
                                                                                    BareSourceLocation {
                                                                                        offset: 35,
                                                                                        file: "main.c",
                                                                                        line: 4,
                                                                                        presumed_file: None,
                                                                                        presumed_line: None,
                                                                                        col: 17,
                                                                                        tok_len: 1,
                                                                                        included_from: None,
                                                                                        is_macro_arg_expansion: false,
                                                                                    },
                                                                                ),
                                                                            },
                                                                        },
                                                                    ),
                                                                    referenced_decl: Some(
                                                                        Node {
                                                                            id: Id(0xb218d0),
                                                                            kind: FunctionDecl(
                                                                                FunctionDecl {
                                                                                    name: Some(
                                                                                        "c",
                                                                                    ),
                                                                                    loc: None,
                                                                                    range: None,
                                                                                    inner: None,
                                                                                },
                                                                            ),
                                                                            inner: [],
                                                                        },
                                                                    ),
                                                                },
                                                            ),
                                                            inner: [],
                                                        },
                                                    ],
                                                },
                                            ],
                                        },
                                    ],
                                },
                            ],
                        },
                    ],
                },
            ],
        },
    ],
}
planetA commented 6 months ago

Looking at the code, I can say that LAST_LOC_FILENAME and LAST_LOC_LINE are fundamentally wrong, because the last used one, which is going to be the value stored for the end range. Instead, it should use the one from the parent.

planetA commented 6 months ago

I looked once again into the algorithm. Conceptually, I think you implemented it correctly. The problem happens, when some node do not have "loc" and "range" fields. Then these fields are going to be skipped for the purpose of setting LAST_LOC_FILENAME and LAST_LOC_LINE.

For example, by adding the following Other struct, I could work around the problem:

#[derive(Deserialize, Debug)]
pub enum Clang {
    FunctionDecl(FunctionDecl),
    CallExpr(CallExpr),
    DeclRefExpr(DeclRefExpr),
    Other(Other),
}

// ...

#[derive(Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Other {
    pub name: Option<String>,
    pub loc: Option<clang_ast::SourceLocation>,
    pub range: Option<clang_ast::SourceRange>,
}
planetA commented 6 months ago

I think it makes sense to make loc and range mandatory fields in the Node struct. Or, loc and range should be accounted in NodeVisitor. What do you think?

dtolnay commented 6 months ago

That's right, I think we need every variant to have loc and range if you're planning to look at the loc or range of any variant.

I experimented with a different approach in #35 where the Deserializer would inject correct values of loc and range, rather than having the Deserialize impl in charge of the bookkeeping. So for example if the input were:

{
  "id": "0xb218d0",
  "kind": "FunctionDecl",
  "loc": {
    "offset": 4,
    "file": "main.c",
    "line": 1,
    "col": 5,
    "tokLen": 1
  },
  "range": {"..."}
  "inner": [
    {
      "id": "0xb21aa8",
      "kind": "VarDecl",
      "loc": {
        "offset": 31,
        "col": 13,
        "tokLen": 1
      },
      "..."

then the Deserialize impl would see the following in the inner node (whether or not it actually cares about deserializing loc):

      "id": "0xb21aa8",
      "kind": "VarDecl",
      "loc": {
        "offset": 31,
        "col": 13,
        "tokLen": 1,
        "file": "main.c",
        "line": 1
      },

I think it can work, but it needs some more effort to complete it.