tobymao / sqlglot

Python SQL Parser and Transpiler
https://sqlglot.com/
MIT License
6.5k stars 671 forks source link

Duckdb - Error tokenizing multiple named parameters #3618

Closed bigluck closed 3 months ago

bigluck commented 3 months ago

Before you file an issue

Fully reproducible code snippet I'm writing a transformer that remap duckdb named parameters into a valid type based on the user's inputs.

def resolve_user_params(query: str, params: dict[str, Any], dialect='duckdb') -> str:
    errors: set[str] = set()

    def _transform(node: sqlglot.expressions.Expression) -> sqlglot.expressions.Expression:
        if isinstance(node, sqlglot.expressions.Placeholder):
            key = node.name
            if key not in params:
                errors.add(f'Variable ${key} is an unknown user_param')
                return node
            value = params[key]
            match value:
                case bool():
                    return sqlglot.expressions.Boolean(this=value)
                case int():
                    return sqlglot.expressions.Literal(this=value, is_string=False)
                case float():
                    return sqlglot.expressions.Literal(this=value, is_string=False)
                case str():
                    return sqlglot.expressions.Literal(this=value, is_string=True)
                case _:
                    errors.add(f'Variable ${key} is not a valid type, got {type(value)}')
            return node
        return node

    parsed_query = sqlglot.parse_one(query, read=dialect)
    # print(repr(parsed_query))

    transformed_query = parsed_query.transform(_transform)
    final_query = transformed_query.sql(dialect=dialect, comments=False)

    print('query', query)
    print(' - params', params)
    print(' - errors', errors)
    print(' - final_query', final_query)

    return final_query

The function works with a basic query like:

resolve_user_params(
    query="SELECT * FROM taxi_zones WHERE LocationID > $location_id_min AND LocationID IS NOT NULL",
    params={
        "location_id_min": 124,
    },
)

# query SELECT * FROM taxi_zones WHERE LocationID > $location_id_min AND LocationID IS NOT NULL
# - params {'location_id_min': 124}
# - errors set()
# - final_query SELECT * FROM taxi_zones WHERE LocationID > 124 AND NOT LocationID IS NULL

but it fails when I pass multiple named parameters:

resolve_user_params(
    query="SELECT * FROM taxi_zones WHERE LocationID > $location_id_min AND Borough = $borough",
    params={
        "location_id_min": 124,
        "borough": 12,
    },
)

Traceback (most recent call last):
  File ".venv/lib/python3.12/site-packages/sqlglot/tokens.py", line 970, in tokenize
    self._scan()
  File ".venv/lib/python3.12/site-packages/sqlglot/tokens.py", line 1003, in _scan
    self._scan_keywords()
  File ".venv/lib/python3.12/site-packages/sqlglot/tokens.py", line 1134, in _scan_keywords
    if self._scan_string(word):
       ^^^^^^^^^^^^^^^^^^^^^^^
  File ".venv/lib/python3.12/site-packages/sqlglot/tokens.py", line 1306, in _scan_string
    text = self._extract_string(end, unescape_sequences=token_type != TokenType.RAW_STRING)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File ".venv/lib/python3.12/site-packages/sqlglot/tokens.py", line 1385, in _extract_string
    raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
sqlglot.errors.TokenError: Missing $location_id_min AND Borough = $ from 1:44

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File ".venv/lib/python3.12/site-packages/marimo/_runtime/runner/cell_runner.py", line 302, in run
    return_value = execute_cell(cell, self.glbls)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File ".venv/lib/python3.12/site-packages/marimo/_ast/cell.py", line 460, in execute_cell
    return eval(cell.last_expr, glbls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/pg/0dm9b2p54yn_wqd89jd7j_wm0000gn/T/marimo_4911/__marimo__cell_lEQa__output.py", line 31, in <module>
  File "/var/folders/pg/0dm9b2p54yn_wqd89jd7j_wm0000gn/T/marimo_4911/__marimo__cell_bkHC_.py", line 26, in resolve_user_params
    parsed_query = sqlglot.parse_one(query, read='duckdb')
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File ".venv/lib/python3.12/site-packages/sqlglot/__init__.py", line 136, in parse_one
    result = dialect.parse(sql, **opts)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File ".venv/lib/python3.12/site-packages/sqlglot/dialects/dialect.py", line 518, in parse
    return self.parser(**opts).parse(self.tokenize(sql), sql)
                                     ^^^^^^^^^^^^^^^^^^
  File ".venv/lib/python3.12/site-packages/sqlglot/dialects/dialect.py", line 535, in tokenize
    return self.tokenizer.tokenize(sql)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File ".venv/lib/python3.12/site-packages/sqlglot/tokens.py", line 975, in tokenize
    raise TokenError(f"Error tokenizing '{context}'") from e
sqlglot.errors.TokenError: Error tokenizing 'cationID > $location_id_min AND Borough = $boroug'

Official Documentation

bigluck commented 3 months ago

Thanks @georgesittas