eliben / pycparser

:snake: Complete C99 parser in pure Python
Other
3.21k stars 612 forks source link

TypeError:'Nonetype' object is not callable #530

Closed ToSeven closed 5 months ago

ToSeven commented 5 months ago

I fuzzed the pycaparser and found the below clash. e3caed063291b07617f8ee73d9d332b0

eliben commented 5 months ago

Please create a minimal reproducer that I can use to reproduce the crash

ToSeven commented 5 months ago

This is the PoC file

import json
import sys
import re

# This is not required if you've installed pycparser into
# your site-packages/ with setup.py
#
sys.path.extend(['.', '..'])

from pycparser import parse_file, c_ast,c_parser
from pycparser.plyparser import Coord

RE_CHILD_ARRAY = re.compile(r'(.*)\[(.*)\]')
RE_INTERNAL_ATTR = re.compile('__.*__')

class CJsonError(Exception):
    pass

def memodict(fn):
    """ Fast memoization decorator for a function taking a single argument """
    class memodict(dict):
        def __missing__(self, key):
            ret = self[key] = fn(key)
            return ret
    return memodict().__getitem__

@memodict
def child_attrs_of(klass):
    """
    Given a Node class, get a set of child attrs.
    Memoized to avoid highly repetitive string manipulation

    """
    non_child_attrs = set(klass.attr_names)
    all_attrs = set([i for i in klass.__slots__ if not RE_INTERNAL_ATTR.match(i)])
    return all_attrs - non_child_attrs

def to_dict(node):
    """ Recursively convert an ast into dict representation. """
    klass = node.__class__

    result = {}

    # Metadata
    result['_nodetype'] = klass.__name__

    # Local node attributes
    for attr in klass.attr_names:
        result[attr] = getattr(node, attr)

    # Coord object
    if node.coord:
        result['coord'] = str(node.coord)
    else:
        result['coord'] = None

    # Child attributes
    for child_name, child in node.children():
        # Child strings are either simple (e.g. 'value') or arrays (e.g. 'block_items[1]')
        match = RE_CHILD_ARRAY.match(child_name)
        if match:
            array_name, array_index = match.groups()
            array_index = int(array_index)
            # arrays come in order, so we verify and append.
            result[array_name] = result.get(array_name, [])
            if array_index != len(result[array_name]):
                raise CJsonError('Internal ast error. Array {} out of order. '
                    'Expected index {}, got {}'.format(
                    array_name, len(result[array_name]), array_index))
            result[array_name].append(to_dict(child))
        else:
            result[child_name] = to_dict(child)

    # Any child attributes that were missing need "None" values in the json.
    for child_attr in child_attrs_of(klass):
        if child_attr not in result:
            result[child_attr] = None

    return result

def to_json(node, **kwargs):
    """ Convert ast node to json string """
    return json.dumps(to_dict(node), **kwargs)

def file_to_dict(filename):
    """ Load C file into dict representation of ast """
    ast = parse_file(filename, use_cpp=True)
    return to_dict(ast)

def file_to_json(filename, **kwargs):
    """ Load C file into json string representation of ast """
    ast = parse_file(filename, use_cpp=True)
    return to_json(ast, **kwargs)

def _parse_coord(coord_str):
    """ Parse coord string (file:line[:column]) into Coord object. """
    if coord_str is None:
        return None

    vals = coord_str.split(':')
    vals.extend([None] * 3)
    filename, line, column = vals[:3]
    return Coord(filename, line, column)

def _convert_to_obj(value):
    """
    Convert an object in the dict representation into an object.
    Note: Mutually recursive with from_dict.

    """
    value_type = type(value)
    if value_type == dict:
        return from_dict(value)
    elif value_type == list:
        return [_convert_to_obj(item) for item in value]
    else:
        # String
        return value

def from_dict(node_dict):
    """ Recursively build an ast from dict representation """
    class_name = node_dict.pop('_nodetype')

    klass = getattr(c_ast, class_name)

    # Create a new dict containing the key-value pairs which we can pass
    # to node constructors.
    objs = {}
    for key, value in node_dict.items():
        if key == 'coord':
            objs[key] = _parse_coord(value)
        else:
            objs[key] = _convert_to_obj(value)

    # Use keyword parameters, which works thanks to beautifully consistent
    # ast Node initializers.
    return klass(**objs)

def from_json(ast_json):
    """ Build an ast from json string representation """
    return from_dict(json.loads(ast_json))

#------------------------------------------------------------------------------
if __name__ == "__main__":
    _c_parser = c_parser.CParser(
            lex_optimize=True,
            yacc_debug=False,
            yacc_optimize=False)
    ast=_c_parser.parse(
        """i
{i[
#66666666
#66666666666666666L66666666
#6666666666666666666666666666666*i""",
        ''
    )
    ast_dict=to_dict(ast)
    ast = from_dict(ast_dict)
    print(to_json(ast, sort_keys=True, indent=4))
eliben commented 5 months ago

I get this thrown for the given C code sample:

ValueError: invalid literal for int() with base 10: '66666666666666666L'

Looks pretty reasonable to me. Perhaps the TypeError comes from some wrapping code that's not inside pycparser?

ToSeven commented 5 months ago

test.py

from pycparser import c_parser

if __name__ == "__main__":
    _c_parser = c_parser.CParser(
            lex_optimize=True,
            yacc_debug=False,
            yacc_optimize=False)
    ast=_c_parser.parse(
        """i
{i[
#66666666
#66666666666666666L66666666
#6666666666666666666666666666666�*�i""",
        ''
    )

image

ToSeven commented 5 months ago

@eliben I have deleted all non-related statements and got the simplest file. It shows that the root cause of the crash is that parser doesn't check the legality of input.