python / cpython

The Python programming language
https://www.python.org
Other
63.54k stars 30.44k forks source link

Suspicion of Buffer Overrun in CPython 3.11.1 on WIndows 64-bit #100566

Closed hajimen closed 1 year ago

hajimen commented 1 year ago

Bug report

The code below might look redundunt, but I need the length to reproduce the bug.

import typing as ty
import collections.abc as abc
from dataclasses import dataclass
from enum import Enum, auto
from pathlib import Path

class ColorBase(object):
    VALUES = []

    def __eq__(self, other):
        return True

    def __hash__(self) -> int:
        return hash((getattr(self, n) for n in self.VALUES))

class BaseRGBColor(ColorBase):
    VALUES = ["rgb_r", "rgb_g", "rgb_b"]

    def __init__(self, rgb_r, rgb_g, rgb_b, is_upscaled=False):
        """
        """
        super(BaseRGBColor, self).__init__()
        self.rgb_r = float(rgb_r)
        self.rgb_g = float(rgb_g)
        self.rgb_b = float(rgb_b)

class sRGBColor(BaseRGBColor):
    pass

class Orientation(Enum):
    LeftTop = auto()

class Side(Enum):
    Top = auto()

Left = Orientation.LeftTop
Top = Left
TopSide = Side.Top

@dataclass(frozen=True)
class Style:
    size: float
    x_loc: float
    y_loc: float
    h_o: Orientation = Left
    v_o: Orientation = Top
    color: ColorBase = sRGBColor(0, 0, 0)
    align: Orientation = Left
    side: Side = TopSide

MDT = ty.Union['Manuscript', 'Descriptor']
MDLT = abc.Sequence[ty.Union['Manuscript', 'Descriptor']]

def _manuscript_matmul(self: 'Descriptor', r, synth: bool):
    k = self.m_name
    if synth or isinstance(r, type(self)):
        return Manuscript(**{k: self % r})
    elif isinstance(r, Manuscript):
        ret = r.dict()
        if k in ret:
            ret[k] = ret[k] % self
        else:
            ret[k] = self
        return Manuscript(**ret)
    elif isinstance(r, Descriptor):
        return Manuscript(**{k: self, r.m_name: r})
    else:
        raise Exception('Bad code')

class Descriptor:
    m_name = ''

    def __matmul__(self, r) -> 'Manuscript':
        raise NotImplementedError()

    def __mod__(self, r) -> 'Descriptor':
        raise NotImplementedError()

@dataclass
class Manuscript:
    row: ty.Optional['Row'] = None
    color_conversion_intent: ty.Optional['ColorConversionIntent'] = None
    legend: ty.Optional['Legend'] = None

    def dict(self):
        return {k: v for k, v in self.__dict__.items() if v is not None}

    def __matmul__(self: 'Manuscript', r: MDT):
        ld = self.dict()
        ret = ld.copy()
        if isinstance(r, Manuscript):
            rd = r.dict()
            ret.update(rd)
            for k in set(ld.keys()) | set(rd.keys()):
                if k in ld and k in rd:
                    ret[k] = ld[k] @ rd[k]
            return Manuscript(**ret)
        elif isinstance(r, Descriptor):
            k = r.m_name
            if k in ret:
                ret[k] = ret[k] % r
            else:
                ret[k] = r
            return Manuscript(**ret)
        else:
            raise Exception('@ rvalue should be Manuscript or Descriptor.')

    def __rshift__(self, r: MDLT) -> MDLT:
        return [ty.cast(Manuscript, self @ v) for v in r]

K = ty.TypeVar('K')
V = ty.TypeVar('V')

class DictCombinable(Descriptor, ty.Generic[K, V]):
    kv: list[type] = []

    def __init__(self, d: ty.Optional[ty.Dict[K, V]] = None) -> None:
        self.d: dict[K, V] = {} if d is None else d

    def __matmul__(self, r: ty.Union[tuple, MDT]):
        return _manuscript_matmul(self, r, isinstance(r, tuple))

    def __eq__(self, r) -> bool:
        ret = (r.d == self.d)
        return isinstance(r, type(self)) and ret
        # return isinstance(r, type(self)) and r.d == self.d

class Legend(DictCombinable[Style, str]):
    kv = [Style, str]
    m_name = 'legend'

class ColorConversionIntent(Descriptor, Enum):
    Perceptual = auto()

    def __matmul__(self, r: MDT):
        return _manuscript_matmul(self, r, False)

ColorConversionIntent.m_name = 'color_conversion_intent'
Perceptual = ColorConversionIntent.Perceptual

class Overlay(Descriptor, ty.Generic[V]):
    def __init__(self, v: V) -> None:
        self.v = v

    def __matmul__(self, r: MDT):
        return _manuscript_matmul(self, r, False)

class Row(Overlay[int]):
    m_name = 'row'

FONT_PATH = Path('foo.ttf')
m = Perceptual @ Row(1)
s1 = Style(3., 0., 0.)
s2 = Style(3., 0., 5.)
ls = [Legend({s1: 'S1'}), Legend({s2: 'S2'})]
ms = m >> ls
d1 = ms[0].legend.d
d2 = (Perceptual @ Row(1) @ Legend({s1: 'S1'})).legend.d
print(d1 == d2)
print([(k, v) for k, v in d1.items()] == [(k, v) for k, v in d2.items()])

The output is:

False
True

It is obviously wrong. Just removing the redundunt line FONT_PATH = Path('calibri.ttf') fixes the wrong output. Buffer overrun is highly suspicious.

Environment

sobolevn commented 1 year ago

Any chance to simplify this example? 🙏 Right now it is very hard to understand what is going on.

ronaldoussoren commented 1 year ago

For me this script often prints True and False as for the OP, but at times also prints True and True (as one would expect). This often points to an inconsitency between __eq__ and __hash__ given that this is related to dict equality.

And indeed, the implementation ColorBase does not conform to the contract for a hashable object: Two instances can be equal but with different hashes. Replacing the __hash__ implementation by one that returns a constant value fixes the issue.

For objects that are used as dict keys:

hajimen commented 1 year ago

For me this script often prints True and False as for the OP, but at times also prints True and True (as one would expect). This often points to an inconsitency between __eq__ and __hash__ given that this is related to dict equality.

And indeed, the implementation ColorBase does not conform to the contract for a hashable object: Two instances can be equal but with different hashes. Replacing the __hash__ implementation by one that returns a constant value fixes the issue.

For objects that are used as dict keys:

  • a == b must imply that hash(a) == hash(b)
  • The hash must not change after construction (or at least not after being used as a dict key)

Thanks ronaldoussoren, but did you tried to append the line below?

print([(hash(k), hash(v)) for k, v in d1.items()] == [(hash(k), hash(v)) for k, v in d2.items()])

The line prints True even when print(d1 == d2) line prints False.