darvid / python-hyperscan

🐍 A CPython extension for the Hyperscan regular expression matching library.
https://python-hyperscan.readthedocs.io/en/latest/
MIT License
165 stars 28 forks source link

How to match an exact string with hyperscan like with re.findall #39

Open rafikg opened 2 years ago

rafikg commented 2 years ago
from typing import Optional, Any, List
import hyperscan

def on_match(id: int, start: int, end: int, flags: int, context: Optional[Any] = None) -> Optional[bool]:
    context['results'].append((id, start, end))
    return 0

db = hyperscan.Database()
patterns = (
    # expression,  id, flags
    (br'O+M',      0,  hyperscan.HS_FLAG_CASELESS|hyperscan.HS_FLAG_SOM_LEFTMOST),
)
expressions, ids, flags = zip(*patterns)
db.compile(
    expressions=expressions, ids=ids, elements=len(patterns), flags=flags
)
lines = ['Om', 'OOm', 'oom', 'sroom', 'communication', 'surveillance']

context = {'results': []}

text = str.encode("\n".join(lines))
print(text)

db.scan(text, match_event_handler=on_match, context=context)

for result in context['results']:
    print(result)

(0, 0, 2) ->Om
(0, 3, 6) -> OOm
(0, 7, 10) -> oom
(0, 13, 16) ->oom (in sroom)
(0, 18, 20)->om (in communication)

with re.findall()

re.findall(rb'O+M', text, flags=re.IGNORECASE)
[b'Om', b'OOm', b'oom', b'oom']