The following repro-case demonstrates the problem:
-- -*- haskell -*-
{
module Main where
import qualified Data.ByteString as B
import Data.Word
}
%encoding "latin1"
:-
<0> [\x01-\xff]+ { False }
<0> [\x00] { True }
{
type AlexInput = B.ByteString
alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
alexGetByte = B.uncons
alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar = undefined
-- generated by @alex@
alexScan :: AlexInput -> Int -> AlexReturn Bool
{-
GOOD cases:
("012\NUL3","012","\NUL3",3,3,False)
("\NUL0","\NUL","0",1,1,True)
("012","012","",3,3,False)
BAD case:
("0@P`p\128\144\160","0@P`p","",5,8,False)
expected:
("0@P`p\128\144\160","0@P`p\128\144\160","",8,8,False)
-}
main :: IO ()
main = do
go (B.pack [0x30,0x31,0x32,0x00,0x33]) -- GOOD
go (B.pack [0x00,0x30]) -- GOOD
go (B.pack [0x30,0x31,0x32]) -- GOOD
go (B.pack [0x30,0x40,0x50,0x60,0x70,0x80,0x90,0xa0]) -- BAD
where
go inp = case (alexScan inp 0) of
-- expected invariant: len == B.length inp - B.length inp'
AlexToken inp' len b -> print (inp, B.take len inp, inp',len,B.length inp - B.length inp',b)
}
The cause is most likely the one already pointed out in #63, i.e.
The following repro-case demonstrates the problem:
The cause is most likely the one already pointed out in #63, i.e.
https://github.com/simonmar/alex/blob/ff84f447bbca5f3b660fcdc5c3124920c7197b1c/templates/GenericTemplate.hs#L178-L180
which tries to count code-points encoded in UTF8, but which makes no sense when in the 8-bit clean
--latin1
mode.