haskell / haskeline

A Haskell library for line input in command-line programs.
https://hackage.haskell.org/package/haskeline
BSD 3-Clause "New" or "Revised" License
221 stars 75 forks source link

win32Term does not paste UTF-8 characters #54

Closed RyanGlScott closed 7 years ago

RyanGlScott commented 7 years ago

A bizarre discrepancy that I discovered when using GHCi (or any Haskeline-based terminal, actually) on both Windows on Linux. If you have a string which contains sufficiently fancy UTF-8 characters and attempt to copy-and-paste it into Haskeline using a native Windows console, it'll just drop the fancy characters!

For instance, if you have the string "Error: Не удается найти указанный файл" and try to copy-and-paste it into GHCi on PowerShell, you'll get:

haskeline1

(It does paste it some extra whitespace characters, for some bizarre reason...)

But if you paste the same string into, say, MSYS2, it works fine!

haskeline2

So the issue appears to be local to win32Term.

I'm not intimately familiar with how win32Term reads input, so I can't say if this is a Haskeline-specific issue or a problem with the way Win32 API calls (e.g., ReadConsoleInput) are being used.

RyanGlScott commented 7 years ago

I did a little bit of digging, and I've managed to isolate the problem down to a single IO function. Here's the code you'll need:

{-# LANGUAGE ScopedTypeVariables #-}
module Main (main) where

import Control.Concurrent
import Control.Exception
import Control.Monad
import Data.Char
import Data.Maybe
import Foreign
import Foreign.C
import Graphics.Win32.Misc
import System.Win32

#include <windows.h>

main :: IO ()
main = do
  stdIN <- getStdHandle sTD_INPUT_HANDLE
  ch <- newChan
  win32WithEvent stdIN ch $ \x -> x >>= print

win32WithEvent :: HANDLE -> Chan Event -> (IO Event -> IO a) -> IO a
win32WithEvent h eventChan f = f $ getEvent h eventChan

keyEventLoop :: IO [Event] -> Chan Event -> IO Event
keyEventLoop readEvents eventChan = do
    -- first, see if any events are already queued up (from a key/ctrl-c
    -- event or from a previous call to getEvent where we read in multiple
    -- keys)
    isEmpty <- isEmptyChan eventChan
    if not isEmpty
        then readChan eventChan
        else do
            lock <- newEmptyMVar
            tid <- forkIO $ handleErrorEvent (readerLoop lock)
            readChan eventChan `finally` do
                            putMVar lock ()
                            killThread tid
  where
    readerLoop lock = do
        es <- readEvents
        if null es
            then readerLoop lock
            else -- Use the lock to work around the fact that writeList2Chan
                 -- isn't atomic.  Otherwise, some events could be ignored if
                 -- the subthread is killed before it saves them in the chan.
                 bracket_ (putMVar lock ()) (takeMVar lock) $
                    writeList2Chan eventChan es
    handleErrorEvent = handle $ \e -> case fromException e of
                                Just ThreadKilled -> return ()
                                _ -> writeChan eventChan (ErrorEvent e)

foreign import ccall "windows.h ReadConsoleInputW" c_ReadConsoleInput
    :: HANDLE -> Ptr () -> DWORD -> Ptr DWORD -> IO Bool

foreign import ccall "windows.h WaitForSingleObject" c_WaitForSingleObject
    :: HANDLE -> DWORD -> IO DWORD

foreign import ccall "windows.h GetNumberOfConsoleInputEvents"
    c_GetNumberOfConsoleInputEvents :: HANDLE -> Ptr DWORD -> IO Bool

getNumberOfEvents :: HANDLE -> IO Int
getNumberOfEvents h = alloca $ \numEventsPtr -> do
    failIfFalse_ "GetNumberOfConsoleInputEvents"
        $ c_GetNumberOfConsoleInputEvents h numEventsPtr
    fmap fromEnum $ peek numEventsPtr

getEvent :: HANDLE -> Chan Event -> IO Event
getEvent h = keyEventLoop (eventReader h)

eventReader :: HANDLE -> IO [Event]
eventReader h = do
    let waitTime = 500 -- milliseconds
    ret <- c_WaitForSingleObject h waitTime
    yield -- otherwise, the above foreign call causes the loop to never
          -- respond to the killThread
    if ret /= (#const WAIT_OBJECT_0)
        then eventReader h
        else do
            es <- readEvents h
            return $ mapMaybe processEvent es

processEvent :: InputEvent -> Maybe Event
processEvent KeyEvent {keyDown = True, unicodeChar = c, virtualKeyCode = vc,
                    controlKeyState = cstate}
    = fmap (\e -> KeyInput [Key modifier' e]) $ keyFromCode vc `mplus` simpleKeyChar
  where
    simpleKeyChar = guard (c /= '\NUL') >> return (KeyChar c)
    testMod ck = (cstate .&. ck) /= 0
    modifier' = if hasMeta modifier && hasControl modifier
                    then noModifier {hasShift = hasShift modifier}
                    else modifier
    modifier = Modifier {hasMeta = testMod ((#const RIGHT_ALT_PRESSED)
                                        .|. (#const LEFT_ALT_PRESSED))
                        ,hasControl = testMod ((#const RIGHT_CTRL_PRESSED)
                                        .|. (#const LEFT_CTRL_PRESSED))
                                    && not (c > '\NUL' && c <= '\031')
                        ,hasShift = testMod (#const SHIFT_PRESSED)
                                    && not (isPrint c)
                        }

processEvent WindowEvent = Just WindowResize
processEvent _ = Nothing

keyFromCode :: WORD -> Maybe BaseKey
keyFromCode (#const VK_BACK) = Just Backspace
keyFromCode (#const VK_LEFT) = Just LeftKey
keyFromCode (#const VK_RIGHT) = Just RightKey
keyFromCode (#const VK_UP) = Just UpKey
keyFromCode (#const VK_DOWN) = Just DownKey
keyFromCode (#const VK_DELETE) = Just Delete
keyFromCode (#const VK_HOME) = Just Home
keyFromCode (#const VK_END) = Just End
keyFromCode (#const VK_PRIOR) = Just PageUp
keyFromCode (#const VK_NEXT) = Just PageDown
-- The Windows console will return '\r' when return is pressed.
keyFromCode (#const VK_RETURN) = Just (KeyChar '\n')
-- TODO: KillLine?
-- TODO: function keys.
keyFromCode _ = Nothing

data InputEvent = KeyEvent {keyDown :: BOOL,
                          repeatCount :: WORD,
                          virtualKeyCode :: WORD,
                          virtualScanCode :: WORD,
                          unicodeChar :: Char,
                          controlKeyState :: DWORD}
            -- TODO: WINDOW_BUFFER_SIZE_RECORD
            -- I cant figure out how the user generates them.
           | WindowEvent
           | OtherEvent
                        deriving Show

peekEvent :: Ptr () -> IO InputEvent
peekEvent pRecord = do
    eventType :: WORD <- (#peek INPUT_RECORD, EventType) pRecord
    let eventPtr = (#ptr INPUT_RECORD, Event) pRecord
    case eventType of
        (#const KEY_EVENT) -> getKeyEvent eventPtr
        (#const WINDOW_BUFFER_SIZE_EVENT) -> return WindowEvent
        _ -> return OtherEvent

readEvents :: HANDLE -> IO [InputEvent]
readEvents h = do
    n <- getNumberOfEvents h
    alloca $ \numEventsPtr ->
        allocaBytes (n * #size INPUT_RECORD) $ \pRecord -> do
            failIfFalse_ "ReadConsoleInput"
                $ c_ReadConsoleInput h pRecord (toEnum n) numEventsPtr
            numRead <- fmap fromEnum $ peek numEventsPtr
            forM [0..toEnum numRead-1] $ \i -> peekEvent
                $ pRecord `plusPtr` (i * #size INPUT_RECORD)

getKeyEvent :: Ptr () -> IO InputEvent
getKeyEvent p = do
    kDown' <- (#peek KEY_EVENT_RECORD, bKeyDown) p
    repeat' <- (#peek KEY_EVENT_RECORD, wRepeatCount) p
    keyCode <- (#peek KEY_EVENT_RECORD, wVirtualKeyCode) p
    scanCode <- (#peek KEY_EVENT_RECORD, wVirtualScanCode) p
    char :: CWchar <- (#peek KEY_EVENT_RECORD, uChar) p
    state <- (#peek KEY_EVENT_RECORD, dwControlKeyState) p
    return KeyEvent {keyDown = kDown',
                            repeatCount = repeat',
                            virtualKeyCode = keyCode,
                            virtualScanCode = scanCode,
                            unicodeChar = toEnum (fromEnum char),
                            controlKeyState = state}

data Event
  = WindowResize
  | KeyInput [Key]
  | ErrorEvent SomeException
  | ExternalPrint String
  deriving Show

data Key = Key Modifier BaseKey
            deriving (Show,Eq,Ord)

data Modifier = Modifier {hasControl, hasMeta, hasShift :: Bool}
            deriving (Eq,Ord)

instance Show Modifier where
    show m = show $ catMaybes [maybeUse hasControl "ctrl"
                        , maybeUse hasMeta "meta"
                        , maybeUse hasShift "shift"
                        ]
        where
            maybeUse f str = if f m then Just str else Nothing

noModifier :: Modifier
noModifier = Modifier False False False

data BaseKey = KeyChar Char
             | FunKey Int
             | LeftKey | RightKey | DownKey | UpKey
             -- TODO: is KillLine really a key?
             | KillLine | Home | End | PageDown | PageUp
             | Backspace | Delete
            deriving (Show,Eq,Ord)

Stick this into an *.hsc file, run hsc2hs on it, and then load the resulting *.hs file. Now run main, and it should wait for a single input event. Here's where it gets interesting: if you try copy-and-pasting a character like "Λ", it won't read it! And if you copy a string that has normal ASCII characters after it, like "Λ, lowercase λ", and then paste it, then lo and behold:

> main
KeyInput [Key [] (KeyChar ',')]

It'll ignore Λ and skip right to the comma!

RyanGlScott commented 7 years ago

The plot thickens. If I change the implementation of eventReader to print out every Event as it receives it:

eventReader :: HANDLE -> IO [Event]
eventReader h = do
    let waitTime = 500 -- milliseconds
    ret <- c_WaitForSingleObject h waitTime
    yield -- otherwise, the above foreign call causes the loop to never
          -- respond to the killThread
    if ret /= (#const WAIT_OBJECT_0)
        then eventReader h
        else do
            es <- readEvents h
            traverse print es -- <- NOTICE THIS PART
            return $ mapMaybe processEvent es

Load it into GHCi, type main, and then paste the string "Λ, lowercase λ", this is what happens:

$ ghci .\Example.hs
GHCi, version 8.0.2: http://www.haskell.org/ghc/  :? for help
Loaded GHCi configuration from C:\Users\RyanGlScott\AppData\Roaming\ghc\ghci.conf
[1 of 1] Compiling Main             ( Example.hs, interpreted )

Example.hsc:30:16: warning: [-Wdeprecations]
    In the use of ‘isEmptyChan’
    (imported from Control.Concurrent, but defined in Control.Concurrent.Chan):
    Deprecated: "if you need this operation, use Control.Concurrent.STM.TChan instead.  See <http://ghc.haskell.org/trac/ghc/ticket/4154> for details"
Ok, modules loaded: Main.
> main
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 13, virtualScanCode = 28, unicodeChar = '\r', controlKeyState = 0}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 18, virtualScanCode = 56, unicodeChar = '\NUL', controlKeyState = 2}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 97, virtualScanCode = 79, unicodeChar = '\NUL', controlKeyState = 2}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 97, virtualScanCode = 79, unicodeChar = '\NUL', controlKeyState = 2}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 97, virtualScanCode = 79, unicodeChar = '\NUL', controlKeyState = 2}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 97, virtualScanCode = 79, unicodeChar = '\NUL', controlKeyState = 2}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 98, virtualScanCode = 80, unicodeChar = '\NUL', controlKeyState = 2}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 98, virtualScanCode = 80, unicodeChar = '\NUL', controlKeyState = 2}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 18, virtualScanCode = 56, unicodeChar = '\923', controlKeyState = 0}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 188, virtualScanCode = 51, unicodeChar = ',', controlKeyState = 0}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 188, virtualScanCode = 51, unicodeChar = ',', controlKeyState = 0}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 32, virtualScanCode = 57, unicodeChar = ' ', controlKeyState = 0}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 32, virtualScanCode = 57, unicodeChar = ' ', controlKeyState = 0}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 76, virtualScanCode = 38, unicodeChar = 'l', controlKeyState = 0}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 76, virtualScanCode = 38, unicodeChar = 'l', controlKeyState = 0}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 79, virtualScanCode = 24, unicodeChar = 'o', controlKeyState = 0}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 79, virtualScanCode = 24, unicodeChar = 'o', controlKeyState = 0}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 87, virtualScanCode = 17, unicodeChar = 'w', controlKeyState = 0}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 87, virtualScanCode = 17, unicodeChar = 'w', controlKeyState = 0}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 69, virtualScanCode = 18, unicodeChar = 'e', controlKeyState = 0}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 69, virtualScanCode = 18, unicodeChar = 'e', controlKeyState = 0}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 82, virtualScanCode = 19, unicodeChar = 'r', controlKeyState = 0}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 82, virtualScanCode = 19, unicodeChar = 'r', controlKeyState = 0}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 67, virtualScanCode = 46, unicodeChar = 'c', controlKeyState = 0}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 67, virtualScanCode = 46, unicodeChar = 'c', controlKeyState = 0}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 65, virtualScanCode = 30, unicodeChar = 'a', controlKeyState = 0}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 65, virtualScanCode = 30, unicodeChar = 'a', controlKeyState = 0}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 83, virtualScanCode = 31, unicodeChar = 's', controlKeyState = 0}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 83, virtualScanCode = 31, unicodeChar = 's', controlKeyState = 0}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 69, virtualScanCode = 18, unicodeChar = 'e', controlKeyState = 0}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 69, virtualScanCode = 18, unicodeChar = 'e', controlKeyState = 0}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 32, virtualScanCode = 57, unicodeChar = ' ', controlKeyState = 0}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 32, virtualScanCode = 57, unicodeChar = ' ', controlKeyState = 0}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 18, virtualScanCode = 56, unicodeChar = '\NUL', controlKeyState = 2}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 97, virtualScanCode = 79, unicodeChar = '\NUL', controlKeyState = 2}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 97, virtualScanCode = 79, unicodeChar = '\NUL', controlKeyState = 2}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 97, virtualScanCode = 79, unicodeChar = '\NUL', controlKeyState = 2}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 97, virtualScanCode = 79, unicodeChar = '\NUL', controlKeyState = 2}
KeyEvent {keyDown = True, repeatCount = 1, virtualKeyCode = 98, virtualScanCode = 80, unicodeChar = '\NUL', controlKeyState = 2}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 98, virtualScanCode = 80, unicodeChar = '\NUL', controlKeyState = 2}
KeyEvent {keyDown = False, repeatCount = 1, virtualKeyCode = 18, virtualScanCode = 56, unicodeChar = '\955', controlKeyState = 0}
KeyInput [Key [] (KeyChar ',')]

Reading this, it becomes clearer why Haskeline is skipping over Λ (or \923, as rendered above): it doesn't receive a KeyEvent for Λ with keyDown = True! As a result, processEvent is throwing away the one Λ KeyEvent it receives, in which keyDown = False.

RyanGlScott commented 7 years ago

One way around this is to mimic the behavior of libuv in this situation. libuv ignores keyup events unless the left alt key was held and a valid Unicode character was emitted. Experimentally, all my attempts at pasting in Unicode characters seems to meet these two criteria, so I think this shouldn't work here as well.

judah commented 7 years ago

Thanks for taking the time to dig into this and find a solution!