diff options
-rw-r--r-- | compiler/parser/Lexer.x | 78 | ||||
-rw-r--r-- | testsuite/tests/parser/should_compile/T13986.hs | 5 | ||||
-rw-r--r-- | testsuite/tests/parser/should_compile/all.T | 1 |
3 files changed, 59 insertions, 25 deletions
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x index 8c17315fb0..3bf249bd7e 100644 --- a/compiler/parser/Lexer.x +++ b/compiler/parser/Lexer.x @@ -129,38 +129,38 @@ import ApiAnnotation -- NB: The logic behind these definitions is also reflected in basicTypes/Lexeme.hs -- Any changes here should likely be reflected there. -$unispace = \x05 -- Trick Alex into handling Unicode. See alexGetByte. +$unispace = \x05 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $nl = [\n\r\f] $whitechar = [$nl\v\ $unispace] $white_no_nl = $whitechar # \n -- TODO #8424 $tab = \t $ascdigit = 0-9 -$unidigit = \x03 -- Trick Alex into handling Unicode. See alexGetByte. +$unidigit = \x03 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $decdigit = $ascdigit -- for now, should really be $digit (ToDo) $digit = [$ascdigit $unidigit] $special = [\(\)\,\;\[\]\`\{\}] $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:] -$unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetByte. +$unisymbol = \x04 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $symbol = [$ascsymbol $unisymbol] # [$special \_\"\'] -$unilarge = \x01 -- Trick Alex into handling Unicode. See alexGetByte. +$unilarge = \x01 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $asclarge = [A-Z] $large = [$asclarge $unilarge] -$unismall = \x02 -- Trick Alex into handling Unicode. See alexGetByte. +$unismall = \x02 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $ascsmall = [a-z] $small = [$ascsmall $unismall \_] -$unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetByte. +$unigraphic = \x06 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $graphic = [$small $large $symbol $digit $special $unigraphic \"\'] $binit = 0-1 $octit = 0-7 $hexit = [$decdigit A-F a-f] -$uniidchar = \x07 -- Trick Alex into handling Unicode. See alexGetByte. +$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $idchar = [$small $large $digit $uniidchar \'] $pragmachar = [$small $large $digit] @@ -1968,27 +1968,29 @@ getLastTk = P $ \s@(PState { last_tk = last_tk }) -> POk s last_tk data AlexInput = AI RealSrcLoc StringBuffer -alexInputPrevChar :: AlexInput -> Char -alexInputPrevChar (AI _ buf) = prevChar buf '\n' +{- +Note [Unicode in Alex] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Although newer versions of Alex support unicode, this grammar is processed with +the old style '--latin1' behaviour. This means that when implementing the +functions --- backwards compatibility for Alex 2.x -alexGetChar :: AlexInput -> Maybe (Char,AlexInput) -alexGetChar inp = case alexGetByte inp of - Nothing -> Nothing - Just (b,i) -> c `seq` Just (c,i) - where c = chr $ fromIntegral b + alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) + alexInputPrevChar :: AlexInput -> Char -alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) -alexGetByte (AI loc s) - | atEnd s = Nothing - | otherwise = byte `seq` loc' `seq` s' `seq` - --trace (show (ord c)) $ - Just (byte, (AI loc' s')) - where (c,s') = nextChar s - loc' = advanceSrcLoc loc c - byte = fromIntegral $ ord adj_c +which Alex uses to to take apart our 'AlexInput', we must + + * return a latin1 character in the 'Word8' that 'alexGetByte' expects + * return a latin1 character in 'alexInputPrevChar'. + +We handle this in 'adjustChar' by squishing entire classes of unicode +characters into single bytes. +-} - non_graphic = '\x00' +{-# INLINE adjustChar #-} +adjustChar :: Char -> Word8 +adjustChar c = fromIntegral $ ord adj_c + where non_graphic = '\x00' upper = '\x01' lower = '\x02' digit = '\x03' @@ -2034,6 +2036,32 @@ alexGetByte (AI loc s) Space -> space _other -> non_graphic +-- Getting the previous 'Char' isn't enough here - we need to convert it into +-- the same format that 'alexGetByte' would have produced. +-- +-- See Note [Unicode in Alex] and #13986. +alexInputPrevChar :: AlexInput -> Char +alexInputPrevChar (AI _ buf) = chr (fromIntegral (adjustChar pc)) + where pc = prevChar buf '\n' + +-- backwards compatibility for Alex 2.x +alexGetChar :: AlexInput -> Maybe (Char,AlexInput) +alexGetChar inp = case alexGetByte inp of + Nothing -> Nothing + Just (b,i) -> c `seq` Just (c,i) + where c = chr $ fromIntegral b + +-- See Note [Unicode in Alex] +alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) +alexGetByte (AI loc s) + | atEnd s = Nothing + | otherwise = byte `seq` loc' `seq` s' `seq` + --trace (show (ord c)) $ + Just (byte, (AI loc' s')) + where (c,s') = nextChar s + loc' = advanceSrcLoc loc c + byte = adjustChar c + -- This version does not squash unicode characters, it is used when -- lexing strings. alexGetChar' :: AlexInput -> Maybe (Char,AlexInput) diff --git a/testsuite/tests/parser/should_compile/T13986.hs b/testsuite/tests/parser/should_compile/T13986.hs new file mode 100644 index 0000000000..b1b4882a35 --- /dev/null +++ b/testsuite/tests/parser/should_compile/T13986.hs @@ -0,0 +1,5 @@ +{-# LANGUAGE TypeApplications #-} + +module T13986 where + +foo x₁@True = 10 diff --git a/testsuite/tests/parser/should_compile/all.T b/testsuite/tests/parser/should_compile/all.T index c008bd439a..e2f68f6e96 100644 --- a/testsuite/tests/parser/should_compile/all.T +++ b/testsuite/tests/parser/should_compile/all.T @@ -109,3 +109,4 @@ test('DumpRenamedAst', normal, compile, ['-dsuppress-uniques -ddump-rn-ast'] test('DumpTypecheckedAst', normal, compile, ['-dsuppress-uniques -ddump-tc-ast']) test('T13747', normal, compile, ['']) test('T14189', normal, compile, ['-dsuppress-uniques -ddump-rn-ast']) +test('T13986', normal, compile, ['']) |