summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--compiler/parser/Lexer.x115
1 files changed, 73 insertions, 42 deletions
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index 88a0f07d90..cfe795585b 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -1,15 +1,20 @@
-----------------------------------------------------------------------------
-- (c) The University of Glasgow, 2006
--
--- GHC's lexer.
+-- GHC's lexer for Haskell 2010 [1].
--
--- This is a combination of an Alex-generated lexer from a regex
--- definition, with some hand-coded bits.
+-- This is a combination of an Alex-generated lexer [2] from a regex
+-- definition, with some hand-coded bits. [3]
--
-- Completely accurate information about token-spans within the source
-- file is maintained. Every token has a start and end RealSrcLoc
-- attached to it.
--
+-- References:
+-- [1] https://www.haskell.org/onlinereport/haskell2010/haskellch2.html
+-- [2] http://www.haskell.org/alex/
+-- [3] https://ghc.haskell.org/trac/ghc/wiki/Commentary/Compiler/Parser
+--
-----------------------------------------------------------------------------
-- ToDo / known bugs:
@@ -31,6 +36,10 @@
-- form? This is quite difficult to achieve. We don't do it for
-- qualified varids.
+
+-- -----------------------------------------------------------------------------
+-- Alex "Haskell code fragment top"
+
{
-- XXX The above flags turn off warnings in the generated code:
{-# LANGUAGE BangPatterns #-}
@@ -91,48 +100,55 @@ import Data.Ratio
import Data.Word
}
-$unispace = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
-$whitechar = [\ \n\r\f\v $unispace]
-$white_no_nl = $whitechar # \n
+
+-- -----------------------------------------------------------------------------
+-- Alex "Character set macros"
+
+$unispace = \x05 -- Trick Alex into handling Unicode. See alexGetByte.
+$nl = [\n\r\f]
+$whitechar = [$nl\v\ $unispace]
+$white_no_nl = $whitechar # \n -- TODO #8424
$tab = \t
$ascdigit = 0-9
-$unidigit = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
+$unidigit = \x03 -- Trick Alex into handling Unicode. See alexGetByte.
$decdigit = $ascdigit -- for now, should really be $digit (ToDo)
$digit = [$ascdigit $unidigit]
$special = [\(\)\,\;\[\]\`\{\}]
-$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
-$unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
-$symbol = [$ascsymbol $unisymbol] # [$special \_\:\"\']
+$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:]
+$unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetByte.
+$symbol = [$ascsymbol $unisymbol] # [$special \_\"\']
-$unilarge = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
+$unilarge = \x01 -- Trick Alex into handling Unicode. See alexGetByte.
$asclarge = [A-Z]
$large = [$asclarge $unilarge]
-$unismall = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
+$unismall = \x02 -- Trick Alex into handling Unicode. See alexGetByte.
$ascsmall = [a-z]
$small = [$ascsmall $unismall \_]
-$unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
-$graphic = [$small $large $symbol $digit $special $unigraphic \:\"\']
+$unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetByte.
+$graphic = [$small $large $symbol $digit $special $unigraphic \"\']
$binit = 0-1
$octit = 0-7
$hexit = [$decdigit A-F a-f]
-$symchar = [$symbol \:]
-$nl = [\n\r]
$idchar = [$small $large $digit \']
$pragmachar = [$small $large $digit]
$docsym = [\| \^ \* \$]
-@varid = $small $idchar*
-@conid = $large $idchar*
-@varsym = $symbol $symchar*
-@consym = \: $symchar*
+-- -----------------------------------------------------------------------------
+-- Alex "Regular expression macros"
+
+@varid = $small $idchar* -- variable identifiers
+@conid = $large $idchar* -- constructor identifiers
+
+@varsym = ($symbol # \:) $symbol* -- variable (operator) symbol
+@consym = \: $symbol* -- constructor (operator) symbol
@decimal = $decdigit+
@binary = $binit+
@@ -140,8 +156,11 @@ $docsym = [\| \^ \* \$]
@hexadecimal = $hexit+
@exponent = [eE] [\-\+]? @decimal
--- we support the hierarchical module name extension:
@qual = (@conid \.)+
+@qvarid = @qual @varid
+@qconid = @qual @conid
+@qvarsym = @qual @varsym
+@qconsym = @qual @consym
@floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
@@ -150,9 +169,17 @@ $docsym = [\| \^ \* \$]
@negative = \-
@signed = @negative ?
+
+-- -----------------------------------------------------------------------------
+-- Alex "Identifier"
+
haskell :-
--- everywhere: skip whitespace and comments
+
+-- -----------------------------------------------------------------------------
+-- Alex "Rules"
+
+-- everywhere: skip whitespace
$white_no_nl+ ;
$tab+ { warn Opt_WarnTabs (text "Tab character") }
@@ -179,7 +206,7 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") }
-- have a Haddock comment). The rules then munch the rest of the line.
"-- " ~[$docsym \#] .* { lineCommentToken }
-"--" [^$symbol : \ ] .* { lineCommentToken }
+"--" [^$symbol \ ] .* { lineCommentToken }
-- Next, match Haddock comments if no -haddock flag
@@ -191,7 +218,7 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") }
-- make sure that the first non-dash character isn't a symbol, and munch the
-- rest of the line.
-"---"\-* [^$symbol :] .* { lineCommentToken }
+"---"\-* ~$symbol .* { lineCommentToken }
-- Since the previous rules all match dashes followed by at least one
-- character, we also need to match a whole line filled with just dashes.
@@ -252,13 +279,13 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") }
-- single-line line pragmas, of the form
-- # <line> "<file>" <extra-stuff> \n
-<line_prag1> $decdigit+ { setLine line_prag1a }
+<line_prag1> @decimal { setLine line_prag1a }
<line_prag1a> \" [$graphic \ ]* \" { setFile line_prag1b }
<line_prag1b> .* { pop }
-- Haskell-style line pragmas, of the form
-- {-# LINE <line> "<file>" #-}
-<line_prag2> $decdigit+ { setLine line_prag2a }
+<line_prag2> @decimal { setLine line_prag2a }
<line_prag2a> \" [$graphic \ ]* \" { setFile line_prag2b }
<line_prag2b> "#-}"|"-}" { pop }
-- NOTE: accept -} at the end of a LINE pragma, for compatibility
@@ -341,8 +368,8 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") }
{ lex_quasiquote_tok }
-- qualified quasi-quote (#5555)
- "[" @qual @varid "|" / { ifExtension qqEnabled }
- { lex_qquasiquote_tok }
+ "[" @qvarid "|" / { ifExtension qqEnabled }
+ { lex_qquasiquote_tok }
}
<0> {
@@ -376,15 +403,15 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") }
}
<0,option_prags> {
- @qual @varid { idtoken qvarid }
- @qual @conid { idtoken qconid }
+ @qvarid { idtoken qvarid }
+ @qconid { idtoken qconid }
@varid { varid }
@conid { idtoken conid }
}
<0> {
- @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
- @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
+ @qvarid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
+ @qconid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
@varid "#"+ / { ifExtension magicHashEnabled } { varid }
@conid "#"+ / { ifExtension magicHashEnabled } { idtoken conid }
}
@@ -392,8 +419,8 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") }
-- ToDo: - move `var` and (sym) into lexical syntax?
-- - remove backquote from $special?
<0> {
- @qual @varsym { idtoken qvarsym }
- @qual @consym { idtoken qconsym }
+ @qvarsym { idtoken qvarsym }
+ @qconsym { idtoken qconsym }
@varsym { varsym }
@consym { consym }
}
@@ -453,6 +480,10 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") }
\" { lex_string_tok }
}
+
+-- -----------------------------------------------------------------------------
+-- Alex "Haskell code fragment bottom"
+
{
-- -----------------------------------------------------------------------------
-- The token type
@@ -467,6 +498,7 @@ data Token
| ITdo
| ITelse
| IThiding
+ | ITforeign
| ITif
| ITimport
| ITin
@@ -484,7 +516,6 @@ data Token
| ITwhere
| ITforall -- GHC extension keywords
- | ITforeign
| ITexport
| ITlabel
| ITdynamic
@@ -1738,13 +1769,13 @@ alexGetByte (AI loc s)
loc' = advanceSrcLoc loc c
byte = fromIntegral $ ord adj_c
- non_graphic = '\x0'
- upper = '\x1'
- lower = '\x2'
- digit = '\x3'
- symbol = '\x4'
- space = '\x5'
- other_graphic = '\x6'
+ non_graphic = '\x00'
+ upper = '\x01'
+ lower = '\x02'
+ digit = '\x03'
+ symbol = '\x04'
+ space = '\x05'
+ other_graphic = '\x06'
adj_c
| c <= '\x06' = non_graphic