diff options
Diffstat (limited to 'compiler/GHC/Parser/HaddockLex.x')
| -rw-r--r-- | compiler/GHC/Parser/HaddockLex.x | 201 |
1 files changed, 201 insertions, 0 deletions
diff --git a/compiler/GHC/Parser/HaddockLex.x b/compiler/GHC/Parser/HaddockLex.x new file mode 100644 index 0000000000..e215769f9e --- /dev/null +++ b/compiler/GHC/Parser/HaddockLex.x @@ -0,0 +1,201 @@ +{ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# OPTIONS_GHC -funbox-strict-fields #-} + +module GHC.Parser.HaddockLex (lexHsDoc, lexStringLiteral) where + +import GHC.Prelude + +import GHC.Data.FastString +import GHC.Hs.Doc +import GHC.Parser.Lexer +import GHC.Parser.Annotation +import GHC.Types.SrcLoc +import GHC.Types.SourceText +import GHC.Data.StringBuffer +import qualified GHC.Data.Strict as Strict +import GHC.Types.Name.Reader +import GHC.Utils.Outputable +import GHC.Utils.Error +import GHC.Utils.Encoding +import GHC.Hs.Extension + +import qualified GHC.Data.EnumSet as EnumSet + +import Data.Maybe +import Data.Word + +import Data.ByteString ( ByteString ) +import qualified Data.ByteString as BS + +import qualified GHC.LanguageExtensions as LangExt +} + +-- ----------------------------------------------------------------------------- +-- Alex "Character set macros" +-- Copied from GHC/Parser/Lexer.x + +-- NB: The logic behind these definitions is also reflected in "GHC.Utils.Lexeme" +-- Any changes here should likely be reflected there. +$unispace = \x05 -- Trick Alex into handling Unicode. See Note [Unicode in Alex]. +$nl = [\n\r\f] +$whitechar = [$nl\v\ $unispace] +$white_no_nl = $whitechar # \n -- TODO #8424 +$tab = \t + +$ascdigit = 0-9 +$unidigit = \x03 -- Trick Alex into handling Unicode. See Note [Unicode in Alex]. +$decdigit = $ascdigit -- exactly $ascdigit, no more no less. +$digit = [$ascdigit $unidigit] + +$special = [\(\)\,\;\[\]\`\{\}] +$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:] +$unisymbol = \x04 -- Trick Alex into handling Unicode. See Note [Unicode in Alex]. +$symbol = [$ascsymbol $unisymbol] # [$special \_\"\'] + +$unilarge = \x01 -- Trick Alex into handling Unicode. See Note [Unicode in Alex]. +$asclarge = [A-Z] +$large = [$asclarge $unilarge] + +$unismall = \x02 -- Trick Alex into handling Unicode. See Note [Unicode in Alex]. +$ascsmall = [a-z] +$small = [$ascsmall $unismall \_] + +$uniidchar = \x07 -- Trick Alex into handling Unicode. See Note [Unicode in Alex]. +$idchar = [$small $large $digit $uniidchar \'] + +$unigraphic = \x06 -- Trick Alex into handling Unicode. See Note [Unicode in Alex]. +$graphic = [$small $large $symbol $digit $idchar $special $unigraphic \"\'] + +$alpha = [$small $large] + +-- The character sets marked "TODO" are mostly overly inclusive +-- and should be defined more precisely once alex has better +-- support for unicode character sets (see +-- https://github.com/simonmar/alex/issues/126). + +@id = $alpha $idchar* \#* | $symbol+ +@modname = $large $idchar* +@qualid = (@modname \.)* @id + +:- + \' @qualid \' | \` @qualid \` { getIdentifier 1 } + \'\` @qualid \`\' | \'\( @qualid \)\' | \`\( @qualid \)\` { getIdentifier 2 } + [. \n] ; + +{ +data AlexInput = AlexInput + { alexInput_position :: !RealSrcLoc + , alexInput_string :: !ByteString + } + +-- NB: As long as we don't use a left-context we don't need to track the +-- previous input character. +alexInputPrevChar :: AlexInput -> Word8 +alexInputPrevChar = error "Left-context not supported" + +alexGetByte :: AlexInput -> Maybe (Word8, AlexInput) +alexGetByte (AlexInput p s) = case utf8UnconsByteString s of + Nothing -> Nothing + Just (c,bs) -> Just (adjustChar c, AlexInput (advanceSrcLoc p c) bs) + +alexScanTokens :: RealSrcLoc -> ByteString -> [(RealSrcSpan, ByteString)] +alexScanTokens start str0 = go (AlexInput start str0) + where go inp@(AlexInput pos str) = + case alexScan inp 0 of + AlexSkip inp' _ln -> go inp' + AlexToken inp'@(AlexInput _ str') _ act -> act pos (BS.length str - BS.length str') str : go inp' + AlexEOF -> [] + AlexError (AlexInput p _) -> error $ "lexical error at " ++ show p + +-------------------------------------------------------------------------------- + +-- | Extract identifier from Alex state. +getIdentifier :: Int -- ^ adornment length + -> RealSrcLoc + -> Int + -- ^ Token length + -> ByteString + -- ^ The remaining input beginning with the found token + -> (RealSrcSpan, ByteString) +getIdentifier !i !loc0 !len0 !s0 = + (mkRealSrcSpan loc1 loc2, ident) + where + (adornment, s1) = BS.splitAt i s0 + ident = BS.take (len0 - 2*i) s1 + loc1 = advanceSrcLocBS loc0 adornment + loc2 = advanceSrcLocBS loc1 ident + +advanceSrcLocBS :: RealSrcLoc -> ByteString -> RealSrcLoc +advanceSrcLocBS !loc bs = case utf8UnconsByteString bs of + Nothing -> loc + Just (c, bs') -> advanceSrcLocBS (advanceSrcLoc loc c) bs' + +-- | Lex 'StringLiteral' for warning messages +lexStringLiteral :: P (LocatedN RdrName) -- ^ A precise identifier parser + -> Located StringLiteral + -> Located (WithHsDocIdentifiers StringLiteral GhcPs) +lexStringLiteral identParser (L l sl@(StringLiteral _ fs _)) + = L l (WithHsDocIdentifiers sl idents) + where + bs = bytesFS fs + + idents = mapMaybe (uncurry (validateIdentWith identParser)) plausibleIdents + + plausibleIdents :: [(SrcSpan,ByteString)] + plausibleIdents = case l of + RealSrcSpan span _ -> [(RealSrcSpan span' Strict.Nothing, tok) | (span', tok) <- alexScanTokens (realSrcSpanStart span) bs] + UnhelpfulSpan reason -> [(UnhelpfulSpan reason, tok) | (_, tok) <- alexScanTokens fakeLoc bs] + + fakeLoc = mkRealSrcLoc (mkFastString "") 0 0 + +-- | Lex identifiers from a docstring. +lexHsDoc :: P (LocatedN RdrName) -- ^ A precise identifier parser + -> HsDocString + -> HsDoc GhcPs +lexHsDoc identParser doc = + WithHsDocIdentifiers doc idents + where + docStrings = docStringChunks doc + idents = concat [mapMaybe maybeDocIdentifier (plausibleIdents doc) | doc <- docStrings] + + maybeDocIdentifier :: (SrcSpan, ByteString) -> Maybe (Located RdrName) + maybeDocIdentifier = uncurry (validateIdentWith identParser) + + plausibleIdents :: LHsDocStringChunk -> [(SrcSpan,ByteString)] + plausibleIdents (L (RealSrcSpan span _) (HsDocStringChunk s)) + = [(RealSrcSpan span' Strict.Nothing, tok) | (span', tok) <- alexScanTokens (realSrcSpanStart span) s] + plausibleIdents (L (UnhelpfulSpan reason) (HsDocStringChunk s)) + = [(UnhelpfulSpan reason, tok) | (_, tok) <- alexScanTokens fakeLoc s] -- preserve the original reason + + fakeLoc = mkRealSrcLoc (mkFastString "") 0 0 + +validateIdentWith :: P (LocatedN RdrName) -> SrcSpan -> ByteString -> Maybe (Located RdrName) +validateIdentWith identParser mloc str0 = + let -- These ParserFlags should be as "inclusive" as possible, allowing + -- identifiers defined with any language extension. + pflags = mkParserOpts + (EnumSet.fromList [LangExt.MagicHash]) + dopts + [] + False False False False + dopts = DiagOpts + { diag_warning_flags = EnumSet.empty + , diag_fatal_warning_flags = EnumSet.empty + , diag_warn_is_error = False + , diag_reverse_errors = False + , diag_max_errors = Nothing + , diag_ppr_ctx = defaultSDocContext + } + buffer = stringBufferFromByteString str0 + realSrcLc = case mloc of + RealSrcSpan loc _ -> realSrcSpanStart loc + UnhelpfulSpan _ -> mkRealSrcLoc (mkFastString "") 0 0 + pstate = initParserState pflags buffer realSrcLc + in case unP identParser pstate of + POk _ name -> Just $ case mloc of + RealSrcSpan _ _ -> reLoc name + UnhelpfulSpan _ -> L mloc (unLoc name) -- Preserve the original reason + _ -> Nothing +} |
