Merged in leodemoura/pygments-main (pull request #399)

author: Georg Brandl <georg@python.org> 2014-10-08 09:21:15 +0200
committer: Georg Brandl <georg@python.org> 2014-10-08 09:21:15 +0200
commit: 444fb6fd9b3492040a36fcca672fee8175f8d603 (patch)
tree: 2bd411ca78decf276b95dc6b1788e594b2e35287 /pygments/lexers/ml.py
parent: 491fec23ef01687906f5d71ee718522cd2917926 (diff)
parent: c1bfe4eed3805d3556bffa3c6b9cc2d3f6976205 (diff)
download: pygments-444fb6fd9b3492040a36fcca672fee8175f8d603.tar.gz
1 files changed, 768 insertions, 0 deletions
diff --git a/pygments/lexers/ml.py b/pygments/lexers/ml.py
new file mode 100644
index 00000000..81de39b9
--- /dev/null
+++ b/pygments/lexers/ml.py
@@ -0,0 +1,768 @@
+# -*- coding: utf-8 -*-
+"""
+    pygments.lexers.ml
+    ~~~~~~~~~~~~~~~~~~
+
+    Lexers for ML family languages.
+
+    :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import re
+
+from pygments.lexer import RegexLexer, include, bygroups, default, words
+from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
+    Number, Punctuation, Error
+
+__all__ = ['SMLLexer', 'OcamlLexer', 'OpaLexer']
+
+
+class SMLLexer(RegexLexer):
+    """
+    For the Standard ML language.
+
+    .. versionadded:: 1.5
+    """
+
+    name = 'Standard ML'
+    aliases = ['sml']
+    filenames = ['*.sml', '*.sig', '*.fun']
+    mimetypes = ['text/x-standardml', 'application/x-standardml']
+
+    alphanumid_reserved = set((
+        # Core
+        'abstype', 'and', 'andalso', 'as', 'case', 'datatype', 'do', 'else',
+        'end', 'exception', 'fn', 'fun', 'handle', 'if', 'in', 'infix',
+        'infixr', 'let', 'local', 'nonfix', 'of', 'op', 'open', 'orelse',
+        'raise', 'rec', 'then', 'type', 'val', 'with', 'withtype', 'while',
+        # Modules
+        'eqtype', 'functor', 'include', 'sharing', 'sig', 'signature',
+        'struct', 'structure', 'where',
+    ))
+
+    symbolicid_reserved = set((
+        # Core
+        ':', '\|', '=', '=>', '->', '#',
+        # Modules
+        ':>',
+    ))
+
+    nonid_reserved = set(('(', ')', '[', ']', '{', '}', ',', ';', '...', '_'))
+
+    alphanumid_re = r"[a-zA-Z][\w']*"
+    symbolicid_re = r"[!%&$#+\-/:<=>?@\\~`^|*]+"
+
+    # A character constant is a sequence of the form #s, where s is a string
+    # constant denoting a string of size one character. This setup just parses
+    # the entire string as either a String.Double or a String.Char (depending
+    # on the argument), even if the String.Char is an erronous
+    # multiple-character string.
+    def stringy(whatkind):
+        return [
+            (r'[^"\\]', whatkind),
+            (r'\\[\\\"abtnvfr]', String.Escape),
+            # Control-character notation is used for codes < 32,
+            # where \^@ == \000
+            (r'\\\^[\x40-\x5e]', String.Escape),
+            # Docs say 'decimal digits'
+            (r'\\[0-9]{3}', String.Escape),
+            (r'\\u[0-9a-fA-F]{4}', String.Escape),
+            (r'\\\s+\\', String.Interpol),
+            (r'"', whatkind, '#pop'),
+        ]
+
+    # Callbacks for distinguishing tokens and reserved words
+    def long_id_callback(self, match):
+        if match.group(1) in self.alphanumid_reserved:
+            token = Error
+        else:
+            token = Name.Namespace
+        yield match.start(1), token, match.group(1)
+        yield match.start(2), Punctuation, match.group(2)
+
+    def end_id_callback(self, match):
+        if match.group(1) in self.alphanumid_reserved:
+            token = Error
+        elif match.group(1) in self.symbolicid_reserved:
+            token = Error
+        else:
+            token = Name
+        yield match.start(1), token, match.group(1)
+
+    def id_callback(self, match):
+        str = match.group(1)
+        if str in self.alphanumid_reserved:
+            token = Keyword.Reserved
+        elif str in self.symbolicid_reserved:
+            token = Punctuation
+        else:
+            token = Name
+        yield match.start(1), token, str
+
+    tokens = {
+        # Whitespace and comments are (almost) everywhere
+        'whitespace': [
+            (r'\s+', Text),
+            (r'\(\*', Comment.Multiline, 'comment'),
+        ],
+
+        'delimiters': [
+            # This lexer treats these delimiters specially:
+            # Delimiters define scopes, and the scope is how the meaning of
+            # the `|' is resolved - is it a case/handle expression, or function
+            # definition by cases? (This is not how the Definition works, but
+            # it's how MLton behaves, see http://mlton.org/SMLNJDeviations)
+            (r'\(|\[|{', Punctuation, 'main'),
+            (r'\)|\]|}', Punctuation, '#pop'),
+            (r'\b(let|if|local)\b(?!\')', Keyword.Reserved, ('main', 'main')),
+            (r'\b(struct|sig|while)\b(?!\')', Keyword.Reserved, 'main'),
+            (r'\b(do|else|end|in|then)\b(?!\')', Keyword.Reserved, '#pop'),
+        ],
+
+        'core': [
+            # Punctuation that doesn't overlap symbolic identifiers
+            (r'(%s)' % '|'.join(re.escape(z) for z in nonid_reserved),
+             Punctuation),
+
+            # Special constants: strings, floats, numbers in decimal and hex
+            (r'#"', String.Char, 'char'),
+            (r'"', String.Double, 'string'),
+            (r'~?0x[0-9a-fA-F]+', Number.Hex),
+            (r'0wx[0-9a-fA-F]+', Number.Hex),
+            (r'0w\d+', Number.Integer),
+            (r'~?\d+\.\d+[eE]~?\d+', Number.Float),
+            (r'~?\d+\.\d+', Number.Float),
+            (r'~?\d+[eE]~?\d+', Number.Float),
+            (r'~?\d+', Number.Integer),
+
+            # Labels
+            (r'#\s*[1-9][0-9]*', Name.Label),
+            (r'#\s*(%s)' % alphanumid_re, Name.Label),
+            (r'#\s+(%s)' % symbolicid_re, Name.Label),
+            # Some reserved words trigger a special, local lexer state change
+            (r'\b(datatype|abstype)\b(?!\')', Keyword.Reserved, 'dname'),
+            (r'(?=\b(exception)\b(?!\'))', Text, ('ename')),
+            (r'\b(functor|include|open|signature|structure)\b(?!\')',
+             Keyword.Reserved, 'sname'),
+            (r'\b(type|eqtype)\b(?!\')', Keyword.Reserved, 'tname'),
+
+            # Regular identifiers, long and otherwise
+            (r'\'[\w\']*', Name.Decorator),
+            (r'(%s)(\.)' % alphanumid_re, long_id_callback, "dotted"),
+            (r'(%s)' % alphanumid_re, id_callback),
+            (r'(%s)' % symbolicid_re, id_callback),
+        ],
+        'dotted': [
+            (r'(%s)(\.)' % alphanumid_re, long_id_callback),
+            (r'(%s)' % alphanumid_re, end_id_callback, "#pop"),
+            (r'(%s)' % symbolicid_re, end_id_callback, "#pop"),
+            (r'\s+', Error),
+            (r'\S+', Error),
+        ],
+
+
+        # Main parser (prevents errors in files that have scoping errors)
+        'root': [
+            default('main')
+        ],
+
+        # In this scope, I expect '|' to not be followed by a function name,
+        # and I expect 'and' to be followed by a binding site
+        'main': [
+            include('whitespace'),
+
+            # Special behavior of val/and/fun
+            (r'\b(val|and)\b(?!\')', Keyword.Reserved, 'vname'),
+            (r'\b(fun)\b(?!\')', Keyword.Reserved,
+             ('#pop', 'main-fun', 'fname')),
+
+            include('delimiters'),
+            include('core'),
+            (r'\S+', Error),
+        ],
+
+        # In this scope, I expect '|' and 'and' to be followed by a function
+        'main-fun': [
+            include('whitespace'),
+
+            (r'\s', Text),
+            (r'\(\*', Comment.Multiline, 'comment'),
+
+            # Special behavior of val/and/fun
+            (r'\b(fun|and)\b(?!\')', Keyword.Reserved, 'fname'),
+            (r'\b(val)\b(?!\')', Keyword.Reserved,
+             ('#pop', 'main', 'vname')),
+
+            # Special behavior of '|' and '|'-manipulating keywords
+            (r'\|', Punctuation, 'fname'),
+            (r'\b(case|handle)\b(?!\')', Keyword.Reserved,
+             ('#pop', 'main')),
+
+            include('delimiters'),
+            include('core'),
+            (r'\S+', Error),
+        ],
+
+        # Character and string parsers
+        'char': stringy(String.Char),
+        'string': stringy(String.Double),
+
+        'breakout': [
+            (r'(?=\b(%s)\b(?!\'))' % '|'.join(alphanumid_reserved), Text, '#pop'),
+        ],
+
+        # Dealing with what comes after module system keywords
+        'sname': [
+            include('whitespace'),
+            include('breakout'),
+
+            (r'(%s)' % alphanumid_re, Name.Namespace),
+            default('#pop'),
+        ],
+
+        # Dealing with what comes after the 'fun' (or 'and' or '|') keyword
+        'fname': [
+            include('whitespace'),
+            (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
+            (r'\(', Punctuation, 'tyvarseq'),
+
+            (r'(%s)' % alphanumid_re, Name.Function, '#pop'),
+            (r'(%s)' % symbolicid_re, Name.Function, '#pop'),
+
+            # Ignore interesting function declarations like "fun (x + y) = ..."
+            default('#pop'),
+        ],
+
+        # Dealing with what comes after the 'val' (or 'and') keyword
+        'vname': [
+            include('whitespace'),
+            (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
+            (r'\(', Punctuation, 'tyvarseq'),
+
+            (r'(%s)(\s*)(=(?!%s))' % (alphanumid_re, symbolicid_re),
+             bygroups(Name.Variable, Text, Punctuation), '#pop'),
+            (r'(%s)(\s*)(=(?!%s))' % (symbolicid_re, symbolicid_re),
+             bygroups(Name.Variable, Text, Punctuation), '#pop'),
+            (r'(%s)' % alphanumid_re, Name.Variable, '#pop'),
+            (r'(%s)' % symbolicid_re, Name.Variable, '#pop'),
+
+            # Ignore interesting patterns like 'val (x, y)'
+            default('#pop'),
+        ],
+
+        # Dealing with what comes after the 'type' (or 'and') keyword
+        'tname': [
+            include('whitespace'),
+            include('breakout'),
+
+            (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
+            (r'\(', Punctuation, 'tyvarseq'),
+            (r'=(?!%s)' % symbolicid_re, Punctuation, ('#pop', 'typbind')),
+
+            (r'(%s)' % alphanumid_re, Keyword.Type),
+            (r'(%s)' % symbolicid_re, Keyword.Type),
+            (r'\S+', Error, '#pop'),
+        ],
+
+        # A type binding includes most identifiers
+        'typbind': [
+            include('whitespace'),
+
+            (r'\b(and)\b(?!\')', Keyword.Reserved, ('#pop', 'tname')),
+
+            include('breakout'),
+            include('core'),
+            (r'\S+', Error, '#pop'),
+        ],
+
+        # Dealing with what comes after the 'datatype' (or 'and') keyword
+        'dname': [
+            include('whitespace'),
+            include('breakout'),
+
+            (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
+            (r'\(', Punctuation, 'tyvarseq'),
+            (r'(=)(\s*)(datatype)',
+             bygroups(Punctuation, Text, Keyword.Reserved), '#pop'),
+            (r'=(?!%s)' % symbolicid_re, Punctuation,
+             ('#pop', 'datbind', 'datcon')),
+
+            (r'(%s)' % alphanumid_re, Keyword.Type),
+            (r'(%s)' % symbolicid_re, Keyword.Type),
+            (r'\S+', Error, '#pop'),
+        ],
+
+        # common case - A | B | C of int
+        'datbind': [
+            include('whitespace'),
+
+            (r'\b(and)\b(?!\')', Keyword.Reserved, ('#pop', 'dname')),
+            (r'\b(withtype)\b(?!\')', Keyword.Reserved, ('#pop', 'tname')),
+            (r'\b(of)\b(?!\')', Keyword.Reserved),
+
+            (r'(\|)(\s*)(%s)' % alphanumid_re,
+             bygroups(Punctuation, Text, Name.Class)),
+            (r'(\|)(\s+)(%s)' % symbolicid_re,
+             bygroups(Punctuation, Text, Name.Class)),
+
+            include('breakout'),
+            include('core'),
+            (r'\S+', Error),
+        ],
+
+        # Dealing with what comes after an exception
+        'ename': [
+            include('whitespace'),
+
+            (r'(exception|and)\b(\s+)(%s)' % alphanumid_re,
+             bygroups(Keyword.Reserved, Text, Name.Class)),
+            (r'(exception|and)\b(\s*)(%s)' % symbolicid_re,
+             bygroups(Keyword.Reserved, Text, Name.Class)),
+            (r'\b(of)\b(?!\')', Keyword.Reserved),
+
+            include('breakout'),
+            include('core'),
+            (r'\S+', Error),
+        ],
+
+        'datcon': [
+            include('whitespace'),
+            (r'(%s)' % alphanumid_re, Name.Class, '#pop'),
+            (r'(%s)' % symbolicid_re, Name.Class, '#pop'),
+            (r'\S+', Error, '#pop'),
+        ],
+
+        # Series of type variables
+        'tyvarseq': [
+            (r'\s', Text),
+            (r'\(\*', Comment.Multiline, 'comment'),
+
+            (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
+            (alphanumid_re, Name),
+            (r',', Punctuation),
+            (r'\)', Punctuation, '#pop'),
+            (symbolicid_re, Name),
+        ],
+
+        'comment': [
+            (r'[^(*)]', Comment.Multiline),
+            (r'\(\*', Comment.Multiline, '#push'),
+            (r'\*\)', Comment.Multiline, '#pop'),
+            (r'[(*)]', Comment.Multiline),
+        ],
+    }
+
+
+class OcamlLexer(RegexLexer):
+    """
+    For the OCaml language.
+
+    .. versionadded:: 0.7
+    """
+
+    name = 'OCaml'
+    aliases = ['ocaml']
+    filenames = ['*.ml', '*.mli', '*.mll', '*.mly']
+    mimetypes = ['text/x-ocaml']
+
+    keywords = (
+        'as', 'assert', 'begin', 'class', 'constraint', 'do', 'done',
+        'downto', 'else', 'end', 'exception', 'external', 'false',
+        'for', 'fun', 'function', 'functor', 'if', 'in', 'include',
+        'inherit', 'initializer', 'lazy', 'let', 'match', 'method',
+        'module', 'mutable', 'new', 'object', 'of', 'open', 'private',
+        'raise', 'rec', 'sig', 'struct', 'then', 'to', 'true', 'try',
+        'type', 'value', 'val', 'virtual', 'when', 'while', 'with',
+    )
+    keyopts = (
+        '!=', '#', '&', '&&', '\(', '\)', '\*', '\+', ',', '-',
+        '-\.', '->', '\.', '\.\.', ':', '::', ':=', ':>', ';', ';;', '<',
+        '<-', '=', '>', '>]', '>}', '\?', '\?\?', '\[', '\[<', '\[>', '\[\|',
+        ']', '_', '`', '{', '{<', '\|', '\|]', '}', '~'
+    )
+
+    operators = r'[!$%&*+\./:<=>?@^|~-]'
+    word_operators = ('and', 'asr', 'land', 'lor', 'lsl', 'lxor', 'mod', 'or')
+    prefix_syms = r'[!?~]'
+    infix_syms = r'[=<>@^|&+\*/$%-]'
+    primitives = ('unit', 'int', 'float', 'bool', 'string', 'char', 'list', 'array')
+
+    tokens = {
+        'escape-sequence': [
+            (r'\\[\\\"\'ntbr]', String.Escape),
+            (r'\\[0-9]{3}', String.Escape),
+            (r'\\x[0-9a-fA-F]{2}', String.Escape),
+        ],
+        'root': [
+            (r'\s+', Text),
+            (r'false|true|\(\)|\[\]', Name.Builtin.Pseudo),
+            (r'\b([A-Z][\w\']*)(?=\s*\.)', Name.Namespace, 'dotted'),
+            (r'\b([A-Z][\w\']*)', Name.Class),
+            (r'\(\*(?![)])', Comment, 'comment'),
+            (r'\b(%s)\b' % '|'.join(keywords), Keyword),
+            (r'(%s)' % '|'.join(keyopts[::-1]), Operator),
+            (r'(%s|%s)?%s' % (infix_syms, prefix_syms, operators), Operator),
+            (r'\b(%s)\b' % '|'.join(word_operators), Operator.Word),
+            (r'\b(%s)\b' % '|'.join(primitives), Keyword.Type),
+
+            (r"[^\W\d][\w']*", Name),
+
+            (r'-?\d[\d_]*(.[\d_]*)?([eE][+\-]?\d[\d_]*)', Number.Float),
+            (r'0[xX][\da-fA-F][\da-fA-F_]*', Number.Hex),
+            (r'0[oO][0-7][0-7_]*', Number.Oct),
+            (r'0[bB][01][01_]*', Number.Bin),
+            (r'\d[\d_]*', Number.Integer),
+
+            (r"'(?:(\\[\\\"'ntbr ])|(\\[0-9]{3})|(\\x[0-9a-fA-F]{2}))'",
+             String.Char),
+            (r"'.'", String.Char),
+            (r"'", Keyword),  # a stray quote is another syntax element
+
+            (r'"', String.Double, 'string'),
+
+            (r'[~?][a-z][\w\']*:', Name.Variable),
+        ],
+        'comment': [
+            (r'[^(*)]+', Comment),
+            (r'\(\*', Comment, '#push'),
+            (r'\*\)', Comment, '#pop'),
+            (r'[(*)]', Comment),
+        ],
+        'string': [
+            (r'[^\\"]+', String.Double),
+            include('escape-sequence'),
+            (r'\\\n', String.Double),
+            (r'"', String.Double, '#pop'),
+        ],
+        'dotted': [
+            (r'\s+', Text),
+            (r'\.', Punctuation),
+            (r'[A-Z][\w\']*(?=\s*\.)', Name.Namespace),
+            (r'[A-Z][\w\']*', Name.Class, '#pop'),
+            (r'[a-z_][\w\']*', Name, '#pop'),
+        ],
+    }
+
+
+class OpaLexer(RegexLexer):
+    """
+    Lexer for the Opa language (http://opalang.org).
+
+    .. versionadded:: 1.5
+    """
+
+    name = 'Opa'
+    aliases = ['opa']
+    filenames = ['*.opa']
+    mimetypes = ['text/x-opa']
+
+    # most of these aren't strictly keywords
+    # but if you color only real keywords, you might just
+    # as well not color anything
+    keywords = (
+        'and', 'as', 'begin', 'case', 'client', 'css', 'database', 'db', 'do',
+        'else', 'end', 'external', 'forall', 'function', 'if', 'import',
+        'match', 'module', 'or', 'package', 'parser', 'rec', 'server', 'then',
+        'type', 'val', 'with', 'xml_parser',
+    )
+
+    # matches both stuff and `stuff`
+    ident_re = r'(([a-zA-Z_]\w*)|(`[^`]*`))'
+
+    op_re = r'[.=\-<>,@~%/+?*&^!]'
+    punc_re = r'[()\[\],;|]'  # '{' and '}' are treated elsewhere
+                              # because they are also used for inserts
+
+    tokens = {
+        # copied from the caml lexer, should be adapted
+        'escape-sequence': [
+            (r'\\[\\\"\'ntr}]', String.Escape),
+            (r'\\[0-9]{3}', String.Escape),
+            (r'\\x[0-9a-fA-F]{2}', String.Escape),
+        ],
+
+        # factorizing these rules, because they are inserted many times
+        'comments': [
+            (r'/\*', Comment, 'nested-comment'),
+            (r'//.*?$', Comment),
+        ],
+        'comments-and-spaces': [
+            include('comments'),
+            (r'\s+', Text),
+        ],
+
+        'root': [
+            include('comments-and-spaces'),
+            # keywords
+            (words(keywords, prefix=r'\b', suffix=r'\b'), Keyword),
+            # directives
+            # we could parse the actual set of directives instead of anything
+            # starting with @, but this is troublesome
+            # because it needs to be adjusted all the time
+            # and assuming we parse only sources that compile, it is useless
+            (r'@' + ident_re + r'\b', Name.Builtin.Pseudo),
+
+            # number literals
+            (r'-?.[\d]+([eE][+\-]?\d+)', Number.Float),
+            (r'-?\d+.\d*([eE][+\-]?\d+)', Number.Float),
+            (r'-?\d+[eE][+\-]?\d+', Number.Float),
+            (r'0[xX][\da-fA-F]+', Number.Hex),
+            (r'0[oO][0-7]+', Number.Oct),
+            (r'0[bB][01]+', Number.Bin),
+            (r'\d+', Number.Integer),
+            # color literals
+            (r'#[\da-fA-F]{3,6}', Number.Integer),
+
+            # string literals
+            (r'"', String.Double, 'string'),
+            # char literal, should be checked because this is the regexp from
+            # the caml lexer
+            (r"'(?:(\\[\\\"'ntbr ])|(\\[0-9]{3})|(\\x[0-9a-fA-F]{2})|.)'",
+             String.Char),
+
+            # this is meant to deal with embedded exprs in strings
+            # every time we find a '}' we pop a state so that if we were
+            # inside a string, we are back in the string state
+            # as a consequence, we must also push a state every time we find a
+            # '{' or else we will have errors when parsing {} for instance
+            (r'{', Operator, '#push'),
+            (r'}', Operator, '#pop'),
+
+            # html literals
+            # this is a much more strict that the actual parser,
+            # since a<b would not be parsed as html
+            # but then again, the parser is way too lax, and we can't hope
+            # to have something as tolerant
+            (r'<(?=[a-zA-Z>])', String.Single, 'html-open-tag'),
+
+            # db path
+            # matching the '[_]' in '/a[_]' because it is a part
+            # of the syntax of the db path definition
+            # unfortunately, i don't know how to match the ']' in
+            # /a[1], so this is somewhat inconsistent
+            (r'[@?!]?(/\w+)+(\[_\])?', Name.Variable),
+            # putting the same color on <- as on db path, since
+            # it can be used only to mean Db.write
+            (r'<-(?!'+op_re+r')', Name.Variable),
+
+            # 'modules'
+            # although modules are not distinguished by their names as in caml
+            # the standard library seems to follow the convention that modules
+            # only area capitalized
+            (r'\b([A-Z]\w*)(?=\.)', Name.Namespace),
+
+            # operators
+            # = has a special role because this is the only
+            # way to syntactic distinguish binding constructions
+            # unfortunately, this colors the equal in {x=2} too
+            (r'=(?!'+op_re+r')', Keyword),
+            (r'(%s)+' % op_re, Operator),
+            (r'(%s)+' % punc_re, Operator),
+
+            # coercions
+            (r':', Operator, 'type'),
+            # type variables
+            # we need this rule because we don't parse specially type
+            # definitions so in "type t('a) = ...", "'a" is parsed by 'root'
+            ("'"+ident_re, Keyword.Type),
+
+            # id literal, #something, or #{expr}
+            (r'#'+ident_re, String.Single),
+            (r'#(?={)', String.Single),
+
+            # identifiers
+            # this avoids to color '2' in 'a2' as an integer
+            (ident_re, Text),
+
+            # default, not sure if that is needed or not
+            # (r'.', Text),
+        ],
+
+        # it is quite painful to have to parse types to know where they end
+        # this is the general rule for a type
+        # a type is either:
+        # * -> ty
+        # * type-with-slash
+        # * type-with-slash -> ty
+        # * type-with-slash (, type-with-slash)+ -> ty
+        #
+        # the code is pretty funky in here, but this code would roughly
+        # translate in caml to:
+        # let rec type stream =
+        # match stream with
+        # | [< "->";  stream >] -> type stream
+        # | [< "";  stream >] ->
+        #   type_with_slash stream
+        #   type_lhs_1 stream;
+        # and type_1 stream = ...
+        'type': [
+            include('comments-and-spaces'),
+            (r'->', Keyword.Type),
+            default(('#pop', 'type-lhs-1', 'type-with-slash')),
+        ],
+
+        # parses all the atomic or closed constructions in the syntax of type
+        # expressions: record types, tuple types, type constructors, basic type
+        # and type variables
+        'type-1': [
+            include('comments-and-spaces'),
+            (r'\(', Keyword.Type, ('#pop', 'type-tuple')),
+            (r'~?{', Keyword.Type, ('#pop', 'type-record')),
+            (ident_re+r'\(', Keyword.Type, ('#pop', 'type-tuple')),
+            (ident_re, Keyword.Type, '#pop'),
+            ("'"+ident_re, Keyword.Type),
+            # this case is not in the syntax but sometimes
+            # we think we are parsing types when in fact we are parsing
+            # some css, so we just pop the states until we get back into
+            # the root state
+            default('#pop'),
+        ],
+
+        # type-with-slash is either:
+        # * type-1
+        # * type-1 (/ type-1)+
+        'type-with-slash': [
+            include('comments-and-spaces'),
+            default(('#pop', 'slash-type-1', 'type-1')),
+        ],
+        'slash-type-1': [
+            include('comments-and-spaces'),
+            ('/', Keyword.Type, ('#pop', 'type-1')),
+            # same remark as above
+            default('#pop'),
+        ],
+
+        # we go in this state after having parsed a type-with-slash
+        # while trying to parse a type
+        # and at this point we must determine if we are parsing an arrow
+        # type (in which case we must continue parsing) or not (in which
+        # case we stop)
+        'type-lhs-1': [
+            include('comments-and-spaces'),
+            (r'->', Keyword.Type, ('#pop', 'type')),
+            (r'(?=,)', Keyword.Type, ('#pop', 'type-arrow')),
+            default('#pop'),
+        ],
+        'type-arrow': [
+            include('comments-and-spaces'),
+            # the look ahead here allows to parse f(x : int, y : float -> truc)
+            # correctly
+            (r',(?=[^:]*?->)', Keyword.Type, 'type-with-slash'),
+            (r'->', Keyword.Type, ('#pop', 'type')),
+            # same remark as above
+            default('#pop'),
+        ],
+
+        # no need to do precise parsing for tuples and records
+        # because they are closed constructions, so we can simply
+        # find the closing delimiter
+        # note that this function would be not work if the source
+        # contained identifiers like `{)` (although it could be patched
+        # to support it)
+        'type-tuple': [
+            include('comments-and-spaces'),
+            (r'[^\(\)/*]+', Keyword.Type),
+            (r'[/*]', Keyword.Type),
+            (r'\(', Keyword.Type, '#push'),
+            (r'\)', Keyword.Type, '#pop'),
+        ],
+        'type-record': [
+            include('comments-and-spaces'),
+            (r'[^{}/*]+', Keyword.Type),
+            (r'[/*]', Keyword.Type),
+            (r'{', Keyword.Type, '#push'),
+            (r'}', Keyword.Type, '#pop'),
+        ],
+
+        # 'type-tuple': [
+        #     include('comments-and-spaces'),
+        #     (r'\)', Keyword.Type, '#pop'),
+        #     default(('#pop', 'type-tuple-1', 'type-1')),
+        # ],
+        # 'type-tuple-1': [
+        #     include('comments-and-spaces'),
+        #     (r',?\s*\)', Keyword.Type, '#pop'), # ,) is a valid end of tuple, in (1,)
+        #     (r',', Keyword.Type, 'type-1'),
+        # ],
+        # 'type-record':[
+        #     include('comments-and-spaces'),
+        #     (r'}', Keyword.Type, '#pop'),
+        #     (r'~?(?:\w+|`[^`]*`)', Keyword.Type, 'type-record-field-expr'),
+        # ],
+        # 'type-record-field-expr': [
+        #
+        # ],
+
+        'nested-comment': [
+            (r'[^/*]+', Comment),
+            (r'/\*', Comment, '#push'),
+            (r'\*/', Comment, '#pop'),
+            (r'[/*]', Comment),
+        ],
+
+        # the copy pasting between string and single-string
+        # is kinda sad. Is there a way to avoid that??
+        'string': [
+            (r'[^\\"{]+', String.Double),
+            (r'"', String.Double, '#pop'),
+            (r'{', Operator, 'root'),
+            include('escape-sequence'),
+        ],
+        'single-string': [
+            (r'[^\\\'{]+', String.Double),
+            (r'\'', String.Double, '#pop'),
+            (r'{', Operator, 'root'),
+            include('escape-sequence'),
+        ],
+
+        # all the html stuff
+        # can't really reuse some existing html parser
+        # because we must be able to parse embedded expressions
+
+        # we are in this state after someone parsed the '<' that
+        # started the html literal
+        'html-open-tag': [
+            (r'[\w\-:]+', String.Single, ('#pop', 'html-attr')),
+            (r'>', String.Single, ('#pop', 'html-content')),
+        ],
+
+        # we are in this state after someone parsed the '</' that
+        # started the end of the closing tag
+        'html-end-tag': [
+            # this is a star, because </> is allowed
+            (r'[\w\-:]*>', String.Single, '#pop'),
+        ],
+
+        # we are in this state after having parsed '<ident(:ident)?'
+        # we thus parse a possibly empty list of attributes
+        'html-attr': [
+            (r'\s+', Text),
+            (r'[\w\-:]+=', String.Single, 'html-attr-value'),
+            (r'/>', String.Single, '#pop'),
+            (r'>', String.Single, ('#pop', 'html-content')),
+        ],
+
+        'html-attr-value': [
+            (r"'", String.Single, ('#pop', 'single-string')),
+            (r'"', String.Single, ('#pop', 'string')),
+            (r'#'+ident_re, String.Single, '#pop'),
+            (r'#(?={)', String.Single, ('#pop', 'root')),
+            (r'[^"\'{`=<>]+', String.Single, '#pop'),
+            (r'{', Operator, ('#pop', 'root')),  # this is a tail call!
+        ],
+
+        # we should probably deal with '\' escapes here
+        'html-content': [
+            (r'<!--', Comment, 'html-comment'),
+            (r'</', String.Single, ('#pop', 'html-end-tag')),
+            (r'<', String.Single, 'html-open-tag'),
+            (r'{', Operator, 'root'),
+            (r'[^<{]+', String.Single),
+        ],
+
+        'html-comment': [
+            (r'-->', Comment, '#pop'),
+            (r'[^\-]+|-', Comment),
+        ],
+    }
author	Georg Brandl <georg@python.org>	2014-10-08 09:21:15 +0200
committer	Georg Brandl <georg@python.org>	2014-10-08 09:21:15 +0200
commit	444fb6fd9b3492040a36fcca672fee8175f8d603 (patch)
tree	2bd411ca78decf276b95dc6b1788e594b2e35287 /pygments/lexers/ml.py
parent	491fec23ef01687906f5d71ee718522cd2917926 (diff)
parent	c1bfe4eed3805d3556bffa3c6b9cc2d3f6976205 (diff)
download	pygments-444fb6fd9b3492040a36fcca672fee8175f8d603.tar.gz