5 files changed, 297 insertions, 222 deletions
diff --git a/scripts/check_sources.py b/scripts/check_sources.py
index 94c2c15d..5f233887 100755
--- a/scripts/check_sources.py
+++ b/scripts/check_sources.py
@@ -23,8 +23,10 @@ from os.path import join, splitext, abspath
 
 checkers = {}
 
+
 def checker(*suffixes, **kwds):
     only_pkg = kwds.pop('only_pkg', False)
+
     def deco(func):
         for suffix in suffixes:
             checkers.setdefault(suffix, []).append(func)
@@ -38,8 +40,6 @@ copyright_re = re.compile(r'^    :copyright: Copyright 2006-2014 by '
                           r'the Pygments team, see AUTHORS\.$', re.UNICODE)
 copyright_2_re = re.compile(r'^                %s(, %s)*[,.]$' %
                             (name_mail_re, name_mail_re), re.UNICODE)
-coding_re    = re.compile(r'coding[:=]\s*([-\w.]+)')
-not_ix_re    = re.compile(r'\bnot\s+\S+?\s+i[sn]\s\S+')
 is_const_re  = re.compile(r'if.*?==\s+(None|False|True)\b')
 
 misspellings = ["developement", "adress", "verificate",  # ALLOW-MISSPELLING
@@ -48,44 +48,30 @@ misspellings = ["developement", "adress", "verificate",  # ALLOW-MISSPELLING
 
 @checker('.py')
 def check_syntax(fn, lines):
+    if '#!/' in lines[0]:
+        lines = lines[1:]
+    if 'coding:' in lines[0]:
+        lines = lines[1:]
     try:
-        compile(''.join(lines), fn, "exec")
+        compile('\n'.join(lines), fn, "exec")
     except SyntaxError as err:
         yield 0, "not compilable: %s" % err
 
 
 @checker('.py')
 def check_style_and_encoding(fn, lines):
-    encoding = 'ascii'
     for lno, line in enumerate(lines):
-        if len(line) > 90:
+        if len(line) > 110:
             yield lno+1, "line too long"
-        m = not_ix_re.search(line)
-        if m:
-            yield lno+1, '"' + m.group() + '"'
         if is_const_re.search(line):
             yield lno+1, 'using == None/True/False'
-        if lno < 2:
-            co = coding_re.search(line)
-            if co:
-                encoding = co.group(1)
-        try:
-            line.decode(encoding)
-        except AttributeError:
-            # Python 3 - encoding was already checked
-            pass
-        except UnicodeDecodeError as err:
-            yield lno+1, "not decodable: %s\n   Line: %r" % (err, line)
-        except LookupError as err:
-            yield 0, "unknown encoding: %s" % encoding
-            encoding = 'latin1'
 
 
 @checker('.py', only_pkg=True)
 def check_fileheader(fn, lines):
     # line number correction
     c = 1
-    if lines[0:1] == ['#!/usr/bin/env python\n']:
+    if lines[0:1] == ['#!/usr/bin/env python']:
         lines = lines[1:]
         c = 2
 
@@ -94,31 +80,28 @@ def check_fileheader(fn, lines):
     for lno, l in enumerate(lines):
         llist.append(l)
         if lno == 0:
-            if l == '# -*- coding: rot13 -*-\n':
-                # special-case pony package
-                return
-            elif l != '# -*- coding: utf-8 -*-\n':
+            if l != '# -*- coding: utf-8 -*-':
                 yield 1, "missing coding declaration"
         elif lno == 1:
-            if l != '"""\n' and l != 'r"""\n':
+            if l != '"""' and l != 'r"""':
                 yield 2, 'missing docstring begin (""")'
             else:
                 docopen = True
         elif docopen:
-            if l == '"""\n':
+            if l == '"""':
                 # end of docstring
                 if lno <= 4:
                     yield lno+c, "missing module name in docstring"
                 break
 
-            if l != "\n" and l[:4] != '    ' and docopen:
+            if l != "" and l[:4] != '    ' and docopen:
                 yield lno+c, "missing correct docstring indentation"
 
             if lno == 2:
                 # if not in package, don't check the module name
                 modname = fn[:-3].replace('/', '.').replace('.__init__', '')
                 while modname:
-                    if l.lower()[4:-1] == modname:
+                    if l.lower()[4:] == modname:
                         break
                     modname = '.'.join(modname.split('.')[1:])
                 else:
@@ -133,7 +116,7 @@ def check_fileheader(fn, lines):
 
     # check for copyright and license fields
     license = llist[-2:-1]
-    if license != ["    :license: BSD, see LICENSE for details.\n"]:
+    if license != ["    :license: BSD, see LICENSE for details."]:
         yield 0, "no correct license info"
 
     ci = -3
@@ -176,16 +159,19 @@ def main(argv):
     for root, dirs, files in os.walk(path):
         if '.hg' in dirs:
             dirs.remove('.hg')
+        if 'examplefiles' in dirs:
+            dirs.remove('examplefiles')
         if '-i' in opts and abspath(root) in opts['-i']:
             del dirs[:]
             continue
         # XXX: awkward: for the Makefile call: don't check non-package
         #      files for file headers
-        in_pocoo_pkg = root.startswith('./pygments')
+        in_pygments_pkg = root.startswith('./pygments')
         for fn in files:
 
             fn = join(root, fn)
-            if fn[:2] == './': fn = fn[2:]
+            if fn[:2] == './':
+                fn = fn[2:]
 
             if '-i' in opts and abspath(fn) in opts['-i']:
                 continue
@@ -199,15 +185,14 @@ def main(argv):
                 print("Checking %s..." % fn)
 
             try:
-                f = open(fn, 'r')
-                lines = list(f)
+                lines = open(fn, 'rb').read().decode('utf-8').splitlines()
             except (IOError, OSError) as err:
                 print("%s: cannot open: %s" % (fn, err))
                 num += 1
                 continue
 
             for checker in checkerlist:
-                if not in_pocoo_pkg and checker.only_pkg:
+                if not in_pygments_pkg and checker.only_pkg:
                     continue
                 for lno, msg in checker(fn, lines):
                     print(u"%s:%d: %s" % (fn, lno, msg), file=out)
diff --git a/scripts/debug_lexer.py b/scripts/debug_lexer.py
new file mode 100755
index 00000000..dfc28ce2
--- /dev/null
+++ b/scripts/debug_lexer.py
@@ -0,0 +1,233 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+    Lexing error finder
+    ~~~~~~~~~~~~~~~~~~~
+
+    For the source files given on the command line, display
+    the text where Error tokens are being generated, along
+    with some context.
+
+    :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+from __future__ import print_function
+
+import os
+import sys
+
+# always prefer Pygments from source if exists
+srcpath = os.path.join(os.path.dirname(__file__), '..')
+if os.path.isdir(os.path.join(srcpath, 'pygments')):
+    sys.path.insert(0, srcpath)
+
+
+from pygments.lexer import RegexLexer, ProfilingRegexLexer, ProfilingRegexLexerMeta
+from pygments.lexers import get_lexer_by_name, find_lexer_class, \
+    find_lexer_class_for_filename
+from pygments.token import Error, Text, _TokenType
+from pygments.cmdline import _parse_options
+
+
+class DebuggingRegexLexer(RegexLexer):
+    """Make the state stack, position and current match instance attributes."""
+
+    def get_tokens_unprocessed(self, text, stack=('root',)):
+        """
+        Split ``text`` into (tokentype, text) pairs.
+
+        ``stack`` is the inital stack (default: ``['root']``)
+        """
+        self.pos = 0
+        tokendefs = self._tokens
+        self.statestack = list(stack)
+        statetokens = tokendefs[self.statestack[-1]]
+        while 1:
+            for rexmatch, action, new_state in statetokens:
+                self.m = m = rexmatch(text, self.pos)
+                if m:
+                    if action is not None:
+                        if type(action) is _TokenType:
+                            yield self.pos, action, m.group()
+                        else:
+                            for item in action(self, m):
+                                yield item
+                    self.pos = m.end()
+                    if new_state is not None:
+                        # state transition
+                        if isinstance(new_state, tuple):
+                            for state in new_state:
+                                if state == '#pop':
+                                    self.statestack.pop()
+                                elif state == '#push':
+                                    self.statestack.append(self.statestack[-1])
+                                else:
+                                    self.statestack.append(state)
+                        elif isinstance(new_state, int):
+                            # pop
+                            del self.statestack[new_state:]
+                        elif new_state == '#push':
+                            self.statestack.append(self.statestack[-1])
+                        else:
+                            assert False, 'wrong state def: %r' % new_state
+                        statetokens = tokendefs[self.statestack[-1]]
+                    break
+            else:
+                try:
+                    if text[self.pos] == '\n':
+                        # at EOL, reset state to 'root'
+                        self.pos += 1
+                        self.statestack = ['root']
+                        statetokens = tokendefs['root']
+                        yield self.pos, Text, u'\n'
+                        continue
+                    yield self.pos, Error, text[self.pos]
+                    self.pos += 1
+                except IndexError:
+                    break
+
+
+def main(fn, lexer=None, options={}):
+    if lexer is not None:
+        lxcls = get_lexer_by_name(lexer).__class__
+    else:
+        lxcls = find_lexer_class_for_filename(os.path.basename(fn))
+        if lxcls is None:
+            name, rest = fn.split('_', 1)
+            lxcls = find_lexer_class(name)
+            if lxcls is None:
+                raise AssertionError('no lexer found for file %r' % fn)
+    debug_lexer = False
+    if profile:
+        # does not work for e.g. ExtendedRegexLexers
+        if lxcls.__bases__ == (RegexLexer,):
+            # yes we can!  (change the metaclass)
+            lxcls.__class__ = ProfilingRegexLexerMeta
+            lxcls.__bases__ = (ProfilingRegexLexer,)
+            lxcls._prof_sort_index = profsort
+    else:
+        if lxcls.__bases__ == (RegexLexer,):
+            lxcls.__bases__ = (DebuggingRegexLexer,)
+            debug_lexer = True
+        elif lxcls.__bases__ == (DebuggingRegexLexer,):
+            # already debugged before
+            debug_lexer = True
+        else:
+            # HACK: ExtendedRegexLexer subclasses will only partially work here.
+            lxcls.__bases__ = (DebuggingRegexLexer,)
+            debug_lexer = True
+
+    lx = lxcls(**options)
+    lno = 1
+    if fn == '-':
+        text = sys.stdin.read()
+    else:
+        with open(fn, 'rb') as fp:
+            text = fp.read().decode('utf-8')
+    text = text.strip('\n') + '\n'
+    tokens = []
+    states = []
+
+    def show_token(tok, state):
+        reprs = list(map(repr, tok))
+        print('   ' + reprs[1] + ' ' + ' ' * (29-len(reprs[1])) + reprs[0], end=' ')
+        if debug_lexer:
+            print(' ' + ' ' * (29-len(reprs[0])) + repr(state), end=' ')
+        print()
+
+    for type, val in lx.get_tokens(text):
+        lno += val.count('\n')
+        if type == Error:
+            print('Error parsing', fn, 'on line', lno)
+            print('Previous tokens' + (debug_lexer and ' and states' or '') + ':')
+            if showall:
+                for tok, state in map(None, tokens, states):
+                    show_token(tok, state)
+            else:
+                for i in range(max(len(tokens) - num, 0), len(tokens)):
+                    if debug_lexer:
+                        show_token(tokens[i], states[i])
+                    else:
+                        show_token(tokens[i], None)
+            print('Error token:')
+            l = len(repr(val))
+            print('   ' + repr(val), end=' ')
+            if debug_lexer and hasattr(lx, 'statestack'):
+                print(' ' * (60-l) + repr(lx.statestack), end=' ')
+            print()
+            print()
+            return 1
+        tokens.append((type, val))
+        if debug_lexer:
+            if hasattr(lx, 'statestack'):
+                states.append(lx.statestack[:])
+            else:
+                states.append(None)
+    if showall:
+        for tok, state in zip(tokens, states):
+            show_token(tok, state)
+    return 0
+
+
+def print_help():
+    print('''\
+Pygments development helper to quickly debug lexers.
+
+    scripts/debug_lexer.py [options] file ...
+
+Give one or more filenames to lex them and display possible error tokens
+and/or profiling info.  Files are assumed to be encoded in UTF-8.
+
+Selecting lexer and options:
+
+    -l NAME         use lexer named NAME (default is to guess from
+                    the given filenames)
+    -O OPTIONSTR    use lexer options parsed from OPTIONSTR
+
+Debugging lexing errors:
+
+    -n N            show the last N tokens on error
+    -a              always show all lexed tokens (default is only
+                    to show them when an error occurs)
+
+Profiling:
+
+    -p              use the ProfilingRegexLexer to profile regexes
+                    instead of the debugging lexer
+    -s N            sort profiling output by column N (default is
+                    column 4, the time per call)
+''')
+
+num = 10
+showall = False
+lexer = None
+options = {}
+profile = False
+profsort = 4
+
+if __name__ == '__main__':
+    import getopt
+    opts, args = getopt.getopt(sys.argv[1:], 'n:l:apO:s:h')
+    for opt, val in opts:
+        if opt == '-n':
+            num = int(val)
+        elif opt == '-a':
+            showall = True
+        elif opt == '-l':
+            lexer = val
+        elif opt == '-p':
+            profile = True
+        elif opt == '-s':
+            profsort = int(val)
+        elif opt == '-O':
+            options = _parse_options([val])
+        elif opt == '-h':
+            print_help()
+            sys.exit(0)
+    ret = 0
+    if not args:
+        print_help()
+    for f in args:
+        ret += main(f, lexer, options)
+    sys.exit(bool(ret))
diff --git a/scripts/find_codetags.py b/scripts/find_codetags.py
index f8204e6e..5063f61f 100755
--- a/scripts/find_codetags.py
+++ b/scripts/find_codetags.py
@@ -39,12 +39,12 @@ def escape_html(text):
 
 def process_file(store, filename):
     try:
-        f = open(filename, 'r')
+        fp = open(filename, 'r')
     except (IOError, OSError):
         return False
     llmatch = 0
-    try:
-        for lno, line in enumerate(f):
+    with fp:
+        for lno, line in enumerate(fp):
             # just some random heuristics to filter out binary files
             if lno < 100 and binary_re.search(line):
                 return False
@@ -69,8 +69,6 @@ def process_file(store, filename):
                         continue
                 llmatch = 0
         return True
-    finally:
-        f.close()
 
 
 def main():
@@ -198,13 +196,13 @@ td { padding: 2px 5px 2px 5px;
           '<td class="tag %%(tag)s">%%(tag)s</td>'
           '<td class="who">%%(who)s</td><td class="what">%%(what)s</td></tr>')
 
-    f = open(output, 'w')
     table = '\n'.join(TABLE % fname +
                       '\n'.join(TR % (no % 2,) % entry
                                 for no, entry in enumerate(store[fname]))
                       for fname in sorted(store))
-    f.write(HTML % (', '.join(map(abspath, args)), table))
-    f.close()
+
+    with open(output, 'w') as fp:
+        fp.write(HTML % (', '.join(map(abspath, args)), table))
 
     print("Report written to %s." % output)
     return 0
diff --git a/scripts/find_error.py b/scripts/find_error.py
index 7aaa9bee..ba0b76f1 100755..120000
--- a/scripts/find_error.py
+++ b/scripts/find_error.py
@@ -1,173 +1 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
-    Lexing error finder
-    ~~~~~~~~~~~~~~~~~~~
-
-    For the source files given on the command line, display
-    the text where Error tokens are being generated, along
-    with some context.
-
-    :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
-    :license: BSD, see LICENSE for details.
-"""
-
-from __future__ import print_function
-
-import os
-import sys
-
-# always prefer Pygments from source if exists
-srcpath = os.path.join(os.path.dirname(__file__), '..')
-if os.path.isdir(os.path.join(srcpath, 'pygments')):
-    sys.path.insert(0, srcpath)
-
-
-from pygments.lexer import RegexLexer
-from pygments.lexers import get_lexer_for_filename, get_lexer_by_name
-from pygments.token import Error, Text, _TokenType
-from pygments.cmdline import _parse_options
-
-
-class DebuggingRegexLexer(RegexLexer):
-    """Make the state stack, position and current match instance attributes."""
-
-    def get_tokens_unprocessed(self, text, stack=('root',)):
-        """
-        Split ``text`` into (tokentype, text) pairs.
-
-        ``stack`` is the inital stack (default: ``['root']``)
-        """
-        self.pos = 0
-        tokendefs = self._tokens
-        self.statestack = list(stack)
-        statetokens = tokendefs[self.statestack[-1]]
-        while 1:
-            for rexmatch, action, new_state in statetokens:
-                self.m = m = rexmatch(text, self.pos)
-                if m:
-                    if type(action) is _TokenType:
-                        yield self.pos, action, m.group()
-                    else:
-                        for item in action(self, m):
-                            yield item
-                    self.pos = m.end()
-                    if new_state is not None:
-                        # state transition
-                        if isinstance(new_state, tuple):
-                            for state in new_state:
-                                if state == '#pop':
-                                    self.statestack.pop()
-                                elif state == '#push':
-                                    self.statestack.append(self.statestack[-1])
-                                else:
-                                    self.statestack.append(state)
-                        elif isinstance(new_state, int):
-                            # pop
-                            del self.statestack[new_state:]
-                        elif new_state == '#push':
-                            self.statestack.append(self.statestack[-1])
-                        else:
-                            assert False, 'wrong state def: %r' % new_state
-                        statetokens = tokendefs[self.statestack[-1]]
-                    break
-            else:
-                try:
-                    if text[self.pos] == '\n':
-                        # at EOL, reset state to 'root'
-                        self.pos += 1
-                        self.statestack = ['root']
-                        statetokens = tokendefs['root']
-                        yield self.pos, Text, u'\n'
-                        continue
-                    yield self.pos, Error, text[self.pos]
-                    self.pos += 1
-                except IndexError:
-                    break
-
-
-def main(fn, lexer=None, options={}):
-    if lexer is not None:
-        lx = get_lexer_by_name(lexer)
-    else:
-        try:
-            lx = get_lexer_for_filename(os.path.basename(fn), **options)
-        except ValueError:
-            try:
-                name, rest = fn.split('_', 1)
-                lx = get_lexer_by_name(name, **options)
-            except ValueError:
-                raise AssertionError('no lexer found for file %r' % fn)
-    debug_lexer = False
-    # does not work for e.g. ExtendedRegexLexers
-    if lx.__class__.__bases__ == (RegexLexer,):
-        lx.__class__.__bases__ = (DebuggingRegexLexer,)
-        debug_lexer = True
-    elif lx.__class__.__bases__ == (DebuggingRegexLexer,):
-        # already debugged before
-        debug_lexer = True
-    lno = 1
-    text = open(fn, 'U').read()
-    text = text.strip('\n') + '\n'
-    tokens = []
-    states = []
-
-    def show_token(tok, state):
-        reprs = map(repr, tok)
-        print('   ' + reprs[1] + ' ' + ' ' * (29-len(reprs[1])) + reprs[0], end=' ')
-        if debug_lexer:
-            print(' ' + ' ' * (29-len(reprs[0])) + repr(state), end=' ')
-        print()
-
-    for type, val in lx.get_tokens(text):
-        lno += val.count('\n')
-        if type == Error:
-            print('Error parsing', fn, 'on line', lno)
-            print('Previous tokens' + (debug_lexer and ' and states' or '') + ':')
-            if showall:
-                for tok, state in map(None, tokens, states):
-                    show_token(tok, state)
-            else:
-                for i in range(max(len(tokens) - num, 0), len(tokens)):
-                    show_token(tokens[i], states[i])
-            print('Error token:')
-            l = len(repr(val))
-            print('   ' + repr(val), end=' ')
-            if debug_lexer and hasattr(lx, 'statestack'):
-                print(' ' * (60-l) + repr(lx.statestack), end=' ')
-            print()
-            print()
-            return 1
-        tokens.append((type, val))
-        if debug_lexer:
-            if hasattr(lx, 'statestack'):
-                states.append(lx.statestack[:])
-            else:
-                states.append(None)
-    if showall:
-        for tok, state in map(None, tokens, states):
-            show_token(tok, state)
-    return 0
-
-
-num = 10
-showall = False
-lexer = None
-options = {}
-
-if __name__ == '__main__':
-    import getopt
-    opts, args = getopt.getopt(sys.argv[1:], 'n:l:aO:')
-    for opt, val in opts:
-        if opt == '-n':
-            num = int(val)
-        elif opt == '-a':
-            showall = True
-        elif opt == '-l':
-            lexer = val
-        elif opt == '-O':
-            options = _parse_options([val])
-    ret = 0
-    for f in args:
-        ret += main(f, lexer, options)
-    sys.exit(bool(ret))
+debug_lexer.py
+\ No newline at end of file
diff --git a/scripts/get_vimkw.py b/scripts/get_vimkw.py
index 4ea302f4..fc4d5ec6 100644
--- a/scripts/get_vimkw.py
+++ b/scripts/get_vimkw.py
@@ -1,13 +1,42 @@
 from __future__ import print_function
+
 import re
 
+from pygments.util import format_lines
+
 r_line = re.compile(r"^(syn keyword vimCommand contained|syn keyword vimOption "
                     r"contained|syn keyword vimAutoEvent contained)\s+(.*)")
 r_item = re.compile(r"(\w+)(?:\[(\w+)\])?")
 
+HEADER = '''\
+# -*- coding: utf-8 -*-
+"""
+    pygments.lexers._vim_builtins
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    This file is autogenerated by scripts/get_vimkw.py
+
+    :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+# Split up in multiple functions so it's importable by jython, which has a
+# per-method size limit.
+'''
+
+METHOD = '''\
+def _get%(key)s():
+%(body)s
+    return var
+%(key)s = _get%(key)s()
+'''
+
 def getkw(input, output):
     out = file(output, 'w')
 
+    # Copy template from an existing file.
+    print(HEADER, file=out)
+
     output_info = {'command': [], 'option': [], 'auto': []}
     for line in file(input):
         m = r_line.match(line)
@@ -29,9 +58,10 @@ def getkw(input, output):
     output_info['option'].append("('inoremap','inoremap')")
     output_info['option'].append("('vnoremap','vnoremap')")
 
-    for a, b in output_info.items():
-        b.sort()
-        print('%s=[%s]' % (a, ','.join(b)), file=out)
+    for key, keywordlist in output_info.items():
+        keywordlist.sort()
+        body = format_lines('var', keywordlist, raw=True, indent_level=1)
+        print(METHOD % locals(), file=out)
 
 def is_keyword(w, keywords):
     for i in range(len(w), 0, -1):
@@ -40,4 +70,5 @@ def is_keyword(w, keywords):
     return False
 
 if __name__ == "__main__":
-    getkw("/usr/share/vim/vim73/syntax/vim.vim", "temp.py")
+    getkw("/usr/share/vim/vim74/syntax/vim.vim",
+          "pygments/lexers/_vim_builtins.py")