Ka-Ping's version.

author: Guido van Rossum <guido@python.org> 1997-03-07 00:21:12 +0000
committer: Guido van Rossum <guido@python.org> 1997-03-07 00:21:12 +0000
commit: bd7fd3ce5a1830c14472ae5e2e3b481aaecb0dba (patch)
tree: 43e4c7541ef89f48b937bdd67057747995a56bd1 /Lib/tokenize.py
parent: 76f85743eda96b0457b05d09813ee1775cf04577 (diff)
download: cpython-bd7fd3ce5a1830c14472ae5e2e3b481aaecb0dba.tar.gz
1 files changed, 132 insertions, 45 deletions
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 8f16115410..d6985e0fbc 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -1,63 +1,150 @@
-# This module compiles a regular expression that recognizes Python tokens.
-# It is designed to match the working of the Python tokenizer exactly.
-# It takes care of everything except indentation;
-# note that un-escaped newlines are tokens, too.
-# tokenprog.regs[3] gives the location of the token without whitespace
-# It also defines various subexpressions, but doesn't compile them.
-# See the function test() below for an example of how to use.
+"""tokenize.py (Ka-Ping Yee, 4 March 1997)
 
-import regex
+This module compiles a regular expression that recognizes Python tokens
+in individual lines of text.  The regular expression handles everything
+except indentation, continuations, and triple-quoted strings.  The function
+'tokenize.tokenize()' takes care of these things for streams of text.  It
+accepts a file-like object and a function, uses the readline() method to
+scan the file, and calls the function called once for each token found
+passing its type, a string containing the token, the line number, the line,
+and the starting and ending positions of the token within the line.
+It is designed to match the working of the Python tokenizer exactly."""
 
-# Note: to get a quoted backslash in a regexp, it must be quadrupled.
+import string, regex
+from token import *
 
-Ignore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?'
+def group(*choices): return '\(' + string.join(choices, '\|') + '\)'
 
+Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?'
 Name = '[a-zA-Z_][a-zA-Z0-9_]*'
 
 Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
 Octnumber = '0[0-7]*[lL]?'
 Decnumber = '[1-9][0-9]*[lL]?'
-Intnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber
+Intnumber = group(Hexnumber, Octnumber, Decnumber)
 Exponent = '[eE][-+]?[0-9]+'
-Pointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?'
+Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?'
 Expfloat = '[0-9]+' + Exponent
-Floatnumber = Pointfloat + '\|' + Expfloat
-Number = Floatnumber + '\|' + Intnumber
+Floatnumber = group(Pointfloat, Expfloat)
+Number = group(Floatnumber, Intnumber)
 
-String = '\'\(\\\\.\|[^\\\n\']\)*\'' + '\|' + '"\(\\\\.\|[^\\\n"]\)*"'
-# Note: this module *recognizes* double quotes, but for backward
-# compatibility, it doesn't *use* them!
+Single = group('^\'', '[^\]\'')
+Double = group('^"', '[^\]"')
+Tsingle = group('^\'\'\'', '[^\]\'\'\'')
+Tdouble = group('^"""', '[^\]"""')
+Triple = group('\'\'\'', '"""')
+String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'),
+               '"'  + group('[\].', '[^"\]') + '*' + group('"', '[\]\n'))
 
-Operator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>'
+Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|',
+                 '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
 Bracket = '[][(){}]'
-Special = '[:;.,`\n]'
-Funny = Operator + '\|' + Bracket + '\|' + Special
+Special = group('[\]?\r?\n', '[:;.,`\f]')
+Funny = group(Operator, Bracket, Special)
 
-PlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny
-
-Token = Ignore + '\(' + PlainToken + '\)'
+PlainToken = group(Name, Number, Triple, String, Funny)
+Token = Ignore + PlainToken
 
 try:
-	save_syntax = regex.set_syntax(0) # Use default syntax
-	tokenprog = regex.compile(Token)
+    save_syntax = regex.set_syntax(0)          # use default syntax
+    tokenprog = regex.compile(Token)
+    endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double),
+        '\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) }
 finally:
-	if save_syntax != 0:
-		dummy = regex.set_syntax(save_syntax) # Restore original syntax
-
-
-def test(file):
-	f = open(file, 'r')
-	while 1:
-		line = f.readline()
-		if not line: break
-		i, n = 0, len(line)
-		while i < n:
-			j = tokenprog.match(line, i)
-			if j < 0:
-				print 'No token at', `line[i:i+20]` + '...'
-				i = i+1
-			else:
-				i = i+j
-				a, b = tokenprog.regs[3]
-				if a < b:
-					print 'Token:', `line[a:b]`
+    regex.set_syntax(save_syntax)              # restore original syntax
+
+tabsize = 8
+TokenError = 'TokenError'
+def printtoken(type, string, linenum, line, start, end):   # for testing
+    print `linenum` + ':', tok_name[type], repr(string)
+
+def tokenize(readline, tokeneater = printtoken):
+    linenum = parenlev = continued = 0
+    namechars, numchars = string.letters + '_', string.digits
+    contstr = ''
+    indents = [0]
+    while 1:                                   # loop over lines in stream
+        line = readline()
+        linenum = linenum + 1
+        if line[-2:] == '\r\n': line = line[:-2] + '\n'
+        pos, max = 0, len(line)
+
+        if contstr:                            # continued string
+            if not line: raise TokenError, "EOF within multi-line string"
+            if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n'
+            if endprog.search(line) >= 0:
+                pos = end = endprog.regs[0][1]
+                tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0)
+                contstr = ''
+            else:
+                contstr = contstr + line
+                continue
+
+        elif parenlev == 0 and not continued:  # this is a new statement
+            if not line: break
+            column = 0
+            while 1:                           # measure leading whitespace
+                if line[pos] == ' ': column = column + 1
+                elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize
+                elif line[pos] == '\f': column = 0
+                else: break
+                pos = pos + 1
+            if line[pos] in '#\n': continue    # skip comments or blank lines
+
+            if column > indents[-1]:           # count indents or dedents
+                indents.append(column)
+                tokeneater(INDENT, '\t', linenum, line, 0, 0)
+            while column < indents[-1]:
+                indents = indents[:-1]
+                tokeneater(DEDENT, '\t', linenum, line, 0, 0)
+
+        else:                                  # continued statement
+            if not line: raise TokenError, "EOF within multi-line statement"
+            continued = 0
+
+        while pos < max:
+            if tokenprog.match(line, pos) > 0:             # scan for tokens
+                start, end = tokenprog.regs[3]
+                token = line[start:end]
+                pos = end
+
+                if token[0] in namechars:                  # ordinary name
+                    tokeneater(NAME, token, linenum, line, start, end)
+                elif token[0] in numchars:                 # ordinary number
+                    tokeneater(NUMBER, token, linenum, line, start, end)
+
+                elif token in ('\'\'\'', '"""'):           # triple-quoted
+                    endprog = endprogs[token]
+                    if endprog.search(line, pos) >= 0:     # all on one line
+                        pos = endprog.regs[0][1]
+                        tokeneater(STRING, token, linenum, line, start, pos)
+                    else:
+                        contstr = line[start:]             # multiple lines
+                        break
+                elif token[0] in '\'"':
+                    if token[-1] == '\n':                  # continued string
+                        endprog, contstr = endprogs[token[0]], line[start:]
+                        break
+                    else:                                  # ordinary string
+                        tokeneater(STRING, token, linenum, line, start, end)
+
+                elif token[0] == '\n':
+                    tokeneater(NEWLINE, token, linenum, line, start, end)
+                elif token[0] == '\\':                     # continued stmt
+                    continued = 1
+
+                else:
+                    if token[0] in '([{': parenlev = parenlev + 1
+                    if token[0] in ')]}': parenlev = parenlev - 1
+                    tokeneater(OP, token, linenum, line, start, end)
+            else:
+                tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1)
+                pos = pos + 1
+
+    for indent in indents[1:]:                 # pop remaining indent levels
+        tokeneater(DEDENT, '\t', linenum, line, 0, 0)
+
+if __name__ == '__main__':                     # testing
+    import sys
+    file = open(sys.argv[-1])
+    tokenize(file.readline)
author	Guido van Rossum <guido@python.org>	1997-03-07 00:21:12 +0000
committer	Guido van Rossum <guido@python.org>	1997-03-07 00:21:12 +0000
commit	bd7fd3ce5a1830c14472ae5e2e3b481aaecb0dba (patch)
tree	43e4c7541ef89f48b937bdd67057747995a56bd1 /Lib/tokenize.py
parent	76f85743eda96b0457b05d09813ee1775cf04577 (diff)
download	cpython-bd7fd3ce5a1830c14472ae5e2e3b481aaecb0dba.tar.gz