jsonpath_rw/lexer.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

from __future__ import unicode_literals, print_function, absolute_import, division, generators, nested_scopes
import sys
import logging

import ply.lex

logger = logging.getLogger(__name__)

class JsonPathLexerError(Exception):
    pass

class JsonPathLexer(object):
    '''
    A Lexical analyzer for JsonPath. 
    '''
    
    def __init__(self, debug=False):
        self.debug = debug
        if self.__doc__ == None:
            raise JsonPathLexerError('Docstrings have been removed! By design of PLY, jsonpath-rw requires docstrings. You must not use PYTHONOPTIMIZE=2 or python -OO.')

    def tokenize(self, string):
        '''
        Maps a string to an iterator over tokens. In other words: [char] -> [token]
        '''
        
        new_lexer = ply.lex.lex(module=self, debug=self.debug, errorlog=logger)
        new_lexer.latest_newline = 0
        new_lexer.input(string)

        while True:
            t = new_lexer.token()
            if t is None: break
            t.col = t.lexpos - new_lexer.latest_newline
            yield t

    # ============== PLY Lexer specification ==================
    #
    # This probably should be private but:
    #   - the parser requires access to `tokens` (perhaps they should be defined in a third, shared dependency)
    #   - things like `literals` might be a legitimate part of the public interface.
    #
    # Anyhow, it is pythonic to give some rope to hang oneself with :-)

    literals = ['*', '.', '[', ']', '(', ')', '$', ',', ':', '|', '&']
    
    reserved_words = { 'where': 'WHERE' }

    tokens = ['DOUBLEDOT', 'NUMBER', 'ID', 'NAMED_OPERATOR'] + list(reserved_words.values())

    states = [ ('singlequote', 'exclusive'),
               ('doublequote', 'exclusive'),
               ('backquote', 'exclusive') ]

    # Normal lexing, rather easy
    t_DOUBLEDOT = r'\.\.'
    t_ignore = ' \t'

    def t_ID(self, t):
        r'[a-zA-Z_@][a-zA-Z0-9_@]*'
        t.type = self.reserved_words.get(t.value, 'ID')
        return t

    def t_NUMBER(self, t):
        r'\d+'
        t.value = int(t.value)
        return t

    # Single-quoted strings
    t_singlequote_ignore = ''
    def t_SINGLEQUOTE(self, t):
        r'\''
        t.lexer.string_start = t.lexer.lexpos
        t.lexer.push_state('singlequote')

    def t_singlequote_SINGLEQUOTE(self, t):
        r"([^']|\\')*'"
        t.value = t.value[:-1]
        t.type = 'ID'
        t.lexer.pop_state()
        return t

    def t_singlequote_error(self, t):
        raise JsonPathLexerError('Error on line %s, col %s while lexing singlequoted field: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))


    # Double-quoted strings
    t_doublequote_ignore = ''
    def t_DOUBLEQUOTE(self, t):
        r'"'
        t.lexer.string_start = t.lexer.lexpos
        t.lexer.push_state('doublequote')

    def t_doublequote_DOUBLEQUOTE(self, t):
        r'([^"]|\\")*"'
        t.value = t.value[:-1]
        t.type = 'ID'
        t.lexer.pop_state()
        return t

    def t_doublequote_error(self, t):
        raise JsonPathLexerError('Error on line %s, col %s while lexing doublequoted field: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))

    
    # Back-quoted "magic" operators
    t_backquote_ignore = ''
    def t_BACKQUOTE(self, t):
        r'`'
        t.lexer.string_start = t.lexer.lexpos
        t.lexer.push_state('backquote')

    def t_backquote_BACKQUOTE(self, t):
        r'([^`]|\\`)*`'
        t.value = t.value[:-1]
        t.type = 'NAMED_OPERATOR'
        t.lexer.pop_state()
        return t

    def t_backquote_error(self, t):
        raise JsonPathLexerError('Error on line %s, col %s while lexing backquoted operator: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))


    # Counting lines, handling errors
    def t_newline(self, t):
        r'\n'
        t.lexer.lineno += 1
        t.lexer.latest_newline = t.lexpos

    def t_error(self, t):
        raise JsonPathLexerError('Error on line %s, col %s: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.lexer.latest_newline, t.value[0]))

if __name__ == '__main__':
    logging.basicConfig()
    lexer = JsonPathLexer(debug=True)
    for token in lexer.tokenize(sys.stdin.read()):
        print('%-20s%s' % (token.value, token.type))