summaryrefslogtreecommitdiff
path: root/benchmarks/generate_json.py
blob: d0c3de091268e7a9db29f71bb0e50f4d3823a2ef (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""Generate a JSON file for json decode benchmarking purposes using freqdata.py.
Works with pypy and python.

Initial setup::

    $ git clone -b pypy-support git://github.com/simplejson/simplejson.git; \
      cd simplejson

To generate::

    $ mkdir -p build/bench; \
      PYTHONPATH=. python benchmarks/generate_json.py > build/bench/feed.json

To run the read benchmark::

    $ python -m timeit -n 1 -r 5 \
      -s 'import simplejson;s = open("build/bench/feed.json", "rb").read()' \
      'simplejson.loads(s)'

To run the write benchmark::

    $ python -m timeit -n 1 -r 5 \
      -s 'import simplejson;s = open("build/bench/feed.json", "rb").read()' \
      -s 'd = simplejson.loads(s)' \
      'simplejson.dumps(d)'

"""
import sys
import operator
from random import Random

import simplejson

from freqdata import FREQ

def main(num_games, freq=FREQ):
    game_keys = sorted([k for k in freq.iterkeys() if not isinstance(k, tuple)])
    rng = Random()
    rng.seed(6935051420896)

    def type_chooser(type_freq):
        types = sorted(type_freq.iteritems(), key=operator.itemgetter(1, 0))
        type_sum = sum([v for (k, v) in types])
        def chooser():
            orig_i = rng.randrange(0, type_sum)
            i = orig_i
            for typ, freq in types:
                if i < freq:
                    return typ
                i -= freq
            assert False, "orig_i=%r type_sum=%r" % (orig_i, type_sum)
        return chooser

    choosers = dict((k, type_chooser(v)) for (k, v) in freq.iteritems())

    def generate_string(typ, (start, stop), coderangegen):
        if typ is str:
            empty, char = '', chr
        else:
            empty, char = u'', unichr
        for _ in xrange(rng.randrange(start, stop)):
            loword, highord = coderangegen()
        return empty.join([char(rng.randrange(*coderangegen()))
                           for _ in xrange(rng.randrange(start, stop))])

    def generate_list(key, typ):
        start, stop = typ[1]
        return [key_chooser((key, typ))
                for _ in xrange(rng.randrange(start, stop))]

    def key_chooser(key):
        typ = choosers[key]()
        if typ is None:
            return None
        elif typ is bool:
            return rng.choice((False, True))
        elif typ is float:
            # actually 100.0 is included in the actual data
            return rng.uniform(0.0, 100.0)
        elif typ is int:
            # this is arbitrary, not from data
            return rng.randrange(0, 10000000)
        elif isinstance(typ, tuple):
            (parent, rest) = typ
            if parent in (str, unicode):
                return generate_string(parent, rest, choosers.get((key, typ)))
            elif parent is list:
                return generate_list(key, typ)
        raise NotImplementedError(typ)

    def digits(n):
        s = str(rng.randrange(0, 10 ** n))
        return ('0' * (n - len(s))) + s

    def timestamp():
        return '{}-{}-{}T{}:{}:{}.{}'.format(
            *(digits(n) for n in (4, 2, 2, 2, 2, 2, 6)))

    def metascore():
        # actually 100.0 is included in the actual data
        return rng.uniform(0.0, 100.0)

    def game():
        d = {}
        for k in game_keys:
            d[k] = key_chooser(k)
        return d

    json = {}
    json['generated'] = timestamp()
    json['games'] = [game() for _ in xrange(num_games)]
    return simplejson.dumps(json, sort_keys=True)

if __name__ == '__main__':
    try:
        num = int(sys.argv[1])
    except IndexError:
        num = 20000
    print main(num)