script/attr_count_read: load and correlate all data

This changes script/attr_count_read to take the samba private directory as an argument and load all the databases at once, printing them as one big table. It isn't extremely clear what it all means, but it *tries* to tell you. With --plot, it will attempt to load matplotlib and plot the number of requested attributes against the number returned, with colour of each point indicating its relative frequency. It is a scatterplot that wants to be a heatmap. With --no-casefold, you can get an extra confusing table where, for instance, something repeatedly asks for "attributeId" which is not accounted for, while in a completely different row an unrequested "attributeID" is found many times over. Signed-off-by: Douglas Bagnall <douglas.bagnall@catalyst.net.nz> Reviewed-by: Andrew Bartlett <abartlet@samba.org> Autobuild-User(master): Andrew Bartlett <abartlet@samba.org> Autobuild-Date(master): Wed May 1 06:46:36 UTC 2019 on sn-devel-184
author: Douglas Bagnall <douglas.bagnall@catalyst.net.nz> 2019-03-31 16:07:57 +1300
committer: Andrew Bartlett <abartlet@samba.org> 2019-05-01 06:46:36 +0000
commit: bd53819b28bab04408fc7fd7cfecc04a9aff9baf (patch)
tree: b339f5e28c55326121118b78f646f1a20e66c0f6 /script
parent: 60620273dba1d7f7ff25710c5dd8fd6d32f2d149 (diff)
download: samba-bd53819b28bab04408fc7fd7cfecc04a9aff9baf.tar.gz
1 files changed, 183 insertions, 11 deletions
diff --git a/script/attr_count_read b/script/attr_count_read
index 69c7c63e729..4338b6bee10 100755
--- a/script/attr_count_read
+++ b/script/attr_count_read
@@ -1,27 +1,199 @@
 #!/usr/bin/env python3
+#
+# Copyright (C) Catalyst IT Ltd. 2019
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
 
+from __future__ import print_function
 import sys
+import argparse
 import struct
+import os
+from collections import OrderedDict, Counter
+from pprint import pprint
+
 sys.path.insert(0, "bin/python")
 import tdb
 
 
-def main(filename):
+def unpack_uint(filename, casefold=True):
+    db = tdb.Tdb(filename)
+    d = {}
+    for k in db:
+        v = struct.unpack("I", db[k])[0]
+        k2 = k.decode('utf-8')
+        if casefold:
+            k2 = k2.lower()
+        if k2 in d: # because casefold
+            d[k2] += v
+        else:
+            d[k2] = v
+    return d
+
+
+def unpack_ssize_t_pair(filename, casefold):
     db = tdb.Tdb(filename)
     pairs = []
-    longest = 0
     for k in db:
+        key = struct.unpack("nn", k)
         v = struct.unpack("I", db[k])[0]
-        pairs.append((v, k.decode('utf-8')))
-        longest = max(len(k), longest)
+        pairs.append((v, key))
+
+    pairs.sort(reverse=True)
+    #print(pairs)
+    return [(k, v) for (v, k) in pairs]
+
+
+DATABASES = [
+    ('requested', "debug/attr_counts_requested.tdb", unpack_uint,
+     "The attribute was specifically requested."),
+    ('duplicates', "debug/attr_counts_duplicates.tdb", unpack_uint,
+     "Requested more than once in the same request."),
+    ('empty request', "debug/attr_counts_empty_req.tdb", unpack_uint,
+     "No attributes were requested, but these were returned"),
+    ('null request', "debug/attr_counts_null_req.tdb", unpack_uint,
+     "The attribute list was NULL and these were returned."),
+    ('found', "debug/attr_counts_found.tdb", unpack_uint,
+     "The attribute was specifically requested and it was found."),
+    ('not found', "debug/attr_counts_not_found.tdb", unpack_uint,
+     "The attribute was specifically requested but was not found."),
+    ('unwanted', "debug/attr_counts_unwanted.tdb", unpack_uint,
+     "The attribute was not requested and it was found."),
+    ('star match', "debug/attr_counts_star_match.tdb", unpack_uint,
+     'The attribute was not specifically requested but "*" was.'),
+    ('req vs found', "debug/attr_counts_req_vs_found.tdb", unpack_ssize_t_pair,
+     "How many attributes were requested versus how many were returned."),
+]
+
+
+def plot_pair_data(name, data, doc, lim=90):
+    # Note we keep the matplotlib import internal to this function for
+    # two reasons:
+    # 1. Some people won't have matplotlib, but might want to run the
+    #    script.
+    # 2. The import takes hundreds of milliseconds, which is a
+    #    nuisance if you don't wat graphs.
+    #
+    # This plot could be improved!
+    import matplotlib.pylab as plt
+    fig, ax = plt.subplots()
+    if lim:
+        data2 = []
+        for p, c in data:
+            if p[0] > lim or p[1] > lim:
+                print("not plotting %s: %s" % (p, c))
+                continue
+            data2.append((p, c))
+        skipped = len(data) - len(data2)
+        if skipped:
+            name += " (excluding %d out of range values)" % skipped
+            data = data2
+    xy, counts = zip(*data)
+    x, y = zip(*xy)
+    bins_x = max(x) + 4
+    bins_y = max(y)
+    ax.set_title(name)
+    ax.scatter(x, y, c=counts)
+    plt.show()
+
+
+def print_pair_data(name, data, doc):
+    print(name)
+    print(doc)
+    t = "%14s | %14s | %14s"
+    print(t % ("requested", "returned", "count"))
+    print(t % (('-' * 14,) * 3))
+
+    for xy, count in data:
+        x, y = xy
+        if x == -2:
+            x = 'NULL'
+        elif x == -4:
+            x = '*'
+        print(t % (x, y, count))
+
+
+def print_counts(count_data):
+    all_attrs = Counter()
+    for c in count_data:
+        all_attrs.update(c[1])
+
+    print("found %d attrs" % len(all_attrs))
+    longest = max(len(x) for x in all_attrs)
+
+    #pprint(all_attrs)
+    rows = OrderedDict()
+    for a, _ in all_attrs.most_common():
+        rows[a] = [a]
+
+    for col_name, counts, doc in count_data:
+        for attr, row in rows.items():
+            d = counts.get(attr, '')
+            row.append(d)
+
+        print("%15s: %s" % (col_name, doc))
+    print()
+
+    t = "%{}s".format(longest)
+    for c in count_data:
+        t += " | %{}s".format(max(len(c[0]), 7))
+
+    h = t % (("attribute",) + tuple(c[0] for c in count_data))
+    print(h)
+    print("-" * len(h))
+
+    for attr, row in rows.items():
+        print(t % tuple(row))
+        pass
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('LDB_PRIVATE_DIR',
+                        help="read attr counts in this directory")
+    parser.add_argument('--plot', action="store_true",
+                        help='attempt to draw graphs')
+    parser.add_argument('--no-casefold', action="store_false",
+                        default=True, dest="casefold",
+                        help='See all the encountered case varients')
+    args = parser.parse_args()
+
+    if not os.path.isdir(args.LDB_PRIVATE_DIR):
+        parser.print_usage()
+        sys.exit(1)
 
-    pairs.sort()
-    for v, k in pairs:
-        print("%*s: %7d" % (longest, k,  v))
+    count_data = []
+    pair_data = []
+    for k, filename, unpacker, doc in DATABASES:
+        filename = os.path.join(args.LDB_PRIVATE_DIR, filename)
+        try:
+            d = unpacker(filename, casefold=args.casefold)
+        except (RuntimeError, IOError) as e:
+            print("could not parse %s: %s" % (filename, e))
+            continue
+        if unpacker is unpack_ssize_t_pair:
+            pair_data.append((k, d, doc))
+        else:
+            count_data.append((k, d, doc))
 
+    for k, v, doc in pair_data:
+        if args.plot:
+            plot_pair_data(k, v, doc)
+        print_pair_data(k, v, doc)
 
-if len(sys.argv) < 2:
-    print("Usage: attr_count_read <tdb-file>")
-    sys.exit(1)
+    print()
+    print_counts(count_data)
 
-main(sys.argv[1])
+main()
author	Douglas Bagnall <douglas.bagnall@catalyst.net.nz>	2019-03-31 16:07:57 +1300
committer	Andrew Bartlett <abartlet@samba.org>	2019-05-01 06:46:36 +0000
commit	bd53819b28bab04408fc7fd7cfecc04a9aff9baf (patch)
tree	b339f5e28c55326121118b78f646f1a20e66c0f6 /script
parent	60620273dba1d7f7ff25710c5dd8fd6d32f2d149 (diff)
download	samba-bd53819b28bab04408fc7fd7cfecc04a9aff9baf.tar.gz