import gdb import bson from pprint import pprint DEBUGGING = False ''' Public API to be called by users. The input `ident` is a string of the form: 'collection-2--4547167393143767234'. From within gdb type: python dump_pages_for_table('collection-2--4547167393143767234') Some behaviors/limitations: * Disk images of data are not deserialized into their separate key/value pairs. * If update chain WT_UPDATEs are valid bson, the values will be parsed and output as BSON maps. * If updates are not bson (e.g: index entries), they will be output as a raw byte array. * WT_UPDATE structures have a pretty printer registered. Disabling pretty printers will result in more raw output. * Any `file:*.wt` can be output, e.g: `_mdb_catalog` or `WiredTiger`. Though the output may be less supported/of lower quality. ''' def dump_pages_for_table(ident): conn_impl_type = gdb.lookup_type("WT_CONNECTION_IMPL") if not conn_impl_type: print('WT_CONNECTION_IMPL type not found. Try invoking this function from a different \ thread and frame.') return conn_impl_ptr_type = conn_impl_type.pointer() dbg('impl', conn_impl_ptr_type) conn_ptr = None try: conn_ptr = gdb.parse_and_eval("session->iface->connection") except gdb.error: pass if not conn_ptr or not conn_ptr.address: print( 'Failed to find a suitable `WT_SESSION session` object to extract a connection object \ from. Try finding an eviction thread and frame, e.g: `__wt_evict_thread_run`. If the session is \ optimized out, try going up stack frames until the variable is in a local scope rather than a \ function input.') return conn = conn_ptr.reinterpret_cast(conn_impl_ptr_type).dereference() dbg('conn', conn) data_handle, all_dhs = get_data_handle(conn, 'file:{}.wt'.format(ident)) if not data_handle: print('Data handle not found for ident. Ident: `{}`'.format(ident)) print('All known data handles:') pprint(all_dhs) return dump_handle(data_handle) # Private API. def dbg(ident, var): if not DEBUGGING: return print('----------') if type(var) == gdb.Value: print('{}: ({}*){}'.format(ident, var.type, var.address)) else: print(ident) print(' ' + str(type(var))) methods = dir(var) out = [name for name in methods if not name.startswith("__")] for item in out: print(' ' + item) if type(var) == gdb.Value: print('\n Fields:') print('\t' + '\n\t'.join(str(var).split('\n'))) def walk_wt_list(lst): ret = [] node = lst['tqh_first'] dbg('node', node) while True: if not node: break ret.append(node.dereference()) node = node['q']['tqe_next'] return ret def get_data_handle(conn, handle_name): dbg('datahandles', conn['dhqh']) ret = None all_file_dhs = [] for handle in walk_wt_list(conn['dhqh']): if handle['name'].string().startswith('file:'): all_file_dhs.append(handle['name'].string()[5:-3]) if handle['name'].string() == handle_name: ret = handle return ret, all_file_dhs def get_btree_handle(dhandle): btree = gdb.lookup_type('WT_BTREE').pointer() return dhandle['handle'].reinterpret_cast(btree).dereference() def dump_update_chain(update_chain): while True: if not update_chain: print(' λ (End of update chain)') break dbg('update', update_chain) wt_val = update_chain.dereference() obj = None dbg('wt_val', wt_val) val_bytes = gdb.selected_inferior().read_memory(wt_val['data'], wt_val['size']) can_bson = wt_val['type'] == 3 if can_bson: try: obj = bson.decode_all(val_bytes)[0] except: pass print(' ' + '\n '.join(str(wt_val).split('\n')) + " " + str(obj) + " =>") update_chain = update_chain['next'] def dump_insert_list(wt_insert): key_struct = wt_insert['u']['key'] key = gdb.selected_inferior().read_memory( int(wt_insert.address) + key_struct['offset'], key_struct['size']).tobytes() print('Key: ' + str(key)) print('Value:') update_chain = wt_insert['upd'] dump_update_chain(update_chain) def dump_skip_list(wt_insert_head): if not wt_insert_head['head'].address: return wt_insert = wt_insert_head['head'][0] idx = 0 while True: if not wt_insert: break dump_insert_list(wt_insert.dereference()) dbg('insert' + str(idx), wt_insert.dereference()) idx += 1 wt_insert = wt_insert['next'][0] def dump_modified(leaf_page): print("Modify:") if not leaf_page['modify']: print("No modifies") return leaf_modify = leaf_page['modify'].dereference() dbg('modify', leaf_modify) row_leaf_insert = leaf_modify['u2']['row_leaf']['insert'] dbg('row store', row_leaf_insert) if not row_leaf_insert: print("No insert list") else: print("Insert list:") dump_skip_list(row_leaf_insert.dereference().dereference()) row_leaf_update = leaf_modify['u2']['row_leaf']['update'] if not row_leaf_update: print("No update list") else: print("Update list:") leaf_num_entries = int(leaf_page['entries']) for i in range(0, leaf_num_entries): dump_update_chain(row_leaf_update[i]) def dump_disk(leaf_page): dbg('in-memory page:', leaf_page) dsk = leaf_page['dsk'].dereference() if int(dsk.address) == 0: print("No page loaded from disk.") return dbg('on-disk page:', dsk) wt_page_header_size = 28 wt_block_header_size = 12 page_bytes = gdb.selected_inferior().read_memory( int(dsk.address) + wt_page_header_size + wt_block_header_size, int(dsk['mem_size'])).tobytes() print("Dsk:\n" + str(page_bytes)) def dump_handle(dhandle): print("Dumping: " + dhandle['name'].string()) btree = get_btree_handle(dhandle) root = btree['root'] root_page = root['page'].dereference() dbg('btree', btree) dbg('root', btree['root']) dbg('root page', root_page) rpindex = root_page['u']['intl']['__index'].dereference() leaf_num_entries = int(rpindex['entries']) for idx in range(0, leaf_num_entries): dbg('rpindex', rpindex) dbg('rp-pre-index', rpindex['index'].dereference().dereference()) leaf_page = rpindex['index'][idx].dereference()['page'].dereference() dbg('leaf', leaf_page) dump_disk(leaf_page) dump_modified(leaf_page)