Rev 5366: Implement a custom parser for chk btree leaves. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-btree-chk-leaf
John Arbash Meinel
john at arbash-meinel.com
Tue Aug 3 21:56:54 BST 2010
At http://bazaar.launchpad.net/~jameinel/bzr/2.3-btree-chk-leaf
------------------------------------------------------------
revno: 5366
revision-id: john at arbash-meinel.com-20100803205639-k23colcozyd14440
parent: pqm at pqm.ubuntu.com-20100802234934-d963xmqwx5gzevr0
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-btree-chk-leaf
timestamp: Tue 2010-08-03 15:56:39 -0500
message:
Implement a custom parser for chk btree leaves.
The basic parsing is actually quicker (47us to build this new structure vs 110us)
Most likely because we have 1 malloc rather than N per leaf.
Memory size is also much smaller. We'll lose a little bit if we
convert back and forth from keys, etc.
-------------- next part --------------
=== modified file 'bzrlib/_btree_serializer_pyx.pyx'
--- a/bzrlib/_btree_serializer_pyx.pyx 2010-02-17 17:11:16 +0000
+++ b/bzrlib/_btree_serializer_pyx.pyx 2010-08-03 20:56:39 +0000
@@ -33,6 +33,7 @@
char *PyString_AsString(object p) except NULL
object PyString_FromStringAndSize(char *, Py_ssize_t)
PyObject *PyString_FromStringAndSize_ptr "PyString_FromStringAndSize" (char *, Py_ssize_t)
+ object PyString_FromFormat(char *, ...)
int PyString_CheckExact(object s)
int PyString_CheckExact_ptr "PyString_CheckExact" (PyObject *)
Py_ssize_t PyString_Size(object p)
@@ -49,13 +50,19 @@
PyObject *PyTuple_GET_ITEM_ptr_object "PyTuple_GET_ITEM" (object tpl, int index)
void Py_INCREF(object)
void Py_DECREF_ptr "Py_DECREF" (PyObject *)
+ void *PyMem_Malloc(size_t nbytes)
+ void PyMem_Free(void *)
+ void memset(void *, int, size_t)
cdef extern from "string.h":
void *memcpy(void *dest, void *src, size_t n)
void *memchr(void *s, int c, size_t n)
+ int memcmp(void *s1, void *s2, size_t n)
# GNU extension
# void *memrchr(void *s, int c, size_t n)
int strncmp(char *s1, char *s2, size_t n)
+ unsigned long strtoul(char *s1, char **out, int base)
+ unsigned long long strtoull(char *s1, char **out, int base)
# It seems we need to import the definitions so that the pyrex compiler has
# local names to access them.
@@ -315,6 +322,305 @@
return parser.parse()
+ctypedef struct gc_chk_sha1_record:
+ unsigned long long block_offset
+ unsigned int block_length
+ unsigned int record_start
+ unsigned int record_end
+ char sha1[20]
+
+
+cdef int _unhexbuf[256]
+cdef char *_hexbuf = '01234567890abcdef'
+
+cdef _populate_unhexbuf():
+ cdef unsigned char a
+ for a from 0 <= a < 255:
+ _unhexbuf[a] = -1
+ for a in '0123456789':
+ _unhexbuf[a] = a - c'0'
+ for a in 'abcdef':
+ _unhexbuf[a] = a - c'a' + 10
+ for a in 'ABCDEF':
+ _unhexbuf[a] = a - c'A' + 10
+
+
+cdef int _unhexlify_sha1(char *as_hex, char *as_bin):
+ """Take the hex sha1 in as_hex and make it binary in as_bin"""
+ cdef int top
+ cdef int bot
+ cdef int i
+ cdef char *cur
+
+ cur = as_hex
+ for i from 0 <= i < 20:
+ top = _unhexbuf[<unsigned char>(cur)]
+ cur += 1
+ bot = _unhexbuf[<unsigned char>(cur)]
+ cur += 1
+ if top == -1 or bot == -1:
+ return 0
+ as_bin[i] = (top << 4) + bot;
+ return 1
+
+
+cdef void _hexlify_sha1(char *as_bin, char *as_hex):
+ cdef int i, j
+ cdef char c
+
+ j = 0
+ for i from 0 <= i < 20:
+ c = as_bin[i]
+ as_hex[j] = _hexbuf[(c>>4)&0xf]
+ j += 1
+ as_hex[j] = _hexbuf[(c)&0xf]
+ j += 1
+
+
+cdef class GCCHKSHA1LeafNode:
+ """Track all the entries for a given leaf node."""
+
+ cdef public int num_entries
+ cdef gc_chk_sha1_record *entries
+ # TODO: Consider what a mini-index might look like. Something that could
+ # let us quickly jump to a subset range, rather than doing pure
+ # bisect all the time.
+
+ def __sizeof__(self):
+ return (sizeof(GCCHKSHA1LeafNode)
+ + sizeof(gc_chk_sha1_record)*self.num_entries)
+
+ def __dealloc__(self):
+ if self.entries != NULL:
+ PyMem_Free(self.entries)
+ self.entries = NULL
+
+ def __init__(self, bytes):
+ self._parse_bytes(bytes)
+
+ property min_key:
+ def __get__(self):
+ pass
+
+ property max_key:
+ def __get__(self):
+ pass
+
+ cdef int _key_to_sha1(self, key, char *sha1):
+ """Map a key into its sha1 content.
+
+ :param key: A tuple of style ('sha1:abcd...',)
+ :param sha1: A char buffer of 20 bytes
+ :return: 1 if this could be converted, 0 otherwise
+ """
+ cdef char *c_val
+ if not PyTuple_CheckExact(key) and not StaticTuple_CheckExact(key):
+ return 0
+ #? raise TypeError('Keys must be a tuple or StaticTuple')
+ if len(key) != 1:
+ return 0
+ val = key[0]
+ if not PyString_CheckExact(val) or PyString_GET_SIZE(val) != 45:
+ return 0
+ c_val = PyString_AS_STRING(val)
+ if not strncmp(c_val, 'sha1:', 5):
+ return 0
+ if not _unhexlify_sha1(c_val + 5, sha1):
+ return 0
+ return 1
+
+ cdef StaticTuple _sha1_to_key(self, char *sha1):
+ """Compute a ('sha1:abcd',) key for a given sha1."""
+ cdef StaticTuple key
+ cdef object hexxed
+ cdef char *c_buf
+ hexxed = PyString_FromStringAndSize(NULL, 45)
+ c_buf = PyString_AS_STRING(hexxed)
+ memcpy(c_buf, 'sha1:', 5)
+ _hexlify_sha1(sha1, c_buf+5)
+ key = StaticTuple_New(1)
+ Py_INCREF(hexxed)
+ StaticTuple_SET_ITEM(key, 0, hexxed)
+ return key
+
+ cdef StaticTuple _record_to_value_and_refs(self,
+ gc_chk_sha1_record *record):
+ """Extract the refs and value part of this record."""
+ cdef StaticTuple value_and_refs
+ cdef StaticTuple empty
+ value_and_refs = StaticTuple_New(2)
+ # This is really inefficient to go from a logical state back to a
+ # string, but it makes things work a bit better internally for now.
+ value = PyString_FromFormat('%llu %lu %lu %lu',
+ record.block_offset, record.block_length,
+ record.record_start, record.record_end)
+ Py_INCREF(value)
+ StaticTuple_SET_ITEM(value_and_refs, 0, value)
+ # Always empty refs
+ empty = StaticTuple_New(0)
+ Py_INCREF(empty)
+ StaticTuple_SET_ITEM(value_and_refs, 1, empty)
+ return value_and_refs
+
+ cdef StaticTuple _record_to_item(self, gc_chk_sha1_record *record):
+ """Turn a given record back into a fully fledged item.
+ """
+ cdef StaticTuple item
+ cdef StaticTuple key
+ cdef StaticTuple value_and_refs
+ cdef object value
+ key = self._sha1_to_key(record.sha1)
+ item = StaticTuple_New(2)
+ Py_INCREF(key)
+ StaticTuple_SET_ITEM(item, 0, key)
+ value_and_refs = self._record_to_value_and_refs(record)
+ Py_INCREF(value_and_refs)
+ StaticTuple_SET_ITEM(item, 1, value_and_refs)
+ return item
+
+ cdef gc_chk_sha1_record* _lookup_record(self, char *sha1):
+ """Find a gc_chk_sha1_record that matches the sha1 supplied."""
+ # For right now we iterate, in the future we should bisect, or create
+ # a local index, or use the sha1 as a hash into a local table, etc.
+ cdef int i
+ for i from 0 <= i < self.num_entries:
+ if memcmp(self.entries[i].sha1, sha1, 20) == 0:
+ return &self.entries[i]
+ return NULL
+
+ def __contains__(self, key):
+ cdef char sha1[20]
+ cdef gc_chk_sha1_record *record
+ if not self._key_to_sha1(key, sha1):
+ # If it isn't a sha1 key, then it won't be in this leaf node
+ return False
+ return self._lookup_record(sha1) != NULL
+
+ def __getitem__(self, key):
+ cdef char sha1[20]
+ cdef gc_chk_sha1_record *record = NULL
+ if self._key_to_sha1(key, sha1):
+ record = self._lookup_record(sha1)
+ if record == NULL:
+ raise KeyError('key %r is not present' % (key,))
+ return self._record_to_value_and_refs(record)
+
+ def __len__(self):
+ return self.num_entries
+
+ def all_keys(self):
+ cdef int i
+ cdef list result
+ for i from 0 <= i < self.num_entries:
+ result.append(self._sha1_to_key(self.entries[i].sha1))
+ return result
+
+ def all_items(self):
+ cdef int i
+ cdef list result
+ for i from 0 <= i < self.num_entries:
+ result.append(self._record_to_item(self.entries + i))
+ return result
+
+ cdef _parse_bytes(self, bytes):
+ """Parse the string 'bytes' into content."""
+ cdef char *c_bytes
+ cdef char *c_content
+ cdef char *c_cur
+ cdef char *c_end
+ cdef char *c_next
+ cdef char *c_sha1_prefix
+ cdef void *buf
+ cdef Py_ssize_t n_bytes
+ cdef int num_entries
+ cdef int entry
+ cdef int interesting_char
+ cdef int i
+ cdef gc_chk_sha1_record *cur_record
+
+ if not PyString_CheckExact(bytes):
+ raise TypeError('We only support parsing plain 8-bit strings.')
+ # Pass 1, count how many entries there will be
+ n_bytes = PyString_GET_SIZE(bytes)
+ c_bytes = PyString_AS_STRING(bytes)
+ c_end = c_bytes + n_bytes
+ if strncmp(c_bytes, 'type=leaf\n', 10):
+ raise ValueError("bytes did not start with 'type=leaf\\n': %r"
+ % (bytes[:10],))
+ c_content = c_bytes + 10
+ c_cur = c_content
+ num_entries = 0
+ while c_cur != NULL and c_cur < c_end:
+ c_cur = <char *>memchr(c_cur, c'\n', c_end - c_cur);
+ if c_cur == NULL:
+ break
+ c_cur += 1
+ num_entries += 1
+ # Now allocate the memory for these items, and go to town
+ # We allocate both the offsets and the entries in the same malloc. we
+ # should probably pay a bit closer attention to alignment
+ buf = PyMem_Malloc(num_entries *
+ (sizeof(unsigned short) + sizeof(gc_chk_sha1_record)))
+ self.entries = <gc_chk_sha1_record*>buf
+ self.num_entries = num_entries
+ c_cur = c_content
+ cur_record = self.entries
+ entry = 0
+ interesting_char = -1
+ while c_cur != NULL and c_cur < c_end and entry < num_entries:
+ if strncmp(c_cur, 'sha1:', 5):
+ raise ValueError('At byte %d, line did not start with sha1: %r'
+ % (c_cur - c_bytes, safe_string_from_size(c_cur, 10)))
+ c_cur += 5
+ c_next = <char *>memchr(c_cur, c'\0', c_end - c_cur)
+ if c_next == NULL or (c_next - c_cur != 40):
+ raise ValueError('Line did not contain 40 hex bytes')
+ if not _unhexlify_sha1(c_cur, cur_record.sha1):
+ raise ValueError('We failed to unhexlify')
+ if interesting_char == -1:
+ interesting_char = 20
+ c_sha1_prefix = cur_record.sha1
+ elif interesting_char > 0:
+ for i from 0 <= i < interesting_char:
+ if cur_record.sha1[i] != c_sha1_prefix[i]:
+ interesting_char = i
+ break
+ c_cur = c_next + 1
+ if c_cur[0] != c'\0':
+ raise ValueError('only 1 null, not 2 as expected')
+ c_cur += 1
+ cur_record.block_offset = strtoull(c_cur, &c_next, 10)
+ if c_cur == c_next or c_next[0] != c' ':
+ raise ValueError('Failed to parse block offset')
+ c_cur = c_next + 1
+ cur_record.block_length = strtoul(c_cur, &c_next, 10)
+ if c_cur == c_next or c_next[0] != c' ':
+ raise ValueError('Failed to parse block length')
+ c_cur = c_next + 1
+ cur_record.record_start = strtoul(c_cur, &c_next, 10)
+ if c_cur == c_next or c_next[0] != c' ':
+ raise ValueError('Failed to parse block length')
+ c_cur = c_next + 1
+ cur_record.record_end = strtoul(c_cur, &c_next, 10)
+ if c_cur == c_next or c_next[0] != c'\n':
+ raise ValueError('Failed to parse record end')
+ c_cur = c_next + 1
+ # Pass 3: build the offset map
+ # The idea with the offset map is that we should be able to quickly
+ # jump to the key that matches a gives sha1. We know that the keys are
+ # in sorted order, and we know that a lot of the prefix is going to be
+ # the same across them.
+
+
+
+
+def _parse_into_chk(bytes, key_length, ref_list_length):
+ """Parse into a format optimized for chk records."""
+ assert key_length == 1
+ assert ref_list_length == 0
+ return GCCHKSHA1LeafNode(bytes)
+
+
def _flatten_node(node, reference_lists):
"""Convert a node into the serialized form.
=== modified file 'bzrlib/btree_index.py'
--- a/bzrlib/btree_index.py 2010-06-20 11:18:38 +0000
+++ b/bzrlib/btree_index.py 2010-08-03 20:56:39 +0000
@@ -605,7 +605,7 @@
class _LeafNode(object):
"""A leaf node for a serialised B+Tree index."""
- __slots__ = ('keys', 'min_key', 'max_key')
+ __slots__ = ('_keys', 'min_key', 'max_key')
def __init__(self, bytes, key_length, ref_list_length):
"""Parse bytes to create a leaf node object."""
@@ -617,7 +617,28 @@
self.max_key = key_list[-1][0]
else:
self.min_key = self.max_key = None
- self.keys = dict(key_list)
+ self._keys = dict(key_list)
+
+ def __len__(self):
+ return len(self._keys)
+
+ def __contains__(self, key):
+ return key in self._keys
+
+ def __getitem__(self, key):
+ return self._keys[key]
+
+ def all_items(self):
+ """Return a sorted list of (key, (value, refs)) items"""
+ items = self._keys.items()
+ items.sort()
+ return items
+
+ def all_keys(self):
+ """Return a sorted list of all keys."""
+ keys = self._keys.keys()
+ keys.sort()
+ return keys
class _InternalNode(object):
@@ -672,6 +693,7 @@
self._recommended_pages = self._compute_recommended_pages()
self._root_node = None
self._base_offset = offset
+ self._leaf_klass = _LeafNode
# Default max size is 100,000 leave values
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
if unlimited_cache:
@@ -950,7 +972,7 @@
"""Cache directly from key => value, skipping the btree."""
if self._leaf_value_cache is not None:
for node in nodes.itervalues():
- for key, value in node.keys.iteritems():
+ for key, value in node.all_items():
if key in self._leaf_value_cache:
# Don't add the rest of the keys, we've seen this node
# before.
@@ -980,10 +1002,10 @@
if self._row_offsets[-1] == 1:
# There is only the root node, and we read that via key_count()
if self.node_ref_lists:
- for key, (value, refs) in sorted(self._root_node.keys.items()):
+ for key, (value, refs) in self._root_node.all_items():
yield (self, key, value, refs)
else:
- for key, (value, refs) in sorted(self._root_node.keys.items()):
+ for key, (value, refs) in self._root_node.all_items():
yield (self, key, value)
return
start_of_leaves = self._row_offsets[-2]
@@ -999,11 +1021,11 @@
# for spilling index builds to disk.
if self.node_ref_lists:
for _, node in nodes:
- for key, (value, refs) in sorted(node.keys.items()):
+ for key, (value, refs) in node.all_items():
yield (self, key, value, refs)
else:
for _, node in nodes:
- for key, (value, refs) in sorted(node.keys.items()):
+ for key, (value, refs) in node.all_items():
yield (self, key, value)
@staticmethod
@@ -1170,8 +1192,8 @@
continue
node = nodes[node_index]
for next_sub_key in sub_keys:
- if next_sub_key in node.keys:
- value, refs = node.keys[next_sub_key]
+ if next_sub_key in node:
+ value, refs = node[next_sub_key]
if self.node_ref_lists:
yield (self, next_sub_key, value, refs)
else:
@@ -1245,14 +1267,13 @@
# sub_keys is all of the keys we are looking for that should exist
# on this page, if they aren't here, then they won't be found
node = nodes[node_index]
- node_keys = node.keys
parents_to_check = set()
for next_sub_key in sub_keys:
- if next_sub_key not in node_keys:
+ if next_sub_key not in node:
# This one is just not present in the index at all
missing_keys.add(next_sub_key)
else:
- value, refs = node_keys[next_sub_key]
+ value, refs = node[next_sub_key]
parent_keys = refs[ref_list_num]
parent_map[next_sub_key] = parent_keys
parents_to_check.update(parent_keys)
@@ -1265,8 +1286,8 @@
while parents_to_check:
next_parents_to_check = set()
for key in parents_to_check:
- if key in node_keys:
- value, refs = node_keys[key]
+ if key in node:
+ value, refs = node[key]
parent_keys = refs[ref_list_num]
parent_map[key] = parent_keys
next_parents_to_check.update(parent_keys)
@@ -1546,7 +1567,8 @@
continue
bytes = zlib.decompress(data)
if bytes.startswith(_LEAF_FLAG):
- node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
+ node = self._leaf_klass(bytes, self._key_length,
+ self.node_ref_lists)
elif bytes.startswith(_INTERNAL_FLAG):
node = _InternalNode(bytes)
else:
=== modified file 'bzrlib/python-compat.h'
--- a/bzrlib/python-compat.h 2009-10-21 20:53:21 +0000
+++ b/bzrlib/python-compat.h 2010-08-03 20:56:39 +0000
@@ -65,6 +65,7 @@
#if !defined(S_ISLNK)
#define S_ISLNK(mode) (0)
#endif
+ #define strtoull _strtoui64
#else /* Not win32 */
/* For htonl */
#include "arpa/inet.h"
=== modified file 'bzrlib/tests/test_btree_index.py'
--- a/bzrlib/tests/test_btree_index.py 2010-06-20 11:18:38 +0000
+++ b/bzrlib/tests/test_btree_index.py 2010-08-03 20:56:39 +0000
@@ -216,12 +216,12 @@
leaf1_bytes = zlib.decompress(leaf1)
sorted_node_keys = sorted(node[0] for node in nodes)
node = btree_index._LeafNode(leaf1_bytes, 1, 0)
- self.assertEqual(231, len(node.keys))
- self.assertEqual(sorted_node_keys[:231], sorted(node.keys))
+ self.assertEqual(231, len(node))
+ self.assertEqual(sorted_node_keys[:231], node.all_keys())
leaf2_bytes = zlib.decompress(leaf2)
node = btree_index._LeafNode(leaf2_bytes, 1, 0)
- self.assertEqual(400 - 231, len(node.keys))
- self.assertEqual(sorted_node_keys[231:], sorted(node.keys))
+ self.assertEqual(400 - 231, len(node))
+ self.assertEqual(sorted_node_keys[231:], node.all_keys())
def test_last_page_rounded_1_layer(self):
builder = btree_index.BTreeBuilder(key_elements=1, reference_lists=0)
@@ -241,9 +241,9 @@
leaf2 = content[74:]
leaf2_bytes = zlib.decompress(leaf2)
node = btree_index._LeafNode(leaf2_bytes, 1, 0)
- self.assertEqual(10, len(node.keys))
+ self.assertEqual(10, len(node))
sorted_node_keys = sorted(node[0] for node in nodes)
- self.assertEqual(sorted_node_keys, sorted(node.keys))
+ self.assertEqual(sorted_node_keys, node.all_keys())
def test_last_page_not_rounded_2_layer(self):
builder = btree_index.BTreeBuilder(key_elements=1, reference_lists=0)
@@ -263,9 +263,9 @@
leaf2 = content[8192:]
leaf2_bytes = zlib.decompress(leaf2)
node = btree_index._LeafNode(leaf2_bytes, 1, 0)
- self.assertEqual(400 - 231, len(node.keys))
+ self.assertEqual(400 - 231, len(node))
sorted_node_keys = sorted(node[0] for node in nodes)
- self.assertEqual(sorted_node_keys[231:], sorted(node.keys))
+ self.assertEqual(sorted_node_keys[231:], node.all_keys())
def test_three_level_tree_details(self):
# The left most pointer in the second internal node in a row should
@@ -302,7 +302,7 @@
# in the second node it points at
pos = index._row_offsets[2] + internal_node2.offset + 1
leaf = index._get_leaf_nodes([pos])[pos]
- self.assertTrue(internal_node2.keys[0] in leaf.keys)
+ self.assertTrue(internal_node2.keys[0] in leaf)
def test_2_leaves_2_2(self):
builder = btree_index.BTreeBuilder(key_elements=2, reference_lists=2)
@@ -728,7 +728,7 @@
nodes = dict(index._read_nodes([0]))
self.assertEqual([0], nodes.keys())
node = nodes[0]
- self.assertEqual([('key',)], node.keys.keys())
+ self.assertEqual([('key',)], node.all_keys())
self.assertEqual([('get', 'index')], trans._activity)
def test__read_nodes_no_size_multiple_pages(self):
@@ -1112,7 +1112,7 @@
min_l2_key = l2.min_key
max_l1_key = l1.max_key
self.assertTrue(max_l1_key < min_l2_key)
- parents_min_l2_key = l2.keys[min_l2_key][1][0]
+ parents_min_l2_key = l2[min_l2_key][1][0]
self.assertEqual((l1.max_key,), parents_min_l2_key)
# Now, whatever key we select that would fall on the second page,
# should give us all the parents until the page break
@@ -1131,7 +1131,7 @@
parent_map = {}
search_keys = index._find_ancestors([max_l1_key], 0, parent_map,
missing_keys)
- self.assertEqual(sorted(l1.keys), sorted(parent_map))
+ self.assertEqual(l1.all_keys(), sorted(parent_map))
self.assertEqual(set(), missing_keys)
self.assertEqual(set(), search_keys)
@@ -1205,7 +1205,7 @@
("2222222222222222222222222222222222222222",): ("value:2", ()),
("3333333333333333333333333333333333333333",): ("value:3", ()),
("4444444444444444444444444444444444444444",): ("value:4", ()),
- }, node.keys)
+ }, dict(node.all_items()))
def test_LeafNode_2_2(self):
node_bytes = ("type=leaf\n"
@@ -1225,7 +1225,7 @@
('11', '33'): ('value:3',
((('11', 'ref22'),), (('11', 'ref22'), ('11', 'ref22')))),
('11', '44'): ('value:4', ((), (('11', 'ref00'),)))
- }, node.keys)
+ }, dict(node.all_items()))
def test_InternalNode_1(self):
node_bytes = ("type=internal\n"
@@ -1266,7 +1266,7 @@
('11', '33'): ('value:3',
((('11', 'ref22'),), (('11', 'ref22'), ('11', 'ref22')))),
('11', '44'): ('value:4', ((), (('11', 'ref00'),)))
- }, node.keys)
+ }, dict(node.all_items()))
def assertFlattened(self, expected, key, value, refs):
flat_key, flat_line = self.parse_btree._flatten_node(
More information about the bazaar-commits
mailing list