Rev 5521: (broken) Just doing a snapshot. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
John Arbash Meinel
john at arbash-meinel.com
Fri Nov 19 20:05:26 GMT 2010
At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
------------------------------------------------------------
revno: 5521
revision-id: john at arbash-meinel.com-20101119200510-rslzmtz8ofvjez5i
parent: john at arbash-meinel.com-20101119184852-64g7pbzu1z9sl9k7
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Fri 2010-11-19 14:05:10 -0600
message:
(broken) Just doing a snapshot.
Starting to move the internals of RabinIndex to using the RabinHashMap.
Still quite a bit to sort out. RabinBucket and rabin_offset were used
rather extensively in the rest of the code.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'
--- a/bzrlib/_delta_index_pyx.pyx 2010-11-18 21:56:06 +0000
+++ b/bzrlib/_delta_index_pyx.pyx 2010-11-19 20:05:10 +0000
@@ -60,6 +60,10 @@
cdef int MAX_DELTA_OP_SIZE
# This is a copy instruction of 64kB at the largest possible offset, etc.
MAX_DELTA_OP_SIZE = (5 + 5 + 1 + RABIN_WINDOW + 7)
+cdef int MAX_MATCH_PRE
+MAX_MATCH_PRE = 255
+cdef int MAX_MATCH_TAIL
+MAX_MATCH_TAIL = 65535
cdef struct rabin_entry:
@@ -222,7 +226,7 @@
# collisions). Determine how we want to make this better
# One option is to use a search. Do 2 loops,
# for slot in old_table:
- # val = slot.val
+ # val = slot.rabin_val
# for sub_slot in search(old_table, val):
# insert(new_table, sub_slot)
# mark(sub_slot, completed=True)
@@ -277,6 +281,9 @@
table_size = self.table_mask + 1
total_entries = self.entry_count + new_entries
# We allow the table to become 2/3rds full before we resize
+ # TODO: We may want to shrink at this point if we find that we
+ # over-allocated. (Say we got 1MB of all 0's, then we end up with
+ # a single entry in the map, but reserved lots of space)
if (self.table_mask != 0 and total_entries * 3 < table_size * 2):
# We have enough room already
return
@@ -367,33 +374,6 @@
return matches
-cdef class rabin_offset:
- # Pointer to the bytes just after the bytes that match this hash
- # (eg, if you have hash('abc') = 20, then for 'abcd' we will point to 'd')
- # (Note that if the source listing changes, we'll need to move this
- # pointer.)
- cdef const_data ptr
- # The next rabin_offset that matches this val
- cdef public rabin_offset next
- # The 32-bit Rabin value that matches this location
- cdef public unsigned int val
- # The source that we can find this string in (TODO: consider a source
- # pointer vs a source count)
- cdef public unsigned short source
-
- def __repr__(self):
- return '%s(%d, %d)' % (self.__class__.__name__, self.val, self.source)
-
-
-cdef class RabinBucket:
- cdef public rabin_offset first
- cdef public unsigned int count
-
- def __init__(self):
- self.first = None
- self.count = 0
-
-
cdef class SourceInfo:
# The actual content data for this buffer, should be exactly a PyString
# (8-bit string) type
@@ -478,11 +458,7 @@
A given sequence of bytes will generate a "RABIN" value, this is then
indexed via a hash map.
"""
-
- cdef public unsigned int hash_mask
- cdef public unsigned int num_entries
- # List of RabinBucket objects
- cdef public object buckets
+ cdef public RabinHashMap _hash_map
# List of SourceInfo objects, must always be < 64k
cdef public object sources
cdef public unsigned int total_source_bytes
@@ -493,134 +469,44 @@
return self.total_source_bytes
def __init__(self):
- self.hash_mask = 0
- self.buckets = [None]
+ self._hash_map = RabinHashMap()
# Just initialize the basic buckets to start with
- self._ensure_enough_buckets(0)
- self.num_entries = 0
self.sources = []
self.total_source_bytes = 0
self.max_bucket_size = MAX_HASH_BUCKET_ENTRIES
+ cdef _find_source_for_ptr(self, const_data ptr):
+ """Search through the source list for which one holds this pointer."""
+ cdef SourceInfo info
+ cdef char *start, *end
+
+ if ptr == NULL:
+ return -1, None
+ for idx, info in enumerate(self.sources):
+ start = PyString_AS_STRING(info.buf)
+ end = start + PyString_GET_SIZE(info.buf)
+ if ptr >= start && ptr <= end:
+ return idx, info
+ return -1, None
+
def _matches_as_dict(self):
- cdef RabinBucket bucket
cdef SourceInfo source
- cdef rabin_offset offset
+ cdef unsigned int i
+ cdef Py_ssize_t local_offset
+ cdef rabin_entry *slot
matches = {}
- for bucket in self.buckets:
- if bucket is None:
+ for i from 0 <= i <= self._hash_map.table_mask:
+ slot = self._hash_map._table + i
+ if (slot.rabin_val == 0
+ and (slot.ptr == NULL or slot.ptr == dummy_ptr)):
continue
- offset = bucket.first
- while offset is not None:
- source = self.sources[offset.source]
- local_offset = offset.ptr - PyString_AS_STRING(source.buf)
- matches[offset.val] = source.start_offset + local_offset
- offset = offset.next
+ idx, source = self._find_source_for_ptr(slot.ptr)
+ if source is not None:
+ local_offset = slot.ptr - PyString_AS_STRING(source.buf)
+ matches[slot.rabin_val] = source.start_offset + local_offset
return matches
- def _compute_num_hash_buckets(self, new_size):
- """Determine a power-of-two number of hash buckets to use.
-
- We know that the new content will have no more than "new_size /
- RABIN_WINDOW" rabin_offsets that we will want to create. And we know
- that we already have '.num_entries'. So we add the two together, and
- find a power-of-two that is big enough to hold everything. (Note that
- we default to wanting *some* collisions.)
- """
- cdef unsigned long long max_buckets
- cdef unsigned long long raw_size
- cdef unsigned long long num_entries
- cdef long num_buckets
- cdef unsigned int exponent
-
- raw_size = new_size
- if raw_size > 0:
- # From the C code:
- # /* Determine index hash size. Note that indexing skips the
- # first byte to allow for optimizing the Rabin's polynomial
- # initialization in create_delta(). */
- raw_size = raw_size - 1
- num_entries = raw_size / RABIN_WINDOW
- num_entries += self.num_entries
- # Heuristic, default to trying to fit 4 items per hash bucket
- max_buckets = num_entries / DEFAULT_NUM_COLLISIONS
- # Minimum of 16 buckets
- exponent = DEFAULT_MIN_BUCKET_EXPONENT
- # 2^30 is the largest that fits cleanly in a 32-bit signed long (PyInt)
- while (1ull << exponent) < max_buckets and exponent < 30:
- exponent += 1
- num_buckets = 1 << exponent
- return num_buckets
-
- cdef _add_offset_to_buckets(self, list buckets, long mask,
- rabin_offset offset):
- cdef RabinBucket bucket
- cdef long new_index
-
- new_index = offset.val & mask
- bucket = buckets[new_index]
- if bucket is None:
- bucket = RabinBucket()
- buckets[new_index] = bucket
- offset.next = bucket.first
- bucket.first = offset
- bucket.count += 1
-
- cdef _add_offset(self, rabin_offset offset):
- self._add_offset_to_buckets(self.buckets, self.hash_mask, offset)
- self.num_entries += 1
-
- def _py_add_offset(self, val, source):
- """A python thunk for testing _add_offset
-
- :param val: The Rabin hash value (integer)
- :param source: The source index (integer)
- :return: A newly created rabin_offset that has been created and added
- """
- cdef rabin_offset offset
-
- offset = rabin_offset()
- offset.next = None
- offset.val = val
- offset.source = source
- offset.ptr = NULL
- self._add_offset(offset)
- return offset
-
- def _ensure_enough_buckets(self, new_size):
- """Make sure we have enough buckets ready."""
- cdef long num_buckets
- cdef long new_mask
- cdef RabinBucket bucket
- cdef rabin_offset offset
- cdef list new_buckets
-
- num_buckets = self._compute_num_hash_buckets(new_size)
- if len(self.buckets) == num_buckets:
- # Already done
- return
- # Time to resize
- new_buckets = [None] * num_buckets
- new_mask = num_buckets - 1
- for bucket in self.buckets:
- if bucket is None:
- continue
- # We want to insert them in reverse order, so that we preserve the
- # final ordering. We won't preserve perfect ordering when shrinking
- # and offsets merge into the same bucket, but it should be good
- # enough.
- offsets_to_insert = []
- offset = bucket.first
- while offset is not None:
- offsets_to_insert.append(offset)
- offset = offset.next
- while offsets_to_insert:
- offset = offsets_to_insert.pop()
- self._add_offset_to_buckets(new_buckets, new_mask, offset)
- self.buckets = new_buckets
- self.hash_mask = new_mask
-
def _compute_offsets(self, source):
"""Add the hash entries for this data as a 'source' string."""
cdef unsigned short source_counter
@@ -628,9 +514,11 @@
cdef const_data start
cdef unsigned long num_entries
cdef unsigned int val
+ cdef Py_ssize_t match_pre
+ cdef Py_ssize_t match_tail
cdef int i
cdef Py_ssize_t size
- cdef rabin_offset last
+ cdef rabin_entry *last
cdef RabinBucket bucket
cdef unsigned int hash_offset
@@ -648,27 +536,32 @@
# We walk backwards through the data, so that early matches are the
# first entries in the hash buckets.
start = (<const_data>PyString_AS_STRING(source))
- # TODO: try to align this so the pointers end up on a 4/8-byte
- # alignment
+ end = start + size
+ # Note: trying to align this so the pointers end up on a 4/8-byte
+ # alignment didn't seem to help anything. We only ended up with a
+ # 1/4 chance of having both source and target aligned.
ptr = (start + (num_entries * RABIN_WINDOW) - RABIN_WINDOW)
- last = rabin_offset()
- last.val = ~0 # 0xFFFFFFFF
+ last == NULL
while ptr >= start:
val = 0
for i from 1 <= i <= RABIN_WINDOW:
RABIN_ADD(val, ptr[i])
- if (val == last.val):
+ match_pre = ptr - start
+ if match_pre > MAX_MATCH_PRE:
+ match_pre = MAX_MATCH_PRE
+ match_tail = end - ptr
+ if match_tail > MAX_MATCH_TAIL:
+ match_tail = MAX_MATCH_TAIL
+ if (last != NULL and val == last.rabin_val):
# Keep the most recent of consecutive entries, which means the
# closest to the start of the source
# XXX: test this
last.ptr = (ptr + RABIN_WINDOW)
+ last.match_pre = <unsigned char>match_pre
+ last.match_tail = <unsigned short>match_tail
num_entries -= 1
else:
- last = rabin_offset()
- last.ptr = (ptr + RABIN_WINDOW)
- last.val = val
- last.source = source_counter
- self._add_offset(last)
+ last = self._hash_map.add(ptr, val, match_pre, match_tail)
ptr -= RABIN_WINDOW
def _compute_offsets_from_delta(self, delta_source):
@@ -676,12 +569,15 @@
cdef const_data ptr
cdef const_data start
cdef const_data end
+ cdef const_data insert_start
+ cdef const_data insert_end
+ cdef Py_ssize_t match_pre
+ cdef Py_ssize_t match_tail
cdef unsigned long num_entries
cdef unsigned int val, prev_val
cdef unsigned char c
cdef int i
cdef Py_ssize_t size
- cdef rabin_offset this
cdef RabinBucket bucket
cdef unsigned int hash_offset
@@ -713,8 +609,10 @@
if c & 0x20: ptr += 1
if c & 0x40: ptr += 1
elif c != 0:
+ insert_start = ptr
+ insert_end = ptr + c
# Insert command, grab these bytes to index, length is 'c'
- if ptr + c > end:
+ if insert_end > end:
# Actually an invalid command...
raise ValueError('insert command of %d bytes at offset'
' %d goes off the end of delta bytes'
@@ -724,6 +622,12 @@
# TODO: try to align this so the pointers end up at 4/8-byte
# alignment
while c > RABIN_WINDOW + 3:
+ match_pre = ptr - insert_start
+ if match_pre > MAX_MATCH_PRE:
+ match_pre = MAX_MATCH_PRE
+ match_tail = insert_end - ptr
+ if match_tail > MAX_MATCH_TAIL:
+ match_tail = MAX_MATCH_TAIL
val = 0
# We shift-by-one because the matching code skips the first
# byte
@@ -734,11 +638,7 @@
if val == prev_val:
# Keep only the first of matching sequences
continue
- this = rabin_offset()
- this.ptr = ptr
- this.val = val
- this.source = source_counter
- self._add_offset(this)
+ last = self._hash_map.add(ptr, val, match_pre, match_tail)
num_entries += 1
ptr += c
else:
@@ -831,7 +731,7 @@
start_offset = self.total_source_bytes + extra_offset
new_info = SourceInfo(source, start_offset, is_delta=False)
self.sources.append(new_info)
- self._ensure_enough_buckets(len(source))
+ self._hash_map.reserve(len(source) / RABIN_WINDOW)
self._compute_offsets(source)
self._limit_hash_buckets()
self.total_source_bytes += len(source) + extra_offset
@@ -940,7 +840,7 @@
while offset is not None:
this = offset
offset = offset.next
- if this.val != cur_rabin_val:
+ if this.rabin_val != cur_rabin_val:
continue
ref = this.ptr
ref_source = self.index.sources[this.source]
=== modified file 'bzrlib/tests/test__delta_index.py'
--- a/bzrlib/tests/test__delta_index.py 2010-11-19 18:48:52 +0000
+++ b/bzrlib/tests/test__delta_index.py 2010-11-19 20:05:10 +0000
@@ -395,137 +395,6 @@
, delta)
-
-class TestRabinIndexComputeNumHashBuckets(TestCaseWithRabinIndex):
-
- _module = None # set by load_tests()
-
- def assertNumBuckets(self, expected, new_size):
- self.assertEqual(expected,
- self.index._compute_num_hash_buckets(new_size))
-
- def test_min_buckets(self):
- self.assertNumBuckets(16, 0)
- self.assertNumBuckets(16, 1)
-
- def test_max_buckets(self):
- self.assertNumBuckets(2**31 / (16 * 4), 2**31)
- self.assertNumBuckets(2**32 / (16 * 4), (2**32 - 1))
- # Even with enormous content, we explicitly cap at 2**30 hash buckets,
- # just for safety sake.
- self.assertNumBuckets(2**30, 2**48)
-
- def test_reasonable_buckets(self):
- # RABIN_WINDOW is 16 bytes. We skip the 'first' byte, so we need enough
- # buckets to fit N entries, and we default to 4 collisions per bucket
- # the value will also always be a power of two
- self.assertNumBuckets(16, 1024)
- # Everything still fits in 16 buckets (16 buckets * 4 entries per
- # bucket * 16 bytes per entry = 1088)
- self.assertNumBuckets(16, 1088)
- # Now we need 17 buckets, so we jump to the next power of 2
- self.assertNumBuckets(32, 1089)
- # This is going to be a common size
- self.assertNumBuckets(65536, 4*1024*1024)
-
- def test_includes_current_entries(self):
- self.assertNumBuckets(16, 1024)
- # if we already have 64 entries, we'll need twice as many buckets
- self.index.num_entries = 64
- self.assertNumBuckets(32, 1024)
-
-
-class TestRabinIndex_ensure_enough_buckets(TestCaseWithRabinIndex):
-
- def test_create_empty_buckets(self):
- self.assertEqual(16, len(self.index.buckets))
- self.assertEqual([None]*16, self.index.buckets)
- self.assertEqual(15, self.index.hash_mask)
-
- def test_buckets_get_reassigned(self):
- bucket = self._module.RabinBucket()
- self.index.buckets[1] = bucket
- self.index.num_entries = 1
- offset = self._module.rabin_offset()
- offset.val = 17
- bucket.first = offset
- bucket.count = 1
- # At this point, we should get a 16-wide buckets array, with the offset
- # moved to the right location
- self.assertEqual(16, len(self.index.buckets))
- self.assertEqual(15, self.index.hash_mask)
- bucket = self.index.buckets[1]
- self.assertIsNot(None, bucket)
- self.assertIs(offset, bucket.first)
- self.assertEqual(1, bucket.count)
- self.assertEqual([None], self.index.buckets[:1])
- self.assertEqual([None]*14, self.index.buckets[2:])
- # Now switch to a wider bucket
- self.index._ensure_enough_buckets(2048)
- self.assertEqual(32, len(self.index.buckets))
- self.assertEqual(31, self.index.hash_mask)
- bucket = self.index.buckets[17]
- self.assertIsNot(None, bucket)
- self.assertIs(offset, bucket.first)
- self.assertEqual(1, bucket.count)
- self.assertEqual([None]*17, self.index.buckets[:17])
- self.assertEqual([None]*14, self.index.buckets[18:])
-
- def test_shrinking_buckets(self):
- self.index._ensure_enough_buckets(2048)
- self.assertEqual(32, len(self.index.buckets))
- self.index._py_add_offset(1, 0)
- self.index._py_add_offset(1, 1)
- self.index._py_add_offset(1, 2)
- bucket = self.index.buckets[1]
- self.assertIsNot(None, bucket)
- offset = bucket.first
- self.assertEqual(2, offset.source)
- offset = offset.next
- self.assertEqual(1, offset.source)
- offset = offset.next
- self.assertEqual(0, offset.source)
- self.assertIs(None, offset.next)
- # After resizing, we want to preserve the order
- self.index._ensure_enough_buckets(0)
- self.assertEqual(16, len(self.index.buckets))
- bucket = self.index.buckets[1]
- self.assertIsNot(None, bucket)
- offset = bucket.first
- self.assertEqual(2, offset.source)
- offset = offset.next
- self.assertEqual(1, offset.source)
- offset = offset.next
- self.assertEqual(0, offset.source)
- self.assertIs(None, offset.next)
-
-
-class TestRabinIndex_add_offset(TestCaseWithRabinIndex):
-
- def test_add_offset_simple(self):
- offset = self.index._py_add_offset(1, 0)
- self.assertEqual(1, offset.val)
- self.assertEqual(0, offset.source)
- self.assertEqual(1, self.index.num_entries)
- self.assertIsNot(None, self.index.buckets[1])
- bucket = self.index.buckets[1]
- self.assertEqual(1, bucket.count)
- offset = bucket.first
- self.assertIsNot(None, offset)
- self.assertIs(None, offset.next)
-
- def test_add_offset_into_existing_bucket(self):
- offset1 = self.index._py_add_offset(1, 0)
- offset17 = self.index._py_add_offset(17, 0)
- self.assertEqual(2, self.index.num_entries)
- self.assertIsNot(None, self.index.buckets[1])
- bucket = self.index.buckets[1]
- self.assertEqual(2, bucket.count)
- self.assertIs(offset17, bucket.first)
- self.assertIs(offset1, offset17.next)
- self.assertIs(None, offset1.next)
-
-
class TestRabinMacros(tests.TestCase):
_module = None # set by load_tests()
More information about the bazaar-commits
mailing list