Rev 5526: Use a dirt-simple algorithm for limiting hash buckets. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
John Arbash Meinel
john at arbash-meinel.com
Mon Nov 22 18:55:37 GMT 2010
At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
------------------------------------------------------------
revno: 5526
revision-id: john at arbash-meinel.com-20101122185506-x149dtfvzikngk7x
parent: john at arbash-meinel.com-20101122171321-s0m41he5mvzu6v5u
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Mon 2010-11-22 12:55:06 -0600
message:
Use a dirt-simple algorithm for limiting hash buckets.
It drops the time for my cyclical-stress-test from 57s down to 1.2s which
is perfectly comparable to the original code. Needs some testing on real-world
data to see the impact, but at least we've avoided strictly pathological
behavior.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'
--- a/bzrlib/_delta_index_pyx.pyx 2010-11-22 17:13:21 +0000
+++ b/bzrlib/_delta_index_pyx.pyx 2010-11-22 18:55:06 +0000
@@ -300,6 +300,39 @@
memset(new_table, 0, n_bytes)
self._swap_table(new_table, table_size - 1)
+ cdef _limit_hash_entries(self, unsigned int rabin_val):
+ """Limit the number of collisions with the given rabin val."""
+ # This is a non-global-optimal limiting algorithm. The basic issues
+ # are:
+ # 1) add() is O(num_collisions), because of cyclical data, this can
+ # be O(data_len) which is quite bad.
+ # 2) The old code used a linked list to get O(1) add, and then did a
+ # global reduction on all collisions. However, it then had to
+ # re-allocate everything to pack it into a 'nicer' structure.
+ # 3) side-note: A possible option is to use a bit-field to indicate
+ # that this hash bucket entry has been turned into a linked-list,
+ # and keep O(1ish) inserts.
+ # 4) For now, we just use a dirt-simple algorithm that says
+ # remove-every-other entry when the bucket gets full. We have to
+ # use 'dummy' entries, because collisions work across buckets.
+ cdef RabinEntrySearch search
+ search = RabinEntrySearch(self)
+ search.start(rabin_val)
+
+ search.next()
+ # We will remove all 'odd' entries, by pointing them at the dummy
+ # pointer. We start the loop at an 'even' (0 is even), and then step
+ # once to get to odd, set that to dummy, and step again.
+ while search.entry != NULL:
+ search.next()
+ if search.entry != NULL:
+ search.entry.rabin_val = 0
+ search.entry.ptr = dummy_ptr
+ self.entry_count -= 1
+ # We could set the search.free_slot here, but we don't actually
+ # care.
+ search.next()
+
cdef rabin_entry *add(self, const_data ptr, unsigned int global_offset,
unsigned int val, Py_ssize_t match_pre,
Py_ssize_t match_tail,
@@ -325,6 +358,8 @@
entry.ptr = ptr
entry.global_offset = global_offset
self.entry_count += 1
+ if search.match_count > MAX_HASH_BUCKET_ENTRIES:
+ self._limit_hash_entries(val)
return entry
def _py_add(self, val, global_offset, pre_bytes, post_bytes):
@@ -672,7 +707,6 @@
self.sources.append(new_info)
self._hash_map.reserve(len(source) / RABIN_WINDOW)
self._compute_offsets(source, start_offset)
- self._limit_hash_buckets()
self.total_source_bytes += len(source) + extra_offset
def add_delta_source(self, delta_source, extra_offset=0):
@@ -686,7 +720,6 @@
# This is an upper bound of the number of entries we'll find
self._hash_map.reserve(len(delta_source) / RABIN_WINDOW)
self._compute_offsets_from_delta(delta_source, start_offset)
- self._limit_hash_buckets()
self.total_source_bytes += len(delta_source) + extra_offset
def make_delta(self, content, max_delta_size=0, noisy=False):
=== modified file 'bzrlib/tests/test__delta_index.py'
--- a/bzrlib/tests/test__delta_index.py 2010-11-22 17:07:34 +0000
+++ b/bzrlib/tests/test__delta_index.py 2010-11-22 18:55:06 +0000
@@ -212,6 +212,14 @@
], self.the_map._py_find_all(None))
+ def test_add_limits_collisions(self):
+ self.the_map.reserve(200)
+ self.assertEqual(1023, self.the_map.table_mask)
+ for i in range(200):
+ self.the_map._py_add(1, i, 0, 0)
+ self.assertTrue(self.the_map.entry_count < 64)
+
+
class TestRabinIndex(TestCaseWithRabinIndex):
def test_basic(self):
@@ -280,22 +288,6 @@
(val2, (val2 + 1) & 1023, 64, 64, 1, 0, 0),
], None)
- def DONT_test_add_source_limits_matching_buckets(self):
- source = ('x'
- + ('1234567890123456'
- 'abcdefghijklmnop' * 100))
- val1 = self._module._py_rabin(source[1:17])
- val2 = self._module._py_rabin(source[17:33])
- # Both of these val will want to be added 100 times, but both should be
- # restricted to 64
- self.index.add_source(source)
- max_count = self._module.MAX_HASH_BUCKET_ENTRIES
- self.assertEqual(max_count*2,
- self.index.num_entries)
- bucket = self.index.buckets[val1 & self.index.hash_mask]
- self.assertBucketCount(max_count, bucket)
- bucket = self.index.buckets[val2 & self.index.hash_mask]
- self.assertBucketCount(max_count, bucket)
def test_add_delta_source(self):
source = ('x'
More information about the bazaar-commits
mailing list