Rev 5527: Found a bug in how we were dealing with RabinEntrySearch. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
John Arbash Meinel
john at arbash-meinel.com
Mon Nov 22 20:00:17 GMT 2010
At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
------------------------------------------------------------
revno: 5527
revision-id: john at arbash-meinel.com-20101122195946-1lkuqx5ak6090xac
parent: john at arbash-meinel.com-20101122185506-x149dtfvzikngk7x
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Mon 2010-11-22 13:59:46 -0600
message:
Found a bug in how we were dealing with RabinEntrySearch.
We were treating end-of-search as entry == NULL, when Search was treating it
as entry.ptr == NULL. However, we have free_slot available, and it does
what we needed for insert purposes. So we'll just use entry == NULL to
signal then end of interesting nodes.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'
--- a/bzrlib/_delta_index_pyx.pyx 2010-11-22 18:55:06 +0000
+++ b/bzrlib/_delta_index_pyx.pyx 2010-11-22 19:59:46 +0000
@@ -135,6 +135,8 @@
# offset.
# Don't search forever, even if we have bogus entries in the table
while self._step < self._table_mask:
+ # print self.rabin_val, self._step, (
+ # self._next_hash & self._table_mask)
slot = self._table + (self._next_hash & self._table_mask)
self._step += 1
self._next_hash = self._next_hash + self._step
@@ -146,16 +148,32 @@
if slot.rabin_val == 0:
if slot.ptr == NULL:
# We found a blank entry, so this chain is done
- if self.free_slot != NULL:
- self.entry = self.free_slot
- else:
- self.entry = NULL
+ self.entry = NULL
+ if self.free_slot == NULL:
self.free_slot = slot
return
elif slot.ptr == dummy_ptr:
# We found a dummy entry
if self.free_slot == NULL:
self.free_slot = slot
+ # If we got this far, we are in 'infinite loop' territory
+ debug = []
+ cdef int i
+ for i from 0 <= i <= self._table_mask:
+ slot = self._table + i
+ if slot.ptr == NULL:
+ info = '<null>'
+ elif slot.ptr == dummy_ptr:
+ info = '<dummy>'
+ else:
+ info = '0x%08x' % (<intptr_t>slot.ptr,)
+ debug.append((i, info, slot.rabin_val, slot.global_offset))
+ import pprint
+ s = pprint.pformat(debug)
+ raise RuntimeError("We seem to have stepped to far while searching"
+ " for rabin_val = %d, we took %d steps, hash %d\n%s"
+ % (self.rabin_val, self._step,
+ self._next_hash, s))
cdef public unsigned int MIN_TABLE_SIZE
@@ -300,7 +318,8 @@
memset(new_table, 0, n_bytes)
self._swap_table(new_table, table_size - 1)
- cdef _limit_hash_entries(self, unsigned int rabin_val):
+ cdef _limit_hash_entries(self, unsigned int rabin_val,
+ RabinEntrySearch search):
"""Limit the number of collisions with the given rabin val."""
# This is a non-global-optimal limiting algorithm. The basic issues
# are:
@@ -315,23 +334,24 @@
# 4) For now, we just use a dirt-simple algorithm that says
# remove-every-other entry when the bucket gets full. We have to
# use 'dummy' entries, because collisions work across buckets.
- cdef RabinEntrySearch search
- search = RabinEntrySearch(self)
search.start(rabin_val)
-
search.next()
# We will remove all 'odd' entries, by pointing them at the dummy
# pointer. We start the loop at an 'even' (0 is even), and then step
# once to get to odd, set that to dummy, and step again.
+ count = 0
while search.entry != NULL:
search.next()
- if search.entry != NULL:
- search.entry.rabin_val = 0
- search.entry.ptr = dummy_ptr
- self.entry_count -= 1
- # We could set the search.free_slot here, but we don't actually
- # care.
- search.next()
+ if search.entry == NULL:
+ break
+ search.entry.rabin_val = 0
+ search.entry.ptr = dummy_ptr
+ self.entry_count -= 1
+ # We could set the search.free_slot here, but we don't actually
+ # care.
+ count += 1
+ search.next()
+ # print "dummified %d entries" % (count,)
cdef rabin_entry *add(self, const_data ptr, unsigned int global_offset,
unsigned int val, Py_ssize_t match_pre,
@@ -359,7 +379,7 @@
entry.global_offset = global_offset
self.entry_count += 1
if search.match_count > MAX_HASH_BUCKET_ENTRIES:
- self._limit_hash_entries(val)
+ self._limit_hash_entries(val, search)
return entry
def _py_add(self, val, global_offset, pre_bytes, post_bytes):
=== modified file 'bzrlib/tests/test__delta_index.py'
--- a/bzrlib/tests/test__delta_index.py 2010-11-22 18:55:06 +0000
+++ b/bzrlib/tests/test__delta_index.py 2010-11-22 19:59:46 +0000
@@ -213,11 +213,20 @@
def test_add_limits_collisions(self):
- self.the_map.reserve(200)
- self.assertEqual(1023, self.the_map.table_mask)
+ self.the_map.reserve(400)
+ # We fill up the map with duplicates, but not in direct order. We make
+ # sure that even when limited, we can still access all the buckets,
+ # etc.
+ val1 = 12345678
+ val2 = 12345679
for i in range(200):
- self.the_map._py_add(1, i, 0, 0)
- self.assertTrue(self.the_map.entry_count < 64)
+ self.the_map._py_add(val1, i, 0, 0)
+ self.the_map._py_add(val2, 2000+i, 0, 0)
+ self.assertTrue(self.the_map.entry_count < 128)
+ self.assertEqual(self.the_map.entry_count,
+ len(self.the_map._py_find_all(None)))
+ self.assertEqual(len(self.the_map._py_find_all(val1)),
+ len(self.the_map._py_find_all(val2)))
class TestRabinIndex(TestCaseWithRabinIndex):
More information about the bazaar-commits
mailing list