Rev 5528: Special casing rabin_value = 0 was a bad idea. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
John Arbash Meinel
john at arbash-meinel.com
Mon Nov 22 20:19:30 GMT 2010
At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
------------------------------------------------------------
revno: 5528
revision-id: john at arbash-meinel.com-20101122201859-0vxlror7x9gf1r8b
parent: john at arbash-meinel.com-20101122195946-1lkuqx5ak6090xac
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Mon 2010-11-22 14:18:59 -0600
message:
Special casing rabin_value = 0 was a bad idea.
Instead, we change _py_add so that it uses a dummy ptr value, just for the
test suite. This isn't 'dummy_ptr' and it isn't NULL, so the rest of the
checks still work.
Ran into real-world data that was computing rabin_hash of null content,
which caused rabin_val == 0, which was acting wrongly in the rest of the
code.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'
--- a/bzrlib/_delta_index_pyx.pyx 2010-11-22 19:59:46 +0000
+++ b/bzrlib/_delta_index_pyx.pyx 2010-11-22 20:18:59 +0000
@@ -140,22 +140,21 @@
slot = self._table + (self._next_hash & self._table_mask)
self._step += 1
self._next_hash = self._next_hash + self._step
+ if slot.ptr == NULL:
+ # We found a blank entry, so this chain is done
+ self.entry = NULL
+ if self.free_slot == NULL:
+ self.free_slot = slot
+ return
+ elif slot.ptr == dummy_ptr:
+ # We found a dummy entry
+ if self.free_slot == NULL:
+ self.free_slot = slot
if slot.rabin_val == self.rabin_val:
# We found an entry that matches exactly, return it
self.match_count += 1
self.entry = slot
return
- if slot.rabin_val == 0:
- if slot.ptr == NULL:
- # We found a blank entry, so this chain is done
- self.entry = NULL
- if self.free_slot == NULL:
- self.free_slot = slot
- return
- elif slot.ptr == dummy_ptr:
- # We found a dummy entry
- if self.free_slot == NULL:
- self.free_slot = slot
# If we got this far, we are in 'infinite loop' territory
debug = []
cdef int i
@@ -182,6 +181,9 @@
cdef const_data dummy_ptr
dummy_ptr = <const_data>(-1)
+cdef const_data _test_dummy_ptr
+_test_dummy_ptr = <const_data>(-2)
+
cdef class RabinHashMap:
"""Map from RABIN_HASH value to pointers that can match.
@@ -192,8 +194,8 @@
cdef public unsigned int entry_count
cdef public unsigned int table_mask # table_size - 1
# The array of entries.
- # Completely empty slots are noted by having ptr==NULL && rabin_val == 0
- # Dummy entries are noted by having ptr == dummy_ptr && rabin_val == 0
+ # Completely empty slots are noted by having ptr==NULL
+ # Dummy entries are noted by having ptr == dummy_ptr
cdef rabin_entry *_table
# TODO: two options for 'soft' resizing.
@@ -272,8 +274,7 @@
search = RabinEntrySearch(self)
for i from 0 <= i <= old_mask:
slot = old_table + i
- if (slot.rabin_val == 0
- and (slot.ptr == NULL or slot.ptr == dummy_ptr)):
+ if (slot.ptr == NULL or slot.ptr == dummy_ptr):
# An empty slot
continue
search.start(slot.rabin_val)
@@ -391,7 +392,7 @@
"""
assert pre_bytes < 256
assert post_bytes < 65536
- self.add(NULL, global_offset, val, pre_bytes, post_bytes)
+ self.add(_test_dummy_ptr, global_offset, val, pre_bytes, post_bytes)
def _py_find_all(self, val):
"""Find all entries that match the given rabin_val.
@@ -408,9 +409,7 @@
if val is None:
for i from 0 <= i <= self.table_mask:
slot = self._table + i
- if (slot.rabin_val == 0 and slot.ptr == NULL):
- continue
- if (slot.rabin_val == 0 and slot.ptr == dummy_ptr):
+ if slot.ptr == NULL or slot.ptr == dummy_ptr:
# Should we output dummy entries? Could be useful for
# debugging
continue
@@ -425,11 +424,13 @@
search.start(rabin_val)
search.next()
while search.entry != NULL:
- matches.append((search.entry.rabin_val,
- <intptr_t>(search.entry - self._table),
- search.entry.global_offset,
- search.entry.match_pre, search.entry.match_tail,
- search.match_count, search._step))
+ slot = search.entry
+ matches.append((slot.rabin_val,
+ <intptr_t>(slot - self._table),
+ slot.global_offset,
+ slot.match_pre, slot.match_tail,
+ search.match_count, search._step,
+ ))
search.next()
return matches
@@ -544,8 +545,7 @@
matches = {}
for i from 0 <= i <= self._hash_map.table_mask:
slot = self._hash_map._table + i
- if (slot.rabin_val == 0
- and (slot.ptr == NULL or slot.ptr == dummy_ptr)):
+ if (slot.ptr == NULL or slot.ptr == dummy_ptr):
continue
matches[slot.rabin_val] = slot.global_offset
return matches
=== modified file 'bzrlib/tests/test__delta_index.py'
--- a/bzrlib/tests/test__delta_index.py 2010-11-22 19:59:46 +0000
+++ b/bzrlib/tests/test__delta_index.py 2010-11-22 20:18:59 +0000
@@ -228,6 +228,26 @@
self.assertEqual(len(self.the_map._py_find_all(val1)),
len(self.the_map._py_find_all(val2)))
+ def test_handle_null_rabin_val(self):
+ # rabin_val('\0'*16) == 0
+ # As such, we have to allow it as a valid entry in the hash map, and
+ # match it correctly.
+ self.the_map.reserve(10)
+ self.the_map._py_add(0, 1, 0, 0)
+ self.the_map._py_add(0, 2, 0, 0)
+ self.the_map._py_add(0, 3, 0, 0)
+ self.the_map._py_add(0, 4, 0, 0)
+ self.assertEqual([(0, 0, 1, 0, 0, 1, 1),
+ (0, 1, 2, 0, 0, 2, 2),
+ (0, 3, 3, 0, 0, 3, 3),
+ (0, 6, 4, 0, 0, 4, 4),
+ ], self.the_map._py_find_all(0))
+ self.assertEqual([(0, 0, 1, 0, 0, 0, 0),
+ (0, 1, 2, 0, 0, 0, 0),
+ (0, 3, 3, 0, 0, 0, 0),
+ (0, 6, 4, 0, 0, 0, 0),
+ ], self.the_map._py_find_all(None))
+
class TestRabinIndex(TestCaseWithRabinIndex):
More information about the bazaar-commits
mailing list