Rev 5528: Special casing rabin_value = 0 was a bad idea. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

John Arbash Meinel john at arbash-meinel.com
Mon Nov 22 20:19:30 GMT 2010


At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

------------------------------------------------------------
revno: 5528
revision-id: john at arbash-meinel.com-20101122201859-0vxlror7x9gf1r8b
parent: john at arbash-meinel.com-20101122195946-1lkuqx5ak6090xac
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Mon 2010-11-22 14:18:59 -0600
message:
  Special casing rabin_value = 0 was a bad idea.
  
  Instead, we change _py_add so that it uses a dummy ptr value, just for the
  test suite. This isn't 'dummy_ptr' and it isn't NULL, so the rest of the
  checks still work.
  
  Ran into real-world data that was computing rabin_hash of null content,
  which caused rabin_val == 0, which was acting wrongly in the rest of the
  code.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'
--- a/bzrlib/_delta_index_pyx.pyx	2010-11-22 19:59:46 +0000
+++ b/bzrlib/_delta_index_pyx.pyx	2010-11-22 20:18:59 +0000
@@ -140,22 +140,21 @@
             slot = self._table + (self._next_hash & self._table_mask)
             self._step += 1
             self._next_hash = self._next_hash + self._step
+            if slot.ptr == NULL:
+                # We found a blank entry, so this chain is done
+                self.entry = NULL
+                if self.free_slot == NULL:
+                    self.free_slot = slot
+                return
+            elif slot.ptr == dummy_ptr:
+                # We found a dummy entry
+                if self.free_slot == NULL:
+                    self.free_slot = slot
             if slot.rabin_val == self.rabin_val:
                 # We found an entry that matches exactly, return it
                 self.match_count += 1
                 self.entry = slot
                 return
-            if slot.rabin_val == 0:
-                if slot.ptr == NULL:
-                    # We found a blank entry, so this chain is done
-                    self.entry = NULL
-                    if self.free_slot == NULL:
-                        self.free_slot = slot
-                    return
-                elif slot.ptr == dummy_ptr:
-                    # We found a dummy entry
-                    if self.free_slot == NULL:
-                        self.free_slot = slot
         # If we got this far, we are in 'infinite loop' territory
         debug = []
         cdef int i
@@ -182,6 +181,9 @@
 cdef const_data dummy_ptr
 dummy_ptr = <const_data>(-1)
 
+cdef const_data _test_dummy_ptr
+_test_dummy_ptr = <const_data>(-2)
+
 cdef class RabinHashMap:
     """Map from RABIN_HASH value to pointers that can match.
 
@@ -192,8 +194,8 @@
     cdef public unsigned int entry_count
     cdef public unsigned int table_mask # table_size - 1
     # The array of entries.
-    # Completely empty slots are noted by having ptr==NULL && rabin_val == 0
-    # Dummy entries are noted by having ptr == dummy_ptr && rabin_val == 0
+    # Completely empty slots are noted by having ptr==NULL
+    # Dummy entries are noted by having ptr == dummy_ptr
     cdef rabin_entry *_table
 
     # TODO: two options for 'soft' resizing.
@@ -272,8 +274,7 @@
             search = RabinEntrySearch(self)
             for i from 0 <= i <= old_mask:
                 slot = old_table + i
-                if (slot.rabin_val == 0
-                    and (slot.ptr == NULL or slot.ptr == dummy_ptr)):
+                if (slot.ptr == NULL or slot.ptr == dummy_ptr):
                     # An empty slot
                     continue
                 search.start(slot.rabin_val)
@@ -391,7 +392,7 @@
         """
         assert pre_bytes < 256
         assert post_bytes < 65536
-        self.add(NULL, global_offset, val, pre_bytes, post_bytes)
+        self.add(_test_dummy_ptr, global_offset, val, pre_bytes, post_bytes)
 
     def _py_find_all(self, val):
         """Find all entries that match the given rabin_val.
@@ -408,9 +409,7 @@
         if val is None:
             for i from 0 <= i <= self.table_mask:
                 slot = self._table + i
-                if (slot.rabin_val == 0 and slot.ptr == NULL):
-                    continue
-                if (slot.rabin_val == 0 and slot.ptr == dummy_ptr):
+                if slot.ptr == NULL or slot.ptr == dummy_ptr:
                     # Should we output dummy entries? Could be useful for
                     # debugging
                     continue
@@ -425,11 +424,13 @@
             search.start(rabin_val)
             search.next()
             while search.entry != NULL:
-                matches.append((search.entry.rabin_val,
-                                <intptr_t>(search.entry - self._table),
-                                search.entry.global_offset,
-                                search.entry.match_pre, search.entry.match_tail,
-                                search.match_count, search._step))
+                slot = search.entry
+                matches.append((slot.rabin_val,
+                                <intptr_t>(slot - self._table),
+                                slot.global_offset,
+                                slot.match_pre, slot.match_tail,
+                                search.match_count, search._step,
+                                ))
                 search.next()
         return matches
 
@@ -544,8 +545,7 @@
         matches = {}
         for i from 0 <= i <= self._hash_map.table_mask:
             slot = self._hash_map._table + i
-            if (slot.rabin_val == 0
-                and (slot.ptr == NULL or slot.ptr == dummy_ptr)):
+            if (slot.ptr == NULL or slot.ptr == dummy_ptr):
                 continue
             matches[slot.rabin_val] = slot.global_offset
         return matches

=== modified file 'bzrlib/tests/test__delta_index.py'
--- a/bzrlib/tests/test__delta_index.py	2010-11-22 19:59:46 +0000
+++ b/bzrlib/tests/test__delta_index.py	2010-11-22 20:18:59 +0000
@@ -228,6 +228,26 @@
         self.assertEqual(len(self.the_map._py_find_all(val1)),
                          len(self.the_map._py_find_all(val2)))
 
+    def test_handle_null_rabin_val(self):
+        # rabin_val('\0'*16) == 0
+        # As such, we have to allow it as a valid entry in the hash map, and
+        # match it correctly.
+        self.the_map.reserve(10)
+        self.the_map._py_add(0, 1, 0, 0)
+        self.the_map._py_add(0, 2, 0, 0)
+        self.the_map._py_add(0, 3, 0, 0)
+        self.the_map._py_add(0, 4, 0, 0)
+        self.assertEqual([(0, 0, 1, 0, 0, 1, 1),
+                          (0, 1, 2, 0, 0, 2, 2),
+                          (0, 3, 3, 0, 0, 3, 3),
+                          (0, 6, 4, 0, 0, 4, 4),
+                         ], self.the_map._py_find_all(0))
+        self.assertEqual([(0, 0, 1, 0, 0, 0, 0),
+                          (0, 1, 2, 0, 0, 0, 0),
+                          (0, 3, 3, 0, 0, 0, 0),
+                          (0, 6, 4, 0, 0, 0, 0),
+                         ], self.the_map._py_find_all(None))
+
 
 class TestRabinIndex(TestCaseWithRabinIndex):
 



More information about the bazaar-commits mailing list