Rev 5527: Found a bug in how we were dealing with RabinEntrySearch. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

Mon Nov 22 20:00:17 GMT 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

------------------------------------------------------------
revno: 5527
revision-id: john at arbash-meinel.com-20101122195946-1lkuqx5ak6090xac
parent: john at arbash-meinel.com-20101122185506-x149dtfvzikngk7x
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Mon 2010-11-22 13:59:46 -0600
message:
  Found a bug in how we were dealing with RabinEntrySearch.
  
  We were treating end-of-search as entry == NULL, when Search was treating it
  as entry.ptr == NULL. However, we have free_slot available, and it does
  what we needed for insert purposes. So we'll just use entry == NULL to
  signal then end of interesting nodes.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'

--- a/bzrlib/_delta_index_pyx.pyx	2010-11-22 18:55:06 +0000
+++ b/bzrlib/_delta_index_pyx.pyx	2010-11-22 19:59:46 +0000
@@ -135,6 +135,8 @@
         # offset.
         # Don't search forever, even if we have bogus entries in the table
         while self._step < self._table_mask:
+            # print self.rabin_val, self._step, (
+            #         self._next_hash & self._table_mask)
             slot = self._table + (self._next_hash & self._table_mask)
             self._step += 1
             self._next_hash = self._next_hash + self._step
@@ -146,16 +148,32 @@
             if slot.rabin_val == 0:
                 if slot.ptr == NULL:
                     # We found a blank entry, so this chain is done
-                    if self.free_slot != NULL:
-                        self.entry = self.free_slot
-                    else:
-                        self.entry = NULL
+                    self.entry = NULL
+                    if self.free_slot == NULL:
                         self.free_slot = slot
                     return
                 elif slot.ptr == dummy_ptr:
                     # We found a dummy entry
                     if self.free_slot == NULL:
                         self.free_slot = slot
+        # If we got this far, we are in 'infinite loop' territory
+        debug = []
+        cdef int i
+        for i from 0 <= i <= self._table_mask:
+            slot = self._table + i
+            if slot.ptr == NULL:
+                info = '<null>'
+            elif slot.ptr == dummy_ptr:
+                info = '<dummy>'
+            else:
+                info = '0x%08x' % (<intptr_t>slot.ptr,)
+            debug.append((i, info, slot.rabin_val, slot.global_offset))
+        import pprint
+        s = pprint.pformat(debug)
+        raise RuntimeError("We seem to have stepped to far while searching"
+                           " for rabin_val = %d, we took %d steps, hash %d\n%s"
+                           % (self.rabin_val, self._step,
+                              self._next_hash, s))
 
 
 cdef public unsigned int MIN_TABLE_SIZE
@@ -300,7 +318,8 @@
         memset(new_table, 0, n_bytes)
         self._swap_table(new_table, table_size - 1)
 
-    cdef _limit_hash_entries(self, unsigned int rabin_val):
+    cdef _limit_hash_entries(self, unsigned int rabin_val,
+                             RabinEntrySearch search):
         """Limit the number of collisions with the given rabin val."""
         # This is a non-global-optimal limiting algorithm. The basic issues
         # are:
@@ -315,23 +334,24 @@
         #   4) For now, we just use a dirt-simple algorithm that says
         #      remove-every-other entry when the bucket gets full. We have to
         #      use 'dummy' entries, because collisions work across buckets.
-        cdef RabinEntrySearch search
-        search = RabinEntrySearch(self)
         search.start(rabin_val)
-
         search.next()
         # We will remove all 'odd' entries, by pointing them at the dummy
         # pointer. We start the loop at an 'even' (0 is even), and then step
         # once to get to odd, set that to dummy, and step again.
+        count = 0
         while search.entry != NULL:
             search.next()
-            if search.entry != NULL:
-                search.entry.rabin_val = 0
-                search.entry.ptr = dummy_ptr
-                self.entry_count -= 1
-                # We could set the search.free_slot here, but we don't actually
-                # care.
-                search.next()
+            if search.entry == NULL:
+                break
+            search.entry.rabin_val = 0
+            search.entry.ptr = dummy_ptr
+            self.entry_count -= 1
+            # We could set the search.free_slot here, but we don't actually
+            # care.
+            count += 1
+            search.next()
+        # print "dummified %d entries" % (count,)
 
     cdef rabin_entry *add(self, const_data ptr, unsigned int global_offset,
                           unsigned int val, Py_ssize_t match_pre,
@@ -359,7 +379,7 @@
         entry.global_offset = global_offset
         self.entry_count += 1
         if search.match_count > MAX_HASH_BUCKET_ENTRIES:
-            self._limit_hash_entries(val)
+            self._limit_hash_entries(val, search)
         return entry
 
     def _py_add(self, val, global_offset, pre_bytes, post_bytes):

=== modified file 'bzrlib/tests/test__delta_index.py'
--- a/bzrlib/tests/test__delta_index.py	2010-11-22 18:55:06 +0000
+++ b/bzrlib/tests/test__delta_index.py	2010-11-22 19:59:46 +0000
@@ -213,11 +213,20 @@
 
 
     def test_add_limits_collisions(self):
-        self.the_map.reserve(200)
-        self.assertEqual(1023, self.the_map.table_mask)
+        self.the_map.reserve(400)
+        # We fill up the map with duplicates, but not in direct order. We make
+        # sure that even when limited, we can still access all the buckets,
+        # etc.
+        val1 = 12345678
+        val2 = 12345679
         for i in range(200):
-            self.the_map._py_add(1, i, 0, 0)
-        self.assertTrue(self.the_map.entry_count < 64)
+            self.the_map._py_add(val1, i, 0, 0)
+            self.the_map._py_add(val2, 2000+i, 0, 0)
+        self.assertTrue(self.the_map.entry_count < 128)
+        self.assertEqual(self.the_map.entry_count,
+                         len(self.the_map._py_find_all(None)))
+        self.assertEqual(len(self.the_map._py_find_all(val1)),
+                         len(self.the_map._py_find_all(val2)))
 
 
 class TestRabinIndex(TestCaseWithRabinIndex):