Rev 5531: Now working as generally expected. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

John Arbash Meinel john at arbash-meinel.com
Tue Nov 23 18:43:27 GMT 2010


At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

------------------------------------------------------------
revno: 5531
revision-id: john at arbash-meinel.com-20101123184250-bwayj00cwd2r2q20
parent: john at arbash-meinel.com-20101122225006-y8e23h0r864l1mnr
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Tue 2010-11-23 12:42:50 -0600
message:
  Now working as generally expected.
  
  
  I'm not seeing a few strings that appear more than I would expect.
  all spaces (' '*16) is quite frequent in texts.
  I'm fairly happy with the compaction logic, though I think I'd like
  it to be done with C structures, rather than sorting a python list
  of tuples.
  
  Current time is 3m39s vs 3m3s, or about 20% slower overall. Slightly
  larger, but still <1% difference.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'
--- a/bzrlib/_delta_index_pyx.pyx	2010-11-22 22:50:06 +0000
+++ b/bzrlib/_delta_index_pyx.pyx	2010-11-23 18:42:50 +0000
@@ -85,12 +85,6 @@
     unsigned char match_pre
 
 
-cdef class RabinEntryProxy:
-    """Just a wrapper around a rabin_entry struct, so we can use python."""
-
-    cdef rabin_entry *entry
-
-
 cdef class RabinHashMap
 
 # This encapsulates the context needed to search for multiple values that match
@@ -131,22 +125,27 @@
     cdef rabin_entry* next_with_dummy(self) except? NULL:
         cdef rabin_entry *slot
         while self._step < self._table_mask:
-            # print self.rabin_val, self._step, (
-            #         self._next_hash & self._table_mask)
             slot = self._table + (self._next_hash & self._table_mask)
             self._step += 1
             self._next_hash = self._next_hash + self._step
             if slot.ptr == NULL:
                 # We found a blank entry, so this chain is done
+                if self.free_slot == NULL:
+                    self.free_slot = slot
                 return NULL
             elif slot.ptr == dummy_ptr:
+                if self.free_slot == NULL:
+                    self.free_slot = slot
                 return slot
             if slot.rabin_val != self.rabin_val:
                 continue
             # We found an entry that matches exactly, return it
             self.match_count += 1
             return slot
-        raise RuntimeError('Too many steps in next_with_dummy()')
+        raise RuntimeError("We seem to have stepped too far while searching"
+                           " for rabin_val = %d, we took %d steps, hash %d"
+                           % (self.rabin_val, self._step,
+                              self._next_hash))
 
     cdef next(self):
         cdef rabin_entry *slot
@@ -163,44 +162,15 @@
         # offset.
         # Don't search forever, even if we have bogus entries in the table
         while self._step < self._table_mask:
-            # print self.rabin_val, self._step, (
-            #         self._next_hash & self._table_mask)
-            slot = self._table + (self._next_hash & self._table_mask)
-            self._step += 1
-            self._next_hash = self._next_hash + self._step
-            if slot.ptr == NULL:
-                # We found a blank entry, so this chain is done
-                self.entry = NULL
-                if self.free_slot == NULL:
-                    self.free_slot = slot
-                return
-            elif slot.ptr == dummy_ptr:
-                # We found a dummy entry
-                if self.free_slot == NULL:
-                    self.free_slot = slot
-            if slot.rabin_val == self.rabin_val:
-                # We found an entry that matches exactly, return it
-                self.match_count += 1
-                self.entry = slot
-                return
-        # If we got this far, we are in 'infinite loop' territory
-        debug = []
-        cdef int i
-        for i from 0 <= i <= self._table_mask:
-            slot = self._table + i
-            if slot.ptr == NULL:
-                info = '<null>'
-            elif slot.ptr == dummy_ptr:
-                info = '<dummy>'
-            else:
-                info = '0x%08x' % (<intptr_t>slot.ptr,)
-            debug.append((i, info, slot.rabin_val, slot.global_offset))
-        import pprint
-        s = pprint.pformat(debug)
-        raise RuntimeError("We seem to have stepped to far while searching"
-                           " for rabin_val = %d, we took %d steps, hash %d\n%s"
+            slot = self.next_with_dummy()
+            if (slot != NULL and slot.ptr == dummy_ptr):
+                continue
+            self.entry = slot
+            return
+        raise RuntimeError("We seem to have stepped too far while searching"
+                           " for rabin_val = %d, we took %d steps, hash %d"
                            % (self.rabin_val, self._step,
-                              self._next_hash, s))
+                              self._next_hash))
 
 
 cdef public unsigned int MIN_TABLE_SIZE
@@ -363,21 +333,22 @@
         #   4) For now, we just use a dirt-simple algorithm that says
         #      remove-every-other entry when the bucket gets full. We have to
         #      use 'dummy' entries, because collisions work across buckets.
-        cdef RabinEntryProxy proxy
         cdef list all_entries
         cdef int weight
         cdef unsigned int offset
         cdef rabin_entry *entry
         cdef int i
-        # Must be larger than MAX_HASH_BUCKET_ENTRIES
-        cdef rabin_entry temp_entries[70]
+        cdef int temp_idx
+        # Must be larger than MAX_HASH_BUCKET_ENTRIES. Also, after packing we
+        # add dummy entries, which can cause other hash chains to insert early.
+        # As such, we can have > 64 entries in a chain.
+        cdef rabin_entry temp_entries[128]
 
         search.start(rabin_val)
         search.next()
         all_entries = []
-        i = 0
+        temp_idx = 0
         while search.entry != NULL:
-            proxy.entry = search.entry
             # We're going for a 'weighted entry' style of sorting. The idea is
             # that entries that have a larger area 'around' them can provide
             # better matches than ones with less. Also, that all things equal,
@@ -392,13 +363,13 @@
             # 'pre' bytes is less valuable than 'post' bytes, because of how
             # the matching works, so double-weight post bytes.
             weight += (search.entry.match_tail * 2) + search.entry.match_pre
-            all_entries.append((weight, i))
-            temp_entries[i] = search.entry[0]
-            i += 1
-            if i > 70:
+            all_entries.append((weight, temp_idx))
+            temp_entries[temp_idx] = search.entry[0]
+            temp_idx += 1
+            if temp_idx >= 128:
                 raise RuntimeError('we overflowed the temp_entries list')
+            search.next()
         all_entries.sort(reverse=True)
-        assert len(all_entries) == 64
 
         # Now do the search again, and rewrite the entries. We fill in all
         # slots and all dummy slots with the newly sorted entries. We also
@@ -407,17 +378,26 @@
         search.start(rabin_val)
         entry = search.next_with_dummy()
         while entry != NULL:
-            if i > MAX_HASH_BUCKET_ENTRIES or _keep_map[i] == c'n':
+            if entry.ptr != dummy_ptr:
+                self.entry_count -= 1
+            assert entry.rabin_val == 0 or entry.rabin_val == rabin_val
+            while i < MAX_HASH_BUCKET_ENTRIES and _keep_map[i] == c'n':
+                # Skip entries that are marked to be removed
+                temp_idx = all_entries[i][1]
+                i += 1
+            if i >= MAX_HASH_BUCKET_ENTRIES:
                 # We won't keep this entry, but we have to keep a 'dummy' entry
                 # because of all the other possible hash chains that would step
                 # on it
                 entry.ptr = dummy_ptr
                 entry.rabin_val = 0
             else:
-                entry[0] = temp_entries[all_entries[i][1]]
+                temp_idx = all_entries[i][1]
+                i += 1
+                assert 0 <= temp_idx < 128
+                entry[0] = temp_entries[temp_idx]
+                self.entry_count += 1
             entry = search.next_with_dummy()
-            i += 1
-        self.entry_count -= 32
 
     cdef rabin_entry *add(self, const_data ptr, unsigned int global_offset,
                           unsigned int val, Py_ssize_t match_pre,
@@ -444,7 +424,7 @@
         entry.ptr = ptr
         entry.global_offset = global_offset
         self.entry_count += 1
-        if search.match_count > MAX_HASH_BUCKET_ENTRIES:
+        if search.match_count + 1 >= MAX_HASH_BUCKET_ENTRIES:
             self._limit_hash_entries(val, search)
         return entry
 

=== modified file 'bzrlib/tests/test__delta_index.py'
--- a/bzrlib/tests/test__delta_index.py	2010-11-22 20:18:59 +0000
+++ b/bzrlib/tests/test__delta_index.py	2010-11-23 18:42:50 +0000
@@ -225,8 +225,28 @@
         self.assertTrue(self.the_map.entry_count < 128)
         self.assertEqual(self.the_map.entry_count,
                          len(self.the_map._py_find_all(None)))
-        self.assertEqual(len(self.the_map._py_find_all(val1)),
-                         len(self.the_map._py_find_all(val2)))
+        self.assertTrue(len(self.the_map._py_find_all(val1)) < 64)
+        self.assertTrue(len(self.the_map._py_find_all(val2)) < 64)
+
+    def test_add_ugly_collisions(self):
+        self.the_map.reserve(800)
+        # These two values effectively work on the same hash chain.
+        val1 = 12345678
+        val2 = val1 + 2048
+        val3 = val1 + 4096
+        val4 = val1 + 8192
+        for i in range(200):
+            self.the_map._py_add(val1, 1000+i, 0, i)
+            self.the_map._py_add(val2, 2000+i, 0, i)
+            self.the_map._py_add(val3, 3000+i, 0, i)
+            self.the_map._py_add(val4, 4000+i, 0, i)
+        self.assertTrue(self.the_map.entry_count < 64*4)
+        self.assertEqual(self.the_map.entry_count,
+                         len(self.the_map._py_find_all(None)))
+        self.assertTrue(len(self.the_map._py_find_all(val1)) < 64)
+        self.assertTrue(len(self.the_map._py_find_all(val2)) < 64)
+        self.assertTrue(len(self.the_map._py_find_all(val3)) < 64)
+        self.assertTrue(len(self.the_map._py_find_all(val4)) < 64)
 
     def test_handle_null_rabin_val(self):
         # rabin_val('\0'*16) == 0



More information about the bazaar-commits mailing list