Rev 5526: Use a dirt-simple algorithm for limiting hash buckets. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

Mon Nov 22 18:55:37 GMT 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

------------------------------------------------------------
revno: 5526
revision-id: john at arbash-meinel.com-20101122185506-x149dtfvzikngk7x
parent: john at arbash-meinel.com-20101122171321-s0m41he5mvzu6v5u
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Mon 2010-11-22 12:55:06 -0600
message:
  Use a dirt-simple algorithm for limiting hash buckets.
  
  It drops the time for my cyclical-stress-test from 57s down to 1.2s which
  is perfectly comparable to the original code. Needs some testing on real-world
  data to see the impact, but at least we've avoided strictly pathological
  behavior.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'

--- a/bzrlib/_delta_index_pyx.pyx	2010-11-22 17:13:21 +0000
+++ b/bzrlib/_delta_index_pyx.pyx	2010-11-22 18:55:06 +0000
@@ -300,6 +300,39 @@
         memset(new_table, 0, n_bytes)
         self._swap_table(new_table, table_size - 1)
 
+    cdef _limit_hash_entries(self, unsigned int rabin_val):
+        """Limit the number of collisions with the given rabin val."""
+        # This is a non-global-optimal limiting algorithm. The basic issues
+        # are:
+        #   1) add() is O(num_collisions), because of cyclical data, this can
+        #      be O(data_len) which is quite bad.
+        #   2) The old code used a linked list to get O(1) add, and then did a
+        #      global reduction on all collisions. However, it then had to
+        #      re-allocate everything to pack it into a 'nicer' structure.
+        #   3) side-note: A possible option is to use a bit-field to indicate
+        #      that this hash bucket entry has been turned into a linked-list,
+        #      and keep O(1ish) inserts.
+        #   4) For now, we just use a dirt-simple algorithm that says
+        #      remove-every-other entry when the bucket gets full. We have to
+        #      use 'dummy' entries, because collisions work across buckets.
+        cdef RabinEntrySearch search
+        search = RabinEntrySearch(self)
+        search.start(rabin_val)
+
+        search.next()
+        # We will remove all 'odd' entries, by pointing them at the dummy
+        # pointer. We start the loop at an 'even' (0 is even), and then step
+        # once to get to odd, set that to dummy, and step again.
+        while search.entry != NULL:
+            search.next()
+            if search.entry != NULL:
+                search.entry.rabin_val = 0
+                search.entry.ptr = dummy_ptr
+                self.entry_count -= 1
+                # We could set the search.free_slot here, but we don't actually
+                # care.
+                search.next()
+
     cdef rabin_entry *add(self, const_data ptr, unsigned int global_offset,
                           unsigned int val, Py_ssize_t match_pre,
                           Py_ssize_t match_tail,
@@ -325,6 +358,8 @@
         entry.ptr = ptr
         entry.global_offset = global_offset
         self.entry_count += 1
+        if search.match_count > MAX_HASH_BUCKET_ENTRIES:
+            self._limit_hash_entries(val)
         return entry
 
     def _py_add(self, val, global_offset, pre_bytes, post_bytes):
@@ -672,7 +707,6 @@
         self.sources.append(new_info)
         self._hash_map.reserve(len(source) / RABIN_WINDOW)
         self._compute_offsets(source, start_offset)
-        self._limit_hash_buckets()
         self.total_source_bytes += len(source) + extra_offset
 
     def add_delta_source(self, delta_source, extra_offset=0):
@@ -686,7 +720,6 @@
         # This is an upper bound of the number of entries we'll find
         self._hash_map.reserve(len(delta_source) / RABIN_WINDOW)
         self._compute_offsets_from_delta(delta_source, start_offset)
-        self._limit_hash_buckets()
         self.total_source_bytes += len(delta_source) + extra_offset
 
     def make_delta(self, content, max_delta_size=0, noisy=False):

=== modified file 'bzrlib/tests/test__delta_index.py'
--- a/bzrlib/tests/test__delta_index.py	2010-11-22 17:07:34 +0000
+++ b/bzrlib/tests/test__delta_index.py	2010-11-22 18:55:06 +0000
@@ -212,6 +212,14 @@
                          ], self.the_map._py_find_all(None))
 
 
+    def test_add_limits_collisions(self):
+        self.the_map.reserve(200)
+        self.assertEqual(1023, self.the_map.table_mask)
+        for i in range(200):
+            self.the_map._py_add(1, i, 0, 0)
+        self.assertTrue(self.the_map.entry_count < 64)
+
+
 class TestRabinIndex(TestCaseWithRabinIndex):
 
     def test_basic(self):
@@ -280,22 +288,6 @@
                                 (val2, (val2 + 1) & 1023, 64, 64, 1, 0, 0),
                                ], None)
 
-    def DONT_test_add_source_limits_matching_buckets(self):
-        source = ('x'
-            + ('1234567890123456'
-               'abcdefghijklmnop' * 100))
-        val1 = self._module._py_rabin(source[1:17])
-        val2 = self._module._py_rabin(source[17:33])
-        # Both of these val will want to be added 100 times, but both should be
-        # restricted to 64
-        self.index.add_source(source)
-        max_count = self._module.MAX_HASH_BUCKET_ENTRIES
-        self.assertEqual(max_count*2,
-                         self.index.num_entries)
-        bucket = self.index.buckets[val1 & self.index.hash_mask]
-        self.assertBucketCount(max_count, bucket)
-        bucket = self.index.buckets[val2 & self.index.hash_mask]
-        self.assertBucketCount(max_count, bucket)
 
     def test_add_delta_source(self):
         source = ('x'