Rev 5492: Add code to resize the hash bucket list. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

Thu Oct 14 22:46:10 BST 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

------------------------------------------------------------
revno: 5492
revision-id: john at arbash-meinel.com-20101014214602-x3o61jymo86qduj7
parent: john at arbash-meinel.com-20101014214537-mpc8t8xrsive2z6m
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Thu 2010-10-14 16:46:02 -0500
message:
  Add code to resize the hash bucket list.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'

--- a/bzrlib/_delta_index_pyx.pyx	2010-10-12 22:26:04 +0000
+++ b/bzrlib/_delta_index_pyx.pyx	2010-10-14 21:46:02 +0000
@@ -37,18 +37,22 @@
     #  pointer.)
     cdef const_data ptr
     # The next rabin_offset that matches this val
-    cdef rabin_offset next
+    cdef public rabin_offset next
     # The 32-bit Rabin value that matches this location
-    cdef unsigned int val
+    cdef public unsigned int val
     # The source that we can find this string in (TODO: consider a source
     # pointer vs a source count)
-    cdef unsigned short source
+    cdef public unsigned short source
 
 
 cdef class RabinBucket:
-    cdef rabin_offset first
+    cdef public rabin_offset first
     cdef public unsigned int count
 
+    def __init__(self):
+        self.first = None
+        self.count = 0
+
 
 cdef class SourceInfo:
     # The actual content data for this buffer, should be exactly a PyString
@@ -81,6 +85,12 @@
     # List of SourceInfo objects, must always be < 64k
     cdef public object sources
 
+    def __init__(self):
+        self.hash_mask = 0
+        self.buckets = [None]
+        self.num_entries = 0
+        self.sources = []
+
     def _compute_num_hash_buckets(self, new_size):
         """Determine a power-of-two number of hash buckets to use.
 
@@ -93,7 +103,7 @@
         cdef unsigned long long max_buckets
         cdef unsigned long long raw_size
         cdef unsigned long long num_entries
-        cdef unsigned long num_buckets
+        cdef long num_buckets
         cdef unsigned int exponent
 
         raw_size = new_size
@@ -109,11 +119,53 @@
         max_buckets = num_entries / DEFAULT_NUM_COLLISIONS
         # Minimum of 16 buckets
         exponent = DEFAULT_MIN_BUCKET_EXPONENT
-        while (1ull << exponent) < max_buckets and exponent < 31:
+        # 2^30 is the largest that fits cleanly in a 32-bit signed long (PyInt)
+        while (1ull << exponent) < max_buckets and exponent < 30:
             exponent += 1
         num_buckets = 1 << exponent
         return num_buckets
 
+    cdef _add_offset_to_buckets(self, list buckets, long mask,
+                                rabin_offset offset):
+        cdef RabinBucket bucket
+        cdef long new_index
+
+        new_index = offset.val & mask
+        bucket = buckets[new_index]
+        if bucket is None:
+            bucket = RabinBucket()
+            buckets[new_index] = bucket
+        offset.next = bucket.first
+        bucket.first = offset
+        bucket.count += 1
+
+    cdef _add_offset(self, rabin_offset offset):
+        self._add_offset_to_buckets(self.buckets, self.hash_mask, offset)
+
+    def _ensure_enough_buckets(self, new_size):
+        """Make sure we have enough buckets ready."""
+        cdef long num_buckets
+        cdef long new_mask
+        cdef RabinBucket bucket
+        cdef rabin_offset offset
+        cdef list new_buckets
+
+        num_buckets = self._compute_num_hash_buckets(new_size)
+        if len(self.buckets) == num_buckets:
+            # Already done
+            return
+        # Time to resize
+        new_buckets = [None] * num_buckets
+        new_mask = num_buckets - 1
+        for bucket in self.buckets:
+            if bucket is None:
+                continue
+            offset = bucket.first
+            while offset is not None:
+                self._add_offset_to_buckets(new_buckets, new_mask, offset)
+                offset = offset.next
+        self.buckets = new_buckets
+
     def add_source(self, source):
         """Add a source of more data to delta against.
 

=== modified file 'bzrlib/tests/test__delta_index.py'
--- a/bzrlib/tests/test__delta_index.py	2010-10-12 22:26:04 +0000
+++ b/bzrlib/tests/test__delta_index.py	2010-10-14 21:46:02 +0000
@@ -103,27 +103,29 @@
 """
 
 
-class TestRabinIndex(tests.TestCase):
+class TestCaseWithRabinIndex(tests.TestCase):
 
     _module = None # set by load_tests()
 
+    def setUp(self):
+        super(TestCaseWithRabinIndex, self).setUp()
+        self.index = self._module.RabinIndex()
+
+
+class TestRabinIndex(TestCaseWithRabinIndex):
+
     def test_basic(self):
         index = self._module.RabinIndex()
 
     def test_add_source_type_error(self):
-        index = self._module.RabinIndex()
-        self.assertRaises(TypeError, index.add_source, 1)
-        self.assertRaises(TypeError, index.add_source, u'A unicode string')
-
-
-class TestRabinIndexComputeNumHashBuckets(tests.TestCase):
+        self.assertRaises(TypeError, self.index.add_source, 1)
+        self.assertRaises(TypeError, self.index.add_source, u'A unicode string')
+
+
+class TestRabinIndexComputeNumHashBuckets(TestCaseWithRabinIndex):
 
     _module = None # set by load_tests()
 
-    def setUp(self):
-        super(TestRabinIndexComputeNumHashBuckets, self).setUp()
-        self.index = self._module.RabinIndex()
-
     def assertNumBuckets(self, expected, new_size):
         self.assertEqual(expected,
                          self.index._compute_num_hash_buckets(new_size))
@@ -135,9 +137,9 @@
     def test_max_buckets(self):
         self.assertNumBuckets(2**31 / (16 * 4), 2**31)
         self.assertNumBuckets(2**32 / (16 * 4), (2**32 - 1))
-        # Even with enormous content, we explicitly cap at 2**31 hash buckets,
+        # Even with enormous content, we explicitly cap at 2**30 hash buckets,
         # just for safety sake.
-        self.assertNumBuckets(2**31, 2**48)
+        self.assertNumBuckets(2**30, 2**48)
 
     def test_reasonable_buckets(self):
         # RABIN_WINDOW is 16 bytes. We skip the 'first' byte, so we need enough
@@ -157,3 +159,29 @@
         # if we already have 64 entries, we'll need twice as many buckets
         self.index.num_entries = 64
         self.assertNumBuckets(32, 1024)
+
+
+class TestRabinIndex_ensure_enough_buckets(TestCaseWithRabinIndex):
+
+    def test_create_empty_buckets(self):
+        self.index._ensure_enough_buckets(0)
+        self.assertEqual(16, len(self.index.buckets))
+        self.assertEqual([None]*16, self.index.buckets)
+
+    def test_buckets_get_reassigned(self):
+        bucket = self._module.RabinBucket()
+        self.index.buckets[0] = bucket
+        offset = self._module.rabin_offset()
+        offset.val = 1
+        bucket.first = offset
+        bucket.count = 1
+        self.index._ensure_enough_buckets(0)
+        # At this point, we should get a 16-wide buckets array, with the offset
+        # moved to the right location
+        self.assertEqual(16, len(self.index.buckets))
+        bucket = self.index.buckets[1]
+        self.assertIsNot(None, bucket)
+        self.assertIs(offset, bucket.first)
+        self.assertEqual(1, bucket.count)
+        self.assertEqual([None], self.index.buckets[:1])
+        self.assertEqual([None]*14, self.index.buckets[2:])