Rev 5521: (broken) Just doing a snapshot. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

Fri Nov 19 20:05:26 GMT 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

------------------------------------------------------------
revno: 5521
revision-id: john at arbash-meinel.com-20101119200510-rslzmtz8ofvjez5i
parent: john at arbash-meinel.com-20101119184852-64g7pbzu1z9sl9k7
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Fri 2010-11-19 14:05:10 -0600
message:
  (broken) Just doing a snapshot.
  
  Starting to move the internals of RabinIndex to using the RabinHashMap.
  Still quite a bit to sort out. RabinBucket and rabin_offset were used
  rather extensively in the rest of the code.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'

--- a/bzrlib/_delta_index_pyx.pyx	2010-11-18 21:56:06 +0000
+++ b/bzrlib/_delta_index_pyx.pyx	2010-11-19 20:05:10 +0000
@@ -60,6 +60,10 @@
 cdef int MAX_DELTA_OP_SIZE
 # This is a copy instruction of 64kB at the largest possible offset, etc.
 MAX_DELTA_OP_SIZE = (5 + 5 + 1 + RABIN_WINDOW + 7)
+cdef int MAX_MATCH_PRE
+MAX_MATCH_PRE = 255
+cdef int MAX_MATCH_TAIL
+MAX_MATCH_TAIL = 65535
 
 
 cdef struct rabin_entry:
@@ -222,7 +226,7 @@
         # collisions). Determine how we want to make this better
         # One option is to use a search. Do 2 loops,
         # for slot in old_table:
-        #  val = slot.val
+        #  val = slot.rabin_val
         #  for sub_slot in search(old_table, val):
         #    insert(new_table, sub_slot)
         #    mark(sub_slot, completed=True)
@@ -277,6 +281,9 @@
         table_size = self.table_mask + 1
         total_entries = self.entry_count + new_entries
         # We allow the table to become 2/3rds full before we resize
+        # TODO: We may want to shrink at this point if we find that we
+        #       over-allocated. (Say we got 1MB of all 0's, then we end up with
+        #       a single entry in the map, but reserved lots of space)
         if (self.table_mask != 0 and total_entries * 3 < table_size * 2):
             # We have enough room already
             return
@@ -367,33 +374,6 @@
         return matches
 
 
-cdef class rabin_offset:
-    # Pointer to the bytes just after the bytes that match this hash
-    # (eg, if you have hash('abc') = 20, then for 'abcd' we will point to 'd')
-    # (Note that if the source listing changes, we'll need to move this
-    #  pointer.)
-    cdef const_data ptr
-    # The next rabin_offset that matches this val
-    cdef public rabin_offset next
-    # The 32-bit Rabin value that matches this location
-    cdef public unsigned int val
-    # The source that we can find this string in (TODO: consider a source
-    # pointer vs a source count)
-    cdef public unsigned short source
-
-    def __repr__(self):
-        return '%s(%d, %d)' % (self.__class__.__name__, self.val, self.source)
-
-
-cdef class RabinBucket:
-    cdef public rabin_offset first
-    cdef public unsigned int count
-
-    def __init__(self):
-        self.first = None
-        self.count = 0
-
-
 cdef class SourceInfo:
     # The actual content data for this buffer, should be exactly a PyString
     # (8-bit string) type
@@ -478,11 +458,7 @@
     A given sequence of bytes will generate a "RABIN" value, this is then
     indexed via a hash map.
     """
-
-    cdef public unsigned int hash_mask
-    cdef public unsigned int num_entries
-    # List of RabinBucket objects
-    cdef public object buckets
+    cdef public RabinHashMap _hash_map
     # List of SourceInfo objects, must always be < 64k
     cdef public object sources
     cdef public unsigned int total_source_bytes
@@ -493,134 +469,44 @@
             return self.total_source_bytes
 
     def __init__(self):
-        self.hash_mask = 0
-        self.buckets = [None]
+        self._hash_map = RabinHashMap()
         # Just initialize the basic buckets to start with
-        self._ensure_enough_buckets(0)
-        self.num_entries = 0
         self.sources = []
         self.total_source_bytes = 0
         self.max_bucket_size = MAX_HASH_BUCKET_ENTRIES
 
+    cdef _find_source_for_ptr(self, const_data ptr):
+        """Search through the source list for which one holds this pointer."""
+        cdef SourceInfo info
+        cdef char *start, *end
+
+        if ptr == NULL:
+            return -1, None
+        for idx, info in enumerate(self.sources):
+            start = PyString_AS_STRING(info.buf)
+            end = start + PyString_GET_SIZE(info.buf)
+            if ptr >= start && ptr <= end:
+                return idx, info
+        return -1, None
+
     def _matches_as_dict(self):
-        cdef RabinBucket bucket
         cdef SourceInfo source
-        cdef rabin_offset offset
+        cdef unsigned int i
+        cdef Py_ssize_t local_offset
+        cdef rabin_entry *slot
 
         matches = {}
-        for bucket in self.buckets:
-            if bucket is None:
+        for i from 0 <= i <= self._hash_map.table_mask:
+            slot = self._hash_map._table + i
+            if (slot.rabin_val == 0
+                and (slot.ptr == NULL or slot.ptr == dummy_ptr)):
                 continue
-            offset = bucket.first
-            while offset is not None:
-                source = self.sources[offset.source]
-                local_offset = offset.ptr - PyString_AS_STRING(source.buf)
-                matches[offset.val] = source.start_offset + local_offset
-                offset = offset.next
+            idx, source = self._find_source_for_ptr(slot.ptr)
+            if source is not None:
+                local_offset = slot.ptr - PyString_AS_STRING(source.buf)
+                matches[slot.rabin_val] = source.start_offset + local_offset
         return matches
 
-    def _compute_num_hash_buckets(self, new_size):
-        """Determine a power-of-two number of hash buckets to use.
-
-        We know that the new content will have no more than "new_size /
-        RABIN_WINDOW" rabin_offsets that we will want to create. And we know
-        that we already have '.num_entries'. So we add the two together, and
-        find a power-of-two that is big enough to hold everything. (Note that
-        we default to wanting *some* collisions.)
-        """
-        cdef unsigned long long max_buckets
-        cdef unsigned long long raw_size
-        cdef unsigned long long num_entries
-        cdef long num_buckets
-        cdef unsigned int exponent
-
-        raw_size = new_size
-        if raw_size > 0:
-            # From the C code:
-            # /* Determine index hash size.  Note that indexing skips the
-            #    first byte to allow for optimizing the Rabin's polynomial
-            #    initialization in create_delta(). */
-            raw_size = raw_size - 1
-        num_entries = raw_size / RABIN_WINDOW
-        num_entries += self.num_entries
-        # Heuristic, default to trying to fit 4 items per hash bucket
-        max_buckets = num_entries / DEFAULT_NUM_COLLISIONS
-        # Minimum of 16 buckets
-        exponent = DEFAULT_MIN_BUCKET_EXPONENT
-        # 2^30 is the largest that fits cleanly in a 32-bit signed long (PyInt)
-        while (1ull << exponent) < max_buckets and exponent < 30:
-            exponent += 1
-        num_buckets = 1 << exponent
-        return num_buckets
-
-    cdef _add_offset_to_buckets(self, list buckets, long mask,
-                                rabin_offset offset):
-        cdef RabinBucket bucket
-        cdef long new_index
-
-        new_index = offset.val & mask
-        bucket = buckets[new_index]
-        if bucket is None:
-            bucket = RabinBucket()
-            buckets[new_index] = bucket
-        offset.next = bucket.first
-        bucket.first = offset
-        bucket.count += 1
-
-    cdef _add_offset(self, rabin_offset offset):
-        self._add_offset_to_buckets(self.buckets, self.hash_mask, offset)
-        self.num_entries += 1
-
-    def _py_add_offset(self, val, source):
-        """A python thunk for testing _add_offset
-
-        :param val: The Rabin hash value (integer)
-        :param source: The source index (integer)
-        :return: A newly created rabin_offset that has been created and added
-        """
-        cdef rabin_offset offset
-
-        offset = rabin_offset()
-        offset.next = None
-        offset.val = val
-        offset.source = source
-        offset.ptr = NULL
-        self._add_offset(offset)
-        return offset
-
-    def _ensure_enough_buckets(self, new_size):
-        """Make sure we have enough buckets ready."""
-        cdef long num_buckets
-        cdef long new_mask
-        cdef RabinBucket bucket
-        cdef rabin_offset offset
-        cdef list new_buckets
-
-        num_buckets = self._compute_num_hash_buckets(new_size)
-        if len(self.buckets) == num_buckets:
-            # Already done
-            return
-        # Time to resize
-        new_buckets = [None] * num_buckets
-        new_mask = num_buckets - 1
-        for bucket in self.buckets:
-            if bucket is None:
-                continue
-            # We want to insert them in reverse order, so that we preserve the
-            # final ordering. We won't preserve perfect ordering when shrinking
-            # and offsets merge into the same bucket, but it should be good
-            # enough.
-            offsets_to_insert = []
-            offset = bucket.first
-            while offset is not None:
-                offsets_to_insert.append(offset)
-                offset = offset.next
-            while offsets_to_insert:
-                offset = offsets_to_insert.pop()
-                self._add_offset_to_buckets(new_buckets, new_mask, offset)
-        self.buckets = new_buckets
-        self.hash_mask = new_mask
-
     def _compute_offsets(self, source):
         """Add the hash entries for this data as a 'source' string."""
         cdef unsigned short source_counter
@@ -628,9 +514,11 @@
         cdef const_data start
         cdef unsigned long num_entries
         cdef unsigned int val
+        cdef Py_ssize_t match_pre
+        cdef Py_ssize_t match_tail
         cdef int i
         cdef Py_ssize_t size
-        cdef rabin_offset last
+        cdef rabin_entry *last
         cdef RabinBucket bucket
         cdef unsigned int hash_offset
 
@@ -648,27 +536,32 @@
         # We walk backwards through the data, so that early matches are the
         # first entries in the hash buckets.
         start = (<const_data>PyString_AS_STRING(source))
-        # TODO: try to align this so the pointers end up on a 4/8-byte
-        #       alignment
+        end = start + size
+        # Note: trying to align this so the pointers end up on a 4/8-byte
+        #       alignment didn't seem to help anything. We only ended up with a
+        #       1/4 chance of having both source and target aligned.
         ptr = (start + (num_entries * RABIN_WINDOW) - RABIN_WINDOW)
-        last = rabin_offset()
-        last.val = ~0 # 0xFFFFFFFF
+        last == NULL
         while ptr >= start:
             val = 0
             for i from 1 <= i <= RABIN_WINDOW:
                 RABIN_ADD(val, ptr[i])
-            if (val == last.val):
+            match_pre = ptr - start
+            if match_pre > MAX_MATCH_PRE:
+                match_pre = MAX_MATCH_PRE
+            match_tail = end - ptr
+            if match_tail > MAX_MATCH_TAIL:
+                match_tail = MAX_MATCH_TAIL
+            if (last != NULL and val == last.rabin_val):
                 # Keep the most recent of consecutive entries, which means the
                 # closest to the start of the source
                 # XXX: test this
                 last.ptr = (ptr + RABIN_WINDOW)
+                last.match_pre = <unsigned char>match_pre
+                last.match_tail = <unsigned short>match_tail
                 num_entries -= 1
             else:
-                last = rabin_offset()
-                last.ptr = (ptr + RABIN_WINDOW)
-                last.val = val
-                last.source = source_counter
-                self._add_offset(last)
+                last = self._hash_map.add(ptr, val, match_pre, match_tail)
             ptr -= RABIN_WINDOW
 
     def _compute_offsets_from_delta(self, delta_source):
@@ -676,12 +569,15 @@
         cdef const_data ptr
         cdef const_data start
         cdef const_data end
+        cdef const_data insert_start
+        cdef const_data insert_end
+        cdef Py_ssize_t match_pre
+        cdef Py_ssize_t match_tail
         cdef unsigned long num_entries
         cdef unsigned int val, prev_val
         cdef unsigned char c
         cdef int i
         cdef Py_ssize_t size
-        cdef rabin_offset this
         cdef RabinBucket bucket
         cdef unsigned int hash_offset
 
@@ -713,8 +609,10 @@
                 if c & 0x20: ptr += 1
                 if c & 0x40: ptr += 1
             elif c != 0:
+                insert_start = ptr
+                insert_end = ptr + c
                 # Insert command, grab these bytes to index, length is 'c'
-                if ptr + c > end:
+                if insert_end > end:
                     # Actually an invalid command...
                     raise ValueError('insert command of %d bytes at offset'
                                      ' %d goes off the end of delta bytes'
@@ -724,6 +622,12 @@
                 # TODO: try to align this so the pointers end up at 4/8-byte
                 #       alignment
                 while c > RABIN_WINDOW + 3:
+                    match_pre = ptr - insert_start
+                    if match_pre > MAX_MATCH_PRE:
+                        match_pre = MAX_MATCH_PRE
+                    match_tail = insert_end - ptr
+                    if match_tail > MAX_MATCH_TAIL:
+                        match_tail = MAX_MATCH_TAIL
                     val = 0
                     # We shift-by-one because the matching code skips the first
                     # byte
@@ -734,11 +638,7 @@
                     if val == prev_val:
                         # Keep only the first of matching sequences
                         continue
-                    this = rabin_offset()
-                    this.ptr = ptr
-                    this.val = val
-                    this.source = source_counter
-                    self._add_offset(this)
+                    last = self._hash_map.add(ptr, val, match_pre, match_tail)
                     num_entries += 1
                 ptr += c
             else:
@@ -831,7 +731,7 @@
         start_offset = self.total_source_bytes + extra_offset
         new_info = SourceInfo(source, start_offset, is_delta=False)
         self.sources.append(new_info)
-        self._ensure_enough_buckets(len(source))
+        self._hash_map.reserve(len(source) / RABIN_WINDOW)
         self._compute_offsets(source)
         self._limit_hash_buckets()
         self.total_source_bytes += len(source) + extra_offset
@@ -940,7 +840,7 @@
         while offset is not None:
             this = offset
             offset = offset.next
-            if this.val != cur_rabin_val:
+            if this.rabin_val != cur_rabin_val:
                 continue
             ref = this.ptr
             ref_source = self.index.sources[this.source]

=== modified file 'bzrlib/tests/test__delta_index.py'
--- a/bzrlib/tests/test__delta_index.py	2010-11-19 18:48:52 +0000
+++ b/bzrlib/tests/test__delta_index.py	2010-11-19 20:05:10 +0000
@@ -395,137 +395,6 @@
             , delta)
 
 
-
-class TestRabinIndexComputeNumHashBuckets(TestCaseWithRabinIndex):
-
-    _module = None # set by load_tests()
-
-    def assertNumBuckets(self, expected, new_size):
-        self.assertEqual(expected,
-                         self.index._compute_num_hash_buckets(new_size))
-
-    def test_min_buckets(self):
-        self.assertNumBuckets(16, 0)
-        self.assertNumBuckets(16, 1)
-
-    def test_max_buckets(self):
-        self.assertNumBuckets(2**31 / (16 * 4), 2**31)
-        self.assertNumBuckets(2**32 / (16 * 4), (2**32 - 1))
-        # Even with enormous content, we explicitly cap at 2**30 hash buckets,
-        # just for safety sake.
-        self.assertNumBuckets(2**30, 2**48)
-
-    def test_reasonable_buckets(self):
-        # RABIN_WINDOW is 16 bytes. We skip the 'first' byte, so we need enough
-        # buckets to fit N entries, and we default to 4 collisions per bucket
-        # the value will also always be a power of two
-        self.assertNumBuckets(16, 1024)
-        # Everything still fits in 16 buckets (16 buckets * 4 entries per
-        # bucket * 16 bytes per entry = 1088)
-        self.assertNumBuckets(16, 1088)
-        # Now we need 17 buckets, so we jump to the next power of 2
-        self.assertNumBuckets(32, 1089)
-        # This is going to be a common size
-        self.assertNumBuckets(65536, 4*1024*1024)
-
-    def test_includes_current_entries(self):
-        self.assertNumBuckets(16, 1024)
-        # if we already have 64 entries, we'll need twice as many buckets
-        self.index.num_entries = 64
-        self.assertNumBuckets(32, 1024)
-
-
-class TestRabinIndex_ensure_enough_buckets(TestCaseWithRabinIndex):
-
-    def test_create_empty_buckets(self):
-        self.assertEqual(16, len(self.index.buckets))
-        self.assertEqual([None]*16, self.index.buckets)
-        self.assertEqual(15, self.index.hash_mask)
-
-    def test_buckets_get_reassigned(self):
-        bucket = self._module.RabinBucket()
-        self.index.buckets[1] = bucket
-        self.index.num_entries = 1
-        offset = self._module.rabin_offset()
-        offset.val = 17
-        bucket.first = offset
-        bucket.count = 1
-        # At this point, we should get a 16-wide buckets array, with the offset
-        # moved to the right location
-        self.assertEqual(16, len(self.index.buckets))
-        self.assertEqual(15, self.index.hash_mask)
-        bucket = self.index.buckets[1]
-        self.assertIsNot(None, bucket)
-        self.assertIs(offset, bucket.first)
-        self.assertEqual(1, bucket.count)
-        self.assertEqual([None], self.index.buckets[:1])
-        self.assertEqual([None]*14, self.index.buckets[2:])
-        # Now switch to a wider bucket
-        self.index._ensure_enough_buckets(2048)
-        self.assertEqual(32, len(self.index.buckets))
-        self.assertEqual(31, self.index.hash_mask)
-        bucket = self.index.buckets[17]
-        self.assertIsNot(None, bucket)
-        self.assertIs(offset, bucket.first)
-        self.assertEqual(1, bucket.count)
-        self.assertEqual([None]*17, self.index.buckets[:17])
-        self.assertEqual([None]*14, self.index.buckets[18:])
-
-    def test_shrinking_buckets(self):
-        self.index._ensure_enough_buckets(2048)
-        self.assertEqual(32, len(self.index.buckets))
-        self.index._py_add_offset(1, 0)
-        self.index._py_add_offset(1, 1)
-        self.index._py_add_offset(1, 2)
-        bucket = self.index.buckets[1]
-        self.assertIsNot(None, bucket)
-        offset = bucket.first
-        self.assertEqual(2, offset.source)
-        offset = offset.next
-        self.assertEqual(1, offset.source)
-        offset = offset.next
-        self.assertEqual(0, offset.source)
-        self.assertIs(None, offset.next)
-        # After resizing, we want to preserve the order
-        self.index._ensure_enough_buckets(0)
-        self.assertEqual(16, len(self.index.buckets))
-        bucket = self.index.buckets[1]
-        self.assertIsNot(None, bucket)
-        offset = bucket.first
-        self.assertEqual(2, offset.source)
-        offset = offset.next
-        self.assertEqual(1, offset.source)
-        offset = offset.next
-        self.assertEqual(0, offset.source)
-        self.assertIs(None, offset.next)
-
-
-class TestRabinIndex_add_offset(TestCaseWithRabinIndex):
-
-    def test_add_offset_simple(self):
-        offset = self.index._py_add_offset(1, 0)
-        self.assertEqual(1, offset.val)
-        self.assertEqual(0, offset.source)
-        self.assertEqual(1, self.index.num_entries)
-        self.assertIsNot(None, self.index.buckets[1])
-        bucket = self.index.buckets[1]
-        self.assertEqual(1, bucket.count)
-        offset = bucket.first
-        self.assertIsNot(None, offset)
-        self.assertIs(None, offset.next)
-
-    def test_add_offset_into_existing_bucket(self):
-        offset1 = self.index._py_add_offset(1, 0)
-        offset17 = self.index._py_add_offset(17, 0)
-        self.assertEqual(2, self.index.num_entries)
-        self.assertIsNot(None, self.index.buckets[1])
-        bucket = self.index.buckets[1]
-        self.assertEqual(2, bucket.count)
-        self.assertIs(offset17, bucket.first)
-        self.assertIs(offset1, offset17.next)
-        self.assertIs(None, offset1.next)
-
-
 class TestRabinMacros(tests.TestCase):
 
     _module = None # set by load_tests()