Rev 5542: sorting-on-repack doesn't seem to help a lot. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

Thu Dec 2 21:26:50 GMT 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

------------------------------------------------------------
revno: 5542
revision-id: john at arbash-meinel.com-20101202212630-vycb3zf5uy5iz2tc
parent: john at arbash-meinel.com-20101202201238-31yot935saovm9z5
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Thu 2010-12-02 15:26:30 -0600
message:
  sorting-on-repack doesn't seem to help a lot.
  It adds a couple of seconds to 'build' time, because of more overhead and lots of
  'old' search steps.
  I'm a bit surprised at how many old steps are taken, maybe because the table is
  now 'overly full'? which is why we are resizing in the first place.
  Total time is a fairly significant loss (2m2s). Only a couple seconds due to
  the time to resize, so probably somehow 'less optimal' distribution now...
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'

--- a/bzrlib/_delta_index_pyx.pyx	2010-12-02 20:12:38 +0000
+++ b/bzrlib/_delta_index_pyx.pyx	2010-12-02 21:26:30 +0000
@@ -115,6 +115,9 @@
     unsigned int rabin_val
     # The number of bytes since the start of all sources
     unsigned int global_offset
+    # How many times has this entry been considered a success?
+    unsigned short match_count
+
     # The maximum number of bytes that can be matched around this pointer.
     # Note that this is limited to 64kB, and a theoretical match could be
     # longer. However, that will be handled by a later match
@@ -124,9 +127,6 @@
     # 256 values here
     unsigned char match_pre
 
-    # How many times has this entry been considered a success?
-    unsigned short match_count
-
 
 cdef class RabinHashMap
 
@@ -164,12 +164,11 @@
 
         self.free_slot = NULL
         self._step = 0
-        # Consider, self._next_hash & self._table_mask never see the 'upper'
-        # bits of the rabin_val. Is there a decent way to 'mix that in', to
-        # help keep chains from overlapping too much? (ATM 0x10001 and 0x20001
-        # will overlap for any table size less than 0x10000, at *all* steps) I
-        # don't know how common collisions are, but it may be more-than-normal
-        # because RABIN isn't a super-strong hash.
+        # Note: Adding in the upper bits of rabin_val to _next_hash (via
+        #       something like (rabin_val ^ (rabin_val >> 16)), appears to be
+        #       worse than not.  It might be that ^ is bleeding through similar
+        #       characters (eg, content of 'aaaa' could cause a hash of
+        #       'aa' ^ 'aa' == 00.
         self._next_hash = rabin_val
 
     @cython.profile(False)
@@ -257,7 +256,7 @@
     # Dummy entries are noted by having ptr == dummy_ptr
     cdef rabin_entry *_table
 
-    # TODO: two options for 'soft' resizing.
+    # TODO: two options for 'softer' resizing.
     #   a) 2 active tables. One for 'new' content. When 'new' is full, we
     #      allocate a new table of size(new)*2. We then copy all of the records
     #      from 'old' and free old. Then new items go into new. As an example:
@@ -316,10 +315,15 @@
         # it to the new table. Though this func should be renamed 'move' at
         # that point.
         cdef rabin_entry *slot, *new_slot, *old_table
-        cdef unsigned int i, new_offset, old_mask
-        cdef unsigned long total_matches, total_steps, total_items
-        cdef RabinEntrySearch search
+        cdef unsigned int i, new_offset, old_mask, rabin_val
+        cdef unsigned long total_old_steps, total_steps, total_items
+        cdef RabinEntrySearch search_old, search_new
+        cdef rabin_entry temp_entries[128]
+        cdef int temp_idx
+        cdef list all_entries
 
+        # search_old will maintain pointers to the old table and mask
+        search_old = RabinEntrySearch(self)
         old_table = self._table
         old_mask = self.table_mask
         self._table = new_table
@@ -350,29 +354,51 @@
         #          have to walk over items we already walked. With the current
         #          code w/ 64 entries in a bucket, we walk, 1, 2, 3, 4, 5...
         #          2047 steps.
-        total_steps = total_matches = total_items = 0
+        total_steps = total_old_steps = total_items = 0
+        all_entries = []
         try:
-            search = RabinEntrySearch(self)
+            search_new = RabinEntrySearch(self)
             for i from 0 <= i <= old_mask:
                 slot = old_table + i
                 if (slot.ptr == NULL or slot.ptr == dummy_ptr):
                     # An empty slot
                     continue
-                total_items += 1
-                search.start(slot.rabin_val)
-                while search.free_slot == NULL:
-                    # Note: It should be fine to use search.next() here,
-                    # because there should be no dummy pointers in the new
-                    # table. Might want to validate that, though.
-                    search.next()
-                search.free_slot[0] = slot[0]
-                total_steps += search._step
-                total_matches += search.match_count
+                temp_idx = 0
+                rabin_val = slot.rabin_val
+                search_old.start(rabin_val)
+                search_old.next()
+                del all_entries[:]
+                while search_old.entry != NULL:
+                    temp_entries[temp_idx] = search_old.entry[0]
+                    all_entries.append((self._weight_for_entry(search_old.entry), temp_idx))
+                    temp_idx += 1
+                    if temp_idx >= 128:
+                        raise RuntimeError('we overflowed the temp_entries list')
+                    # We've consumed this entry, delete it
+                    # TODO: We should track search_old steps
+                    search_old.entry.ptr = dummy_ptr
+                    search_old.entry.rabin_val = 0
+                    search_old.next()
+                all_entries.sort(reverse=True)
+                total_old_steps += search_old._step
+
+                search_new.start(rabin_val)
+                for _, temp_idx in all_entries:
+                    total_items += 1
+                    while search_new.free_slot == NULL:
+                        # Note: It should be fine to use search.next() here,
+                        # because there should be no dummy pointers in the new
+                        # table we would want to fill in. Might want to
+                        # validate that, though.
+                        search_new.next()
+                    search_new.free_slot[0] = temp_entries[temp_idx]
+                    search_new.free_slot = NULL
+                total_steps += search_new._step
         finally:
             PyMem_Free(old_table)
-        trace.mutter('              took %d steps, %d matches,'
+        trace.mutter('              took %d steps, %d old_steps,'
                      ' for %d items'
-                     % (total_steps, total_matches,
+                     % (total_steps, total_old_steps,
                         total_items))
 
     def reserve(self, new_entries):
@@ -430,18 +456,51 @@
         #   4) For now, we just use a dirt-simple algorithm that says
         #      remove-every-other entry when the bucket gets full. We have to
         #      use 'dummy' entries, because collisions work across buckets.
-        cdef list all_entries
-        cdef int weight
-        cdef unsigned int offset
-        cdef rabin_entry *entry
-        cdef int i
-        cdef int temp_idx
         # Must be larger than MAX_HASH_BUCKET_ENTRIES. Also, after packing we
         # add dummy entries, which can cause other hash chains to insert early.
         # As such, we can have > 64 entries in a chain.
         cdef rabin_entry temp_entries[128]
-
-        search.start(rabin_val)
+        cdef list all_entries
+
+        search.start(rabin_val)
+        all_entries = self._find_entries_to_sort(search, temp_entries)
+        search.start(rabin_val)
+        self._refill_table_entries(search, all_entries, temp_entries)
+
+    cdef int _weight_for_entry(self, rabin_entry *entry) except? -1:
+        cdef unsigned int offset
+        cdef int weight
+
+        weight = 0
+        offset = entry.global_offset
+        while offset > 0:
+            # Every 8 bits of offset is 1 byte of copy command
+            offset >>= 8
+            weight -= 1
+        # 'pre' bytes is less valuable than 'post' bytes, because of how
+        # the matching works, so double-weight post bytes.
+        # Having hits is very valuable, more so than a byte or two
+        # available
+        weight += ((entry.match_count * 20)
+                 + (entry.match_tail * 4)
+                 + entry.match_pre * 2
+                 + entry.global_offset & 0xF)
+        return weight
+
+    cdef _find_entries_to_sort(self, RabinEntrySearch search,
+                               rabin_entry *temp_entries):
+        """Build up temp_entries into entries that need to be sorted.
+
+        :param search: A RabinEntrySearch object. We assume it has already been
+            "start()"ed and just needs to be iterated.
+        :return: A list of weights for each entry offset.
+        """
+        cdef list all_entries
+        cdef unsigned int offset
+        cdef rabin_entry *entry
+        cdef int i
+        cdef int temp_idx
+
         search.next()
         all_entries = []
         temp_idx = 0
@@ -452,37 +511,34 @@
             # content at the beginning should be favored to content at the end,
             # since the delta is smaller, though only 1 byte per 2^7 bytes
             # offset.
-            offset = search.entry.global_offset
-            while offset > 0:
-                # Every 8 bits of offset is 1 byte of copy command
-                offset >>= 8
-                weight -= 1
-            # 'pre' bytes is less valuable than 'post' bytes, because of how
-            # the matching works, so double-weight post bytes.
-            # Having hits is very valuable, more so than a byte or two
-            # available
-            weight += ((search.entry.match_count * 20)
-                     + (search.entry.match_tail * 4)
-                     + search.entry.match_pre * 2
-                     + search.entry.global_offset & 0xF)
-            all_entries.append((weight, temp_idx))
+            all_entries.append((self._weight_for_entry(search.entry), temp_idx))
             temp_entries[temp_idx] = search.entry[0]
             temp_idx += 1
             if temp_idx >= 128:
                 raise RuntimeError('we overflowed the temp_entries list')
             search.next()
         all_entries.sort(reverse=True)
+        return all_entries
+
+    cdef _refill_table_entries(self, RabinEntrySearch search, list all_entries,
+                               rabin_entry *temp_entries):
+        """Use a sorted list of offsets, a list of entries, and fill the table
+
+        :param search: A RabinEntrySearch used for finding where the entries in
+            the table should go. Should already be seeded with start(rabin_val)
+        """
+        cdef rabin_entry *entry
+        cdef int i
+        cdef int temp_idx
 
         # Now do the search again, and rewrite the entries. We fill in all
         # slots and all dummy slots with the newly sorted entries. We also
         # check _keep_map, to see if this is an entry we should omit.
         i = 0
-        search.start(rabin_val)
         entry = search.next_with_dummy()
         while entry != NULL:
             if entry.ptr != dummy_ptr:
                 self.entry_count -= 1
-            assert entry.rabin_val == 0 or entry.rabin_val == rabin_val
             while i < MAX_HASH_BUCKET_ENTRIES and _keep_map[i] == c'n':
                 # Skip entries that are marked to be removed
                 temp_idx = all_entries[i][1]