Rev 5539: Moving the key functionality into inline functions drops us down to 1m44.4s in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

Thu Dec 2 18:02:19 GMT 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

------------------------------------------------------------
revno: 5539
revision-id: john at arbash-meinel.com-20101202180208-76zf19m2cdib333d
parent: john at arbash-meinel.com-20101201225008-h8065d63rq03shdr
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Thu 2010-12-02 12:02:08 -0600
message:
  Moving the key functionality into inline functions drops us down to 1m44.4s
  
  How do you value the cost of complexity of code, vs the performance you get from the code?
  It is hard for me to determine 'good enough'. I at least wanted to experiment with it
  to determine where the threshold might be.
  (if it shaved 30s out of 100s, then it seems more worthwhile.)
  
  Note the current best time with inlining is 1m45s, the current best time for
  existing bzr code is 1m38s, which is still faster...
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'

--- a/bzrlib/_delta_index_pyx.pyx	2010-12-01 22:50:08 +0000
+++ b/bzrlib/_delta_index_pyx.pyx	2010-12-02 18:02:08 +0000
@@ -222,7 +222,72 @@
                               self._next_hash))
 
 @cython.profile(False)
-cdef inline int _peek(RabinEntrySearch self, unsigned int rabin_val): # noexcept
+cdef inline RabinEntrySearch_start(RabinEntrySearch self, unsigned int rabin_val):
+    self.rabin_val = rabin_val
+    self.entry = NULL
+    self.match_count = 0
+
+    self.free_slot = NULL
+    self._step = 0
+    # Consider, self._next_hash & self._table_mask never see the 'upper'
+    # bits of the rabin_val. Is there a decent way to 'mix that in', to
+    # help keep chains from overlapping too much? (ATM 0x10001 and 0x20001
+    # will overlap for any table size less than 0x10000, at *all* steps) I
+    # don't know how common collisions are, but it may be more-than-normal
+    # because RABIN isn't a super-strong hash.
+    self._next_hash = rabin_val
+
+ at cython.profile(False)
+cdef inline rabin_entry* RabinEntrySearch_next_with_dummy(RabinEntrySearch self): # noexcept
+    cdef rabin_entry *slot
+    # This uses Quadratic Probing:
+    #  http://en.wikipedia.org/wiki/Quadratic_probing
+    # with c1 = c2 = 1/2
+    # This leads to probe locations at:
+    #  h0 = hash(k1)
+    #  h1 = h0 + 1 = h0 + 1
+    #  h2 = h0 + 3 = h1 + 2
+    #  h3 = h0 + 6 = h2 + 3
+    #  h4 = h0 + 10 = h2 + 4
+    # Note that all of these are '& mask', but that is computed *after* the
+    # offset.
+    while self._step < self._table_mask:
+        slot = self._table + (self._next_hash & self._table_mask)
+        self._step += 1
+        self._next_hash = self._next_hash + self._step
+        if slot.ptr == NULL:
+            # We found a blank entry, so this chain is done
+            if self.free_slot == NULL:
+                self.free_slot = slot
+            return NULL
+        elif slot.ptr == dummy_ptr:
+            if self.free_slot == NULL:
+                self.free_slot = slot
+            return slot
+        if slot.rabin_val != self.rabin_val:
+            continue
+        # We found an entry that matches exactly, return it
+        self.match_count += 1
+        return slot
+    return NULL
+
+ at cython.profile(False)
+cdef inline RabinEntrySearch_next(RabinEntrySearch self):
+    cdef rabin_entry *slot
+    # Don't search forever, even if we have bogus entries in the table
+    while self._step < self._table_mask:
+        slot = self.next_with_dummy()
+        if (slot != NULL and slot.ptr == dummy_ptr):
+            continue
+        self.entry = slot
+        return
+    raise RuntimeError("We seem to have stepped too far while searching"
+                       " for rabin_val = %d, we took %d steps, hash %d"
+                       % (self.rabin_val, self._step,
+                          self._next_hash))
+
+ at cython.profile(False)
+cdef int _peek(RabinEntrySearch self, unsigned int rabin_val): # noexcept
     """Check and see if there might be anything at a given rabin_val.
 
     The idea is that most lookups fail to find anything, and we want it to
@@ -953,61 +1018,6 @@
         self.out[self.out_pos] = <unsigned char>(i)
         self.out_pos += 1
 
-    @cython.profile(False)
-    cdef int _find_better_match(self, const_data data, const_data data_end,
-                                unsigned int cur_rabin_val,
-                                RabinEntrySearch search) except -1:
-        cdef rabin_entry *cur_entry
-        cdef const_data cur
-        cdef const_data ref
-        cdef const_data ref_start
-        cdef const_data ref_end
-        cdef Py_ssize_t cur_match_size
-        cdef Py_ssize_t remaining_data_len
-        cdef SourceInfo ref_source
-        cdef int comparisons
-
-        # This is the function that gets triggered the most. Something like
-        # 420 calls while repacking bzr. (only one higher is search.next()
-        # having 500M calls.) Out of those 420M calls, 392M of those failed to
-        # match anything significant (try_again_count).
-
-        search.start(cur_rabin_val)
-        search.next()
-        if search.entry == NULL:
-            global nil_match
-            nil_match += 1
-        remaining_data_len = data_end - data
-        comparisons = 0
-        while search.entry != NULL:
-            cur_entry = search.entry
-            search.next()
-            ref = cur_entry.ptr
-            cur_match_size = remaining_data_len
-            if cur_entry.match_tail < remaining_data_len:
-                cur_match_size = cur_entry.match_tail
-            ref_end = ref + cur_entry.match_tail
-            if cur_match_size <= self.match_size:
-                # Our current best match is better than this one could possibly
-                # be
-                continue
-            # Now actually compare the bytes, and see how many match
-            cur = data
-            comparisons += 1
-            while cur_match_size > 0 and cur[0] == ref[0]:
-                cur += 1
-                ref += 1
-                cur_match_size -= 1
-            cur_match_size = (cur - data)
-            if cur_match_size > self.match_size:
-                # This match is better
-                self.match_size = cur_match_size
-                self.match_entry = cur_entry
-                if cur_match_size >= 4096:
-                    # Stop searching for better matches
-                    return comparisons
-        return comparisons
-
     cdef _finalize_pending_insert(self):
         cdef unsigned char insert_byte
         cdef Py_ssize_t out_pos
@@ -1167,7 +1177,7 @@
                 # Pull in the next data character
                 RABIN_UPDATE(cur_rabin_val, data[-RABIN_WINDOW], data[0])
                 if _peek(search, cur_rabin_val):
-                    count = self._find_better_match(data, data_end, cur_rabin_val,
+                    count = _find_better_match(self, data, data_end, cur_rabin_val,
                                                     search)
                     global matches_checked
                     matches_checked += count
@@ -1203,3 +1213,58 @@
         _PyString_Resize(&self.output_buff, self.out_pos)
         ret = <object>self.output_buff
         return ret
+
+
+ at cython.profile(False)
+cdef inline int _find_better_match(_DeltaCreator self, const_data data, const_data data_end,
+                                   unsigned int cur_rabin_val,
+                                   RabinEntrySearch search) except -1:
+    cdef rabin_entry *cur_entry
+    cdef const_data cur
+    cdef const_data ref
+    cdef const_data ref_end
+    cdef Py_ssize_t cur_match_size
+    cdef Py_ssize_t remaining_data_len
+    cdef int comparisons
+
+    # This is the function that gets triggered the most. Something like
+    # 420 calls while repacking bzr. (only one higher is search.next()
+    # having 500M calls.) Out of those 420M calls, 392M of those failed to
+    # match anything significant (try_again_count).
+
+    RabinEntrySearch_start(search, cur_rabin_val)
+    RabinEntrySearch_next(search)
+    if search.entry == NULL:
+        global nil_match
+        nil_match += 1
+    remaining_data_len = data_end - data
+    comparisons = 0
+    while search.entry != NULL:
+        cur_entry = search.entry
+        RabinEntrySearch_next(search)
+        ref = cur_entry.ptr
+        cur_match_size = remaining_data_len
+        if cur_entry.match_tail < remaining_data_len:
+            cur_match_size = cur_entry.match_tail
+        ref_end = ref + cur_entry.match_tail
+        if cur_match_size <= self.match_size:
+            # Our current best match is better than this one could possibly
+            # be
+            continue
+        # Now actually compare the bytes, and see how many match
+        cur = data
+        comparisons += 1
+        while cur_match_size > 0 and cur[0] == ref[0]:
+            cur += 1
+            ref += 1
+            cur_match_size -= 1
+        cur_match_size = (cur - data)
+        if cur_match_size > self.match_size:
+            # This match is better
+            self.match_size = cur_match_size
+            self.match_entry = cur_entry
+            if cur_match_size >= 4096:
+                # Stop searching for better matches
+                return comparisons
+    return comparisons
+