Rev 5534: Bits of TODO markup. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

Tue Nov 23 21:50:28 GMT 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

------------------------------------------------------------
revno: 5534
revision-id: john at arbash-meinel.com-20101123214953-9yhtoz01kxnr5cct
parent: john at arbash-meinel.com-20101123211007-v5eu386vc5jv0vbj
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Tue 2010-11-23 15:49:53 -0600
message:
  Bits of TODO markup.
  
  Initial results from pulling the 'RabinEntrySearch' allocation out of
  _find_better_match seems to drop specific time from 120s down to 66s.
  Didn't seem to affect wall-time, so not sure if that is accurate.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'

--- a/bzrlib/_delta_index_pyx.pyx	2010-11-23 21:10:07 +0000
+++ b/bzrlib/_delta_index_pyx.pyx	2010-11-23 21:49:53 +0000
@@ -67,6 +67,7 @@
 cdef int MAX_MATCH_TAIL
 MAX_MATCH_TAIL = 65535
 
+
 import time
 cdef object _timer
 _timer = time.clock
@@ -860,7 +861,8 @@
         self.out_pos += 1
 
     cdef int _find_better_match(self, const_data data, const_data data_end,
-                                unsigned int cur_rabin_val) except -1:
+                                unsigned int cur_rabin_val,
+                                RabinEntrySearch search) except -1:
         cdef rabin_entry *cur_entry
         cdef const_data cur
         cdef const_data ref
@@ -869,12 +871,31 @@
         cdef Py_ssize_t cur_match_size
         cdef Py_ssize_t remaining_data_len
         cdef SourceInfo ref_source
-        cdef RabinEntrySearch search
-
-        search = RabinEntrySearch(self.index._hash_map)
+        cdef int comparisons
+
+        # TODO: Incorporate feedback. If _find_better_match picks a given entry
+        #       as the best match, that entry should be rated up, and moved
+        #       earlier in the matches-to-check heirarchy
+        # TODO: We have lots of time to spend during build phase if we can
+        #       improve the make_delta phase. Rough numbers are:
+        #           spent 0.676s in add_source
+        #           spent 1.689s in add_delta_source
+        #           spent 132.955s in make_delta
+        #       So about 100x more time spent in make_delta, then in add_delta
+        #       or add_source. add_delta_source is probably just because of the
+        #       shear count difference. (20x more deltas than raw sources,
+        #       etc.)
+        #       Anything we can do to make the 'good' matches found quicker
+        #       will result in a much more significant win. One simple step,
+        #       'move to front' anything that is chosen as a match... Sort more
+        #       often, etc.
+        #
+
+        # search = RabinEntrySearch(self.index._hash_map)
         search.start(cur_rabin_val)
         search.next()
         remaining_data_len = data_end - data
+        comparisons = 0
         while search.entry != NULL:
             cur_entry = search.entry
             search.next()
@@ -889,6 +910,7 @@
                 continue
             # Now actually compare the bytes, and see how many match
             cur = data
+            comparisons += 1
             while cur_match_size > 0 and cur[0] == ref[0]:
                 cur += 1
                 ref += 1
@@ -900,8 +922,8 @@
                 self.match_entry = cur_entry
                 if cur_match_size >= 4096:
                     # Stop searching for better matches
-                    return search.match_count
-        return search.match_count
+                    return comparisons
+        return comparisons
 
     cdef _finalize_pending_insert(self):
         cdef unsigned char insert_byte
@@ -1027,6 +1049,7 @@
         cdef unsigned char c
         cdef Py_ssize_t target_len
         cdef int count
+        cdef RabinEntrySearch search
 
         if PyString_GET_SIZE(self.content) < RABIN_WINDOW:
             # We don't have enough content to match, so don't try
@@ -1049,17 +1072,17 @@
         self.insert_count = RABIN_WINDOW
         self.match_size = 0
         count = 0
+        search = RabinEntrySearch(self.index._hash_map)
         while data < data_end:
             if self.match_size < 4096:
                 # Our current match isn't "good enough", so look for a better
                 # one
                 # Pull in the next data character
                 RABIN_UPDATE(cur_rabin_val, data[-RABIN_WINDOW], data[0])
-                t = _timer()
-                count = self._find_better_match(data, data_end, cur_rabin_val)
-                global find_better_total_time, matches_checked
-                find_better_total_time += _timer() - t
-                matches_checked += (count - 1)
+                count = self._find_better_match(data, data_end, cur_rabin_val,
+                                                search)
+                global matches_checked
+                matches_checked += count
             if self.match_size < 4:
                 # Our best match is only 4 extra bytes, just count this as an
                 # insert
@@ -1068,7 +1091,7 @@
                 self.match_size = 0
                 global try_again_count, try_again_matches_checked
                 try_again_count += 1
-                try_again_matches_checked += (count -1)
+                try_again_matches_checked += count
             else:
                 data = self._output_match(data)
                 if self.match_size < 4096: