Rev 3829: Tweak the basis selection code. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/hack

John Arbash Meinel john at arbash-meinel.com
Wed Dec 24 21:42:27 GMT 2008


At http://bzr.arbash-meinel.com/branches/bzr/brisbane/hack

------------------------------------------------------------
revno: 3829
revision-id: john at arbash-meinel.com-20081224214206-i3i3qzebnf2cmovj
parent: john at arbash-meinel.com-20081224202312-dwabieqsxc212lt1
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: hack
timestamp: Wed 2008-12-24 15:42:06 -0600
message:
  Tweak the basis selection code.
  
  Now we will compute the delta versus all parents that we have cached in this batch.
  We then use the parent which has the shortest delta.
  For conversion speed, this seems to actually save us time.
  Best guess is that because the InventoryEntry objects are already cached,
  computing the delta is relatively cheap, and cheaper than paging and deserializing
  the extra internal and leaf nodes, especially since we are likely to get a
  collision when we are done. Also beneficial is that we end up with the smallest
  compressed size (so far).
  
  Also adding a bunch of counter code to see what the various hit rates are.
  ties in the delta length favor the left-hand parent.
  With that in place, there seems to be a 6:4 favor of left-hand parent delta being
  shorter.
-------------- next part --------------
=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py	2008-12-20 21:16:24 +0000
+++ b/bzrlib/repository.py	2008-12-24 21:42:06 +0000
@@ -3277,7 +3277,7 @@
             return False
         return True
 
-    def _fetch_batch(self, revision_ids, basis_id, basis_tree):
+    def _fetch_batch(self, revision_ids, basis_id, basis_tree, counter):
         """Fetch across a few revisions.
 
         :param revision_ids: The revisions to copy
@@ -3293,9 +3293,60 @@
         text_keys = set()
         pending_deltas = []
         pending_revisions = []
+        recent_cache = {basis_id: basis_tree}
+        parent_map = self.source.get_parent_map(revision_ids)
         for tree in self.source.revision_trees(revision_ids):
             current_revision_id = tree.get_revision_id()
-            delta = tree.inventory._make_delta(basis_tree.inventory)
+            recent_cache[current_revision_id] = tree
+            parent_ids = parent_map.get(current_revision_id, [])
+            if len(parent_ids) == 0:
+                # No parents to compare against, we just use the current basis
+                counter[2] += 1
+                delta = tree.inventory._make_delta(basis_tree.inventory)
+            elif len(parent_ids) == 1:
+                parent_id = parent_ids[0]
+                if parent_id == basis_id:
+                    # The basis is the one parent, just use it
+                    counter[0] += 1
+                else:
+                    if parent_id in recent_cache:
+                        # the one parent is already cached
+                        counter[1] += 1
+                        basis_id = parent_id
+                        basis_tree = recent_cache[basis_id]
+                    else:
+                        # one parent, but it isn't in the cache
+                        counter[2] += 1
+                delta = tree.inventory._make_delta(basis_tree.inventory)
+            else: # More than one parent
+                deltas = []
+                for idx, parent_id in enumerate(parent_ids):
+                    if parent_id not in recent_cache:
+                        continue
+                    parent_tree = recent_cache[parent_id]
+                    delta = tree.inventory._make_delta(parent_tree.inventory)
+                    deltas.append((len(delta), idx, parent_id, parent_tree, delta))
+                deltas.sort()
+                if len(deltas) == 0:
+                    # Using a random base
+                    counter[2] += 1
+                    delta = tree.inventory._make_delta(basis_tree.inventory)
+                elif len(deltas) == 1:
+                    # Only a single cached parent, use it
+                    _, _, parent_id, basis_tree, delta = deltas[0]
+                    if basis_id == parent_id:
+                        # Turned out to be identical to base
+                        counter[0] += 1
+                    else:
+                        # Using the one cached parent
+                        basis_id = parent_id
+                        counter[1] += 1
+                else:
+                    # We have more than one cached parent, so compare all
+                    # deltas and find the one that is the shortest.
+                    _, idx, basis_id, basis_tree, delta = deltas[0]
+                    counter[3] += 1
+                    counter[4] += idx
             for old_path, new_path, file_id, entry in delta:
                 if new_path is not None:
                     if not (new_path or self.target.supports_rich_root()):
@@ -3344,14 +3395,26 @@
         """
         basis_id, basis_tree = self._get_basis(revision_ids[0])
         batch_size = 100
+        total_counter = [0, 0, 0, 0, 0]
         for offset in range(0, len(revision_ids), batch_size):
             self.target.start_write_group()
             try:
+                counter = [0, 0, 0, 0, 0]
                 pb.update('Transferring revisions', offset,
                           len(revision_ids))
                 batch = revision_ids[offset:offset+batch_size]
                 basis_id, basis_tree = self._fetch_batch(batch,
-                    basis_id, basis_tree)
+                    basis_id, basis_tree, counter)
+                if total_counter[3] > 0:
+                    avg_parent = float(total_counter[4]) / total_counter[3]
+                else:
+                    avg_parent = 0
+                for idx in xrange(len(counter)):
+                    total_counter[idx] += counter[idx]
+                pb.note('Parent is_basis:%d in_cache:%d random:%d, multi: %d'
+                        ' avg: %.3f, totals %s',
+                        counter[0], counter[1], counter[2], counter[3],
+                        avg_parent, total_counter)
             except:
                 self.target.abort_write_group()
                 raise



More information about the bazaar-commits mailing list