Rev 4559: Adding an inventory text cache. in http://bazaar.launchpad.net/~jameinel/bzr/1.18-2a-bundle-performance

John Arbash Meinel john at arbash-meinel.com
Mon Jul 27 19:56:31 BST 2009


At http://bazaar.launchpad.net/~jameinel/bzr/1.18-2a-bundle-performance

------------------------------------------------------------
revno: 4559
revision-id: john at arbash-meinel.com-20090727185611-sf3mfgry3e5e9hbw
parent: john at arbash-meinel.com-20090727180319-of88nkq0bwjzr9hk
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.18-2a-bundle-performance
timestamp: Mon 2009-07-27 13:56:11 -0500
message:
  Adding an inventory text cache.
  
  Drops the time to insert 577 revs from 3m55s down to 3m14s. Not as much
  as I would like, but I'm guessing the bulk of the time is in
  add_inventory anyway.
-------------- next part --------------
=== modified file 'bzrlib/bundle/serializer/v4.py'
--- a/bzrlib/bundle/serializer/v4.py	2009-07-27 17:51:10 +0000
+++ b/bzrlib/bundle/serializer/v4.py	2009-07-27 18:56:11 +0000
@@ -22,12 +22,14 @@
     diff,
     errors,
     iterablefile,
+    lru_cache,
     multiparent,
     osutils,
     pack,
     revision as _mod_revision,
+    serializer,
     trace,
-    serializer,
+    ui,
     )
 from bzrlib.bundle import bundle_data, serializer as bundle_serializer
 from bzrlib import bencode
@@ -600,45 +602,90 @@
             vf_records.append((key, parents, meta['sha1'], d_func(text)))
         versionedfile.add_mpdiffs(vf_records)
 
+    def _get_parent_inventory_texts(self, inventory_text_cache, parent_ids):
+        cached_parent_texts = {}
+        remaining_parent_ids = []
+        for parent_id in parent_ids:
+            p_text = inventory_text_cache.get(parent_id, None)
+            if p_text is None:
+                remaining_parent_ids.append(parent_id)
+            else:
+                cached_parent_texts[parent_id] = p_text
+        ghosts = ()
+        if remaining_parent_ids:
+            # first determine what keys are actually present in the local
+            # inventories object (don't use revisions as they haven't been
+            # installed yet.)
+            parent_keys = [(r,) for r in remaining_parent_ids]
+            present_parent_map = self._repository.inventories.get_parent_map(
+                                        parent_keys)
+            present_parent_ids = []
+            ghosts = set()
+            for p_id in remaining_parent_ids:
+                if (p_id,) in present_parent_map:
+                    present_parent_ids.append(p_id)
+                else:
+                    ghosts.add(p_id)
+            to_string = self._source_serializer.write_inventory_to_string
+            for parent_inv in self._repository.iter_inventories(
+                                    present_parent_ids):
+                p_text = to_string(parent_inv)
+                cached_parent_texts[parent_inv.revision_id] = p_text
+                inventory_text_cache[parent_inv.revision_id] = p_text
+
+        parent_texts = [cached_parent_texts[parent_id]
+                        for parent_id in parent_ids
+                         if parent_id not in ghosts]
+        return parent_texts
+
     def _install_inventory_records(self, records):
         if (self._info['serializer'] == self._repository._serializer.format_num
             and self._repository._serializer.support_altered_by_hack):
             return self._install_mp_records_keys(self._repository.inventories,
                 records)
-        for key, metadata, bytes in records:
-            revision_id = key[-1]
-            parent_ids = metadata['parents']
-            # Note: This assumes the local ghosts are identical to the ghosts
-            #       in the source, as the Bundle serialization format doesn't
-            #       record ghosts.
-            # Find out what is present in the local inventory vf (don't use
-            # revisions vf as those haven't been installed yet.)
-            parent_keys = [(r,) for r in parent_ids]
-            present_parent_map = self._repository.inventories.get_parent_map(
-                                        parent_keys)
-            present_parent_ids = [p_id for p_id in parent_ids
-                                        if (p_id,) in present_parent_map]
-            # TODO: This doesn't do any sort of caching, etc, so expect it to
-            #       perform rather poorly. When transmitting many inventories,
-            #       it will be re-reading and serializing to bytes the
-            #       inventories that it just wrote.
-            parent_invs = list(self._repository.iter_inventories(
-                                present_parent_ids))
-            p_texts = [self._source_serializer.write_inventory_to_string(p)
-                       for p in parent_invs]
-            target_lines = multiparent.MultiParent.from_patch(bytes).to_lines(
-                p_texts)
-            sha1 = osutils.sha_strings(target_lines)
-            if sha1 != metadata['sha1']:
-                raise errors.BadBundle("Can't convert to target format")
-            target_inv = self._source_serializer.read_inventory_from_string(
-                ''.join(target_lines))
-            self._handle_root(target_inv, parent_ids)
-            try:
-                self._repository.add_inventory(revision_id, target_inv,
-                                               parent_ids)
-            except errors.UnsupportedInventoryKind:
-                raise errors.IncompatibleRevision(repr(self._repository))
+        # Use a 10MB text cache, since these are string xml inventories. Note
+        # that 10MB is fairly small for large projects (a single inventory can
+        # be >5MB). Another possibility is to cache 10-20 inventory texts
+        # instead
+        inventory_text_cache = lru_cache.LRUSizeCache(10*1024*1024)
+        pb = ui.ui_factory.nested_progress_bar()
+        try:
+            num_records = len(records)
+            for idx, (key, metadata, bytes) in enumerate(records):
+                pb.update('installing inventory', idx, num_records)
+                revision_id = key[-1]
+                parent_ids = metadata['parents']
+                # Note: This assumes the local ghosts are identical to the
+                #       ghosts in the source, as the Bundle serialization
+                #       format doesn't record ghosts.
+                p_texts = self._get_parent_inventory_texts(inventory_text_cache,
+                                                           parent_ids)
+                # Why does to_lines() take strings as the source, it seems that
+                # it would have to cast to a list of lines, which we get back
+                # as lines and then cast back to a string.
+                target_lines = multiparent.MultiParent.from_patch(bytes
+                            ).to_lines(p_texts)
+                inv_text = ''.join(target_lines)
+                del target_lines
+                sha1 = osutils.sha_string(inv_text)
+                if sha1 != metadata['sha1']:
+                    raise errors.BadBundle("Can't convert to target format")
+                # Add this to the cache so we don't have to extract it again.
+                inventory_text_cache[revision_id] = inv_text
+                target_inv = self._source_serializer.read_inventory_from_string(
+                    inv_text)
+                # TODO: we might try caching the parent inventories themselves,
+                #       and then using inv._make_delta and
+                #       add_inventory_by_delta instead of always using
+                #       add_inventory
+                self._handle_root(target_inv, parent_ids)
+                try:
+                    self._repository.add_inventory(revision_id, target_inv,
+                                                   parent_ids)
+                except errors.UnsupportedInventoryKind:
+                    raise errors.IncompatibleRevision(repr(self._repository))
+        finally:
+            pb.finished()
 
     def _handle_root(self, target_inv, parent_ids):
         revision_id = target_inv.revision_id



More information about the bazaar-commits mailing list