Rev 5727: Fix bug #737234. Preload all entries for iter_entries_by_dir(). in http://bazaar.launchpad.net/~jameinel/bzr/2.4-cheaper-iter-entries-by-dir

Fri Mar 18 11:52:15 UTC 2011

At http://bazaar.launchpad.net/~jameinel/bzr/2.4-cheaper-iter-entries-by-dir

------------------------------------------------------------
revno: 5727
revision-id: john at arbash-meinel.com-20110318115158-6s4xs41w01x3xcxv
parent: pqm at pqm.ubuntu.com-20110316073305-1r89t4gis3nawye3
fixes bug(s): https://launchpad.net/bugs/737234
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.4-cheaper-iter-entries-by-dir
timestamp: Fri 2011-03-18 12:51:58 +0100
message:
  Fix bug #737234. Preload all entries for iter_entries_by_dir().
  
  None of the users of the api break out of the loop early. So if
  the request made is going to touch all entries, then it is best
  to just read them all in, and pre-populate all of the
  InventoryEntries. Otherwise, ordering-by-directory was fairly
  worst-case in large trees, because each directory would then
  read all of its file-ids, which would hit most of the CHKMap,
  but if it couldn't all be cached in memory, it would be
  basically fully (re)read for every directory.
  This drops reading gcc-linaro's full inventory from 4m30s to 11s
  locally, and decreases the amount of data read from 8GB down to
  128MB.
-------------- next part --------------
=== modified file 'bzrlib/inventory.py'

--- a/bzrlib/inventory.py	2010-09-14 13:12:20 +0000
+++ b/bzrlib/inventory.py	2011-03-18 11:51:58 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2005-2010 Canonical Ltd
+# Copyright (C) 2005-2011 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -718,6 +718,14 @@
                 # if we finished all children, pop it off the stack
                 stack.pop()
 
+    def _preload_cache(self):
+        """Populate any caches, we are about to access all items.
+        
+        The default implementation does nothing, because CommonInventory doesn't
+        have a cache.
+        """
+        pass
+    
     def iter_entries_by_dir(self, from_dir=None, specific_file_ids=None,
         yield_parents=False):
         """Iterate over the entries in a directory first order.
@@ -736,6 +744,11 @@
             specific_file_ids = set(specific_file_ids)
         # TODO? Perhaps this should return the from_dir so that the root is
         # yielded? or maybe an option?
+        if from_dir is None and specific_file_ids is None:
+            # They are iterating from the root, and have not specified any
+            # specific entries to look at. All current callers fully consume the
+            # iterator, so we can safely assume we are accessing all entries
+            self._preload_cache()
         if from_dir is None:
             if self.root is None:
                 return
@@ -1390,6 +1403,7 @@
     def __init__(self, search_key_name):
         CommonInventory.__init__(self)
         self._fileid_to_entry_cache = {}
+        self._fully_cached = False
         self._path_to_fileid_cache = {}
         self._search_key_name = search_key_name
         self.root_id = None
@@ -1969,7 +1983,40 @@
                 ie = self._bytes_to_entry(entry)
                 self._fileid_to_entry_cache[file_id] = ie
             yield ie
-
+            
+    def _preload_cache(self):
+        """Make sure all file-ids are in _fileid_to_entry_cache"""
+        if self._fully_cached:
+            return # No need to do it again
+        # The optimal sort order is to use iteritems() directly
+        cache = self._fileid_to_entry_cache
+        for key, entry in self.id_to_entry.iteritems():
+            file_id = key[0]
+            if file_id not in cache:
+                ie = self._bytes_to_entry(entry)
+                cache[file_id] = ie
+            else:
+                ie = cache[file_id]
+        last_parent_id = last_parent_ie = None
+        pid_items = self.parent_id_basename_to_file_id.iteritems()
+        for key, child_file_id in pid_items:
+            if key == ('', ''): # This is the root
+                assert child_file_id == self.root_id
+                continue
+            parent_id, basename = key
+            ie = cache[child_file_id]
+            if parent_id == last_parent_id:
+                parent_ie = last_parent_ie
+            else:
+                parent_ie = cache[parent_id]
+            assert parent_ie.kind == 'directory'
+            if parent_ie._children is None:
+                parent_ie._children = {}
+            assert basename not in parent_ie._children
+            assert basename == ie.name
+            parent_ie._children[basename] = ie
+        self._fully_cached = True
+        
     def iter_changes(self, basis):
         """Generate a Tree.iter_changes change list between this and basis.
 

=== modified file 'bzrlib/tests/test_inv.py'
--- a/bzrlib/tests/test_inv.py	2011-03-14 00:09:25 +0000
+++ b/bzrlib/tests/test_inv.py	2011-03-18 11:51:58 +0000
@@ -1222,6 +1222,38 @@
         self.assertIsInstance(ie2.name, unicode)
         self.assertEqual(('tree\xce\xa9name', 'tree-root-id', 'tree-rev-id'),
                          inv._bytes_to_utf8name_key(bytes))
+        
+    def test__preload_populates_cache(self):
+        inv = Inventory()
+        inv.revision_id = "revid"
+        inv.root.revision = "rootrev"
+        root_id = inv.root.file_id
+        inv.add(InventoryFile("fileid", "file", root_id))
+        inv["fileid"].revision = "filerev"
+        inv["fileid"].executable = True
+        inv["fileid"].text_sha1 = "ffff"
+        inv["fileid"].text_size = 1
+        inv.add(InventoryDirectory("dirid", "dir", root_id))
+        inv.add(InventoryFile("childid", "child", "dirid"))
+        inv["childid"].revision = "filerev"
+        inv["childid"].executable = False
+        inv["childid"].text_sha1 = "dddd"
+        inv["childid"].text_size = 1
+        chk_bytes = self.get_chk_bytes()
+        chk_inv = CHKInventory.from_inventory(chk_bytes, inv)
+        bytes = ''.join(chk_inv.to_lines())
+        new_inv = CHKInventory.deserialise(chk_bytes, bytes, ("revid",))
+        self.assertEqual({}, new_inv._fileid_to_entry_cache)
+        self.assertFalse(new_inv._fully_cached)
+        new_inv._preload_cache()
+        self.assertEqual(
+            sorted([root_id, "fileid", "dirid", "childid"]),
+            sorted(new_inv._fileid_to_entry_cache.keys()))
+        self.assertTrue(new_inv._fully_cached)
+        ie_root = new_inv._fileid_to_entry_cache[root_id]
+        self.assertEqual(['dir', 'file'], sorted(ie_root._children.keys()))
+        ie_dir = new_inv._fileid_to_entry_cache['dirid']
+        self.assertEqual(['child'], sorted(ie_dir._children.keys()))
 
 
 class TestCHKInventoryExpand(tests.TestCaseWithMemoryTransport):

=== modified file 'doc/en/release-notes/bzr-2.4.txt'
--- a/doc/en/release-notes/bzr-2.4.txt	2011-03-15 08:07:04 +0000
+++ b/doc/en/release-notes/bzr-2.4.txt	2011-03-18 11:51:58 +0000
@@ -70,6 +70,13 @@
 * ``bzr export`` can now create ``.tar.xz`` and ``.tar.lzma`` files.
   (Jelmer Vernooij, #551714)
 
+* Getting all entries from ``CHKInventory.iter_entries_by_dir()`` has been
+  sped up dramatically for large trees. Iterating by dir is not the best
+  way to load data from a CHK inventory, so it preloads all the items in
+  the correct order. (With the gcc-tree, this changes it (re)reading 8GB
+  of CHK data, down to just 150MB.) This has noticeable affects for things
+  like building checkouts, etc.  (John Arbash Meinel, #737234)
+
 
 Bug Fixes
 *********