Rev 5730: (jameinel) Fix bug #737234. When running iter_entries_by_dir preload all the in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Canonical.com Patch Queue Manager pqm at pqm.ubuntu.com
Tue Mar 22 12:29:59 UTC 2011


At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 5730 [merge]
revision-id: pqm at pqm.ubuntu.com-20110322122953-gsrls0jajhfnobt5
parent: pqm at pqm.ubuntu.com-20110322095711-9bggm9tnxnw9frow
parent: john at arbash-meinel.com-20110322111606-z527d9dyd4si7e00
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Tue 2011-03-22 12:29:53 +0000
message:
  (jameinel) Fix bug #737234. When running iter_entries_by_dir preload all the
   data if we are going to be looking at it all. (John A Meinel)
modified:
  bzrlib/inventory.py            inventory.py-20050309040759-6648b84ca2005b37
  bzrlib/tests/test_inv.py       testinv.py-20050722220913-1dc326138d1a5892
  doc/en/release-notes/bzr-2.4.txt bzr2.4.txt-20110114053217-k7ym9jfz243fddjm-1
=== modified file 'bzrlib/inventory.py'
--- a/bzrlib/inventory.py	2010-09-14 13:12:20 +0000
+++ b/bzrlib/inventory.py	2011-03-22 11:16:06 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2005-2010 Canonical Ltd
+# Copyright (C) 2005-2011 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -718,6 +718,14 @@
                 # if we finished all children, pop it off the stack
                 stack.pop()
 
+    def _preload_cache(self):
+        """Populate any caches, we are about to access all items.
+        
+        The default implementation does nothing, because CommonInventory doesn't
+        have a cache.
+        """
+        pass
+    
     def iter_entries_by_dir(self, from_dir=None, specific_file_ids=None,
         yield_parents=False):
         """Iterate over the entries in a directory first order.
@@ -736,6 +744,11 @@
             specific_file_ids = set(specific_file_ids)
         # TODO? Perhaps this should return the from_dir so that the root is
         # yielded? or maybe an option?
+        if from_dir is None and specific_file_ids is None:
+            # They are iterating from the root, and have not specified any
+            # specific entries to look at. All current callers fully consume the
+            # iterator, so we can safely assume we are accessing all entries
+            self._preload_cache()
         if from_dir is None:
             if self.root is None:
                 return
@@ -1390,6 +1403,7 @@
     def __init__(self, search_key_name):
         CommonInventory.__init__(self)
         self._fileid_to_entry_cache = {}
+        self._fully_cached = False
         self._path_to_fileid_cache = {}
         self._search_key_name = search_key_name
         self.root_id = None
@@ -1956,7 +1970,7 @@
 
     def iter_just_entries(self):
         """Iterate over all entries.
-        
+
         Unlike iter_entries(), just the entries are returned (not (path, ie))
         and the order of entries is undefined.
 
@@ -1970,6 +1984,59 @@
                 self._fileid_to_entry_cache[file_id] = ie
             yield ie
 
+    def _preload_cache(self):
+        """Make sure all file-ids are in _fileid_to_entry_cache"""
+        if self._fully_cached:
+            return # No need to do it again
+        # The optimal sort order is to use iteritems() directly
+        cache = self._fileid_to_entry_cache
+        for key, entry in self.id_to_entry.iteritems():
+            file_id = key[0]
+            if file_id not in cache:
+                ie = self._bytes_to_entry(entry)
+                cache[file_id] = ie
+            else:
+                ie = cache[file_id]
+        last_parent_id = last_parent_ie = None
+        pid_items = self.parent_id_basename_to_file_id.iteritems()
+        for key, child_file_id in pid_items:
+            if key == ('', ''): # This is the root
+                if child_file_id != self.root_id:
+                    raise ValueError('Data inconsistency detected.'
+                        ' We expected data with key ("","") to match'
+                        ' the root id, but %s != %s'
+                        % (child_file_id, self.root_id))
+                continue
+            parent_id, basename = key
+            ie = cache[child_file_id]
+            if parent_id == last_parent_id:
+                parent_ie = last_parent_ie
+            else:
+                parent_ie = cache[parent_id]
+            if parent_ie.kind != 'directory':
+                raise ValueError('Data inconsistency detected.'
+                    ' An entry in the parent_id_basename_to_file_id map'
+                    ' has parent_id {%s} but the kind of that object'
+                    ' is %r not "directory"' % (parent_id, parent_ie.kind))
+            if parent_ie._children is None:
+                parent_ie._children = {}
+            basename = basename.decode('utf-8')
+            if basename in parent_ie._children:
+                existing_ie = parent_ie._children[basename]
+                if existing_ie != ie:
+                    raise ValueError('Data inconsistency detected.'
+                        ' Two entries with basename %r were found'
+                        ' in the parent entry {%s}'
+                        % (basename, parent_id))
+            if basename != ie.name:
+                raise ValueError('Data inconsistency detected.'
+                    ' In the parent_id_basename_to_file_id map, file_id'
+                    ' {%s} is listed as having basename %r, but in the'
+                    ' id_to_entry map it is %r'
+                    % (child_file_id, basename, ie.name))
+            parent_ie._children[basename] = ie
+        self._fully_cached = True
+
     def iter_changes(self, basis):
         """Generate a Tree.iter_changes change list between this and basis.
 

=== modified file 'bzrlib/tests/test_inv.py'
--- a/bzrlib/tests/test_inv.py	2011-03-14 00:09:25 +0000
+++ b/bzrlib/tests/test_inv.py	2011-03-22 11:16:06 +0000
@@ -1223,6 +1223,88 @@
         self.assertEqual(('tree\xce\xa9name', 'tree-root-id', 'tree-rev-id'),
                          inv._bytes_to_utf8name_key(bytes))
 
+    def make_basic_utf8_inventory(self):
+        inv = Inventory()
+        inv.revision_id = "revid"
+        inv.root.revision = "rootrev"
+        root_id = inv.root.file_id
+        inv.add(InventoryFile("fileid", u'f\xefle', root_id))
+        inv["fileid"].revision = "filerev"
+        inv["fileid"].text_sha1 = "ffff"
+        inv["fileid"].text_size = 0
+        inv.add(InventoryDirectory("dirid", u'dir-\N{EURO SIGN}', root_id))
+        inv.add(InventoryFile("childid", u'ch\xefld', "dirid"))
+        inv["childid"].revision = "filerev"
+        inv["childid"].text_sha1 = "ffff"
+        inv["childid"].text_size = 0
+        chk_bytes = self.get_chk_bytes()
+        chk_inv = CHKInventory.from_inventory(chk_bytes, inv)
+        bytes = ''.join(chk_inv.to_lines())
+        return CHKInventory.deserialise(chk_bytes, bytes, ("revid",))
+
+    def test__preload_handles_utf8(self):
+        new_inv = self.make_basic_utf8_inventory()
+        self.assertEqual({}, new_inv._fileid_to_entry_cache)
+        self.assertFalse(new_inv._fully_cached)
+        new_inv._preload_cache()
+        self.assertEqual(
+            sorted([new_inv.root_id, "fileid", "dirid", "childid"]),
+            sorted(new_inv._fileid_to_entry_cache.keys()))
+        ie_root = new_inv._fileid_to_entry_cache[new_inv.root_id]
+        self.assertEqual([u'dir-\N{EURO SIGN}', u'f\xefle'],
+                         sorted(ie_root._children.keys()))
+        ie_dir = new_inv._fileid_to_entry_cache['dirid']
+        self.assertEqual([u'ch\xefld'], sorted(ie_dir._children.keys()))
+
+    def test__preload_populates_cache(self):
+        inv = Inventory()
+        inv.revision_id = "revid"
+        inv.root.revision = "rootrev"
+        root_id = inv.root.file_id
+        inv.add(InventoryFile("fileid", "file", root_id))
+        inv["fileid"].revision = "filerev"
+        inv["fileid"].executable = True
+        inv["fileid"].text_sha1 = "ffff"
+        inv["fileid"].text_size = 1
+        inv.add(InventoryDirectory("dirid", "dir", root_id))
+        inv.add(InventoryFile("childid", "child", "dirid"))
+        inv["childid"].revision = "filerev"
+        inv["childid"].executable = False
+        inv["childid"].text_sha1 = "dddd"
+        inv["childid"].text_size = 1
+        chk_bytes = self.get_chk_bytes()
+        chk_inv = CHKInventory.from_inventory(chk_bytes, inv)
+        bytes = ''.join(chk_inv.to_lines())
+        new_inv = CHKInventory.deserialise(chk_bytes, bytes, ("revid",))
+        self.assertEqual({}, new_inv._fileid_to_entry_cache)
+        self.assertFalse(new_inv._fully_cached)
+        new_inv._preload_cache()
+        self.assertEqual(
+            sorted([root_id, "fileid", "dirid", "childid"]),
+            sorted(new_inv._fileid_to_entry_cache.keys()))
+        self.assertTrue(new_inv._fully_cached)
+        ie_root = new_inv._fileid_to_entry_cache[root_id]
+        self.assertEqual(['dir', 'file'], sorted(ie_root._children.keys()))
+        ie_dir = new_inv._fileid_to_entry_cache['dirid']
+        self.assertEqual(['child'], sorted(ie_dir._children.keys()))
+
+    def test__preload_handles_partially_evaluated_inventory(self):
+        new_inv = self.make_basic_utf8_inventory()
+        ie = new_inv[new_inv.root_id]
+        self.assertIs(None, ie._children)
+        self.assertEqual([u'dir-\N{EURO SIGN}', u'f\xefle'],
+                         sorted(ie.children.keys()))
+        # Accessing .children loads _children
+        self.assertEqual([u'dir-\N{EURO SIGN}', u'f\xefle'],
+                         sorted(ie._children.keys()))
+        new_inv._preload_cache()
+        # No change
+        self.assertEqual([u'dir-\N{EURO SIGN}', u'f\xefle'],
+                         sorted(ie._children.keys()))
+        ie_dir = new_inv["dirid"]
+        self.assertEqual([u'ch\xefld'],
+                         sorted(ie_dir._children.keys()))
+
 
 class TestCHKInventoryExpand(tests.TestCaseWithMemoryTransport):
 

=== modified file 'doc/en/release-notes/bzr-2.4.txt'
--- a/doc/en/release-notes/bzr-2.4.txt	2011-03-17 17:18:11 +0000
+++ b/doc/en/release-notes/bzr-2.4.txt	2011-03-22 12:29:53 +0000
@@ -122,6 +122,13 @@
 * ``bzr export`` can now create ``.tar.xz`` and ``.tar.lzma`` files.
   (Jelmer Vernooij, #551714)
 
+* Getting all entries from ``CHKInventory.iter_entries_by_dir()`` has been
+  sped up dramatically for large trees. Iterating by dir is not the best
+  way to load data from a CHK inventory, so it preloads all the items in
+  the correct order. (With the gcc-tree, this changes it (re)reading 8GB
+  of CHK data, down to just 150MB.) This has noticeable affects for things
+  like building checkouts, etc.  (John Arbash Meinel, #737234)
+
 
 Bug Fixes
 *********




More information about the bazaar-commits mailing list