Rev 5632: (jameinel) Bug #737234, in file:///home/pqm/archives/thelove/bzr/2.3/
Canonical.com Patch Queue Manager
pqm at pqm.ubuntu.com
Wed Mar 23 12:00:14 UTC 2011
At file:///home/pqm/archives/thelove/bzr/2.3/
------------------------------------------------------------
revno: 5632 [merge]
revision-id: pqm at pqm.ubuntu.com-20110323120011-3b3tzjsj0er2rp4y
parent: pqm at pqm.ubuntu.com-20110322111324-5xg4h0hdp23pgc5x
parent: john at arbash-meinel.com-20110323110749-nrq2wd5slj4h7e51
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: 2.3
timestamp: Wed 2011-03-23 12:00:11 +0000
message:
(jameinel) Bug #737234,
preload the inventory during iter_entries_by_dir to avoid thrashing. (John A
Meinel)
modified:
bzrlib/inventory.py inventory.py-20050309040759-6648b84ca2005b37
bzrlib/tests/test_inv.py testinv.py-20050722220913-1dc326138d1a5892
doc/en/release-notes/bzr-2.3.txt NEWS-20050323055033-4e00b5db738777ff
=== modified file 'bzrlib/inventory.py'
--- a/bzrlib/inventory.py 2010-09-14 13:12:20 +0000
+++ b/bzrlib/inventory.py 2011-03-23 11:07:49 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2005-2010 Canonical Ltd
+# Copyright (C) 2005-2011 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -718,6 +718,14 @@
# if we finished all children, pop it off the stack
stack.pop()
+ def _preload_cache(self):
+ """Populate any caches, we are about to access all items.
+
+ The default implementation does nothing, because CommonInventory doesn't
+ have a cache.
+ """
+ pass
+
def iter_entries_by_dir(self, from_dir=None, specific_file_ids=None,
yield_parents=False):
"""Iterate over the entries in a directory first order.
@@ -736,6 +744,11 @@
specific_file_ids = set(specific_file_ids)
# TODO? Perhaps this should return the from_dir so that the root is
# yielded? or maybe an option?
+ if from_dir is None and specific_file_ids is None:
+ # They are iterating from the root, and have not specified any
+ # specific entries to look at. All current callers fully consume the
+ # iterator, so we can safely assume we are accessing all entries
+ self._preload_cache()
if from_dir is None:
if self.root is None:
return
@@ -1390,6 +1403,7 @@
def __init__(self, search_key_name):
CommonInventory.__init__(self)
self._fileid_to_entry_cache = {}
+ self._fully_cached = False
self._path_to_fileid_cache = {}
self._search_key_name = search_key_name
self.root_id = None
@@ -1956,7 +1970,7 @@
def iter_just_entries(self):
"""Iterate over all entries.
-
+
Unlike iter_entries(), just the entries are returned (not (path, ie))
and the order of entries is undefined.
@@ -1970,6 +1984,59 @@
self._fileid_to_entry_cache[file_id] = ie
yield ie
+ def _preload_cache(self):
+ """Make sure all file-ids are in _fileid_to_entry_cache"""
+ if self._fully_cached:
+ return # No need to do it again
+ # The optimal sort order is to use iteritems() directly
+ cache = self._fileid_to_entry_cache
+ for key, entry in self.id_to_entry.iteritems():
+ file_id = key[0]
+ if file_id not in cache:
+ ie = self._bytes_to_entry(entry)
+ cache[file_id] = ie
+ else:
+ ie = cache[file_id]
+ last_parent_id = last_parent_ie = None
+ pid_items = self.parent_id_basename_to_file_id.iteritems()
+ for key, child_file_id in pid_items:
+ if key == ('', ''): # This is the root
+ if child_file_id != self.root_id:
+ raise ValueError('Data inconsistency detected.'
+ ' We expected data with key ("","") to match'
+ ' the root id, but %s != %s'
+ % (child_file_id, self.root_id))
+ continue
+ parent_id, basename = key
+ ie = cache[child_file_id]
+ if parent_id == last_parent_id:
+ parent_ie = last_parent_ie
+ else:
+ parent_ie = cache[parent_id]
+ if parent_ie.kind != 'directory':
+ raise ValueError('Data inconsistency detected.'
+ ' An entry in the parent_id_basename_to_file_id map'
+ ' has parent_id {%s} but the kind of that object'
+ ' is %r not "directory"' % (parent_id, parent_ie.kind))
+ if parent_ie._children is None:
+ parent_ie._children = {}
+ basename = basename.decode('utf-8')
+ if basename in parent_ie._children:
+ existing_ie = parent_ie._children[basename]
+ if existing_ie != ie:
+ raise ValueError('Data inconsistency detected.'
+ ' Two entries with basename %r were found'
+ ' in the parent entry {%s}'
+ % (basename, parent_id))
+ if basename != ie.name:
+ raise ValueError('Data inconsistency detected.'
+ ' In the parent_id_basename_to_file_id map, file_id'
+ ' {%s} is listed as having basename %r, but in the'
+ ' id_to_entry map it is %r'
+ % (child_file_id, basename, ie.name))
+ parent_ie._children[basename] = ie
+ self._fully_cached = True
+
def iter_changes(self, basis):
"""Generate a Tree.iter_changes change list between this and basis.
=== modified file 'bzrlib/tests/test_inv.py'
--- a/bzrlib/tests/test_inv.py 2011-01-12 01:01:53 +0000
+++ b/bzrlib/tests/test_inv.py 2011-03-23 11:07:49 +0000
@@ -1219,6 +1219,88 @@
self.assertEqual(('tree\xce\xa9name', 'tree-root-id', 'tree-rev-id'),
inv._bytes_to_utf8name_key(bytes))
+ def make_basic_utf8_inventory(self):
+ inv = Inventory()
+ inv.revision_id = "revid"
+ inv.root.revision = "rootrev"
+ root_id = inv.root.file_id
+ inv.add(InventoryFile("fileid", u'f\xefle', root_id))
+ inv["fileid"].revision = "filerev"
+ inv["fileid"].text_sha1 = "ffff"
+ inv["fileid"].text_size = 0
+ inv.add(InventoryDirectory("dirid", u'dir-\N{EURO SIGN}', root_id))
+ inv.add(InventoryFile("childid", u'ch\xefld', "dirid"))
+ inv["childid"].revision = "filerev"
+ inv["childid"].text_sha1 = "ffff"
+ inv["childid"].text_size = 0
+ chk_bytes = self.get_chk_bytes()
+ chk_inv = CHKInventory.from_inventory(chk_bytes, inv)
+ bytes = ''.join(chk_inv.to_lines())
+ return CHKInventory.deserialise(chk_bytes, bytes, ("revid",))
+
+ def test__preload_handles_utf8(self):
+ new_inv = self.make_basic_utf8_inventory()
+ self.assertEqual({}, new_inv._fileid_to_entry_cache)
+ self.assertFalse(new_inv._fully_cached)
+ new_inv._preload_cache()
+ self.assertEqual(
+ sorted([new_inv.root_id, "fileid", "dirid", "childid"]),
+ sorted(new_inv._fileid_to_entry_cache.keys()))
+ ie_root = new_inv._fileid_to_entry_cache[new_inv.root_id]
+ self.assertEqual([u'dir-\N{EURO SIGN}', u'f\xefle'],
+ sorted(ie_root._children.keys()))
+ ie_dir = new_inv._fileid_to_entry_cache['dirid']
+ self.assertEqual([u'ch\xefld'], sorted(ie_dir._children.keys()))
+
+ def test__preload_populates_cache(self):
+ inv = Inventory()
+ inv.revision_id = "revid"
+ inv.root.revision = "rootrev"
+ root_id = inv.root.file_id
+ inv.add(InventoryFile("fileid", "file", root_id))
+ inv["fileid"].revision = "filerev"
+ inv["fileid"].executable = True
+ inv["fileid"].text_sha1 = "ffff"
+ inv["fileid"].text_size = 1
+ inv.add(InventoryDirectory("dirid", "dir", root_id))
+ inv.add(InventoryFile("childid", "child", "dirid"))
+ inv["childid"].revision = "filerev"
+ inv["childid"].executable = False
+ inv["childid"].text_sha1 = "dddd"
+ inv["childid"].text_size = 1
+ chk_bytes = self.get_chk_bytes()
+ chk_inv = CHKInventory.from_inventory(chk_bytes, inv)
+ bytes = ''.join(chk_inv.to_lines())
+ new_inv = CHKInventory.deserialise(chk_bytes, bytes, ("revid",))
+ self.assertEqual({}, new_inv._fileid_to_entry_cache)
+ self.assertFalse(new_inv._fully_cached)
+ new_inv._preload_cache()
+ self.assertEqual(
+ sorted([root_id, "fileid", "dirid", "childid"]),
+ sorted(new_inv._fileid_to_entry_cache.keys()))
+ self.assertTrue(new_inv._fully_cached)
+ ie_root = new_inv._fileid_to_entry_cache[root_id]
+ self.assertEqual(['dir', 'file'], sorted(ie_root._children.keys()))
+ ie_dir = new_inv._fileid_to_entry_cache['dirid']
+ self.assertEqual(['child'], sorted(ie_dir._children.keys()))
+
+ def test__preload_handles_partially_evaluated_inventory(self):
+ new_inv = self.make_basic_utf8_inventory()
+ ie = new_inv[new_inv.root_id]
+ self.assertIs(None, ie._children)
+ self.assertEqual([u'dir-\N{EURO SIGN}', u'f\xefle'],
+ sorted(ie.children.keys()))
+ # Accessing .children loads _children
+ self.assertEqual([u'dir-\N{EURO SIGN}', u'f\xefle'],
+ sorted(ie._children.keys()))
+ new_inv._preload_cache()
+ # No change
+ self.assertEqual([u'dir-\N{EURO SIGN}', u'f\xefle'],
+ sorted(ie._children.keys()))
+ ie_dir = new_inv["dirid"]
+ self.assertEqual([u'ch\xefld'],
+ sorted(ie_dir._children.keys()))
+
class TestCHKInventoryExpand(tests.TestCaseWithMemoryTransport):
=== modified file 'doc/en/release-notes/bzr-2.3.txt'
--- a/doc/en/release-notes/bzr-2.3.txt 2011-03-21 14:35:33 +0000
+++ b/doc/en/release-notes/bzr-2.3.txt 2011-03-23 11:07:49 +0000
@@ -26,6 +26,13 @@
.. Improvements to existing commands, especially improved performance
or memory usage, or better results.
+* Getting all entries from ``CHKInventory.iter_entries_by_dir()`` has been
+ sped up dramatically for large trees. Iterating by dir is not the best
+ way to load data from a CHK inventory, so it preloads all the items in
+ the correct order. (With the gcc-tree, this changes it (re)reading 8GB
+ of CHK data, down to just 150MB.) This has noticeable affects for things
+ like building checkouts, etc. (John Arbash Meinel, #737234)
+
Bug Fixes
*********
More information about the bazaar-commits
mailing list