Rev 5674: Start working on some code to expand a path into a bunch of file-ids that in http://bazaar.launchpad.net/~jameinel/bzr/2.4-log-subdir
John Arbash Meinel
john at arbash-meinel.com
Tue Feb 22 21:01:19 UTC 2011
At http://bazaar.launchpad.net/~jameinel/bzr/2.4-log-subdir
------------------------------------------------------------
revno: 5674
revision-id: john at arbash-meinel.com-20110222210114-1h9rkncrxpyt67p5
parent: john at arbash-meinel.com-20110219220159-q0f2594tg73m6hda
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.4-log-subdir
timestamp: Tue 2011-02-22 15:01:14 -0600
message:
Start working on some code to expand a path into a bunch of file-ids that
represent that path.
Initial results are pretty promising over the existing 'inventory.filter()' code path.
For 2000 mainline revs of bzr looking at all of bzrlib, it took 3.3s vs 54.7 for --filter.
-------------- next part --------------
=== modified file 'bzrlib/delta_search.py'
--- a/bzrlib/delta_search.py 2011-02-19 22:01:59 +0000
+++ b/bzrlib/delta_search.py 2011-02-22 21:01:14 +0000
@@ -122,15 +122,86 @@
self._path_file_id_chk = None
self._matching_file_ids = None
+ def _find_parent_ids(self, tree, file_ids):
+ """Find the parents up to the root for these file_ids.
+
+ Also, determine which of these entries are directories.
+ :param tree: The Tree whose context we are evaluating
+ :param file_ids: The file_ids whose parents we want to find
+ :return: all_file_ids, directory_ids
+ all_file_ids: All found file-ids, that are parents of the
+ supplied. This includes file_ids themselves
+ directory_ids: file_ids you should search for any children.
+ """
+ # First find the parent ids. We go via the Inventory interface for this
+ # part. The 'id_to_entry' map contains file_id => parent_id. We may
+ # change this api a bit, since bytes => entry seems to be a dominant
+ # time factor.
+ next_file_ids = file_ids
+ # These are entries we already know about, and don't need to search
+ # again. "None" is the parent of the root file id. We never search for
+ # it
+ all_file_ids = set()
+ known_parent_ids = set([None])
+ first = True
+ directory_ids = set()
+ while next_file_ids:
+ cur_file_ids = next_file_ids
+ next_file_ids = set()
+ for file_id in cur_file_ids:
+ entry = tree.inventory[file_id]
+ next_file_ids.add(entry.parent_id)
+ all_file_ids.add(file_id)
+ if first and entry.kind == 'directory':
+ directory_ids.add(file_id)
+ first = False
+ # The parent of the root id is None
+ # TODO: Check if next_file_ids is much smaller than
+ # known_parent_ids, then we probably want a different
+ # function
+ next_file_ids.difference_update(known_parent_ids)
+ return all_file_ids, directory_ids
+
+ def _expand_children_ids(self, tree, file_ids, all_file_ids):
+ """find children of the given file_ids, recursively."""
+ # pid_map is (file_id, basename) => child_file_id
+ # Searching for (file_id,) returns all keys that are children.
+ pid_map = tree.inventory.parent_id_basename_to_file_id
+ searched_file_ids = set(file_ids)
+ next_search_keys = [(f_id,) for f_id in file_ids]
+ while next_search_keys:
+ search_keys = next_search_keys
+ next_search_keys = []
+ for pid_basename, file_id in pid_map.iteritems(search_keys):
+ all_file_ids.add(file_id)
+ if file_id not in searched_file_ids:
+ # At this point, file_id may be a file or a directory. But
+ # rather than page in the id_to_entry code, and build lots
+ # of InventoryEntry objects, we just search for children.
+ # Directories will have children, files won't.
+ # Note that we could abuse the Entry cache for
+ # CHKInventory. And pre-resolve anything that is already in
+ # the cache.
+ searched_file_ids.add(file_id)
+ next_search_keys.append((file_id,))
+
+ def _expand_file_ids(self, tree, file_ids):
+ """Expand file ids to include direct parents, and all children."""
+ pid_map = tree.inventory.parent_id_basename_to_file_id
+ if pid_map.key() == self._path_file_id_chk:
+ return self._matching_file_ids
+ all_file_ids, directory_ids = self._find_parent_ids(tree, file_ids)
+ self._expand_children_ids(tree, directory_ids, all_file_ids)
+ self._matching_file_ids = all_file_ids
+ self._path_file_id_chk = pid_map.key()
+ return all_file_ids
+
def _expand_paths(self, tree):
pid_map = tree.inventory.parent_id_basename_to_file_id
if pid_map.key() == self._path_file_id_chk:
return self._matching_file_ids
file_ids = tree.paths2ids(self.search_paths)
- file_ids, _ = tree.inventory._expand_fileids_to_parents_and_children(
- file_ids)
- self._matching_file_ids = file_ids
- self._path_file_id_chk = pid_map.key()
+ file_ids = self._expand_file_ids(tree, file_ids)
return file_ids
def _path_search(self, revisions, trees):
=== modified file 'bzrlib/tests/__init__.py'
--- a/bzrlib/tests/__init__.py 2011-02-11 17:12:35 +0000
+++ b/bzrlib/tests/__init__.py 2011-02-22 21:01:14 +0000
@@ -3748,9 +3748,10 @@
'bzrlib.tests.test_conflicts',
'bzrlib.tests.test_counted_lock',
'bzrlib.tests.test_crash',
+ 'bzrlib.tests.test_debug',
'bzrlib.tests.test_decorators',
'bzrlib.tests.test_delta',
- 'bzrlib.tests.test_debug',
+ 'bzrlib.tests.test_delta_search',
'bzrlib.tests.test_deprecated_graph',
'bzrlib.tests.test_diff',
'bzrlib.tests.test_directory_service',
=== added file 'bzrlib/tests/test_delta_search.py'
--- a/bzrlib/tests/test_delta_search.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test_delta_search.py 2011-02-22 21:01:14 +0000
@@ -0,0 +1,73 @@
+# Copyright (C) 2011 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Search for data in historical revisions."""
+
+from bzrlib import (
+ delta_search,
+ tests,
+ )
+
+
+class TestCHKDeltaSearch(tests.TestCaseWithMemoryTransport):
+ # Note: These tests assume that the default repository format supports CHK,
+ # which is true for the forseable future.
+
+ def test__expand_file_ids_single(self):
+ builder = self.make_branch_builder('build')
+ builder.build_snapshot('rev-1', None,
+ [('add', ('', 'a-root-id', 'directory', None)),
+ ('add', ('file', 'file-id', 'file', 'contents\n')),
+ ])
+ branch = builder.get_branch()
+ self.addCleanup(branch.lock_read().unlock)
+ search = delta_search.CHKDeltaSearch(branch.repository)
+ tree = branch.repository.revision_tree('rev-1')
+ self.assertEqual(set(['a-root-id', 'file-id']),
+ search._expand_file_ids(tree, ['file-id']))
+
+ def test__expand_file_ids_subdir(self):
+ builder = self.make_branch_builder('build')
+ builder.build_snapshot('rev-1', None,
+ [('add', ('', 'a-root-id', 'directory', None)),
+ ('add', ('file', 'file-id', 'file', 'contents\n')),
+ ('add', ('dir', 'dir-id', 'directory', None)),
+ ('add', ('dir/subfile', 'subfile-id', 'file', 'contents\n')),
+ ])
+ branch = builder.get_branch()
+ self.addCleanup(branch.lock_read().unlock)
+ search = delta_search.CHKDeltaSearch(branch.repository)
+ tree = branch.repository.revision_tree('rev-1')
+ # We should include dir and root, but not 'file'
+ self.assertEqual(set(['a-root-id', 'dir-id', 'subfile-id']),
+ search._expand_file_ids(tree, ['subfile-id']))
+
+ def test__expand_file_ids_includes_children(self):
+ builder = self.make_branch_builder('build')
+ builder.build_snapshot('rev-1', None,
+ [('add', ('', 'a-root-id', 'directory', None)),
+ ('add', ('file', 'file-id', 'file', 'contents\n')),
+ ('add', ('dir', 'dir-id', 'directory', None)),
+ ('add', ('dir/subfile', 'subfile-id', 'file', 'contents\n')),
+ ])
+ branch = builder.get_branch()
+ self.addCleanup(branch.lock_read().unlock)
+ search = delta_search.CHKDeltaSearch(branch.repository)
+ tree = branch.repository.revision_tree('rev-1')
+ # We should recurse up to dir and root, and down to subfile, but not
+ # across to file.
+ self.assertEqual(set(['a-root-id', 'dir-id', 'subfile-id']),
+ search._expand_file_ids(tree, ['dir-id']))
=== added file 'time_delta_search.py'
--- a/time_delta_search.py 1970-01-01 00:00:00 +0000
+++ b/time_delta_search.py 2011-02-22 21:01:14 +0000
@@ -0,0 +1,48 @@
+import optparse
+import time
+from bzrlib import branch, initialize, trace, ui
+
+p = optparse.OptionParser()
+p.add_option('--path', type='str', default='bzrlib', help='Set the search path')
+p.add_option('--num-revs', type='int', default=100,
+ help='Set how many revs to process, 0 => do all of them')
+p.add_option('--filter', action='store_true', default=False,
+ help='Use inventory.filter() instead of delta_search')
+p.add_option('--show-count', action='store_true', default=False,
+ help='Show the count of file_ids per revision.')
+p.add_option('--no-cache', dest='cache', action='store_false', default=True,
+ help='Disable caching based on pid_map chk')
+opts, args = p.parse_args()
+
+with initialize():
+ b = branch.Branch.open('.')
+ b.lock_read()
+ search = b.repository.get_delta_searcher()
+ file_id = None
+ tstart = time.time()
+ count = 0
+ total_count = 0
+ pb = ui.ui_factory.nested_progress_bar()
+ for idx, rev_id in enumerate(
+ b.repository.iter_reverse_revision_history(b.last_revision())):
+ tree = b.repository.revision_tree(rev_id)
+ if file_id is None:
+ file_id = tree.path2id(opts.path)
+ if opts.filter:
+ file_ids = tree.inventory.filter([file_id])._byid.keys()
+ else:
+ file_ids = search._expand_file_ids(tree, [file_id])
+ if not opts.cache:
+ search = b.repository.get_delta_searcher()
+ if opts.show_count:
+ print rev_id, len(file_ids)
+ count += 1
+ total_count += len(file_ids)
+ pb.update('counting', idx, opts.num_revs)
+ if opts.num_revs and count >= opts.num_revs:
+ break
+ tend = time.time()
+
+ trace.note('Found total %d file_ids' % (total_count,))
+ trace.note('Took %.3fs for %d revisions and path: %s'
+ % (tend - tstart, count, opts.path))
More information about the bazaar-commits
mailing list