Rev 2991: Add a new repositoy method _generate_text_key_index for use by reconcile/check. in http://people.ubuntu.com/~robertc/baz2.0/reconcile

Robert Collins robertc at robertcollins.net
Thu Nov 15 02:07:51 GMT 2007


At http://people.ubuntu.com/~robertc/baz2.0/reconcile

------------------------------------------------------------
revno: 2991
revision-id:robertc at robertcollins.net-20071115020737-51o1kmv3jsf9x8aa
parent: robertc at robertcollins.net-20071114042424-ww2cu4nqfo5fv6c4
committer: Robert Collins <robertc at robertcollins.net>
branch nick: _generate_text_key_index
timestamp: Thu 2007-11-15 13:07:37 +1100
message:
  Add a new repositoy method _generate_text_key_index for use by reconcile/check.
added:
  bzrlib/tests/repository_implementations/test__generate_text_key_index.py test__generate_text_-20071114232121-00h9fd8qg8kjfa5k-1
modified:
  bzrlib/graph.py                graph_walker.py-20070525030359-y852guab65d4wtn0-1
  bzrlib/remote.py               remote.py-20060720103555-yeeg2x51vn0rbtdp-1
  bzrlib/repository.py           rev_storage.py-20051111201905-119e9401e46257e3
  bzrlib/tests/repository_implementations/__init__.py __init__.py-20060131092037-9564957a7d4a841b
  bzrlib/tests/repository_implementations/test_check_reconcile.py test_broken.py-20070928125406-62236394w0jpbpd6-2
  bzrlib/tests/test_graph.py     test_graph_walker.py-20070525030405-enq4r60hhi9xrujc-1
=== added file 'bzrlib/tests/repository_implementations/test__generate_text_key_index.py'
--- a/bzrlib/tests/repository_implementations/test__generate_text_key_index.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/repository_implementations/test__generate_text_key_index.py	2007-11-15 02:07:37 +0000
@@ -0,0 +1,30 @@
+# Copyright (C) 2007 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+"""Tests for the _generate_text_key_index API."""
+
+
+from bzrlib.tests.repository_implementations import TestCaseWithRepository
+
+
+class TestGenerateTextKeyIndex(TestCaseWithRepository):
+
+    def test_empty(self):
+        repo = self.make_repository('.')
+        repo.lock_read()
+        self.addCleanup(repo.unlock)
+        self.assertEqual({}, repo._generate_text_key_index())

=== modified file 'bzrlib/graph.py'
--- a/bzrlib/graph.py	2007-10-24 06:48:13 +0000
+++ b/bzrlib/graph.py	2007-11-15 02:07:37 +0000
@@ -44,6 +44,17 @@
 # 2. Since len(['D', 'E']) > 1, find_lca('D', 'E') => ['A']
 
 
+class DictParentsProvider(object):
+
+    def __init__(self, ancestry):
+        self.ancestry = ancestry
+
+    def __repr__(self):
+        return 'DictParentsProvider(%r)' % self.ancestry
+
+    def get_parents(self, revisions):
+        return [self.ancestry.get(r, None) for r in revisions]
+
 
 class _StackedParentsProvider(object):
 

=== modified file 'bzrlib/remote.py'
--- a/bzrlib/remote.py	2007-11-14 04:24:24 +0000
+++ b/bzrlib/remote.py	2007-11-15 02:07:37 +0000
@@ -319,6 +319,17 @@
         self._ensure_real()
         return self._real_repository.find_text_key_references()
 
+    def _generate_text_key_index(self):
+        """Generate a new text key index for the repository.
+
+        This is an expensive function that will take considerable time to run.
+
+        :return: A dict mapping (file_id, revision_id) tuples to a list of
+            parents, also (file_id, revision_id) tuples.
+        """
+        self._ensure_real()
+        return self._real_repository._generate_text_key_index()
+
     def get_revision_graph(self, revision_id=None):
         """See Repository.get_revision_graph()."""
         if revision_id is None:

=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py	2007-11-14 04:24:24 +0000
+++ b/bzrlib/repository.py	2007-11-15 02:07:37 +0000
@@ -39,6 +39,7 @@
     revision as _mod_revision,
     symbol_versioning,
     transactions,
+    tsort,
     ui,
     )
 from bzrlib.bundle import serializer
@@ -1227,6 +1228,91 @@
                 raise errors.NoSuchIdInRepository(self, file_id)
             yield callable_data, weave.get_lines(revision_id)
 
+    def _generate_text_key_index(self):
+        """Generate a new text key index for the repository.
+
+        This is an expensive function that will take considerable time to run.
+
+        :return: A dict mapping text keys ((file_id, revision_id) tuples) to a
+            list of parents, also text keys. When a given key has no parents,
+            the parents list will be [NULL_REVISION].
+        """
+        # All revisions, to find inventory parents.
+        revision_graph = self.get_revision_graph_with_ghosts()
+        ancestors = revision_graph.get_ancestors()
+        revision_order = tsort.topo_sort(ancestors)
+        invalid_keys = set()
+        revision_keys = {}
+        for revision_id in revision_order:
+            revision_keys[revision_id] = set()
+        text_key_references = self.find_text_key_references()
+        text_count = len(text_key_references)
+        # a cache of the text keys to allow reuse; costs a dict of all the
+        # keys, but saves a 2-tuple for every child of a given key.
+        text_key_cache = {}
+        for text_key, valid in text_key_references.iteritems():
+            if not valid:
+                invalid_keys.add(text_key)
+            else:
+                revision_keys[text_key[1]].add(text_key)
+            text_key_cache[text_key] = text_key
+        del text_key_references
+        text_index = {}
+        text_graph = graph.Graph(graph.DictParentsProvider(text_index))
+        NULL_REVISION = _mod_revision.NULL_REVISION
+        pb = ui.ui_factory.nested_progress_bar()
+        batch_size = 10 # should be ~150MB on a 55K path tree
+        batch_count = len(revision_order) / batch_size + 1
+        processed_texts = 0
+        pb.update("Calculating text parents.", processed_texts, text_count)
+        for offset in xrange(batch_count):
+            to_query = revision_order[offset * batch_size:(offset + 1) *
+                batch_size]
+            if not to_query:
+                break
+            for rev_tree in self.revision_trees(to_query):
+                revision_id = rev_tree.get_revision_id()
+                parent_ids = ancestors[revision_id]
+                for text_key in revision_keys[revision_id]:
+                    pb.update("Calculating text parents.", processed_texts)
+                    processed_texts += 1
+                    candidate_parents = []
+                    for parent_id in parent_ids:
+                        parent_text_key = (text_key[0], parent_id)
+                        try:
+                            check_parent = parent_text_key not in \
+                                revision_keys[parent_id]
+                        except KeyError:
+                            # the parent parent_id is a ghost:
+                            check_parent = False
+                            # truncate the derived graph against this ghost.
+                            parent_text_key = None
+                        if check_parent:
+                            # look at the parent commit details inventories to
+                            # determine possible candidates in the per file graph.
+                            # TODO: cache here.
+                            parent_inv = self.revision_tree(parent_id).inventory
+                            parent_entry = parent_inv._byid.get(
+                                text_key[0], None)
+                            if parent_entry is not None:
+                                parent_text_key = (
+                                    text_key[0], parent_entry.revision)
+                            else:
+                                parent_text_key = None
+                        if parent_text_key is not None:
+                            candidate_parents.append(
+                                text_key_cache[parent_text_key])
+                    parent_heads = text_graph.heads(candidate_parents)
+                    new_parents = list(parent_heads)
+                    new_parents.sort(key=lambda x:candidate_parents.index(x))
+                    if new_parents == []:
+                        new_parents = [NULL_REVISION]
+                    text_index[text_key] = new_parents
+
+        for text_key in invalid_keys:
+            text_index[text_key] = [NULL_REVISION]
+        return text_index
+
     def item_keys_introduced_by(self, revision_ids, _files_pb=None):
         """Get an iterable listing the keys of all the data introduced by a set
         of revision IDs.

=== modified file 'bzrlib/tests/repository_implementations/__init__.py'
--- a/bzrlib/tests/repository_implementations/__init__.py	2007-11-14 04:24:24 +0000
+++ b/bzrlib/tests/repository_implementations/__init__.py	2007-11-15 02:07:37 +0000
@@ -27,6 +27,7 @@
 from bzrlib import (
     repository,
     )
+from bzrlib.revision import NULL_REVISION
 from bzrlib.repofmt import (
     weaverepo,
     )
@@ -143,6 +144,13 @@
     def corrected_fulltexts(self):
         return []
 
+    def repository_text_key_index(self):
+        result = {}
+        if self.versioned_root:
+            result.update(self.versioned_repository_text_keys())
+        result.update(self.repository_text_keys())
+        return result
+
 
 class UndamagedRepositoryScenario(BrokenRepoScenario):
     """A scenario where the repository has no damage.
@@ -177,6 +185,12 @@
         result.update({('a-file-id', 'rev1a'): True})
         return result
 
+    def repository_text_keys(self):
+        return {('a-file-id', 'rev1a'):[NULL_REVISION]}
+
+    def versioned_repository_text_keys(self):
+        return {('TREE_ROOT', 'rev1a'):[NULL_REVISION]}
+
 
 class FileParentIsNotInRevisionAncestryScenario(BrokenRepoScenario):
     """A scenario where a revision 'rev2' has 'a-file' with a
@@ -236,6 +250,14 @@
                        ('a-file-id', 'rev2'): True})
         return result
 
+    def repository_text_keys(self):
+        return {('a-file-id', 'rev1a'):[NULL_REVISION],
+                ('a-file-id', 'rev2'):[('a-file-id', 'rev1a')]}
+
+    def versioned_repository_text_keys(self):
+        return {('TREE_ROOT', 'rev1a'):[NULL_REVISION],
+                ('TREE_ROOT', 'rev2'):[('TREE_ROOT', 'rev1a')]}
+
 
 class FileParentHasInaccessibleInventoryScenario(BrokenRepoScenario):
     """A scenario where a revision 'rev3' containing 'a-file' modified in
@@ -296,6 +318,14 @@
                        ('a-file-id', 'rev3'): True})
         return result
 
+    def repository_text_keys(self):
+        return {('a-file-id', 'rev2'):[NULL_REVISION],
+                ('a-file-id', 'rev3'):[NULL_REVISION]}
+
+    def versioned_repository_text_keys(self):
+        return {('TREE_ROOT', 'rev2'):[NULL_REVISION],
+                ('TREE_ROOT', 'rev3'):[NULL_REVISION]}
+
 
 class FileParentsNotReferencedByAnyInventoryScenario(BrokenRepoScenario):
     """A scenario where a repository with file 'a-file' which has extra
@@ -433,6 +463,24 @@
                        ('a-file-id', 'rev5'): True})
         return result
 
+    def repository_text_keys(self):
+        return {('a-file-id', 'rev1a'): [NULL_REVISION],
+                 ('a-file-id', 'rev2c'): [('a-file-id', 'rev1a')],
+                 ('a-file-id', 'rev3'): [('a-file-id', 'rev1a')],
+                 ('a-file-id', 'rev4'): [('a-file-id', 'rev1a')],
+                 ('a-file-id', 'rev5'): [('a-file-id', 'rev2c')]}
+
+    def versioned_repository_text_keys(self):
+        return {('TREE_ROOT', 'rev1a'): [NULL_REVISION],
+                ('TREE_ROOT', 'rev2'): [('TREE_ROOT', 'rev1a')],
+                ('TREE_ROOT', 'rev2b'): [('TREE_ROOT', 'rev1a')],
+                ('TREE_ROOT', 'rev2c'): [('TREE_ROOT', 'rev1a')],
+                ('TREE_ROOT', 'rev3'): [('TREE_ROOT', 'rev1a')],
+                ('TREE_ROOT', 'rev4'):
+                    [('TREE_ROOT', 'rev2'), ('TREE_ROOT', 'rev2b')],
+                ('TREE_ROOT', 'rev5'):
+                    [('TREE_ROOT', 'rev2'), ('TREE_ROOT', 'rev2c')]}
+
 
 class UnreferencedFileParentsFromNoOpMergeScenario(BrokenRepoScenario):
     """
@@ -516,6 +564,20 @@
                        ('a-file-id', 'rev4'): True})
         return result
 
+    def repository_text_keys(self):
+        return {('a-file-id', 'rev1a'): [NULL_REVISION],
+                ('a-file-id', 'rev1b'): [NULL_REVISION],
+                ('a-file-id', 'rev2'): [NULL_REVISION],
+                ('a-file-id', 'rev4'): [('a-file-id', 'rev2')]}
+
+    def versioned_repository_text_keys(self):
+        return {('TREE_ROOT', 'rev1a'): [NULL_REVISION],
+                ('TREE_ROOT', 'rev1b'): [NULL_REVISION],
+                ('TREE_ROOT', 'rev2'):
+                    [('TREE_ROOT', 'rev1a'), ('TREE_ROOT', 'rev1b')],
+                ('TREE_ROOT', 'rev3'): [('TREE_ROOT', 'rev2')],
+                ('TREE_ROOT', 'rev4'): [('TREE_ROOT', 'rev3')]}
+
 
 class TooManyParentsScenario(BrokenRepoScenario):
     """A scenario where 'broken-revision' of 'a-file' claims to have parents
@@ -577,6 +639,18 @@
                        ('a-file-id', 'good-parent'): True})
         return result
              
+    def repository_text_keys(self):
+        return {('a-file-id', 'bad-parent'): [NULL_REVISION],
+                ('a-file-id', 'broken-revision'):
+                    [('a-file-id', 'good-parent')],
+                ('a-file-id', 'good-parent'): [('a-file-id', 'bad-parent')]}
+
+    def versioned_repository_text_keys(self):
+        return {('TREE_ROOT', 'bad-parent'): [NULL_REVISION],
+                ('TREE_ROOT', 'broken-revision'):
+                    [('TREE_ROOT', 'good-parent')],
+                ('TREE_ROOT', 'good-parent'): [('TREE_ROOT', 'bad-parent')]}
+
 
 class ClaimedFileParentDidNotModifyFileScenario(BrokenRepoScenario):
     """A scenario where the file parent is the same as the revision parent, but
@@ -646,6 +720,17 @@
         result.update({('a-file-id', 'basis'): True,
                        ('a-file-id', 'current'): True})
         return result
+
+    def repository_text_keys(self):
+        return {('a-file-id', 'basis'): [NULL_REVISION],
+                ('a-file-id', 'current'): [('a-file-id', 'basis')]}
+
+    def versioned_repository_text_keys(self):
+        return {('TREE_ROOT', 'basis'): ['null:'],
+                ('TREE_ROOT', 'current'):
+                    [('TREE_ROOT', 'modified-something-else')],
+                ('TREE_ROOT', 'modified-something-else'):
+                    [('TREE_ROOT', 'basis')]}
             
 
 class IncorrectlyOrderedParentsScenario(BrokenRepoScenario):
@@ -724,6 +809,22 @@
                        ('a-file-id', 'parent-1'): True,
                        ('a-file-id', 'parent-2'): True})
         return result
+
+    def repository_text_keys(self):
+        return {('a-file-id', 'broken-revision-1-2'):
+                    [('a-file-id', 'parent-1'), ('a-file-id', 'parent-2')],
+                ('a-file-id', 'broken-revision-2-1'):
+                    [('a-file-id', 'parent-2'), ('a-file-id', 'parent-1')],
+                ('a-file-id', 'parent-1'): [NULL_REVISION],
+                ('a-file-id', 'parent-2'): [NULL_REVISION]}
+
+    def versioned_repository_text_keys(self):
+        return {('TREE_ROOT', 'broken-revision-1-2'):
+                    [('TREE_ROOT', 'parent-1'), ('TREE_ROOT', 'parent-2')],
+                ('TREE_ROOT', 'broken-revision-2-1'):
+                    [('TREE_ROOT', 'parent-2'), ('TREE_ROOT', 'parent-1')],
+                ('TREE_ROOT', 'parent-1'): [NULL_REVISION],
+                ('TREE_ROOT', 'parent-2'): [NULL_REVISION]}
                
 
 all_broken_scenario_classes = [
@@ -770,6 +871,7 @@
         'test_fetch',
         'test_fileid_involved',
         'test_find_text_key_references',
+        'test__generate_text_key_index',
         'test_has_same_location',
         'test_is_write_locked',
         'test_iter_reverse_revision_history',

=== modified file 'bzrlib/tests/repository_implementations/test_check_reconcile.py'
--- a/bzrlib/tests/repository_implementations/test_check_reconcile.py	2007-11-14 04:24:24 +0000
+++ b/bzrlib/tests/repository_implementations/test_check_reconcile.py	2007-11-15 02:07:37 +0000
@@ -206,3 +206,11 @@
         self.addCleanup(repo.unlock)
         self.assertEqual(scenario.repository_text_key_references(),
             repo.find_text_key_references())
+
+    def test__generate_text_key_index(self):
+        """Test that the generated text key index has all entries."""
+        repo, scenario = self.prepare_test_repository()
+        repo.lock_read()
+        self.addCleanup(repo.unlock)
+        self.assertEqual(scenario.repository_text_key_index(),
+            repo._generate_text_key_index())

=== modified file 'bzrlib/tests/test_graph.py'
--- a/bzrlib/tests/test_graph.py	2007-10-22 05:44:49 +0000
+++ b/bzrlib/tests/test_graph.py	2007-11-15 02:07:37 +0000
@@ -145,18 +145,6 @@
         return self._real_parents_provider.get_parents(nodes)
 
 
-class DictParentsProvider(object):
-
-    def __init__(self, ancestry):
-        self.ancestry = ancestry
-
-    def __repr__(self):
-        return 'DictParentsProvider(%r)' % self.ancestry
-
-    def get_parents(self, revisions):
-        return [self.ancestry.get(r, None) for r in revisions]
-
-
 class TestGraph(TestCaseWithMemoryTransport):
 
     def make_graph(self, ancestors):
@@ -303,8 +291,8 @@
 
     def test_stacked_parents_provider(self):
 
-        parents1 = DictParentsProvider({'rev2': ['rev3']})
-        parents2 = DictParentsProvider({'rev1': ['rev4']})
+        parents1 = _mod_graph.DictParentsProvider({'rev2': ['rev3']})
+        parents2 = _mod_graph.DictParentsProvider({'rev1': ['rev4']})
         stacked = _mod_graph._StackedParentsProvider([parents1, parents2])
         self.assertEqual([['rev4',], ['rev3']],
                          stacked.get_parents(['rev1', 'rev2']))



More information about the bazaar-commits mailing list