Rev 2747: some support for storing inventories into packs in http://sourcefrog.net/bzr/inv-split

Wed Aug 29 09:08:55 BST 2007

At http://sourcefrog.net/bzr/inv-split

------------------------------------------------------------
revno: 2747
revision-id: mbp at sourcefrog.net-20070829080854-xwt7zhkap0nwgj74
parent: mbp at sourcefrog.net-20070829064742-7jfhlbl7y2d82os6
committer: Martin Pool <mbp at sourcefrog.net>
branch nick: inv-split
timestamp: Wed 2007-08-29 18:08:54 +1000
message:
  some support for storing inventories into packs
modified:
  bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
  bzrlib/inventory_split.py      inventory_lazy.py-20070822123225-v3guzmdkesxlfesa-1
  bzrlib/repofmt/pack_repo.py    pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
  bzrlib/tests/test_inventory_split.py test_inventory_lazy.-20070822123233-9yyaaq16ypoy6rpt-1
  bzrlib/tests/test_pack_repository.py test_pack_repository-20070828111851-nof5soh31tidz2dq-1
=== modified file 'bzrlib/index.py'

--- a/bzrlib/index.py	2007-08-24 22:36:01 +0000
+++ b/bzrlib/index.py	2007-08-29 08:08:54 +0000
@@ -598,7 +598,10 @@
     def add_nodes(self, nodes):
         """Add nodes to the index.
 
-        :param nodes: An iterable of (key, node_refs, value) entries to add.
+        :param nodes: An iterable of index entries to add.  If this 
+            index has references, then this is a sequence of 
+            (key, value, node_refs) entries; otherwise of (key, value) -- the
+            empty reference list can (and must) be omitted.
         """
         if self.reference_lists:
             for (key, value, node_refs) in nodes:

=== modified file 'bzrlib/inventory_split.py'
--- a/bzrlib/inventory_split.py	2007-08-29 06:47:42 +0000
+++ b/bzrlib/inventory_split.py	2007-08-29 08:08:54 +0000
@@ -30,6 +30,7 @@
 
 from bzrlib import (
     errors,
+    osutils,
     xml5,
     )
 from bzrlib.inventory import (
@@ -93,6 +94,9 @@
     def _iter_serialized_parts(self):
         """Yield a sequence of serialized hunks for this inventory.
 
+        Yields a sequence of (dir_hash, dir_bytes), being the ascii hash of
+        the directory, and the byte contents of its representation.
+
         Each of these needs to be inserted into the repository to 
         completely store the inventory.
         """
@@ -118,7 +122,8 @@
             else:
                 raise NotImplementedError(
                     "don't know how to encode %r" % ie)
-        yield bencode(tuples)
+        dir_bytes = bencode(tuples)
+        yield osutils.sha_string(dir_bytes), dir_bytes
 
 
 # TODO: Index by path, as well as by id.

=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py	2007-08-28 11:00:49 +0000
+++ b/bzrlib/repofmt/pack_repo.py	2007-08-29 08:08:54 +0000
@@ -1153,7 +1153,8 @@
         self.repo._inv_write_index = InMemoryGraphIndex(reference_lists=2)
         # if we have created an inventory index, add the new write index to it
         if getattr(self.repo, '_inv_all_indices', None) is not None:
-            self.repo._inv_all_indices.insert_index(0, self.repo._inv_write_index)
+            self.repo._inv_all_indices.insert_index(0,
+                self.repo._inv_write_index)
             # we don't bother updating the knit layer, because there is not
             # defined interface for adding inventories that should need the 
             # existing knit to be changed - its all behind 'repo.add_inventory'.
@@ -1190,8 +1191,42 @@
         self._inventory_add_lines(inv_vf, revision_id, parents,
                                   osutils.split_lines(inv_text))
         return inv_sha1
-        import pdb;pdb.set_trace()
-
+
+    def _add_split_inventory(self, split_inventory):
+        """Add a SplitInventory part by part into the repository.
+
+        This must be called in a write group. 
+        """
+        index_additions = []
+        for dir_hash, dir_bytes in split_inventory._iter_serialized_parts():
+            # TODO: look in the existing index; if this text is in there then
+            # don't write it again.
+            self._add_content_by_hash(dir_hash, dir_bytes)
+        # return the last hash, which should be the root - this will fail if
+        # absolutely nothing was written, which is reasonable.
+        return dir_hash
+
+    def _add_content_by_hash(self, new_hash, new_bytes):
+        """Add content to the repository indexed by hash.
+        """
+        offset, length = self._open_pack_writer.add_bytes_record(
+            new_bytes, [(new_hash,)])
+        key = (new_hash,)
+        value = "%d %d" % (offset, length)
+        self._hash_write_index.add_nodes([(key, value)])
+
+    def _start_hash_index(self):
+        self._hash_write_index = InMemoryGraphIndex(reference_lists=0)
+
+    def _abort_hash_index(self):
+        del self._hash_write_index
+
+    def _commit_hash_index(self, new_pack_name):
+        new_hash_index_name = new_pack_name + '.hix'
+        self.transport.put_file(new_hash_index_name,
+                self._hash_write_index.finish())
+        del self._hash_write_index
+                    
 
 class GraphKnitRepository1(_GraphKnitRepositoryBase, KnitRepository):
     """Experimental graph-knit using repository."""
@@ -1242,6 +1277,7 @@
         self._revision_store.setup()
         self.weave_store.setup()
         self._inv_thunk.setup()
+        self._start_hash_index()
 
     def _commit_write_group(self):
         data_inserted = (self._revision_store.data_inserted() or
@@ -1268,6 +1304,7 @@
             self.weave_store.flush(new_name)
             self._inv_thunk.flush(new_name)
             self._revision_store.flush(new_name)
+            self._commit_hash_index(new_name)
             self._write_stream.close()
             self._upload_transport.rename(self._open_pack_tuple[1],
                 '../packs/' + new_name + '.pack')
@@ -1362,6 +1399,7 @@
         self._revision_store.setup()
         self.weave_store.setup()
         self._inv_thunk.setup()
+        self._start_hash_index()
 
     def _commit_write_group(self):
         data_inserted = (self._revision_store.data_inserted() or
@@ -1380,6 +1418,7 @@
             self.weave_store.flush(new_name)
             self._inv_thunk.flush(new_name)
             self._revision_store.flush(new_name)
+            self._commit_hash_index(new_name)
             self._write_stream.close()
             self._upload_transport.rename(self._open_pack_tuple[1],
                 '../packs/' + new_name + '.pack')

=== modified file 'bzrlib/tests/test_inventory_split.py'
--- a/bzrlib/tests/test_inventory_split.py	2007-08-29 06:47:42 +0000
+++ b/bzrlib/tests/test_inventory_split.py	2007-08-29 08:08:54 +0000
@@ -16,6 +16,7 @@
 
 
 from bzrlib.inventory_split import SplitInventory
+from bzrlib.osutils import sha_string
 from bzrlib.tests import (
         KnownFailure,
         TestCaseInTempDir,
@@ -43,8 +44,8 @@
         # an inventory that contains nothing but the root returns just one
         # empty directory entry.  
         self.assertEqual(1, len(parts))
-        self.assertEqual('le', parts[0])
-        self.assertEqualBencoded(
+        self.assertEqual((sha_string('le'), 'le'), parts[0])
+        self.checkDirText(
             [],
             parts[0])
 
@@ -53,11 +54,12 @@
         inv.add_path('f', kind='file', file_id='f-id')
         parts = list(inv._iter_serialized_parts())
         self.assertEqual(1, len(parts))
-        self.assertEqualBencoded([['.', 'f-id', 'f']],
+        self.checkDirText([['.', 'f-id', 'f']],
             parts[0])
 
-    def assertEqualBencoded(self, expected_obj, bencoded_actual):
-        self.assertEqual(expected_obj, bdecode(bencoded_actual))
+    def checkDirText(self, expected_obj, (dir_hash, dir_bytes)):
+        self.assertEqual(expected_obj, bdecode(dir_bytes))
+        self.assertEqual(sha_string(dir_bytes), dir_hash)
 
     # TODO: test that the returned inventory parts have just exactly the
     # format that we expect

=== modified file 'bzrlib/tests/test_pack_repository.py'
--- a/bzrlib/tests/test_pack_repository.py	2007-08-29 06:47:42 +0000
+++ b/bzrlib/tests/test_pack_repository.py	2007-08-29 08:08:54 +0000
@@ -17,33 +17,40 @@
 
 """Tests specific to the packed repository format."""
 
-from bzrlib import symbol_versioning
-from bzrlib.errors import (NotBranchError,
-                           NoSuchFile,
-                           UnknownFormatError,
-                           UnsupportedFormatError,
-                           )
-from bzrlib.index import GraphIndex
-from bzrlib.repository import RepositoryFormat
-from bzrlib.tests import TestCase, TestCaseWithTransport
-from bzrlib.transport import get_transport
-from bzrlib.transport.memory import MemoryServer
+
 from bzrlib import (
     bzrdir,
     errors,
     repository,
-    upgrade,
     workingtree,
     )
+from bzrlib.inventory_split import SplitInventory
+from bzrlib.repository import RepositoryFormat
+from bzrlib.tests import TestCase, TestCaseWithTransport
+from bzrlib.transport import get_transport
 from bzrlib.repofmt import pack_repo
 
 
 class TestSplitInventory(TestCaseWithTransport):
 
     def get_format(self):
-        # TODO: Update this when a permanent name is allocated
+        # XXX: Update this when a permanent name is allocated, or make one by
+        # hand to avoid naming
         return bzrdir.format_registry.make_bzrdir('experimental')
 
     def test_add_split_inventory(self):
         branch = self.make_branch('t1', format=self.get_format())
         repo = branch.repository
+        inv = SplitInventory('root-id')
+        inv.add_path('hello', 'file', 'hello-id')
+        # now try to add that inventory into the repository: normally this
+        # would be done from within commit
+        repo.lock_write()
+        repo.start_write_group()
+        root_hash = repo._add_split_inventory(inv)
+        repo.commit_write_group()
+        # check that it was stored
+        repo.unlock()
+        # check it's a plausible hash name
+        self.assertEquals(40, len(root_hash))
+        # check we can retrieve it