Rev 5079: Add bootstrapping code so that we can start with a file on disk and extract the in http://bzr.arbash-meinel.com/branches/bzr/lp/2.2.0b2-contained-pack

John Arbash Meinel john at arbash-meinel.com
Thu Mar 4 22:43:50 GMT 2010


At http://bzr.arbash-meinel.com/branches/bzr/lp/2.2.0b2-contained-pack

------------------------------------------------------------
revno: 5079
revision-id: john at arbash-meinel.com-20100304224322-1y9ot2twh0tv9g0g
parent: john at arbash-meinel.com-20100304220843-say9gc02mpn9ty6w
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.2.0b2-contained-pack
timestamp: Thu 2010-03-04 16:43:22 -0600
message:
  Add bootstrapping code so that we can start with a file on disk and extract the
  information from its tail.
-------------- next part --------------
=== modified file 'bzrlib/sack.py'
--- a/bzrlib/sack.py	2010-03-04 22:08:43 +0000
+++ b/bzrlib/sack.py	2010-03-04 22:43:22 +0000
@@ -29,6 +29,9 @@
     )
 from bzrlib.transport import file_view
 
+_HEADER_BASE = '\nBazaar Sack v'
+_VERSION = 1
+
 
 class TrailingIndexBuilder(object):
     """This is the final bit that gets written to the sack content.
@@ -44,12 +47,10 @@
 
     # Note that the header intentionally starts with '\n' so that it separates
     # from the rest of the data if you open it in a text editor
-    _HEADER_BASE = '\nBazaar Sack v'
-    _VERSION = 1
 
     def __init__(self, start_offset):
         self.start_offset = start_offset
-        self.version = self.__class__._VERSION
+        self.version = _VERSION
         self._index_builder = btree_index.BTreeBuilder(reference_lists=0,
                                                        key_elements=1)
 
@@ -60,8 +61,13 @@
         self._index_builder.add_node((index_type,), '%d %d' % (start, length))
 
     def finish(self):
+        # TODO: Perhaps this should be more like BTreeBuilder and return a
+        #       file-like object, alternatively it should take a 'writer'
+        #       function so that we can stream the data into it. For now,
+        #       we don't expect the TrailingIndex to be big enough to worry
+        #       about memory pressure, etc.
         chunks = []
-        chunks.append('%s%d\n' % (self._HEADER_BASE, self.version))
+        chunks.append('%s%d\n' % (_HEADER_BASE, self.version))
         chunks.append(self._index_builder.finish().read())
         self._index_builder = None
         chunks.append(struct.pack('!QI', self.start_offset, self.version))
@@ -69,25 +75,72 @@
 
 
 
+# Note: one downside of abstracting via the FileView is that even though all of
+#       the content exists in a single file, we aren't able to make a single
+#       'readv()' across all of the sub-indexes, because we fake that it is
+#       actually separate files.
+
 class TrailingIndex(object):
     """Track the root-structure for the Sack.
 
     See TrailingIndexBuilder for the structure of this tail.
+
+    :ivar start_offset: The offset in the sack file where the trailing index
+        starts.
+    :ivar version: The recorded version of the content
     """
+
     def __init__(self, version, start_offset):
         self.start_offset = start_offset
         self.version = version
+        self._end_of_file = None
+        self._base_transport = None
+        self._base_filename = None
+        # This is a BTreeGraphIndex mapping names => their offsets in the sack.
+        self._named_sections = None
+        # This is a transport.file_view.FileView which allows us to maps names
+        # into offsets in the backing transport
+        self._file_view = None
+        self._section_file_map = {}
+
+    def _ensure_named_sections(self):
+        expected_header = '%s%d\n' % (_HEADER_BASE, self.version)
+        _, start = self._base_transport.readv(self._base_filename,
+            [(self.start_offset, len(expected_header))]).next()
+        assert start == expected_header
+        root_start = self.start_offset + len(expected_header)
+        root_end = self._end_of_file - 12
+        self._section_file_map = {'root-index': (root_start, root_end)}
+        self._file_view = file_view.FileView(self._base_transport,
+            self._base_filename, self._section_file_map)
+        self._named_sections = btree_index.BTreeGraphIndex(
+            self._file_view, 'root-index', root_end - root_start)
+        # Ensure that we have entries
+        for _, key, value in self._named_sections.iter_all_entries():
+            start, length = map(int, value.split())
+            assert len(key) == 1
+            name, = key
+            self._section_file_map[name] = (start, start+length)
 
     @staticmethod
-    def from_tail(bytes):
+    def parse_tail_bytes(bytes):
         """Get the meta-info out of the last 12 bytes of content."""
         offset, version = struct.unpack('!QI', bytes[-12:])
-        tie = TrailingIndex(version, offset)
-        return tie
+        ti = TrailingIndex(version, offset)
+        return ti
 
     @classmethod
-    def from_bytes(cls, bytes):
-        pass
+    def from_transport(cls, transport, filename):
+        # This is a bootstrap method, if all you have is the sack file, read
+        # its tail and figure out where the internal indices are.
+        file_st = transport.stat(filename)
+        _,tail = transport.readv(filename, [(file_st.st_size-12, 12)]).next()
+        ti = TrailingIndex.parse_tail_bytes(tail)
+        ti._base_transport = transport
+        ti._base_filename = filename
+        ti._end_of_file = file_st.st_size
+        ti._ensure_named_sections()
+        return ti
 
 
 class Sack(object):
@@ -104,6 +157,9 @@
 
       | blob-content | index1 | index2 | tail-index |
 
-    The individual indexes are likely to just be a BTreeIndex, the tail-index
-    is a simplified descriptor defining where to find the other indices.
+    The individual indexes are likely to just be a BTreeGraphIndex, the
+    tail-index is a simplified descriptor defining where to find the other
+    indices.
     """
+
+    # TODO: Rename this to IndexedPack or something like that

=== modified file 'bzrlib/tests/test_sack.py'
--- a/bzrlib/tests/test_sack.py	2010-03-04 22:08:43 +0000
+++ b/bzrlib/tests/test_sack.py	2010-03-04 22:43:22 +0000
@@ -81,16 +81,30 @@
 
 class TestTrailingIndex(tests.TestCase):
 
-    def assertFromTail(self, start_offset, version, bytes):
-        ti = sack.TrailingIndex.from_tail(bytes)
+    def assertTailBytes(self, start_offset, version, bytes):
+        ti = sack.TrailingIndex.parse_tail_bytes(bytes)
         self.assertIsInstance(ti, sack.TrailingIndex)
         self.assertEqual(start_offset, ti.start_offset)
         self.assertEqual(version, ti.version)
 
-    def test_from_tail(self):
-        self.assertFromTail(12345, 1,
-                            '\x00\x00\x00\x00\x00\x00\x30\x39'
-                            '\x00\x00\x00\x01')
-        self.assertFromTail(12345, 123,
-                            '\x00\x00\x00\x00\x00\x00\x30\x39'
-                            '\x00\x00\x00\x7b')
+    def test_parse_tail_bytes(self):
+        self.assertTailBytes(12345, 1,
+                             '\x00\x00\x00\x00\x00\x00\x30\x39'
+                             '\x00\x00\x00\x01')
+        self.assertTailBytes(12345, 123,
+                             '\x00\x00\x00\x00\x00\x00\x30\x39'
+                             '\x00\x00\x00\x7b')
+
+    def test_from_transport(self):
+        # We should be able to bootstrap all info starting with just a path on
+        # disk
+        builder = sack.TrailingIndexBuilder(start_offset=500)
+        builder.add_index_info('texts', 150, 350)
+        content = builder.finish()
+        t = memory.MemoryTransport('')
+        t.put_bytes('test.sack', ' '*500 + content)
+        ti = sack.TrailingIndex.from_transport(t, 'test.sack')
+        # We skip the 16-byte header at the beginning, and the 12-byte tail
+        self.assertEqual({'root-index': (516, 500+len(content)-12),
+                          'texts': (150, 500),
+                         }, ti._section_file_map)



More information about the bazaar-commits mailing list