Rev 5085: Move the actual index memo into a separate class. in http://bzr.arbash-meinel.com/branches/bzr/lp/2.2.0b2-contained-pack

Fri Mar 5 20:00:00 GMT 2010

At http://bzr.arbash-meinel.com/branches/bzr/lp/2.2.0b2-contained-pack

------------------------------------------------------------
revno: 5085
revision-id: john at arbash-meinel.com-20100305195926-ehqikk194yyn0v6s
parent: john at arbash-meinel.com-20100305194041-psk80jrojuznuzrt
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.2.0b2-contained-pack
timestamp: Fri 2010-03-05 13:59:26 -0600
message:
  Move the actual index memo into a separate class.
  
  We want to be able to construct the meta-index from either the 'pack-names'
  file or from the tail of the actual pack file. This also moves the
  serialization and deserialization into a nice collection.
-------------- next part --------------
=== modified file 'bzrlib/sack.py'

--- a/bzrlib/sack.py	2010-03-05 19:40:41 +0000
+++ b/bzrlib/sack.py	2010-03-05 19:59:26 +0000
@@ -33,6 +33,28 @@
 _VERSION = 1
 
 
+class SectionInfo(object):
+    """Track the information about the extra indexes."""
+
+    def __init__(self):
+        self._sections = {}
+
+    def to_bytes(self):
+        # TODO: Should this bencode chunk be zlib compressed? I don't expect
+        #       it will be particularly long, but it is ascii, and might
+        #       compress well. (quick testing only showed 67b => 63b, which
+        #       isn't worthwhile)
+        return bencode.bencode(self._sections)
+
+    @classmethod
+    def from_bytes(cls, bytes):
+        sections = bencode.bdecode_as_tuple(bytes)
+        assert type(sections) is dict
+        res = cls()
+        res._sections = sections
+        return res
+
+
 class TrailingIndexBuilder(object):
     """This is the final bit that gets written to the sack content.
 
@@ -51,14 +73,14 @@
     def __init__(self, start_offset):
         self.start_offset = start_offset
         self.version = _VERSION
-        self._index_info = {}
+        self._section_info = SectionInfo()
 
     def add_index_info(self, index_type, start, length):
         # Note: bzr-search uses a ContainerWriter to write out the bytes, and
         # then adjusts the offsets so that it skips the 'Pack' overhead bytes.
         # I guess I don't really see the benefit versus the crufty overhead...
-        assert index_type not in self._index_info
-        self._index_info[index_type] = (start, length)
+        assert index_type not in self._section_info._sections
+        self._section_info._sections[index_type] = (start, length)
 
     def finish(self):
         # TODO: Perhaps this should be more like BTreeBuilder and return a
@@ -66,68 +88,53 @@
         #       function so that we can stream the data into it. For now,
         #       we don't expect the TrailingIndex to be big enough to worry
         #       about memory pressure, etc.
+        #       Testing has shown it to be in the ~100bytes category, which is
+        #       quite tiny
         chunks = []
         chunks.append('%s%d\n' % (_HEADER_BASE, self.version))
-        # TODO: Should this bencode chunk be zlib compressed? I don't expect
-        #       it will be particularly long, but it is ascii, and probably
-        #       will compress well.
-        chunks.append(bencode.bencode(self._index_info))
+        chunks.append(self._section_info.to_bytes())
         chunks.append(struct.pack('!QI', self.start_offset, self.version))
         return ''.join(chunks)
 
 
 
-# Note: one downside of abstracting via the FileView is that even though all of
-#       the content exists in a single file, we aren't able to make a single
-#       'readv()' across all of the sub-indexes, because we fake that it is
-#       actually separate files.
-
 class TrailingIndex(object):
     """Track the root-structure for the Sack.
 
     See TrailingIndexBuilder for the structure of this tail.
-
-    :ivar start_offset: The offset in the sack file where the trailing index
-        starts.
-    :ivar version: The recorded version of the content
     """
 
-    def __init__(self, version, start_offset):
-        self.start_offset = start_offset
-        self.version = version
+    def __init__(self, transport, filename):
         # Map the name to the start and size of each section
-        self._section_file_map = {}
-        self._transport = None
-        self._filename = None
-
-    def _read_named_sections(self, end_of_file):
-        expected_header = '%s%d\n' % (_HEADER_BASE, self.version)
-        _, tail = self._transport.readv(self._filename,
-            [(self.start_offset, end_of_file - self.start_offset)]).next()
-        assert tail.startswith(expected_header)
-        index_info_bytes = tail[len(expected_header):-12]
-        index_info = bencode.bdecode_as_tuple(index_info_bytes)
-        assert type(index_info) is dict
-        # Ensure that we have entries
-        self._section_file_map.update(index_info)
+        self._section_info = None
+        self._transport = transport
+        self._filename = filename
 
     @staticmethod
     def parse_tail_bytes(bytes):
         """Get the meta-info out of the last 12 bytes of content."""
-        offset, version = struct.unpack('!QI', bytes[-12:])
-        ti = TrailingIndex(version, offset)
-        return ti
+        return struct.unpack('!QI', bytes[-12:])
 
     @classmethod
     def from_transport(cls, transport, filename):
         # This is a bootstrap method, if all you have is the sack file, read
         # its tail and figure out where the internal indices are.
         file_st = transport.stat(filename)
+        # TODO: We could predict the length of the tail, and read extra bytes,
+        #       rather than wasting a round-trip...
         _, tail = transport.readv(filename, [(file_st.st_size-12, 12)]).next()
-        ti = TrailingIndex.parse_tail_bytes(tail)
-        ti._transport = transport
-        ti._filename = filename
-        ti._read_named_sections(file_st.st_size)
+        start_offset, version = TrailingIndex.parse_tail_bytes(tail)
+        expected_header = '%s%d\n' % (_HEADER_BASE, _VERSION)
+        _, tail = transport.readv(filename,
+            [(start_offset, file_st.st_size - start_offset)]).next()
+        assert tail.startswith(expected_header)
+        index_info_bytes = tail[len(expected_header):-12]
+        return cls.from_indicies_memo(transport, filename, index_info_bytes)
+
+    @classmethod
+    def from_indicies_memo(cls, transport, filename, memo_bytes):
+        ti = cls(transport, filename)
+        ti._section_info = SectionInfo.from_bytes(memo_bytes)
         return ti
 
     def get_named_index(self, name, index_class, **kwargs):
@@ -138,7 +145,7 @@
         :param **kwargs: Any other named arguments will be passed to the index
             constructor
         """
-        start, length = self._section_file_map[name]
+        start, length = self._section_info._sections[name]
         return index_class(self._transport, self._filename, size=length,
                            **kwargs)
 

=== modified file 'bzrlib/tests/test_sack.py'
--- a/bzrlib/tests/test_sack.py	2010-03-05 19:40:41 +0000
+++ b/bzrlib/tests/test_sack.py	2010-03-05 19:59:26 +0000
@@ -88,10 +88,8 @@
 class TestTrailingIndex(tests.TestCase):
 
     def assertTailBytes(self, start_offset, version, bytes):
-        ti = sack.TrailingIndex.parse_tail_bytes(bytes)
-        self.assertIsInstance(ti, sack.TrailingIndex)
-        self.assertEqual(start_offset, ti.start_offset)
-        self.assertEqual(version, ti.version)
+        self.assertEqual((start_offset, version),
+                         sack.TrailingIndex.parse_tail_bytes(bytes))
 
     def test_parse_tail_bytes(self):
         self.assertTailBytes(12345, 1,
@@ -111,7 +109,7 @@
         t.put_bytes('test.sack', ' '*500 + content)
         ti = sack.TrailingIndex.from_transport(t, 'test.sack')
         # We skip the 16-byte header at the beginning, and the 12-byte tail
-        self.assertEqual({'texts': (150, 350)}, ti._section_file_map)
+        self.assertEqual({'texts': (150, 350)}, ti._section_info._sections)
 
     def test_get_named_index(self):
         index_builder = btree_index.BTreeBuilder(0, 1)
@@ -127,7 +125,8 @@
         t = memory.MemoryTransport('')
         t.put_bytes('test.sack', content)
         ti = sack.TrailingIndex.from_transport(t, 'test.sack')
-        self.assertEqual({'texts': (0, trail_start)}, ti._section_file_map)
+        self.assertEqual({'texts': (0, trail_start)},
+                         ti._section_info._sections)
         text_index = ti.get_named_index('texts', btree_index.BTreeGraphIndex)
         assert_index_content(self, {('key1',): ('value1',),
                                     ('key2',): ('value2',),