Rev 5085: Move the actual index memo into a separate class. in http://bzr.arbash-meinel.com/branches/bzr/lp/2.2.0b2-contained-pack
John Arbash Meinel
john at arbash-meinel.com
Fri Mar 5 20:00:00 GMT 2010
At http://bzr.arbash-meinel.com/branches/bzr/lp/2.2.0b2-contained-pack
------------------------------------------------------------
revno: 5085
revision-id: john at arbash-meinel.com-20100305195926-ehqikk194yyn0v6s
parent: john at arbash-meinel.com-20100305194041-psk80jrojuznuzrt
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.2.0b2-contained-pack
timestamp: Fri 2010-03-05 13:59:26 -0600
message:
Move the actual index memo into a separate class.
We want to be able to construct the meta-index from either the 'pack-names'
file or from the tail of the actual pack file. This also moves the
serialization and deserialization into a nice collection.
-------------- next part --------------
=== modified file 'bzrlib/sack.py'
--- a/bzrlib/sack.py 2010-03-05 19:40:41 +0000
+++ b/bzrlib/sack.py 2010-03-05 19:59:26 +0000
@@ -33,6 +33,28 @@
_VERSION = 1
+class SectionInfo(object):
+ """Track the information about the extra indexes."""
+
+ def __init__(self):
+ self._sections = {}
+
+ def to_bytes(self):
+ # TODO: Should this bencode chunk be zlib compressed? I don't expect
+ # it will be particularly long, but it is ascii, and might
+ # compress well. (quick testing only showed 67b => 63b, which
+ # isn't worthwhile)
+ return bencode.bencode(self._sections)
+
+ @classmethod
+ def from_bytes(cls, bytes):
+ sections = bencode.bdecode_as_tuple(bytes)
+ assert type(sections) is dict
+ res = cls()
+ res._sections = sections
+ return res
+
+
class TrailingIndexBuilder(object):
"""This is the final bit that gets written to the sack content.
@@ -51,14 +73,14 @@
def __init__(self, start_offset):
self.start_offset = start_offset
self.version = _VERSION
- self._index_info = {}
+ self._section_info = SectionInfo()
def add_index_info(self, index_type, start, length):
# Note: bzr-search uses a ContainerWriter to write out the bytes, and
# then adjusts the offsets so that it skips the 'Pack' overhead bytes.
# I guess I don't really see the benefit versus the crufty overhead...
- assert index_type not in self._index_info
- self._index_info[index_type] = (start, length)
+ assert index_type not in self._section_info._sections
+ self._section_info._sections[index_type] = (start, length)
def finish(self):
# TODO: Perhaps this should be more like BTreeBuilder and return a
@@ -66,68 +88,53 @@
# function so that we can stream the data into it. For now,
# we don't expect the TrailingIndex to be big enough to worry
# about memory pressure, etc.
+ # Testing has shown it to be in the ~100bytes category, which is
+ # quite tiny
chunks = []
chunks.append('%s%d\n' % (_HEADER_BASE, self.version))
- # TODO: Should this bencode chunk be zlib compressed? I don't expect
- # it will be particularly long, but it is ascii, and probably
- # will compress well.
- chunks.append(bencode.bencode(self._index_info))
+ chunks.append(self._section_info.to_bytes())
chunks.append(struct.pack('!QI', self.start_offset, self.version))
return ''.join(chunks)
-# Note: one downside of abstracting via the FileView is that even though all of
-# the content exists in a single file, we aren't able to make a single
-# 'readv()' across all of the sub-indexes, because we fake that it is
-# actually separate files.
-
class TrailingIndex(object):
"""Track the root-structure for the Sack.
See TrailingIndexBuilder for the structure of this tail.
-
- :ivar start_offset: The offset in the sack file where the trailing index
- starts.
- :ivar version: The recorded version of the content
"""
- def __init__(self, version, start_offset):
- self.start_offset = start_offset
- self.version = version
+ def __init__(self, transport, filename):
# Map the name to the start and size of each section
- self._section_file_map = {}
- self._transport = None
- self._filename = None
-
- def _read_named_sections(self, end_of_file):
- expected_header = '%s%d\n' % (_HEADER_BASE, self.version)
- _, tail = self._transport.readv(self._filename,
- [(self.start_offset, end_of_file - self.start_offset)]).next()
- assert tail.startswith(expected_header)
- index_info_bytes = tail[len(expected_header):-12]
- index_info = bencode.bdecode_as_tuple(index_info_bytes)
- assert type(index_info) is dict
- # Ensure that we have entries
- self._section_file_map.update(index_info)
+ self._section_info = None
+ self._transport = transport
+ self._filename = filename
@staticmethod
def parse_tail_bytes(bytes):
"""Get the meta-info out of the last 12 bytes of content."""
- offset, version = struct.unpack('!QI', bytes[-12:])
- ti = TrailingIndex(version, offset)
- return ti
+ return struct.unpack('!QI', bytes[-12:])
@classmethod
def from_transport(cls, transport, filename):
# This is a bootstrap method, if all you have is the sack file, read
# its tail and figure out where the internal indices are.
file_st = transport.stat(filename)
+ # TODO: We could predict the length of the tail, and read extra bytes,
+ # rather than wasting a round-trip...
_, tail = transport.readv(filename, [(file_st.st_size-12, 12)]).next()
- ti = TrailingIndex.parse_tail_bytes(tail)
- ti._transport = transport
- ti._filename = filename
- ti._read_named_sections(file_st.st_size)
+ start_offset, version = TrailingIndex.parse_tail_bytes(tail)
+ expected_header = '%s%d\n' % (_HEADER_BASE, _VERSION)
+ _, tail = transport.readv(filename,
+ [(start_offset, file_st.st_size - start_offset)]).next()
+ assert tail.startswith(expected_header)
+ index_info_bytes = tail[len(expected_header):-12]
+ return cls.from_indicies_memo(transport, filename, index_info_bytes)
+
+ @classmethod
+ def from_indicies_memo(cls, transport, filename, memo_bytes):
+ ti = cls(transport, filename)
+ ti._section_info = SectionInfo.from_bytes(memo_bytes)
return ti
def get_named_index(self, name, index_class, **kwargs):
@@ -138,7 +145,7 @@
:param **kwargs: Any other named arguments will be passed to the index
constructor
"""
- start, length = self._section_file_map[name]
+ start, length = self._section_info._sections[name]
return index_class(self._transport, self._filename, size=length,
**kwargs)
=== modified file 'bzrlib/tests/test_sack.py'
--- a/bzrlib/tests/test_sack.py 2010-03-05 19:40:41 +0000
+++ b/bzrlib/tests/test_sack.py 2010-03-05 19:59:26 +0000
@@ -88,10 +88,8 @@
class TestTrailingIndex(tests.TestCase):
def assertTailBytes(self, start_offset, version, bytes):
- ti = sack.TrailingIndex.parse_tail_bytes(bytes)
- self.assertIsInstance(ti, sack.TrailingIndex)
- self.assertEqual(start_offset, ti.start_offset)
- self.assertEqual(version, ti.version)
+ self.assertEqual((start_offset, version),
+ sack.TrailingIndex.parse_tail_bytes(bytes))
def test_parse_tail_bytes(self):
self.assertTailBytes(12345, 1,
@@ -111,7 +109,7 @@
t.put_bytes('test.sack', ' '*500 + content)
ti = sack.TrailingIndex.from_transport(t, 'test.sack')
# We skip the 16-byte header at the beginning, and the 12-byte tail
- self.assertEqual({'texts': (150, 350)}, ti._section_file_map)
+ self.assertEqual({'texts': (150, 350)}, ti._section_info._sections)
def test_get_named_index(self):
index_builder = btree_index.BTreeBuilder(0, 1)
@@ -127,7 +125,8 @@
t = memory.MemoryTransport('')
t.put_bytes('test.sack', content)
ti = sack.TrailingIndex.from_transport(t, 'test.sack')
- self.assertEqual({'texts': (0, trail_start)}, ti._section_file_map)
+ self.assertEqual({'texts': (0, trail_start)},
+ ti._section_info._sections)
text_index = ti.get_named_index('texts', btree_index.BTreeGraphIndex)
assert_index_content(self, {('key1',): ('value1',),
('key2',): ('value2',),
More information about the bazaar-commits
mailing list