Rev 3918: Another disk format change. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/disk_format

Fri Mar 27 16:57:00 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/disk_format

------------------------------------------------------------
revno: 3918
revision-id: john at arbash-meinel.com-20090327165644-xno4eq1vvq6n2otz
parent: john at arbash-meinel.com-20090327161738-l6z2yuuso0mnr1ca
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: disk_format
timestamp: Fri 2009-03-27 11:56:44 -0500
message:
  Another disk format change.
  
  We have pretty much settled on no-labels, at least until we implement
  a gc+chk index (and then we can decide whether we want a fatter index
  or to have more content in the group).
  
  As such, we don't need to track the size of the header in the disk
  bytestream anymore.
  
  I decided to stick with ascii lengths in the outer wrapper.
  Even large repositories have a small number of groups, so there isn't
  many bytes that are actually wasted at that level. And it means
  if you 'vim foo.pack' there are bits that can actually be interpreted.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'

--- a/bzrlib/groupcompress.py	2009-03-27 16:07:44 +0000
+++ b/bzrlib/groupcompress.py	2009-03-27 16:56:44 +0000
@@ -56,7 +56,6 @@
     )
 
 _USE_LZMA = False and (pylzma is not None)
-_NO_LABELS = True
 _FAST = False
 
 # osutils.sha_string('')
@@ -160,9 +159,6 @@
         # map by key? or just order in file?
         self._entries = {}
         self._compressor_name = None
-        self._z_header_length = None
-        self._header_length = None
-        self._z_header = None
         self._z_content = None
         self._z_content_decompressor = None
         self._z_content_length = None
@@ -170,39 +166,10 @@
         self._content = None
 
     def __len__(self):
-        return self._content_length + self._header_length
-
-    def _parse_header(self):
-        """Parse the header part of the block."""
-        assert self._z_header is not None
-        if self._z_header == '':
-            # Nothing to process
-            self._z_header = None
-            return
-        if self._compressor_name == 'lzma':
-            header = pylzma.decompress(self._z_header)
-        else:
-            assert self._compressor_name == 'zlib'
-            header = zlib.decompress(self._z_header)
-        self._z_header = None # We have consumed the header
-        lines = header.split('\n')
-        del header
-        info_dict = {}
-        for line in lines:
-            if not line: #End of record
-                if not info_dict:
-                    break
-                self.add_entry(**info_dict)
-                info_dict = {}
-                continue
-            key, value = line.split(':', 1)
-            if key == 'key':
-                value = tuple(map(intern, value.split('\x00')))
-            elif key in ('start', 'length'):
-                value = int(value)
-            elif key == 'type':
-                value = intern(value)
-            info_dict[key] = value
+        # This is the maximum number of bytes this object will reference if
+        # everything is decompressed. However, if we decompress less than
+        # everything... (this would cause some problems for LRUSizeCache)
+        return self._content_length + self._z_content_length
 
     def _ensure_content(self, num_bytes=None):
         """Make sure that content has been expanded enough.
@@ -277,48 +244,25 @@
                 # The stream is finished
                 self._z_content_decompressor = None
 
-    def _parse_bytes(self, bytes):
+    def _parse_bytes(self, bytes, pos):
         """Read the various lengths from the header.
 
         This also populates the various 'compressed' buffers.
 
         :return: The position in bytes just after the last newline
         """
-        # At present, there are 4 lengths to be read, we have 2 integers for
-        # the length of the compressed and uncompressed header, and 2 integers
-        # for the compressed and uncompressed content
-        # 14 bytes can represent > 1TB, so to avoid checking too far, cap the
-        # search to 14 bytes.
-        pos = bytes.index('\n', 6, 20)
-        self._z_header_length = int(bytes[6:pos])
-        pos += 1
-        pos2 = bytes.index('\n', pos, pos + 14)
-        self._header_length = int(bytes[pos:pos2])
-        end_of_z_lengths = pos2
-        pos2 += 1
-        # Older versions don't have the content lengths, if we want to preserve
-        # backwards compatibility, we could try/except over these, and allow
-        # them to be skipped
-        try:
-            pos = bytes.index('\n', pos2, pos2 + 14)
-            self._z_content_length = int(bytes[pos2:pos])
-            pos += 1
-            pos2 = bytes.index('\n', pos, pos + 14)
-            self._content_length = int(bytes[pos:pos2])
-            pos = pos2 + 1
-            assert len(bytes) == (pos + self._z_header_length +
-                                  self._z_content_length)
-            pos2 = pos + self._z_header_length
-            self._z_header = bytes[pos:pos2]
-            self._z_content = bytes[pos2:]
-            assert len(self._z_content) == self._z_content_length
-        except ValueError:
-            # This is the older form, which did not encode its content length
-            pos = end_of_z_lengths + 1
-            pos2 = pos + self._z_header_length
-            self._z_header = bytes[pos:pos2]
-            self._z_content = bytes[pos2:]
-            self._z_content_length = len(self._z_content)
+        # At present, we have 2 integers for the compressed and uncompressed
+        # content. In base10 (ascii) 14 bytes can represent > 1TB, so to avoid
+        # checking too far, cap the search to 14 bytes.
+        pos2 = bytes.index('\n', pos, pos + 14)
+        self._z_content_length = int(bytes[pos:pos2])
+        pos = pos2 + 1
+        pos2 = bytes.index('\n', pos, pos + 14)
+        self._content_length = int(bytes[pos:pos2])
+        pos = pos2 + 1
+        assert len(bytes) == (pos + self._z_content_length)
+        self._z_content = bytes[pos:]
+        assert len(self._z_content) == self._z_content_length
 
     @classmethod
     def from_bytes(cls, bytes):
@@ -331,9 +275,7 @@
             out._compressor_name = 'lzma'
         else:
             raise ValueError('unknown compressor: %r' % (bytes,))
-        out._parse_bytes(bytes)
-        if not _NO_LABELS:
-            out._parse_header()
+        out._parse_bytes(bytes, 6)
         return out
 
     def extract(self, key, start, end, sha1=None):
@@ -392,66 +334,24 @@
         self._content_length = len(content)
         self._content = content
         self._z_content = None
-        self._z_header_length = None
 
     def to_bytes(self):
         """Encode the information into a byte stream."""
         compress = zlib.compress
         if _USE_LZMA:
             compress = pylzma.compress
-        chunks = []
-        for key in sorted(self._entries):
-            entry = self._entries[key]
-            chunk = ('key:%s\n'
-                     'sha1:%s\n'
-                     'type:%s\n'
-                     'start:%s\n'
-                     'length:%s\n'
-                     '\n'
-                     ) % ('\x00'.join(entry.key),
-                          entry.sha1,
-                          entry.type,
-                          entry.start,
-                          entry.length,
-                          )
-            chunks.append(chunk)
-        bytes = ''.join(chunks)
-        info_len = len(bytes)
-        z_header_bytes = compress(bytes)
-        del bytes, chunks
-        z_header_len = len(z_header_bytes)
-        # TODO: we may want to have the header compressed in the same chain
-        #       as the data, or we may not, evaulate it
-        #       having them compressed together is probably a win for
-        #       revisions and the 'inv' portion of chk inventories. As the
-        #       label in the header is duplicated in the text.
-        #       For chk pages and real bytes, I would guess this is not
-        #       true.
-        if _NO_LABELS:
-            z_header_bytes = ''
-            z_header_len = 0
-            info_len = 0
-        if self._z_content is not None:
-            content_len = self._content_length
-            z_content_len = self._z_content_length
-            z_content_bytes = self._z_content
-        else:
+        if self._z_content is None:
             assert self._content is not None
-            content_len = self._content_length
-            z_content_bytes = compress(self._content)
-            self._z_content = z_content_bytes
-            z_content_len = len(z_content_bytes)
-            self._z_content_length = z_content_len
+            self._z_content = compress(self._content)
+            self._z_content_length = len(self._z_content)
         if _USE_LZMA:
             header = self.GCB_LZ_HEADER
         else:
             header = self.GCB_HEADER
         chunks = [header,
-                  '%d\n%d\n%d\n%d\n' % (z_header_len, info_len,
-                                        z_content_len, content_len)
+                  '%d\n%d\n' % (self._z_content_length, self._content_length),
+                  self._z_content,
                  ]
-        chunks.append(z_header_bytes)
-        chunks.append(z_content_bytes)
         return ''.join(chunks)
 
 

=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py	2009-03-27 16:10:03 +0000
+++ b/bzrlib/tests/test_groupcompress.py	2009-03-27 16:56:44 +0000
@@ -229,7 +229,7 @@
 
     def test_from_minimal_bytes(self):
         block = groupcompress.GroupCompressBlock.from_bytes(
-            'gcb1z\n0\n0\n0\n0\n')
+            'gcb1z\n0\n0\n')
         self.assertIsInstance(block, groupcompress.GroupCompressBlock)
         self.assertEqual({}, block._entries)
         self.assertIs(None, block._content)
@@ -239,70 +239,21 @@
         self.assertEqual('', block._z_content)
         block._ensure_content() # Ensure content is safe to call 2x
 
-    def test_from_bytes_with_labels(self):
-        header = ('key:bing\n'
-            'sha1:abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n'
-            'type:fulltext\n'
-            'start:100\n'
-            'length:100\n'
-            '\n'
-            'key:foo\x00bar\n'
-            'sha1:abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n'
-            'type:fulltext\n'
-            'start:0\n'
-            'length:100\n'
-            '\n')
-        z_header = zlib.compress(header)
+    def test_from_bytes(self):
         content = ('a tiny bit of content\n')
         z_content = zlib.compress(content)
         z_bytes = (
             'gcb1z\n' # group compress block v1 plain
-            '%d\n' # Length of zlib bytes
-            '%d\n' # Length of all meta-info
             '%d\n' # Length of compressed content
             '%d\n' # Length of uncompressed content
-            '%s'   # Compressed header
             '%s'   # Compressed content
-            ) % (len(z_header), len(header),
-                 len(z_content), len(content),
-                 z_header, z_content)
+            ) % (len(z_content), len(content), z_content)
         block = groupcompress.GroupCompressBlock.from_bytes(
             z_bytes)
-        block._parse_header()
-        self.assertIsInstance(block, groupcompress.GroupCompressBlock)
-        self.assertEqual([('bing',), ('foo', 'bar')], sorted(block._entries))
-        bing = block._entries[('bing',)]
-        self.assertEqual(('bing',), bing.key)
-        self.assertEqual('fulltext', bing.type)
-        self.assertEqual('abcd'*10, bing.sha1)
-        self.assertEqual(100, bing.start)
-        self.assertEqual(100, bing.length)
-        foobar = block._entries[('foo', 'bar')]
-        self.assertEqual(('foo', 'bar'), foobar.key)
-        self.assertEqual('fulltext', foobar.type)
-        self.assertEqual('abcd'*10, foobar.sha1)
-        self.assertEqual(0, foobar.start)
-        self.assertEqual(100, foobar.length)
         self.assertEqual(z_content, block._z_content)
         self.assertIs(None, block._content)
-        block._ensure_content()
-        self.assertEqual(z_content, block._z_content)
-        self.assertEqual(content, block._content)
-
-    def test_from_old_bytes(self):
-        # Backwards compatibility, with groups that didn't define content length
-        content = ('a tiny bit of content\n')
-        z_content = zlib.compress(content)
-        z_bytes = (
-            'gcb1z\n' # group compress block v1 plain
-            '0\n' # Length of zlib bytes
-            '0\n' # Length of all meta-info
-            ''    # Compressed header
-            '%s'   # Compressed content
-            ) % (z_content)
-        block = groupcompress.GroupCompressBlock.from_bytes(
-            z_bytes)
-        self.assertIsInstance(block, groupcompress.GroupCompressBlock)
+        self.assertEqual(len(z_content), block._z_content_length)
+        self.assertEqual(len(content), block._content_length)
         block._ensure_content()
         self.assertEqual(z_content, block._z_content)
         self.assertEqual(content, block._content)
@@ -318,38 +269,23 @@
         self.assertEqual(100, e.length)
 
     def test_to_bytes(self):
-        no_labels = groupcompress._NO_LABELS
-        def reset():
-            groupcompress._NO_LABELS = no_labels
-        self.addCleanup(reset)
-        groupcompress._NO_LABELS = False
+        content = ('this is some content\n'
+                   'this content will be compressed\n')
         gcb = groupcompress.GroupCompressBlock()
         gcb.add_entry(('foo', 'bar'), 'fulltext', 'abcd'*10, 0, 100)
         gcb.add_entry(('bing',), 'fulltext', 'abcd'*10, 100, 100)
-        gcb.set_content('this is some content\n'
-                        'this content will be compressed\n')
+        gcb.set_content(content)
         bytes = gcb.to_bytes()
+        self.assertEqual(gcb._z_content_length, len(gcb._z_content))
+        self.assertEqual(gcb._content_length, len(content))
         expected_header =('gcb1z\n' # group compress block v1 zlib
-                          '76\n' # Length of compressed bytes
-                          '183\n' # Length of uncompressed meta-info
-                          '50\n' # Length of compressed content
-                          '53\n' # Length of uncompressed content
-                         )
+                          '%d\n' # Length of compressed content
+                          '%d\n' # Length of uncompressed content
+                         ) % (gcb._z_content_length, gcb._content_length)
         self.assertStartsWith(bytes, expected_header)
         remaining_bytes = bytes[len(expected_header):]
         raw_bytes = zlib.decompress(remaining_bytes)
-        self.assertEqualDiff('key:bing\n'
-                             'sha1:abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n'
-                             'type:fulltext\n'
-                             'start:100\n'
-                             'length:100\n'
-                             '\n'
-                             'key:foo\x00bar\n'
-                             'sha1:abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n'
-                             'type:fulltext\n'
-                             'start:0\n'
-                             'length:100\n'
-                             '\n', raw_bytes)
+        self.assertEqual(content, raw_bytes)
 
     def test_partial_decomp(self):
         content_chunks = []