Rev 4196: Start a pure python version. in file:///home/vila/src/bzr/experimental/gc-py-bbc/

Wed Mar 25 17:20:33 GMT 2009

At file:///home/vila/src/bzr/experimental/gc-py-bbc/

------------------------------------------------------------
revno: 4196
revision-id: v.ladeuil+lp at free.fr-20090325172033-cfwtfbl3kj2atvyw
parent: v.ladeuil+lp at free.fr-20090324093847-zwbjrv8awx73feiz
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: groupcompress-python-only
timestamp: Wed 2009-03-25 18:20:33 +0100
message:
  Start a pure python version.
  
  * bzrlib/groupcompress.py:
  (_CommonGroupCompressor): Factor out stuff common between pyrex
  and python versions.
  (PythonGroupCompressor): Starts a python version relying on the
  _groupcompress_py.py companion module.
  (PyrexGroupCompressor.__init__): Factor some stuff out to base
  class, fix some imports.
  
  * bzrlib/_groupcompress_py.py: 
  Start implementing the python specific stuff outside the
  groupcompress.py file since to avoid importing useless stuff.
-------------- next part --------------
=== modified file 'BRANCH.TODO'

--- a/BRANCH.TODO	2009-03-23 21:50:37 +0000
+++ b/BRANCH.TODO	2009-03-25 17:20:33 +0000
@@ -19,3 +19,6 @@
  * Investigate sha1 overhead. Currently when doing 'bzr pack' we sha once on
    extraction, and then one more time on insertion. Removing both calls saves
    6s out of 32s for 'bzr pack' (and 8s=>5.5s for 'repository-details')
+
+ * don't require the pyx.c version or ~600 tests errors out,
+   write a pure python version instead

=== added file 'bzrlib/_groupcompress_py.py'
--- a/bzrlib/_groupcompress_py.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/_groupcompress_py.py	2009-03-25 17:20:33 +0000
@@ -0,0 +1,173 @@
+# Copyright (C) 2009 Canonical Limited.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as published
+# by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+
+"""Python version of compiled extensions for doing compression.
+
+We separate the implementation from the groupcompress.py to avoid importing
+useless stuff.
+"""
+
+### v imported from gc plugin at revno30
+class EquivalenceTable(object):
+    """This class tracks equivalencies between lists of hashable objects.
+
+    :ivar lines: The 'static' lines that will be preserved between runs.
+    :ival _matching_lines: A dict of {line:[matching offsets]}
+    """
+
+    def __init__(self, lines):
+        self.lines = lines
+        self._right_lines = None
+        # For each line in 'left' give the offset to the other lines which
+        # match it.
+        self._generate_matching_lines()
+
+    def _generate_matching_lines(self):
+        matches = {}
+        for idx, line in enumerate(self.lines):
+            matches.setdefault(line, []).append(idx)
+        self._matching_lines = matches
+
+    def _update_matching_lines(self, new_lines, index):
+        matches = self._matching_lines
+        start_idx = len(self.lines)
+        for idx, do_index in enumerate(index):
+            if not do_index:
+                continue
+            matches.setdefault(new_lines[idx], []).append(start_idx + idx)
+
+    def get_matches(self, line):
+        """Return the lines which match the line in right."""
+        try:
+            return self._matching_lines[line]
+        except KeyError:
+            return None
+
+    def _get_matching_lines(self):
+        """Return a dictionary showing matching lines."""
+        matching = {}
+        for line in self.lines:
+            matching[line] = self.get_matches(line)
+        return matching
+
+    def get_idx_matches(self, right_idx):
+        """Return the left lines matching the right line at the given offset."""
+        line = self._right_lines[right_idx]
+        try:
+            return self._matching_lines[line]
+        except KeyError:
+            return None
+
+    def extend_lines(self, lines, index):
+        """Add more lines to the left-lines list.
+
+        :param lines: A list of lines to add
+        :param index: A True/False for each node to define if it should be
+            indexed.
+        """
+        self._update_matching_lines(lines, index)
+        self.lines.extend(lines)
+
+    def set_right_lines(self, lines):
+        """Set the lines we will be matching against."""
+        self._right_lines = lines
+
+
+def _get_longest_match(equivalence_table, pos, max_pos, locations):
+    """Get the longest possible match for the current position."""
+    range_start = pos
+    range_len = 0
+    copy_ends = None
+    while pos < max_pos:
+        if locations is None:
+            locations = equivalence_table.get_idx_matches(pos)
+        if locations is None:
+            # No more matches, just return whatever we have, but we know that
+            # this last position is not going to match anything
+            pos += 1
+            break
+        else:
+            if copy_ends is None:
+                # We are starting a new range
+                copy_ends = [loc + 1 for loc in locations]
+                range_len = 1
+                locations = None # Consumed
+            else:
+                # We are currently in the middle of a match
+                next_locations = set(copy_ends).intersection(locations)
+                if len(next_locations):
+                    # range continues
+                    copy_ends = [loc + 1 for loc in next_locations]
+                    range_len += 1
+                    locations = None # Consumed
+                else:
+                    # But we are done with this match, we should be
+                    # starting a new one, though. We will pass back 'locations'
+                    # so that we don't have to do another lookup.
+                    break
+        pos += 1
+    if copy_ends is None:
+        return None, pos, locations
+    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
+
+
+def parse(line_list):
+    result = []
+    lines = iter(line_list)
+    next = lines.next
+    label_line = next()
+    sha1_line = next()
+    if (not label_line.startswith('label: ') or
+        not sha1_line.startswith('sha1: ')):
+        raise AssertionError("bad text record %r" % lines)
+    label = tuple(label_line[7:-1].split('\x00'))
+    sha1 = sha1_line[6:-1]
+    for header in lines:
+        op = header[0]
+        numbers = header[2:]
+        numbers = [int(n) for n in header[2:].split(',')]
+        if op == 'c':
+            result.append((op, numbers[0], numbers[1], None))
+        else:
+            contents = [next() for i in xrange(numbers[0])]
+            result.append((op, None, numbers[0], contents))
+    ## return result
+    return label, sha1, result
+
+
+def apply_delta(basis, delta):
+    """Apply delta to this object to become new_version_id."""
+    lines = []
+    last_offset = 0
+    # eq ranges occur where gaps occur
+    # start, end refer to offsets in basis
+    for op, start, count, delta_lines in delta:
+        if op == 'c':
+            lines.append(basis[start:start+count])
+        else:
+            lines.extend(delta_lines)
+    trim_encoding_newline(lines)
+    return lines
+
+
+def trim_encoding_newline(lines):
+    if lines[-1] == '\n':
+        del lines[-1]
+    else:
+        lines[-1] = lines[-1][:-1]
+
+
+### ^ imported from gc plugin at revno30

=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py	2009-03-24 01:37:26 +0000
+++ b/bzrlib/groupcompress.py	2009-03-25 17:20:33 +0000
@@ -18,7 +18,6 @@
 
 from itertools import izip
 from cStringIO import StringIO
-import struct
 import time
 import zlib
 try:
@@ -39,11 +38,6 @@
     )
 from bzrlib.graph import Graph
 from bzrlib.knit import _DirectPackAccess
-from bzrlib.osutils import (
-    contains_whitespace,
-    sha_string,
-    split_lines,
-    )
 from bzrlib.btree_index import BTreeBuilder
 from bzrlib.lru_cache import LRUSizeCache
 from bzrlib.tsort import topo_sort
@@ -368,7 +362,7 @@
         if c == 'f':
             bytes = content
         elif c == 'd':
-            bytes = _groupcompress_pyx.apply_delta(self._content, content)
+            bytes = apply_delta(self._content, content)
         return bytes
 
     def add_entry(self, key, type, sha1, start, length):
@@ -733,7 +727,242 @@
     return manager.get_record_stream()
 
 
-class GroupCompressor(object):
+class _CommonGroupCompressor(object):
+
+    def __init__(self):
+        """Create a GroupCompressor."""
+        # Consider seeding the lines with some sort of GC Start flag, or
+        # putting it as part of the output stream, rather than in the
+        # compressed bytes.
+        self.lines = []
+        self.endpoint = 0
+        self.input_bytes = 0
+        self.labels_deltas = {}
+
+    def ratio(self):
+        """Return the overall compression ratio."""
+        return float(self.input_bytes) / float(self.endpoint)
+
+
+class PythonGroupCompressor(_CommonGroupCompressor):
+
+    def __init__(self, delta=True):
+        """Create a GroupCompressor.
+
+        :param delta: If False, do not compress records.
+        """
+        super(PythonGroupCompressor, self).__init__()
+        self._delta = delta
+        self.line_offsets = []
+        self.line_locations = EquivalenceTable([])
+        self.lines = self.line_locations.lines
+        self._present_prefixes = set()
+
+    def get_matching_blocks(self, lines, soft=False):
+        """Return the ranges in lines which match self.lines.
+
+        :param lines: lines to compress
+        :return: A list of (old_start, new_start, length) tuples which reflect
+            a region in self.lines that is present in lines.  The last element
+            of the list is always (old_len, new_len, 0) to provide a end point
+            for generating instructions from the matching blocks list.
+        """
+        result = []
+        pos = 0
+        line_locations = self.line_locations
+        line_locations.set_right_lines(lines)
+        # We either copy a range (while there are reusable lines) or we 
+        # insert new lines. To find reusable lines we traverse 
+        locations = None
+        max_pos = len(lines)
+        result_append = result.append
+        min_match_bytes = 10
+        if soft:
+            min_match_bytes = 200
+        while pos < max_pos:
+            block, pos, locations = _get_longest_match(line_locations, pos,
+                                                       max_pos, locations)
+            if block is not None:
+                # Check to see if we are matching fewer than 5 characters,
+                # which is turned into a simple 'insert', rather than a copy
+                # If we have more than 5 lines, we definitely have more than 5
+                # chars
+                if block[-1] < min_match_bytes:
+                    # This block may be a 'short' block, check
+                    old_start, new_start, range_len = block
+                    matched_bytes = sum(map(len,
+                        lines[new_start:new_start + range_len]))
+                    if matched_bytes < min_match_bytes:
+                        block = None
+            if block is not None:
+                result_append(block)
+        result_append((len(self.lines), len(lines), 0))
+        return result
+
+    # FIXME: implement nostore_sha
+    def compress(self, key, lines, expected_sha, nostore_sha=False, soft=False):
+        """Compress lines with label key.
+
+        :param key: A key tuple. It is stored in the output
+            for identification of the text during decompression. If the last
+            element is 'None' it is replaced with the sha1 of the text -
+            e.g. sha1:xxxxxxx.
+        :param lines: The lines to be compressed. Must be split
+            on \n, with the \n preserved.'
+        :param expected_sha: If non-None, the sha the lines are believed to
+            have. During compression the sha is calculated; a mismatch will
+            cause an error.
+        :param soft: Do a 'soft' compression. This means that we require larger
+            ranges to match to be considered for a copy command.
+        :return: The sha1 of lines, and the number of bytes accumulated in
+            the group output so far.
+        """
+        lines = osutils.split_lines(lines)
+        sha1 = osutils.sha_strings(lines)
+        if key[-1] is None:
+            key = key[:-1] + ('sha1:' + sha1,)
+        label = '\x00'.join(key)
+        ## new_lines = []
+        new_lines = ['label: %s\n' % label,
+                     'sha1: %s\n' % sha1,
+                    ]
+        ## index_lines = []
+        index_lines = [False, False]
+        # setup good encoding for trailing \n support.
+        if not lines or lines[-1].endswith('\n'):
+            lines.append('\n')
+        else:
+            lines[-1] = lines[-1] + '\n'
+        pos = 0
+        range_len = 0
+        range_start = 0
+        flush_range = self.flush_range
+        copy_ends = None
+        blocks = self.get_matching_blocks(lines, soft=soft)
+        current_pos = 0
+        #copies_without_insertion = []
+        # We either copy a range (while there are reusable lines) or we
+        # insert new lines. To find reusable lines we traverse
+        for old_start, new_start, range_len in blocks:
+            if new_start != current_pos:
+                # if copies_without_insertion:
+                #     self.flush_multi(copies_without_insertion,
+                #                      lines, new_lines, index_lines)
+                #     copies_without_insertion = []
+                # non-matching region
+                flush_range(current_pos, None, new_start - current_pos,
+                    lines, new_lines, index_lines)
+            current_pos = new_start + range_len
+            if not range_len:
+                continue
+            # copies_without_insertion.append((new_start, old_start, range_len))
+            flush_range(new_start, old_start, range_len,
+                        lines, new_lines, index_lines)
+        # if copies_without_insertion:
+        #     self.flush_multi(copies_without_insertion,
+        #                      lines, new_lines, index_lines)
+        #     copies_without_insertion = []
+        start = self.endpoint # Keep it
+        delta_start = (self.endpoint, len(self.lines))
+        self.output_lines(new_lines, index_lines)
+        trim_encoding_newline(lines)
+        length = sum(map(len, lines))
+        self.input_bytes += length
+        delta_end = (self.endpoint, len(self.lines))
+        self.labels_deltas[key] = (delta_start, delta_end)
+        # FIXME: lot of guessing below
+        return sha1, start, self.endpoint, 'delta', length
+
+    def extract(self, key):
+        """Extract a key previously added to the compressor.
+        
+        :param key: The key to extract.
+        :return: An iterable over bytes and the sha1.
+        """
+        delta_details = self.labels_deltas[key]
+        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
+        label, sha1, delta = parse(delta_lines)
+        ## delta = parse(delta_lines)
+        if label != key:
+            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
+        # Perhaps we want to keep the line offsets too in memory at least?
+        chunks = apply_delta(''.join(self.lines), delta)
+        sha1 = osutils.sha_strings(chunks)
+        return chunks, sha1
+
+    def flush_multi(self, instructions, lines, new_lines, index_lines):
+        """Flush a bunch of different ranges out.
+
+        This should only be called with data that are "pure" copies.
+        """
+        flush_range = self.flush_range
+        if len(instructions) > 2:
+            # This is the number of lines to be copied
+            total_copy_range = sum(i[2] for i in instructions)
+            if len(instructions) > 0.5 * total_copy_range:
+                # We are copying N lines, but taking more than N/2
+                # copy instructions to do so. We will go ahead and expand this
+                # text so that other code is able to match against it
+                flush_range(instructions[0][0], None, total_copy_range,
+                            lines, new_lines, index_lines)
+                return
+        for ns, os, rl in instructions:
+            flush_range(ns, os, rl, lines, new_lines, index_lines)
+
+    def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):
+        insert_instruction = "i,%d\n" % range_len
+        if copy_start is not None:
+            # range stops, flush and start a new copy range
+            stop_byte = self.line_offsets[copy_start + range_len - 1]
+            if copy_start == 0:
+                start_byte = 0
+            else:
+                start_byte = self.line_offsets[copy_start - 1]
+            bytes = stop_byte - start_byte
+            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
+            if (bytes + len(insert_instruction) >
+                len(copy_control_instruction)):
+                new_lines.append(copy_control_instruction)
+                index_lines.append(False)
+                return
+        # not copying, or inserting is shorter than copying, so insert.
+        new_lines.append(insert_instruction)
+        new_lines.extend(lines[range_start:range_start+range_len])
+        index_lines.append(False)
+        index_lines.extend([copy_start is None]*range_len)
+
+    def flush(self):
+        # FIXME: ugly hack to masquerade ourself as the pyrex version
+        class content(object):
+
+            def __init__(self, s):
+                self.s = s
+
+            def to_bytes(self):
+                return self.s
+
+        return content(zlib.compress(''.join(self.lines)))
+
+    def output_lines(self, new_lines, index_lines):
+        """Output some lines.
+
+        :param new_lines: The lines to output.
+        :param index_lines: A boolean flag for each line - when True, index
+            that line.
+        """
+        # indexed_newlines = [idx for idx, val in enumerate(index_lines)
+        #                          if val and new_lines[idx] == '\n']
+        # if indexed_newlines:
+        #     import pdb; pdb.set_trace()
+        endpoint = self.endpoint
+        self.line_locations.extend_lines(new_lines, index_lines)
+        for line in new_lines:
+            endpoint += len(line)
+            self.line_offsets.append(endpoint)
+        self.endpoint = endpoint
+
+
+class PyrexGroupCompressor(_CommonGroupCompressor):
     """Produce a serialised group of compressed texts.
 
     It contains code very similar to SequenceMatcher because of having a similar
@@ -750,17 +979,10 @@
     """
 
     def __init__(self):
-        """Create a GroupCompressor."""
-        # Consider seeding the lines with some sort of GC Start flag, or
-        # putting it as part of the output stream, rather than in the
-        # compressed bytes.
-        self.lines = []
-        self.endpoint = 0
-        self.input_bytes = 0
+        super(PythonGroupCompressor, self).__init__()
         self.num_keys = 0
-        self.labels_deltas = {}
         self._last = None
-        self._delta_index = _groupcompress_pyx.DeltaIndex()
+        self._delta_index = DeltaIndex()
         self._block = GroupCompressBlock()
 
     def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):
@@ -883,9 +1105,8 @@
                                  ' claim %s != %s'
                                  % (len(stored_bytes),
                                     delta_len + 1 + offset))
-            bytes = _groupcompress_pyx.apply_delta(source,
-                                                   stored_bytes[offset + 1:])
-        bytes_sha1 = sha_string(bytes)
+            bytes = apply_delta(source, stored_bytes[offset + 1:])
+        bytes_sha1 = osutils.sha_string(bytes)
         if entry.sha1 != bytes_sha1:
             raise ValueError('Recorded sha1 != measured %s != %s'
                              % (entry.sha1, bytes_sha1))
@@ -920,10 +1141,6 @@
         self.endpoint = self._last[1]
         self._last = None
 
-    def ratio(self):
-        """Return the overall compression ratio."""
-        return float(self.input_bytes) / float(self.endpoint)
-
 
 def make_pack_factory(graph, delta, keylength):
     """Create a factory for creating a pack based groupcompress.
@@ -1075,7 +1292,7 @@
         """check that version_id and lines are safe to add."""
         version_id = key[-1]
         if version_id is not None:
-            if contains_whitespace(version_id):
+            if osutils.contains_whitespace(version_id):
                 raise errors.InvalidRevisionId(version_id, self)
         self.check_not_reserved_id(version_id)
         # TODO: If random_id==False and the key is already present, we should
@@ -1367,7 +1584,7 @@
                 result[record.key] = record.sha1
             else:
                 if record.storage_kind != 'absent':
-                    result[record.key] = sha_string(record.get_bytes_as(
+                    result[record.key] = olsutils.sha_string(record.get_bytes_as(
                         'fulltext'))
         return result
 
@@ -1572,7 +1789,7 @@
             pb.update('Walking content', key_idx, total)
             if record.storage_kind == 'absent':
                 raise errors.RevisionNotPresent(key, self)
-            lines = split_lines(record.get_bytes_as('fulltext'))
+            lines = osutils.split_lines(record.get_bytes_as('fulltext'))
             for line in lines:
                 yield line, key
         pb.update('Walking content', total, total)
@@ -1766,6 +1983,17 @@
 
 
 try:
-    from bzrlib import _groupcompress_pyx
+    from bzrlib._groupcompress_pyx import (
+        apply_delta,
+        DeltaIndex,
+        )
+    GroupCompressor = PyrexCompressor
 except ImportError:
-    pass
+    from bzrlib._groupcompress_py import (
+        apply_delta,
+        EquivalenceTable,
+        _get_longest_match,
+        trim_encoding_newline,
+        )
+    GroupCompressor = PythonGroupCompressor
+