Rev 2669: Merge cleanups from python version in http://bzr.arbash-meinel.com/branches/bzr/0.19-dev/pyrex_knit_extract

Fri Jul 27 20:59:08 BST 2007

At http://bzr.arbash-meinel.com/branches/bzr/0.19-dev/pyrex_knit_extract

------------------------------------------------------------
revno: 2669
revision-id: john at arbash-meinel.com-20070727195903-u3jltnzqul6wyimf
parent: john at arbash-meinel.com-20070725201134-ebb1m46jn37o6qb6
parent: john at arbash-meinel.com-20070727155344-d2u7q1rynmcbi2rk
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: pyrex_knit_extract
timestamp: Fri 2007-07-27 14:59:03 -0500
message:
  Merge cleanups from python version
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/knit.py                 knit.py-20051212171256-f056ac8f0fbe1bd9
    ------------------------------------------------------------
    revno: 2653.1.2
    revision-id: john at arbash-meinel.com-20070727155344-d2u7q1rynmcbi2rk
    parent: john at arbash-meinel.com-20070727144311-8tf47rvp159k4yhl
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: faster_knit_extract
    timestamp: Fri 2007-07-27 10:53:44 -0500
    message:
      Special casing just get_lines() and using non-fully optimized python
      functions improves the 'bzr checkout' time approx 14%
    modified:
      NEWS                           NEWS-20050323055033-4e00b5db738777ff
    ------------------------------------------------------------
    revno: 2653.1.1
    revision-id: john at arbash-meinel.com-20070727144311-8tf47rvp159k4yhl
    parent: john at arbash-meinel.com-20070724133411-8b9q81pi6c0z9tvm
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: faster_knit_extract
    timestamp: Fri 2007-07-27 09:43:11 -0500
    message:
      Pull over the improvements from the Pyrex branch.
      Basically, streamline everything down, so we only special case 'get_lines()'
      Also, move the functions in, so that we are ready for the pyrex improvements
      but can get some of the benefit without actually having pyrex yet.
    modified:
      bzrlib/knit.py                 knit.py-20051212171256-f056ac8f0fbe1bd9
-------------- next part --------------
=== modified file 'NEWS'

--- a/NEWS	2007-07-24 13:34:11 +0000
+++ b/NEWS	2007-07-27 15:53:44 +0000
@@ -81,7 +81,7 @@
       entries. (Ian Clatworthy)
 
     * When extracting just texts from knits, don't process the
-      annotations. This improves 'bzr checkout' time by about 10%.
+      annotations. This improves 'bzr checkout' time by about 14%.
       (John Arbash Meinel)
 
   LIBRARY API BREAKS:

=== modified file 'bzrlib/knit.py'
--- a/bzrlib/knit.py	2007-07-25 20:11:34 +0000
+++ b/bzrlib/knit.py	2007-07-27 19:59:03 +0000
@@ -249,27 +249,6 @@
             result.append((start, end, count, contents))
         return result
 
-    def parse_line_delta_no_annotations(self, lines):
-        """Convert the delta lines into real lines, but ignore annotations.
-
-        line delta is in the form of:
-        intstart intend intcount
-        1..count lines:
-        revid(utf8) newline\n
-        internal representation is
-        (start, end, count, [1..count line)])
-        """
-        result = []
-        lines = iter(lines)
-        next = lines.next
-
-        # walk through the lines parsing.
-        for header in lines:
-            start, end, count = [int(n) for n in header.split(',')]
-            contents = [next().split(' ', 1)[1] for i in xrange(count)]
-            result.append((start, end, count, contents))
-        return result
-
     def get_fulltext_content(self, lines):
         """Extract just the content lines from a fulltext."""
         return (line.split(' ', 1)[1] for line in lines)
@@ -339,18 +318,6 @@
     def parse_line_delta(self, lines, version_id):
         return list(self.parse_line_delta_iter(lines, version_id))
 
-    def parse_line_delta_no_annotations(self, lines):
-        cur = 0
-        num_lines = len(lines)
-        result = []
-        while cur < num_lines:
-            header = lines[cur]
-            cur += 1
-            start, end, c = [int(n) for n in header.split(',')]
-            result.append((start, end, c, lines[cur:cur+c]))
-            cur += c
-        return result
-
     def get_fulltext_content(self, lines):
         """Extract just the content lines from a fulltext."""
         return iter(lines)
@@ -929,7 +896,7 @@
         version_ids = [osutils.safe_revision_id(v) for v in version_ids]
         for version_id in version_ids:
             self.check_not_reserved_id(version_id)
-        text_map = self._get_text_map(version_ids)
+        text_map, content_map = self._get_content_maps(version_ids)
         return [text_map[v] for v in version_ids]
 
     _get_lf_split_line_list = get_line_list
@@ -989,77 +956,6 @@
             text_map[version_id] = text 
         return text_map, final_content 
 
-    def _get_text_map(self, version_ids):
-        """Produce maps of text and KnitContents
-
-        :param version_ids: A list of version ids to extract.
-        :return: a text map containing version_id => [lines] representing that
-            version.
-        """
-        for version_id in version_ids:
-            if not self.has_version(version_id):
-                raise RevisionNotPresent(version_id, self.filename)
-
-        position_map = self._get_components_positions(version_ids)
-        # c = component_id, m = method, p = position, s = size, n = next
-        records = [(c, p, s) for c, (m, p, s, n) in position_map.iteritems()]
-        record_map = {}
-        for component_id, data in self._data._read_raw_data_iter(records):
-            method, position, size, next = position_map[component_id]
-            record_map[component_id] = method, data, next
-
-        # This is used to handle lines with no trailing \n.
-        # The delta objects are built up assuming everything has '\n', and the
-        # final output text is just stripped when there is no newline.
-        raw_text_map = {}
-        text_map = {}
-        digest_map = {}
-        for version_id in version_ids:
-            components = []
-            cursor = version_id
-            while cursor is not None:
-                method, data, next = record_map[cursor]
-                components.append((cursor, method, data))
-                if cursor in raw_text_map:
-                    break
-                cursor = next
-
-            raw_text = None
-            for component_id, method, data in reversed(components):
-                if component_id in raw_text_map:
-                    raw_text = raw_text_map[component_id]
-                else:
-                    if method == 'fulltext':
-                        assert raw_text is None
-                        digest, raw_text =\
-                            self.factory.parse_gzip_fulltext_no_annotation(
-                                self._data, component_id, data)
-                        digest_map[component_id] = digest
-                    elif method == 'line-delta':
-                        assert raw_text is not None
-                        digest, delta =\
-                            self.factory.parse_gzip_line_delta_no_annotation(
-                                self._data, component_id, data)
-                        digest_map[component_id] = digest
-                        raw_text = self._apply_delta(raw_text, delta)
-                    raw_text_map[component_id] = raw_text
-
-            if 'no-eol' in self._index.get_options(version_id):
-                text = raw_text[:] # Don't change the cached text
-                assert len(raw_text) > 0, ('We have no-eol on a text that'
-                    'has no actual lines for version_id %s in %s'
-                    % (version_id, self))
-                text[-1] = raw_text[-1].rstrip('\n')
-            else:
-                text = raw_text
-            text_map[version_id] = text
-
-            # digest here is the digest from the last applied component.
-            if sha_strings(text) != digest_map[version_id]:
-                raise KnitCorrupt(self.filename,
-                                  'sha-1 does not match %s' % version_id)
-        return text_map
-
     def iter_lines_added_or_present_in_versions(self, version_ids=None, 
                                                 pb=None):
         """See VersionedFile.iter_lines_added_or_present_in_versions()."""
@@ -1461,6 +1357,28 @@
     def get_options(self, version_id):
         return self._cache[version_id][1]
 
+    def get_build_chain(self, version_id):
+        """Get the chain that we need to get a fulltext for this version.
+
+        :return: [(version_id, start, size)] needed to extract this text.
+        """
+        cursor = version_id
+        chain = []
+        while cursor is not None:
+            v_id, options, pos, size, parents, idx = self._cache[cursor]
+            chain.append((v_id, pos, size))
+            if 'fulltext' in options:
+                cursor = None
+            elif 'line-delta' not in options:
+                raise errors.KnitIndexUnknownMethod(self._full_path(), options)
+            else:
+                # Line-delta is based on the first present parent
+                # Do we have to handle missing the primary parent?
+                cursor = parents[0]
+        # Put the fulltext first
+        chain.reverse()
+        return chain
+
     def get_parents(self, version_id):
         """Return parents of specified version ignoring ghosts."""
         return [parent for parent in self._cache[version_id][4] 
@@ -1603,24 +1521,14 @@
         """
         df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
         try:
-            rec = self._check_header(version_id, df.readline())
+            rec = _check_record_header(version_id, df.readline(),
+                                       self._filename)
         except Exception, e:
             raise KnitCorrupt(self._filename,
                               "While reading {%s} got %s(%s)"
                               % (version_id, e.__class__.__name__, str(e)))
         return df, rec
 
-    def _check_header(self, version_id, line):
-        rec = line.split()
-        if len(rec) != 4:
-            raise KnitCorrupt(self._filename,
-                              'unexpected number of elements in record header')
-        if rec[1] != version_id:
-            raise KnitCorrupt(self._filename,
-                              'unexpected version, wanted %r, got %r'
-                              % (version_id, rec[1]))
-        return rec
-
     def _parse_record(self, version_id, data):
         try:
             record_contents = _extract_lines_from_gzip(data)
@@ -1629,7 +1537,7 @@
                               "While reading {%s} got %s(%s)"
                               % (version_id, e.__class__.__name__, str(e)))
         header = record_contents.pop(0)
-        rec = self._check_header(version_id, header)
+        rec = _check_record_header(version_id, header, self._filename)
 
         last_line = record_contents.pop()
         if len(record_contents) != int(rec[2]):
@@ -1693,13 +1601,15 @@
             # Skip records we have alread seen
             yielded_records = set()
             needed_records = set()
-            for version_id, pos, size in records:
-                if version_id in self._cache:
-                    if version_id in yielded_records:
+            for record in records:
+                if record[0] in self._cache:
+                    if record[0] in yielded_records:
                         continue
-                    yielded_records.add(version_id)
-                    data = self._cache[version_id]
-                    yield (version_id, data)
+                    yielded_records.add(record[0])
+                    data = self._cache[record[0]]
+                    yield (record[0], data)
+                else:
+                    needed_records.add(record)
                 else:
                     needed_records.add((version_id, pos, size))
             needed_records = sorted(needed_records, key=operator.itemgetter(1))
@@ -2087,6 +1997,20 @@
         return besti, bestj, bestsize
 
 
+def _check_record_header(version_id, line, kd_name):
+    """Parse line as a _KnitData header line"""
+    rec = line.split()
+    if len(rec) != 4:
+        raise errors.KnitCorrupt(kd_name,
+            'unexpected number of elements in record header'
+            ' %r' % (line,))
+    if rec[1] != version_id:
+        raise errors.KnitCorrupt(kd_name,
+            'unexpected version, wanted %r, got %r'
+            % (version_id, rec[1]))
+    return rec
+
+
 try:
     from bzrlib._knit_helpers_c import (
         _load_data_c as _load_data,