Rev 2650: When extracting just text, use functions that ignore annotations. in http://bzr.arbash-meinel.com/branches/bzr/0.19-dev/faster_knit_extract
John Arbash Meinel
john at arbash-meinel.com
Tue Jul 24 00:29:08 BST 2007
At http://bzr.arbash-meinel.com/branches/bzr/0.19-dev/faster_knit_extract
------------------------------------------------------------
revno: 2650
revision-id: john at arbash-meinel.com-20070723232806-n0hcfp5c9zbyehe2
parent: pqm at pqm.ubuntu.com-20070723155512-ckelzx8u6m5vxyja
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: faster_knit_extract
timestamp: Mon 2007-07-23 18:28:06 -0500
message:
When extracting just text, use functions that ignore annotations.
Seems to improve 'bzr checkout' time by about 10%
modified:
bzrlib/knit.py knit.py-20051212171256-f056ac8f0fbe1bd9
-------------- next part --------------
=== modified file 'bzrlib/knit.py'
--- a/bzrlib/knit.py 2007-07-20 12:56:33 +0000
+++ b/bzrlib/knit.py 2007-07-23 23:28:06 +0000
@@ -226,11 +226,6 @@
lines = iter(lines)
next = lines.next
- cache = {}
- def cache_and_return(line):
- origin, text = line.split(' ', 1)
- return cache.setdefault(origin, origin), text
-
# walk through the lines parsing.
for header in lines:
start, end, count = [int(n) for n in header.split(',')]
@@ -238,6 +233,27 @@
result.append((start, end, count, contents))
return result
+ def parse_line_delta_no_annotations(self, lines):
+ """Convert the delta lines into real lines, but ignore annotations.
+
+ line delta is in the form of:
+ intstart intend intcount
+ 1..count lines:
+ revid(utf8) newline\n
+ internal representation is
+ (start, end, count, [1..count line)])
+ """
+ result = []
+ lines = iter(lines)
+ next = lines.next
+
+ # walk through the lines parsing.
+ for header in lines:
+ start, end, count = [int(n) for n in header.split(',')]
+ contents = [next().split(' ', 1)[1] for i in xrange(count)]
+ result.append((start, end, count, contents))
+ return result
+
def get_fulltext_content(self, lines):
"""Extract just the content lines from a fulltext."""
return (line.split(' ', 1)[1] for line in lines)
@@ -307,6 +323,18 @@
def parse_line_delta(self, lines, version_id):
return list(self.parse_line_delta_iter(lines, version_id))
+ def parse_line_delta_no_annotations(self, lines):
+ cur = 0
+ num_lines = len(lines)
+ result = []
+ while cur < num_lines:
+ header = lines[cur]
+ cur += 1
+ start, end, c = [int(n) for n in header.split(',')]
+ result.append((start, end, c, lines[cur:cur+c]))
+ cur += c
+ return result
+
def get_fulltext_content(self, lines):
"""Extract just the content lines from a fulltext."""
return iter(lines)
@@ -845,7 +873,7 @@
version_ids = [osutils.safe_revision_id(v) for v in version_ids]
for version_id in version_ids:
self.check_not_reserved_id(version_id)
- text_map, content_map = self._get_content_maps(version_ids)
+ text_map = self._get_text_map(version_ids)
return [text_map[v] for v in version_ids]
_get_lf_split_line_list = get_line_list
@@ -905,6 +933,64 @@
text_map[version_id] = text
return text_map, final_content
+ def _get_text_map(self, version_ids):
+ """Produce maps of text and KnitContents
+
+ :param version_ids: A list of version ids to extract.
+ :return: a text map containing version_id => [lines] representing that
+ version.
+ """
+ for version_id in version_ids:
+ if not self.has_version(version_id):
+ raise RevisionNotPresent(version_id, self.filename)
+ record_map = self._get_record_map(version_ids)
+
+ # This is used to handle lines with no trailing \n.
+ # The delta objects are built up assuming everything has '\n', and the
+ # final output text is just stripped when there is no newline.
+ no_newlines = []
+ raw_text_map = {}
+ text_map = {}
+ for version_id in version_ids:
+ components = []
+ cursor = version_id
+ while cursor is not None:
+ method, data, digest, next = record_map[cursor]
+ components.append((cursor, method, data, digest))
+ if cursor in raw_text_map:
+ break
+ cursor = next
+
+ raw_text = None
+ for component_id, method, data, digest in reversed(components):
+ if component_id in raw_text_map:
+ raw_text = raw_text_map[component_id]
+ else:
+ if method == 'fulltext':
+ assert raw_text is None
+ raw_text = list(self.factory.get_fulltext_content(data))
+ elif method == 'line-delta':
+ assert raw_text is not None
+ delta = self.factory.parse_line_delta_no_annotations(data)
+ raw_text = self._apply_delta(raw_text, delta)
+ raw_text_map[component_id] = raw_text
+
+ if 'no-eol' in self._index.get_options(version_id):
+ text = raw_text[:] # Don't change the cached text
+ assert len(raw_text) > 0, ('We have no-eol on a text that'
+ 'has no actual lines for version_id %s in %s'
+ % (version_id, self))
+ text[-1] = raw_text[-1].rstrip('\n')
+ else:
+ text = raw_text
+ text_map[version_id] = text
+
+ # digest here is the digest from the last applied component.
+ if sha_strings(text) != digest:
+ raise KnitCorrupt(self.filename,
+ 'sha-1 does not match %s' % version_id)
+ return text_map
+
def iter_lines_added_or_present_in_versions(self, version_ids=None,
pb=None):
"""See VersionedFile.iter_lines_added_or_present_in_versions()."""
More information about the bazaar-commits
mailing list