Rev 2669: Merge cleanups from python version in http://bzr.arbash-meinel.com/branches/bzr/0.19-dev/pyrex_knit_extract
John Arbash Meinel
john at arbash-meinel.com
Fri Jul 27 21:08:11 BST 2007
At http://bzr.arbash-meinel.com/branches/bzr/0.19-dev/pyrex_knit_extract
------------------------------------------------------------
revno: 2669
revision-id: john at arbash-meinel.com-20070727200810-04uws4gejibr2lcc
parent: john at arbash-meinel.com-20070725201134-ebb1m46jn37o6qb6
parent: john at arbash-meinel.com-20070727155344-d2u7q1rynmcbi2rk
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: pyrex_knit_extract
timestamp: Fri 2007-07-27 15:08:10 -0500
message:
Merge cleanups from python version
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/knit.py knit.py-20051212171256-f056ac8f0fbe1bd9
------------------------------------------------------------
revno: 2653.1.2
revision-id: john at arbash-meinel.com-20070727155344-d2u7q1rynmcbi2rk
parent: john at arbash-meinel.com-20070727144311-8tf47rvp159k4yhl
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: faster_knit_extract
timestamp: Fri 2007-07-27 10:53:44 -0500
message:
Special casing just get_lines() and using non-fully optimized python
functions improves the 'bzr checkout' time approx 14%
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
------------------------------------------------------------
revno: 2653.1.1
revision-id: john at arbash-meinel.com-20070727144311-8tf47rvp159k4yhl
parent: john at arbash-meinel.com-20070724133411-8b9q81pi6c0z9tvm
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: faster_knit_extract
timestamp: Fri 2007-07-27 09:43:11 -0500
message:
Pull over the improvements from the Pyrex branch.
Basically, streamline everything down, so we only special case 'get_lines()'
Also, move the functions in, so that we are ready for the pyrex improvements
but can get some of the benefit without actually having pyrex yet.
modified:
bzrlib/knit.py knit.py-20051212171256-f056ac8f0fbe1bd9
-------------- next part --------------
=== modified file 'NEWS'
--- a/NEWS 2007-07-24 13:34:11 +0000
+++ b/NEWS 2007-07-27 15:53:44 +0000
@@ -81,7 +81,7 @@
entries. (Ian Clatworthy)
* When extracting just texts from knits, don't process the
- annotations. This improves 'bzr checkout' time by about 10%.
+ annotations. This improves 'bzr checkout' time by about 14%.
(John Arbash Meinel)
LIBRARY API BREAKS:
=== modified file 'bzrlib/knit.py'
--- a/bzrlib/knit.py 2007-07-25 20:11:34 +0000
+++ b/bzrlib/knit.py 2007-07-27 20:08:10 +0000
@@ -249,27 +249,6 @@
result.append((start, end, count, contents))
return result
- def parse_line_delta_no_annotations(self, lines):
- """Convert the delta lines into real lines, but ignore annotations.
-
- line delta is in the form of:
- intstart intend intcount
- 1..count lines:
- revid(utf8) newline\n
- internal representation is
- (start, end, count, [1..count line)])
- """
- result = []
- lines = iter(lines)
- next = lines.next
-
- # walk through the lines parsing.
- for header in lines:
- start, end, count = [int(n) for n in header.split(',')]
- contents = [next().split(' ', 1)[1] for i in xrange(count)]
- result.append((start, end, count, contents))
- return result
-
def get_fulltext_content(self, lines):
"""Extract just the content lines from a fulltext."""
return (line.split(' ', 1)[1] for line in lines)
@@ -339,18 +318,6 @@
def parse_line_delta(self, lines, version_id):
return list(self.parse_line_delta_iter(lines, version_id))
- def parse_line_delta_no_annotations(self, lines):
- cur = 0
- num_lines = len(lines)
- result = []
- while cur < num_lines:
- header = lines[cur]
- cur += 1
- start, end, c = [int(n) for n in header.split(',')]
- result.append((start, end, c, lines[cur:cur+c]))
- cur += c
- return result
-
def get_fulltext_content(self, lines):
"""Extract just the content lines from a fulltext."""
return iter(lines)
@@ -929,7 +896,7 @@
version_ids = [osutils.safe_revision_id(v) for v in version_ids]
for version_id in version_ids:
self.check_not_reserved_id(version_id)
- text_map = self._get_text_map(version_ids)
+ text_map, content_map = self._get_content_maps(version_ids)
return [text_map[v] for v in version_ids]
_get_lf_split_line_list = get_line_list
@@ -989,77 +956,6 @@
text_map[version_id] = text
return text_map, final_content
- def _get_text_map(self, version_ids):
- """Produce maps of text and KnitContents
-
- :param version_ids: A list of version ids to extract.
- :return: a text map containing version_id => [lines] representing that
- version.
- """
- for version_id in version_ids:
- if not self.has_version(version_id):
- raise RevisionNotPresent(version_id, self.filename)
-
- position_map = self._get_components_positions(version_ids)
- # c = component_id, m = method, p = position, s = size, n = next
- records = [(c, p, s) for c, (m, p, s, n) in position_map.iteritems()]
- record_map = {}
- for component_id, data in self._data._read_raw_data_iter(records):
- method, position, size, next = position_map[component_id]
- record_map[component_id] = method, data, next
-
- # This is used to handle lines with no trailing \n.
- # The delta objects are built up assuming everything has '\n', and the
- # final output text is just stripped when there is no newline.
- raw_text_map = {}
- text_map = {}
- digest_map = {}
- for version_id in version_ids:
- components = []
- cursor = version_id
- while cursor is not None:
- method, data, next = record_map[cursor]
- components.append((cursor, method, data))
- if cursor in raw_text_map:
- break
- cursor = next
-
- raw_text = None
- for component_id, method, data in reversed(components):
- if component_id in raw_text_map:
- raw_text = raw_text_map[component_id]
- else:
- if method == 'fulltext':
- assert raw_text is None
- digest, raw_text =\
- self.factory.parse_gzip_fulltext_no_annotation(
- self._data, component_id, data)
- digest_map[component_id] = digest
- elif method == 'line-delta':
- assert raw_text is not None
- digest, delta =\
- self.factory.parse_gzip_line_delta_no_annotation(
- self._data, component_id, data)
- digest_map[component_id] = digest
- raw_text = self._apply_delta(raw_text, delta)
- raw_text_map[component_id] = raw_text
-
- if 'no-eol' in self._index.get_options(version_id):
- text = raw_text[:] # Don't change the cached text
- assert len(raw_text) > 0, ('We have no-eol on a text that'
- 'has no actual lines for version_id %s in %s'
- % (version_id, self))
- text[-1] = raw_text[-1].rstrip('\n')
- else:
- text = raw_text
- text_map[version_id] = text
-
- # digest here is the digest from the last applied component.
- if sha_strings(text) != digest_map[version_id]:
- raise KnitCorrupt(self.filename,
- 'sha-1 does not match %s' % version_id)
- return text_map
-
def iter_lines_added_or_present_in_versions(self, version_ids=None,
pb=None):
"""See VersionedFile.iter_lines_added_or_present_in_versions()."""
@@ -1461,6 +1357,28 @@
def get_options(self, version_id):
return self._cache[version_id][1]
+ def get_build_chain(self, version_id):
+ """Get the chain that we need to get a fulltext for this version.
+
+ :return: [(version_id, start, size)] needed to extract this text.
+ """
+ cursor = version_id
+ chain = []
+ while cursor is not None:
+ v_id, options, pos, size, parents, idx = self._cache[cursor]
+ chain.append((v_id, pos, size))
+ if 'fulltext' in options:
+ cursor = None
+ elif 'line-delta' not in options:
+ raise errors.KnitIndexUnknownMethod(self._full_path(), options)
+ else:
+ # Line-delta is based on the first present parent
+ # Do we have to handle missing the primary parent?
+ cursor = parents[0]
+ # Put the fulltext first
+ chain.reverse()
+ return chain
+
def get_parents(self, version_id):
"""Return parents of specified version ignoring ghosts."""
return [parent for parent in self._cache[version_id][4]
@@ -1603,24 +1521,14 @@
"""
df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
try:
- rec = self._check_header(version_id, df.readline())
+ rec = _check_record_header(version_id, df.readline(),
+ self._filename)
except Exception, e:
raise KnitCorrupt(self._filename,
"While reading {%s} got %s(%s)"
% (version_id, e.__class__.__name__, str(e)))
return df, rec
- def _check_header(self, version_id, line):
- rec = line.split()
- if len(rec) != 4:
- raise KnitCorrupt(self._filename,
- 'unexpected number of elements in record header')
- if rec[1] != version_id:
- raise KnitCorrupt(self._filename,
- 'unexpected version, wanted %r, got %r'
- % (version_id, rec[1]))
- return rec
-
def _parse_record(self, version_id, data):
try:
record_contents = _extract_lines_from_gzip(data)
@@ -1629,7 +1537,7 @@
"While reading {%s} got %s(%s)"
% (version_id, e.__class__.__name__, str(e)))
header = record_contents.pop(0)
- rec = self._check_header(version_id, header)
+ rec = _check_record_header(version_id, header, self._filename)
last_line = record_contents.pop()
if len(record_contents) != int(rec[2]):
@@ -1693,15 +1601,15 @@
# Skip records we have alread seen
yielded_records = set()
needed_records = set()
- for version_id, pos, size in records:
- if version_id in self._cache:
- if version_id in yielded_records:
+ for record in records:
+ if record[0] in self._cache:
+ if record[0] in yielded_records:
continue
- yielded_records.add(version_id)
- data = self._cache[version_id]
- yield (version_id, data)
+ yielded_records.add(record[0])
+ data = self._cache[record[0]]
+ yield (record[0], data)
else:
- needed_records.add((version_id, pos, size))
+ needed_records.add(record)
needed_records = sorted(needed_records, key=operator.itemgetter(1))
else:
needed_records = sorted(set(records), key=operator.itemgetter(1))
@@ -2087,6 +1995,20 @@
return besti, bestj, bestsize
+def _check_record_header(version_id, line, kd_name):
+ """Parse line as a _KnitData header line"""
+ rec = line.split()
+ if len(rec) != 4:
+ raise errors.KnitCorrupt(kd_name,
+ 'unexpected number of elements in record header'
+ ' %r' % (line,))
+ if rec[1] != version_id:
+ raise errors.KnitCorrupt(kd_name,
+ 'unexpected version, wanted %r, got %r'
+ % (version_id, rec[1]))
+ return rec
+
+
try:
from bzrlib._knit_helpers_c import (
_load_data_c as _load_data,
More information about the bazaar-commits
mailing list