Rev 5675: (jelmer) Move Repository._find_text_key_references_from_xml_inventory_lines in file:///home/pqm/archives/thelove/bzr/%2Btrunk/
Canonical.com Patch Queue Manager
pqm at pqm.ubuntu.com
Mon Feb 21 22:12:01 UTC 2011
At file:///home/pqm/archives/thelove/bzr/%2Btrunk/
------------------------------------------------------------
revno: 5675 [merge]
revision-id: pqm at pqm.ubuntu.com-20110221221157-3dtpgotjz8ktfviu
parent: pqm at pqm.ubuntu.com-20110221205918-g6r1qo36btstg4r7
parent: jelmer at samba.org-20110221195827-fd627hir7bsqywt5
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Mon 2011-02-21 22:11:57 +0000
message:
(jelmer) Move Repository._find_text_key_references_from_xml_inventory_lines
onto the serializer. (Jelmer Vernooij)
modified:
bzrlib/repofmt/pack_repo.py pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
bzrlib/repository.py rev_storage.py-20051111201905-119e9401e46257e3
bzrlib/tests/test_repository.py test_repository.py-20060131075918-65c555b881612f4d
bzrlib/tests/test_xml.py test_xml.py-20050905091053-80b45588931a9b35
bzrlib/xml8.py xml5.py-20050907032657-aac8f960815b66b1
=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py 2011-01-20 21:15:10 +0000
+++ b/bzrlib/repofmt/pack_repo.py 2011-02-21 15:09:19 +0000
@@ -1222,8 +1222,7 @@
def _process_inventory_lines(self, inv_lines):
"""Generate a text key reference map rather for reconciling with."""
repo = self._pack_collection.repo
- refs = repo._find_text_key_references_from_xml_inventory_lines(
- inv_lines)
+ refs = repo._serializer._find_text_key_references(inv_lines)
self._text_refs = refs
# during reconcile we:
# - convert unreferenced texts to full texts
@@ -2476,7 +2475,7 @@
from_repo = self.from_repository
parent_ids = from_repo._find_parent_ids_of_revisions(revision_ids)
parent_keys = [(p,) for p in parent_ids]
- find_text_keys = from_repo._find_text_key_references_from_xml_inventory_lines
+ find_text_keys = from_repo._serializer._find_text_key_references
parent_text_keys = set(find_text_keys(
from_repo._inventory_xml_lines_for_keys(parent_keys)))
content_text_keys = set()
=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py 2011-02-19 17:37:45 +0000
+++ b/bzrlib/repository.py 2011-02-21 22:11:57 +0000
@@ -34,7 +34,6 @@
graph,
inventory,
inventory_delta,
- lazy_regex,
lockable_files,
lockdir,
lru_cache,
@@ -988,12 +987,6 @@
# in a Repository class subclass rather than to override
# get_commit_builder.
_commit_builder_class = CommitBuilder
- # The search regex used by xml based repositories to determine what things
- # where changed in a single commit.
- _file_ids_altered_regex = lazy_regex.lazy_compile(
- r'file_id="(?P<file_id>[^"]+)"'
- r'.* revision="(?P<revision_id>[^"]+)"'
- )
def abort_write_group(self, suppress_errors=False):
"""Commit the contents accrued within the current write group.
@@ -2063,91 +2056,11 @@
w = self.inventories
pb = ui.ui_factory.nested_progress_bar()
try:
- return self._find_text_key_references_from_xml_inventory_lines(
+ return self._serializer._find_text_key_references(
w.iter_lines_added_or_present_in_keys(revision_keys, pb=pb))
finally:
pb.finished()
- def _find_text_key_references_from_xml_inventory_lines(self,
- line_iterator):
- """Core routine for extracting references to texts from inventories.
-
- This performs the translation of xml lines to revision ids.
-
- :param line_iterator: An iterator of lines, origin_version_id
- :return: A dictionary mapping text keys ((fileid, revision_id) tuples)
- to whether they were referred to by the inventory of the
- revision_id that they contain. Note that if that revision_id was
- not part of the line_iterator's output then False will be given -
- even though it may actually refer to that key.
- """
- if not self._serializer.support_altered_by_hack:
- raise AssertionError(
- "_find_text_key_references_from_xml_inventory_lines only "
- "supported for branches which store inventory as unnested xml"
- ", not on %r" % self)
- result = {}
-
- # this code needs to read every new line in every inventory for the
- # inventories [revision_ids]. Seeing a line twice is ok. Seeing a line
- # not present in one of those inventories is unnecessary but not
- # harmful because we are filtering by the revision id marker in the
- # inventory lines : we only select file ids altered in one of those
- # revisions. We don't need to see all lines in the inventory because
- # only those added in an inventory in rev X can contain a revision=X
- # line.
- unescape_revid_cache = {}
- unescape_fileid_cache = {}
-
- # jam 20061218 In a big fetch, this handles hundreds of thousands
- # of lines, so it has had a lot of inlining and optimizing done.
- # Sorry that it is a little bit messy.
- # Move several functions to be local variables, since this is a long
- # running loop.
- search = self._file_ids_altered_regex.search
- unescape = _unescape_xml
- setdefault = result.setdefault
- for line, line_key in line_iterator:
- match = search(line)
- if match is None:
- continue
- # One call to match.group() returning multiple items is quite a
- # bit faster than 2 calls to match.group() each returning 1
- file_id, revision_id = match.group('file_id', 'revision_id')
-
- # Inlining the cache lookups helps a lot when you make 170,000
- # lines and 350k ids, versus 8.4 unique ids.
- # Using a cache helps in 2 ways:
- # 1) Avoids unnecessary decoding calls
- # 2) Re-uses cached strings, which helps in future set and
- # equality checks.
- # (2) is enough that removing encoding entirely along with
- # the cache (so we are using plain strings) results in no
- # performance improvement.
- try:
- revision_id = unescape_revid_cache[revision_id]
- except KeyError:
- unescaped = unescape(revision_id)
- unescape_revid_cache[revision_id] = unescaped
- revision_id = unescaped
-
- # Note that unconditionally unescaping means that we deserialise
- # every fileid, which for general 'pull' is not great, but we don't
- # really want to have some many fulltexts that this matters anyway.
- # RBC 20071114.
- try:
- file_id = unescape_fileid_cache[file_id]
- except KeyError:
- unescaped = unescape(file_id)
- unescape_fileid_cache[file_id] = unescaped
- file_id = unescaped
-
- key = (file_id, revision_id)
- setdefault(key, False)
- if revision_id == line_key[-1]:
- result[key] = True
- return result
-
def _inventory_xml_lines_for_keys(self, keys):
"""Get a line iterator of the sort needed for findind references.
@@ -2183,10 +2096,10 @@
revision_ids. Each altered file-ids has the exact revision_ids that
altered it listed explicitly.
"""
- seen = set(self._find_text_key_references_from_xml_inventory_lines(
+ seen = set(self._serializer._find_text_key_references(
line_iterator).iterkeys())
parent_keys = self._find_parent_keys_of_revisions(revision_keys)
- parent_seen = set(self._find_text_key_references_from_xml_inventory_lines(
+ parent_seen = set(self._serializer._find_text_key_references(
self._inventory_xml_lines_for_keys(parent_keys)))
new_keys = seen - parent_seen
result = {}
@@ -4079,36 +3992,6 @@
pb.finished()
-_unescape_map = {
- 'apos':"'",
- 'quot':'"',
- 'amp':'&',
- 'lt':'<',
- 'gt':'>'
-}
-
-
-def _unescaper(match, _map=_unescape_map):
- code = match.group(1)
- try:
- return _map[code]
- except KeyError:
- if not code.startswith('#'):
- raise
- return unichr(int(code[1:])).encode('utf8')
-
-
-_unescape_re = None
-
-
-def _unescape_xml(data):
- """Unescape predefined XML entities in a string of data."""
- global _unescape_re
- if _unescape_re is None:
- _unescape_re = re.compile('\&([^;]*);')
- return _unescape_re.sub(_unescaper, data)
-
-
class _VersionedFileChecker(object):
def __init__(self, repository, text_key_references=None, ancestors=None):
=== modified file 'bzrlib/tests/test_repository.py'
--- a/bzrlib/tests/test_repository.py 2011-02-08 16:08:23 +0000
+++ b/bzrlib/tests/test_repository.py 2011-02-21 19:58:27 +0000
@@ -668,13 +668,6 @@
self.assertTrue(isinstance(target_format, repo._format.__class__))
-class TestMisc(TestCase):
-
- def test_unescape_xml(self):
- """We get some kind of error when malformed entities are passed"""
- self.assertRaises(KeyError, repository._unescape_xml, 'foo&bar;')
-
-
class TestRepositoryFormatKnit3(TestCaseWithTransport):
def test_attribute__fetch_order(self):
=== modified file 'bzrlib/tests/test_xml.py'
--- a/bzrlib/tests/test_xml.py 2011-01-12 01:01:53 +0000
+++ b/bzrlib/tests/test_xml.py 2011-02-21 19:58:27 +0000
@@ -579,3 +579,10 @@
uni_str = u'\xb5\xe5&\u062c'
self.assertEqual('µå&ج"',
bzrlib.xml8._encode_and_escape(uni_str))
+
+
+class TestMisc(TestCase):
+
+ def test_unescape_xml(self):
+ """We get some kind of error when malformed entities are passed"""
+ self.assertRaises(KeyError, bzrlib.xml8._unescape_xml, 'foo&bar;')
=== modified file 'bzrlib/xml8.py'
--- a/bzrlib/xml8.py 2010-04-30 11:03:59 +0000
+++ b/bzrlib/xml8.py 2011-02-21 15:09:19 +0000
@@ -21,6 +21,7 @@
cache_utf8,
errors,
inventory,
+ lazy_regex,
revision as _mod_revision,
trace,
)
@@ -45,6 +46,35 @@
">":">",
}
+_xml_unescape_map = {
+ 'apos':"'",
+ 'quot':'"',
+ 'amp':'&',
+ 'lt':'<',
+ 'gt':'>'
+}
+
+
+def _unescaper(match, _map=_xml_unescape_map):
+ code = match.group(1)
+ try:
+ return _map[code]
+ except KeyError:
+ if not code.startswith('#'):
+ raise
+ return unichr(int(code[1:])).encode('utf8')
+
+
+_unescape_re = None
+
+
+def _unescape_xml(data):
+ """Unescape predefined XML entities in a string of data."""
+ global _unescape_re
+ if _unescape_re is None:
+ _unescape_re = re.compile('\&([^;]*);')
+ return _unescape_re.sub(_unescaper, data)
+
def _ensure_utf8_re():
"""Make sure the _utf8_re and _unicode_re regexes have been compiled."""
@@ -161,6 +191,13 @@
format_num = '8'
revision_format_num = None
+ # The search regex used by xml based repositories to determine what things
+ # where changed in a single commit.
+ _file_ids_altered_regex = lazy_regex.lazy_compile(
+ r'file_id="(?P<file_id>[^"]+)"'
+ r'.* revision="(?P<revision_id>[^"]+)"'
+ )
+
def _check_revisions(self, inv):
"""Extension point for subclasses to check during serialisation.
@@ -532,5 +569,84 @@
raise AssertionError("repeated property %r" % name)
rev.properties[name] = value
+ def _find_text_key_references(self, line_iterator):
+ """Core routine for extracting references to texts from inventories.
+
+ This performs the translation of xml lines to revision ids.
+
+ :param line_iterator: An iterator of lines, origin_version_id
+ :return: A dictionary mapping text keys ((fileid, revision_id) tuples)
+ to whether they were referred to by the inventory of the
+ revision_id that they contain. Note that if that revision_id was
+ not part of the line_iterator's output then False will be given -
+ even though it may actually refer to that key.
+ """
+ if not self.support_altered_by_hack:
+ raise AssertionError(
+ "_find_text_key_references only "
+ "supported for branches which store inventory as unnested xml"
+ ", not on %r" % self)
+ result = {}
+
+ # this code needs to read every new line in every inventory for the
+ # inventories [revision_ids]. Seeing a line twice is ok. Seeing a line
+ # not present in one of those inventories is unnecessary but not
+ # harmful because we are filtering by the revision id marker in the
+ # inventory lines : we only select file ids altered in one of those
+ # revisions. We don't need to see all lines in the inventory because
+ # only those added in an inventory in rev X can contain a revision=X
+ # line.
+ unescape_revid_cache = {}
+ unescape_fileid_cache = {}
+
+ # jam 20061218 In a big fetch, this handles hundreds of thousands
+ # of lines, so it has had a lot of inlining and optimizing done.
+ # Sorry that it is a little bit messy.
+ # Move several functions to be local variables, since this is a long
+ # running loop.
+ search = self._file_ids_altered_regex.search
+ unescape = _unescape_xml
+ setdefault = result.setdefault
+ for line, line_key in line_iterator:
+ match = search(line)
+ if match is None:
+ continue
+ # One call to match.group() returning multiple items is quite a
+ # bit faster than 2 calls to match.group() each returning 1
+ file_id, revision_id = match.group('file_id', 'revision_id')
+
+ # Inlining the cache lookups helps a lot when you make 170,000
+ # lines and 350k ids, versus 8.4 unique ids.
+ # Using a cache helps in 2 ways:
+ # 1) Avoids unnecessary decoding calls
+ # 2) Re-uses cached strings, which helps in future set and
+ # equality checks.
+ # (2) is enough that removing encoding entirely along with
+ # the cache (so we are using plain strings) results in no
+ # performance improvement.
+ try:
+ revision_id = unescape_revid_cache[revision_id]
+ except KeyError:
+ unescaped = unescape(revision_id)
+ unescape_revid_cache[revision_id] = unescaped
+ revision_id = unescaped
+
+ # Note that unconditionally unescaping means that we deserialise
+ # every fileid, which for general 'pull' is not great, but we don't
+ # really want to have some many fulltexts that this matters anyway.
+ # RBC 20071114.
+ try:
+ file_id = unescape_fileid_cache[file_id]
+ except KeyError:
+ unescaped = unescape(file_id)
+ unescape_fileid_cache[file_id] = unescaped
+ file_id = unescaped
+
+ key = (file_id, revision_id)
+ setdefault(key, False)
+ if revision_id == line_key[-1]:
+ result[key] = True
+ return result
+
serializer_v8 = Serializer_v8()
More information about the bazaar-commits
mailing list