Rev 5675: (jelmer) Move Repository._find_text_key_references_from_xml_inventory_lines in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Mon Feb 21 22:12:01 UTC 2011

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 5675 [merge]
revision-id: pqm at pqm.ubuntu.com-20110221221157-3dtpgotjz8ktfviu
parent: pqm at pqm.ubuntu.com-20110221205918-g6r1qo36btstg4r7
parent: jelmer at samba.org-20110221195827-fd627hir7bsqywt5
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Mon 2011-02-21 22:11:57 +0000
message:
  (jelmer) Move Repository._find_text_key_references_from_xml_inventory_lines
   onto the serializer. (Jelmer Vernooij)
modified:
  bzrlib/repofmt/pack_repo.py    pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
  bzrlib/repository.py           rev_storage.py-20051111201905-119e9401e46257e3
  bzrlib/tests/test_repository.py test_repository.py-20060131075918-65c555b881612f4d
  bzrlib/tests/test_xml.py       test_xml.py-20050905091053-80b45588931a9b35
  bzrlib/xml8.py                 xml5.py-20050907032657-aac8f960815b66b1
=== modified file 'bzrlib/repofmt/pack_repo.py'

--- a/bzrlib/repofmt/pack_repo.py	2011-01-20 21:15:10 +0000
+++ b/bzrlib/repofmt/pack_repo.py	2011-02-21 15:09:19 +0000
@@ -1222,8 +1222,7 @@
     def _process_inventory_lines(self, inv_lines):
         """Generate a text key reference map rather for reconciling with."""
         repo = self._pack_collection.repo
-        refs = repo._find_text_key_references_from_xml_inventory_lines(
-            inv_lines)
+        refs = repo._serializer._find_text_key_references(inv_lines)
         self._text_refs = refs
         # during reconcile we:
         #  - convert unreferenced texts to full texts
@@ -2476,7 +2475,7 @@
         from_repo = self.from_repository
         parent_ids = from_repo._find_parent_ids_of_revisions(revision_ids)
         parent_keys = [(p,) for p in parent_ids]
-        find_text_keys = from_repo._find_text_key_references_from_xml_inventory_lines
+        find_text_keys = from_repo._serializer._find_text_key_references
         parent_text_keys = set(find_text_keys(
             from_repo._inventory_xml_lines_for_keys(parent_keys)))
         content_text_keys = set()

=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py	2011-02-19 17:37:45 +0000
+++ b/bzrlib/repository.py	2011-02-21 22:11:57 +0000
@@ -34,7 +34,6 @@
     graph,
     inventory,
     inventory_delta,
-    lazy_regex,
     lockable_files,
     lockdir,
     lru_cache,
@@ -988,12 +987,6 @@
     # in a Repository class subclass rather than to override
     # get_commit_builder.
     _commit_builder_class = CommitBuilder
-    # The search regex used by xml based repositories to determine what things
-    # where changed in a single commit.
-    _file_ids_altered_regex = lazy_regex.lazy_compile(
-        r'file_id="(?P<file_id>[^"]+)"'
-        r'.* revision="(?P<revision_id>[^"]+)"'
-        )
 
     def abort_write_group(self, suppress_errors=False):
         """Commit the contents accrued within the current write group.
@@ -2063,91 +2056,11 @@
         w = self.inventories
         pb = ui.ui_factory.nested_progress_bar()
         try:
-            return self._find_text_key_references_from_xml_inventory_lines(
+            return self._serializer._find_text_key_references(
                 w.iter_lines_added_or_present_in_keys(revision_keys, pb=pb))
         finally:
             pb.finished()
 
-    def _find_text_key_references_from_xml_inventory_lines(self,
-        line_iterator):
-        """Core routine for extracting references to texts from inventories.
-
-        This performs the translation of xml lines to revision ids.
-
-        :param line_iterator: An iterator of lines, origin_version_id
-        :return: A dictionary mapping text keys ((fileid, revision_id) tuples)
-            to whether they were referred to by the inventory of the
-            revision_id that they contain. Note that if that revision_id was
-            not part of the line_iterator's output then False will be given -
-            even though it may actually refer to that key.
-        """
-        if not self._serializer.support_altered_by_hack:
-            raise AssertionError(
-                "_find_text_key_references_from_xml_inventory_lines only "
-                "supported for branches which store inventory as unnested xml"
-                ", not on %r" % self)
-        result = {}
-
-        # this code needs to read every new line in every inventory for the
-        # inventories [revision_ids]. Seeing a line twice is ok. Seeing a line
-        # not present in one of those inventories is unnecessary but not
-        # harmful because we are filtering by the revision id marker in the
-        # inventory lines : we only select file ids altered in one of those
-        # revisions. We don't need to see all lines in the inventory because
-        # only those added in an inventory in rev X can contain a revision=X
-        # line.
-        unescape_revid_cache = {}
-        unescape_fileid_cache = {}
-
-        # jam 20061218 In a big fetch, this handles hundreds of thousands
-        # of lines, so it has had a lot of inlining and optimizing done.
-        # Sorry that it is a little bit messy.
-        # Move several functions to be local variables, since this is a long
-        # running loop.
-        search = self._file_ids_altered_regex.search
-        unescape = _unescape_xml
-        setdefault = result.setdefault
-        for line, line_key in line_iterator:
-            match = search(line)
-            if match is None:
-                continue
-            # One call to match.group() returning multiple items is quite a
-            # bit faster than 2 calls to match.group() each returning 1
-            file_id, revision_id = match.group('file_id', 'revision_id')
-
-            # Inlining the cache lookups helps a lot when you make 170,000
-            # lines and 350k ids, versus 8.4 unique ids.
-            # Using a cache helps in 2 ways:
-            #   1) Avoids unnecessary decoding calls
-            #   2) Re-uses cached strings, which helps in future set and
-            #      equality checks.
-            # (2) is enough that removing encoding entirely along with
-            # the cache (so we are using plain strings) results in no
-            # performance improvement.
-            try:
-                revision_id = unescape_revid_cache[revision_id]
-            except KeyError:
-                unescaped = unescape(revision_id)
-                unescape_revid_cache[revision_id] = unescaped
-                revision_id = unescaped
-
-            # Note that unconditionally unescaping means that we deserialise
-            # every fileid, which for general 'pull' is not great, but we don't
-            # really want to have some many fulltexts that this matters anyway.
-            # RBC 20071114.
-            try:
-                file_id = unescape_fileid_cache[file_id]
-            except KeyError:
-                unescaped = unescape(file_id)
-                unescape_fileid_cache[file_id] = unescaped
-                file_id = unescaped
-
-            key = (file_id, revision_id)
-            setdefault(key, False)
-            if revision_id == line_key[-1]:
-                result[key] = True
-        return result
-
     def _inventory_xml_lines_for_keys(self, keys):
         """Get a line iterator of the sort needed for findind references.
 
@@ -2183,10 +2096,10 @@
         revision_ids. Each altered file-ids has the exact revision_ids that
         altered it listed explicitly.
         """
-        seen = set(self._find_text_key_references_from_xml_inventory_lines(
+        seen = set(self._serializer._find_text_key_references(
                 line_iterator).iterkeys())
         parent_keys = self._find_parent_keys_of_revisions(revision_keys)
-        parent_seen = set(self._find_text_key_references_from_xml_inventory_lines(
+        parent_seen = set(self._serializer._find_text_key_references(
             self._inventory_xml_lines_for_keys(parent_keys)))
         new_keys = seen - parent_seen
         result = {}
@@ -4079,36 +3992,6 @@
         pb.finished()
 
 
-_unescape_map = {
-    'apos':"'",
-    'quot':'"',
-    'amp':'&',
-    'lt':'<',
-    'gt':'>'
-}
-
-
-def _unescaper(match, _map=_unescape_map):
-    code = match.group(1)
-    try:
-        return _map[code]
-    except KeyError:
-        if not code.startswith('#'):
-            raise
-        return unichr(int(code[1:])).encode('utf8')
-
-
-_unescape_re = None
-
-
-def _unescape_xml(data):
-    """Unescape predefined XML entities in a string of data."""
-    global _unescape_re
-    if _unescape_re is None:
-        _unescape_re = re.compile('\&([^;]*);')
-    return _unescape_re.sub(_unescaper, data)
-
-
 class _VersionedFileChecker(object):
 
     def __init__(self, repository, text_key_references=None, ancestors=None):

=== modified file 'bzrlib/tests/test_repository.py'
--- a/bzrlib/tests/test_repository.py	2011-02-08 16:08:23 +0000
+++ b/bzrlib/tests/test_repository.py	2011-02-21 19:58:27 +0000
@@ -668,13 +668,6 @@
         self.assertTrue(isinstance(target_format, repo._format.__class__))
 
 
-class TestMisc(TestCase):
-
-    def test_unescape_xml(self):
-        """We get some kind of error when malformed entities are passed"""
-        self.assertRaises(KeyError, repository._unescape_xml, 'foo&bar;')
-
-
 class TestRepositoryFormatKnit3(TestCaseWithTransport):
 
     def test_attribute__fetch_order(self):

=== modified file 'bzrlib/tests/test_xml.py'
--- a/bzrlib/tests/test_xml.py	2011-01-12 01:01:53 +0000
+++ b/bzrlib/tests/test_xml.py	2011-02-21 19:58:27 +0000
@@ -579,3 +579,10 @@
         uni_str = u'\xb5\xe5&\u062c'
         self.assertEqual('µå&ج"',
                          bzrlib.xml8._encode_and_escape(uni_str))
+
+
+class TestMisc(TestCase):
+
+    def test_unescape_xml(self):
+        """We get some kind of error when malformed entities are passed"""
+        self.assertRaises(KeyError, bzrlib.xml8._unescape_xml, 'foo&bar;')

=== modified file 'bzrlib/xml8.py'
--- a/bzrlib/xml8.py	2010-04-30 11:03:59 +0000
+++ b/bzrlib/xml8.py	2011-02-21 15:09:19 +0000
@@ -21,6 +21,7 @@
     cache_utf8,
     errors,
     inventory,
+    lazy_regex,
     revision as _mod_revision,
     trace,
     )
@@ -45,6 +46,35 @@
     ">":">",
     }
 
+_xml_unescape_map = {
+    'apos':"'",
+    'quot':'"',
+    'amp':'&',
+    'lt':'<',
+    'gt':'>'
+}
+
+
+def _unescaper(match, _map=_xml_unescape_map):
+    code = match.group(1)
+    try:
+        return _map[code]
+    except KeyError:
+        if not code.startswith('#'):
+            raise
+        return unichr(int(code[1:])).encode('utf8')
+
+
+_unescape_re = None
+
+
+def _unescape_xml(data):
+    """Unescape predefined XML entities in a string of data."""
+    global _unescape_re
+    if _unescape_re is None:
+        _unescape_re = re.compile('\&([^;]*);')
+    return _unescape_re.sub(_unescaper, data)
+
 
 def _ensure_utf8_re():
     """Make sure the _utf8_re and _unicode_re regexes have been compiled."""
@@ -161,6 +191,13 @@
     format_num = '8'
     revision_format_num = None
 
+    # The search regex used by xml based repositories to determine what things
+    # where changed in a single commit.
+    _file_ids_altered_regex = lazy_regex.lazy_compile(
+        r'file_id="(?P<file_id>[^"]+)"'
+        r'.* revision="(?P<revision_id>[^"]+)"'
+        )
+
     def _check_revisions(self, inv):
         """Extension point for subclasses to check during serialisation.
 
@@ -532,5 +569,84 @@
                 raise AssertionError("repeated property %r" % name)
             rev.properties[name] = value
 
+    def _find_text_key_references(self, line_iterator):
+        """Core routine for extracting references to texts from inventories.
+
+        This performs the translation of xml lines to revision ids.
+
+        :param line_iterator: An iterator of lines, origin_version_id
+        :return: A dictionary mapping text keys ((fileid, revision_id) tuples)
+            to whether they were referred to by the inventory of the
+            revision_id that they contain. Note that if that revision_id was
+            not part of the line_iterator's output then False will be given -
+            even though it may actually refer to that key.
+        """
+        if not self.support_altered_by_hack:
+            raise AssertionError(
+                "_find_text_key_references only "
+                "supported for branches which store inventory as unnested xml"
+                ", not on %r" % self)
+        result = {}
+
+        # this code needs to read every new line in every inventory for the
+        # inventories [revision_ids]. Seeing a line twice is ok. Seeing a line
+        # not present in one of those inventories is unnecessary but not
+        # harmful because we are filtering by the revision id marker in the
+        # inventory lines : we only select file ids altered in one of those
+        # revisions. We don't need to see all lines in the inventory because
+        # only those added in an inventory in rev X can contain a revision=X
+        # line.
+        unescape_revid_cache = {}
+        unescape_fileid_cache = {}
+
+        # jam 20061218 In a big fetch, this handles hundreds of thousands
+        # of lines, so it has had a lot of inlining and optimizing done.
+        # Sorry that it is a little bit messy.
+        # Move several functions to be local variables, since this is a long
+        # running loop.
+        search = self._file_ids_altered_regex.search
+        unescape = _unescape_xml
+        setdefault = result.setdefault
+        for line, line_key in line_iterator:
+            match = search(line)
+            if match is None:
+                continue
+            # One call to match.group() returning multiple items is quite a
+            # bit faster than 2 calls to match.group() each returning 1
+            file_id, revision_id = match.group('file_id', 'revision_id')
+
+            # Inlining the cache lookups helps a lot when you make 170,000
+            # lines and 350k ids, versus 8.4 unique ids.
+            # Using a cache helps in 2 ways:
+            #   1) Avoids unnecessary decoding calls
+            #   2) Re-uses cached strings, which helps in future set and
+            #      equality checks.
+            # (2) is enough that removing encoding entirely along with
+            # the cache (so we are using plain strings) results in no
+            # performance improvement.
+            try:
+                revision_id = unescape_revid_cache[revision_id]
+            except KeyError:
+                unescaped = unescape(revision_id)
+                unescape_revid_cache[revision_id] = unescaped
+                revision_id = unescaped
+
+            # Note that unconditionally unescaping means that we deserialise
+            # every fileid, which for general 'pull' is not great, but we don't
+            # really want to have some many fulltexts that this matters anyway.
+            # RBC 20071114.
+            try:
+                file_id = unescape_fileid_cache[file_id]
+            except KeyError:
+                unescaped = unescape(file_id)
+                unescape_fileid_cache[file_id] = unescaped
+                file_id = unescaped
+
+            key = (file_id, revision_id)
+            setdefault(key, False)
+            if revision_id == line_key[-1]:
+                result[key] = True
+        return result
+
 
 serializer_v8 = Serializer_v8()