Rev 41: Handle xml escaped characters that previously failed to index. (Robert Collins) in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

Sat Jun 21 14:03:10 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

------------------------------------------------------------
revno: 41
revision-id: robertc at robertcollins.net-20080621130301-hkrps8jew9h1d0v3
parent: robertc at robertcollins.net-20080618130915-c20vmim4ks0j6wt9
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Sat 2008-06-21 23:03:01 +1000
message:
  Handle xml escaped characters that previously failed to index. (Robert Collins)
modified:
  NEWS                           news-20080608052041-z5bahsl8kwl0uf4x-2
  inventory.py                   inventory.py-20080614043027-u57i4xjlc5ginuft-1
  setup.py                       setup.py-20080608052041-z5bahsl8kwl0uf4x-6
  tests/test_inventory.py        test_inventory.py-20080614043027-u57i4xjlc5ginuft-2
=== modified file 'NEWS'

--- a/NEWS	2008-06-18 13:09:15 +0000
+++ b/NEWS	2008-06-21 13:03:01 +0000
@@ -26,6 +26,10 @@
 
   BUGFIXES:
 
+    * Handles file ids and paths containing any of '"&<> - characters that were
+      xml escaped to place in the xml atttributes of serialised inventories.
+      (Robert Collins)
+
   API BREAKS:
 
   TESTING:

=== modified file 'inventory.py'
--- a/inventory.py	2008-06-14 07:51:17 +0000
+++ b/inventory.py	2008-06-21 13:03:01 +0000
@@ -17,8 +17,13 @@
 
 """Inventory related helpers for indexing."""
 
+import re
 
 from bzrlib import lazy_regex
+from bzrlib.lazy_import import lazy_import
+lazy_import(globals(), """
+from bzrlib import xml_serializer
+""")
 
 _file_ids_name_regex = lazy_regex.lazy_compile(
         r'file_id="(?P<file_id>[^"]+)"'
@@ -26,12 +31,30 @@
         r'(?:.* parent_id="(?P<parent_id>[^"]+)")?'
         )
 
+_unescape_re = lazy_regex.lazy_compile("&amp;|&apos;|&quot;|&lt;|&gt;")
+_unescape_map = {
+    '&amp;': '&',
+    "&apos;": "'",
+    "&quot;": '"',
+    "&lt;": '<',
+    "&gt;": '>',
+    }
+def _unescape_replace(match, map=_unescape_map):
+    return map[match.group()]
+
+
 def paths_from_ids(xml_inventory, serializer, file_ids):
     """Extract the paths for some file_ids from xml_inventory."""
     if not serializer.support_altered_by_hack:
         raise ValueError("Cannot process with serializer %r" % serializer)
     search = _file_ids_name_regex.search
-    unresolved_ids = set(file_ids)
+    # escaped ids to match against the xml:
+    escape_re = xml_serializer.escape_re
+    _escape_replace = xml_serializer._escape_replace
+    escaped_to_raw_ids = {}
+    for file_id in file_ids:
+        escaped_to_raw_ids[escape_re.sub(_escape_replace, file_id)] = file_id
+    unresolved_ids = set(escaped_to_raw_ids)
     # TODO: only examine lines we need to, break early, track unprocessed
     found_ids = {}
     id_paths = {}
@@ -48,9 +71,10 @@
         if parent_id is None:
             # no parent, stash its name now to avoid special casing
             # later.
-            id_paths[file_id] = name
+            path = _unescape_re.sub(_unescape_replace, name)
+            id_paths[file_id] = path
             if file_id in unresolved_ids:
-                result[file_id] = name
+                result[escaped_to_raw_ids[file_id]] = path
     needed_ids = set(unresolved_ids)
     while needed_ids:
         # ---
@@ -77,7 +101,7 @@
         wanted_file_id = unresolved_ids.pop()
         path = id_paths.get(wanted_file_id)
         if path is not None:
-            result[wanted_file_id] = path
+            result[escaped_to_raw_ids[wanted_file_id]] = path
             continue
         lookup_stack = [wanted_file_id]
         lookup_names = []
@@ -92,21 +116,24 @@
                 lookup_names.append(name)
             else:
                 # resolve:
+                path = _unescape_re.sub(_unescape_replace, name)
                 if parent_path:
-                    parent_path = parent_path + '/' + name
+                    parent_path = parent_path + '/' + path
                 else:
-                    parent_path = name
+                    parent_path = path
                 id_paths[file_id] = parent_path
                 if file_id == wanted_file_id:
-                    result[file_id] = parent_path
+                    result[escaped_to_raw_ids[file_id]] = parent_path
                 lookup_stack.pop(-1)
                 while lookup_stack:
                     file_id = lookup_stack.pop(-1)
+                    path = _unescape_re.sub(_unescape_replace,
+                        lookup_names.pop(-1))
                     if parent_path:
-                        parent_path = parent_path + '/' + lookup_names.pop(-1)
+                        parent_path = parent_path + '/' + path
                     else:
-                        parent_path = lookup_names.pop(-1)
+                        parent_path = path
                     id_paths[file_id] = parent_path
                     if file_id == wanted_file_id:
-                        result[file_id] = parent_path
+                        result[escaped_to_raw_ids[file_id]] = parent_path
     return result

=== modified file 'setup.py'
--- a/setup.py	2008-06-08 08:37:10 +0000
+++ b/setup.py	2008-06-21 13:03:01 +0000
@@ -3,7 +3,7 @@
 
 bzr_plugin_name = 'search'
 
-bzr_plugin_version = (1, 6, 0, 'dev', 0)
+bzr_plugin_version = (1, 6, 0, 'dev', 1)
 bzr_commands = ['index', 'search']
 
 if __name__ == 'main':

=== modified file 'tests/test_inventory.py'
--- a/tests/test_inventory.py	2008-06-14 07:51:17 +0000
+++ b/tests/test_inventory.py	2008-06-21 13:03:01 +0000
@@ -105,3 +105,21 @@
             inventory.paths_from_ids(xml, serializer, ['tests']))
         self.assertEqual({'test.py':'tests/test.py'},
             inventory.paths_from_ids(xml, serializer, ['test.py']))
+
+    def test_escaped_chars(self):
+        """Inventories with escaping attributes (&'"<>) are matched ok."""
+        from bzrlib.xml5 import Serializer_v5
+        serializer = Serializer_v5()
+        xml = [
+            '<inventory file_id="root" format="5" revision_id="rev1">\n',
+            '<file file_id="&amp;&apos;&quot;&lt;&gt;" name="__init__.py" parent_id="root" revision="rev1" />\n',
+            '<directory file_id="tests" name="&amp;&apos;&quot;&lt;&gt;" parent_id="root" revision="rev1" />\n',
+            '<file file_id="test.py" name="&gt;&lt;&quot;&apos;&amp;" parent_id="tests" revision="rev1"  />\n',
+            '</inventory>\n'
+            ]
+        # Lookup an id that has every escape
+        self.assertEqual({'&\'"<>':'__init__.py'},
+            inventory.paths_from_ids(xml, serializer, ['&\'"<>']))
+        # Get the path for a name which is escaped
+        self.assertEqual({'test.py':'&\'"<>/><"\'&'},
+            inventory.paths_from_ids(xml, serializer, ['test.py']))