Rev 41: Handle xml escaped characters that previously failed to index. (Robert Collins) in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Sat Jun 21 14:03:10 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 41
revision-id: robertc at robertcollins.net-20080621130301-hkrps8jew9h1d0v3
parent: robertc at robertcollins.net-20080618130915-c20vmim4ks0j6wt9
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Sat 2008-06-21 23:03:01 +1000
message:
Handle xml escaped characters that previously failed to index. (Robert Collins)
modified:
NEWS news-20080608052041-z5bahsl8kwl0uf4x-2
inventory.py inventory.py-20080614043027-u57i4xjlc5ginuft-1
setup.py setup.py-20080608052041-z5bahsl8kwl0uf4x-6
tests/test_inventory.py test_inventory.py-20080614043027-u57i4xjlc5ginuft-2
=== modified file 'NEWS'
--- a/NEWS 2008-06-18 13:09:15 +0000
+++ b/NEWS 2008-06-21 13:03:01 +0000
@@ -26,6 +26,10 @@
BUGFIXES:
+ * Handles file ids and paths containing any of '"&<> - characters that were
+ xml escaped to place in the xml atttributes of serialised inventories.
+ (Robert Collins)
+
API BREAKS:
TESTING:
=== modified file 'inventory.py'
--- a/inventory.py 2008-06-14 07:51:17 +0000
+++ b/inventory.py 2008-06-21 13:03:01 +0000
@@ -17,8 +17,13 @@
"""Inventory related helpers for indexing."""
+import re
from bzrlib import lazy_regex
+from bzrlib.lazy_import import lazy_import
+lazy_import(globals(), """
+from bzrlib import xml_serializer
+""")
_file_ids_name_regex = lazy_regex.lazy_compile(
r'file_id="(?P<file_id>[^"]+)"'
@@ -26,12 +31,30 @@
r'(?:.* parent_id="(?P<parent_id>[^"]+)")?'
)
+_unescape_re = lazy_regex.lazy_compile("&|'|"|<|>")
+_unescape_map = {
+ '&': '&',
+ "'": "'",
+ """: '"',
+ "<": '<',
+ ">": '>',
+ }
+def _unescape_replace(match, map=_unescape_map):
+ return map[match.group()]
+
+
def paths_from_ids(xml_inventory, serializer, file_ids):
"""Extract the paths for some file_ids from xml_inventory."""
if not serializer.support_altered_by_hack:
raise ValueError("Cannot process with serializer %r" % serializer)
search = _file_ids_name_regex.search
- unresolved_ids = set(file_ids)
+ # escaped ids to match against the xml:
+ escape_re = xml_serializer.escape_re
+ _escape_replace = xml_serializer._escape_replace
+ escaped_to_raw_ids = {}
+ for file_id in file_ids:
+ escaped_to_raw_ids[escape_re.sub(_escape_replace, file_id)] = file_id
+ unresolved_ids = set(escaped_to_raw_ids)
# TODO: only examine lines we need to, break early, track unprocessed
found_ids = {}
id_paths = {}
@@ -48,9 +71,10 @@
if parent_id is None:
# no parent, stash its name now to avoid special casing
# later.
- id_paths[file_id] = name
+ path = _unescape_re.sub(_unescape_replace, name)
+ id_paths[file_id] = path
if file_id in unresolved_ids:
- result[file_id] = name
+ result[escaped_to_raw_ids[file_id]] = path
needed_ids = set(unresolved_ids)
while needed_ids:
# ---
@@ -77,7 +101,7 @@
wanted_file_id = unresolved_ids.pop()
path = id_paths.get(wanted_file_id)
if path is not None:
- result[wanted_file_id] = path
+ result[escaped_to_raw_ids[wanted_file_id]] = path
continue
lookup_stack = [wanted_file_id]
lookup_names = []
@@ -92,21 +116,24 @@
lookup_names.append(name)
else:
# resolve:
+ path = _unescape_re.sub(_unescape_replace, name)
if parent_path:
- parent_path = parent_path + '/' + name
+ parent_path = parent_path + '/' + path
else:
- parent_path = name
+ parent_path = path
id_paths[file_id] = parent_path
if file_id == wanted_file_id:
- result[file_id] = parent_path
+ result[escaped_to_raw_ids[file_id]] = parent_path
lookup_stack.pop(-1)
while lookup_stack:
file_id = lookup_stack.pop(-1)
+ path = _unescape_re.sub(_unescape_replace,
+ lookup_names.pop(-1))
if parent_path:
- parent_path = parent_path + '/' + lookup_names.pop(-1)
+ parent_path = parent_path + '/' + path
else:
- parent_path = lookup_names.pop(-1)
+ parent_path = path
id_paths[file_id] = parent_path
if file_id == wanted_file_id:
- result[file_id] = parent_path
+ result[escaped_to_raw_ids[file_id]] = parent_path
return result
=== modified file 'setup.py'
--- a/setup.py 2008-06-08 08:37:10 +0000
+++ b/setup.py 2008-06-21 13:03:01 +0000
@@ -3,7 +3,7 @@
bzr_plugin_name = 'search'
-bzr_plugin_version = (1, 6, 0, 'dev', 0)
+bzr_plugin_version = (1, 6, 0, 'dev', 1)
bzr_commands = ['index', 'search']
if __name__ == 'main':
=== modified file 'tests/test_inventory.py'
--- a/tests/test_inventory.py 2008-06-14 07:51:17 +0000
+++ b/tests/test_inventory.py 2008-06-21 13:03:01 +0000
@@ -105,3 +105,21 @@
inventory.paths_from_ids(xml, serializer, ['tests']))
self.assertEqual({'test.py':'tests/test.py'},
inventory.paths_from_ids(xml, serializer, ['test.py']))
+
+ def test_escaped_chars(self):
+ """Inventories with escaping attributes (&'"<>) are matched ok."""
+ from bzrlib.xml5 import Serializer_v5
+ serializer = Serializer_v5()
+ xml = [
+ '<inventory file_id="root" format="5" revision_id="rev1">\n',
+ '<file file_id="&'"<>" name="__init__.py" parent_id="root" revision="rev1" />\n',
+ '<directory file_id="tests" name="&'"<>" parent_id="root" revision="rev1" />\n',
+ '<file file_id="test.py" name="><"'&" parent_id="tests" revision="rev1" />\n',
+ '</inventory>\n'
+ ]
+ # Lookup an id that has every escape
+ self.assertEqual({'&\'"<>':'__init__.py'},
+ inventory.paths_from_ids(xml, serializer, ['&\'"<>']))
+ # Get the path for a name which is escaped
+ self.assertEqual({'test.py':'&\'"<>/><"\'&'},
+ inventory.paths_from_ids(xml, serializer, ['test.py']))
More information about the bazaar-commits
mailing list