Rev 3180: Add a find_text_references method to the inventory journal. in http://people.ubuntu.com/~robertc/baz2.0/inventory.journalled
Robert Collins
robertc at robertcollins.net
Mon Jan 7 21:32:35 GMT 2008
At http://people.ubuntu.com/~robertc/baz2.0/inventory.journalled
------------------------------------------------------------
revno: 3180
revision-id:robertc at robertcollins.net-20080107213228-xke579q733x8z40g
parent: robertc at robertcollins.net-20080107025955-6xb183leqq2eais7
committer: Robert Collins <robertc at robertcollins.net>
branch nick: inventory.journalled
timestamp: Tue 2008-01-08 08:32:28 +1100
message:
Add a find_text_references method to the inventory journal.
modified:
bzrlib/journalled_inventory.py journalled_inventory-20080103020931-0ht5n40kwc0p7fy1-1
=== modified file 'bzrlib/journalled_inventory.py'
--- a/bzrlib/journalled_inventory.py 2008-01-06 23:31:19 +0000
+++ b/bzrlib/journalled_inventory.py 2008-01-07 21:32:28 +0000
@@ -25,7 +25,7 @@
__all__ = ['EntryAccess', 'InventoryJournal']
-from bzrlib import errors
+from bzrlib import errors, lazy_regex
from bzrlib.osutils import basename, sha_string, sha_strings
from bzrlib import inventory
from bzrlib.revision import NULL_REVISION
@@ -243,6 +243,10 @@
"""Serialise and deserialise inventories using a journal."""
FORMAT_1 = 'bzr journalled inventory v1 (bzr 1.2)'
+ _file_ids_altered_regex = lazy_regex.lazy_compile(
+ '^(?P<path_utf8>[^\x00]+)\x00(?P<file_id>[^\x00]+)\x00[^\x00]*\x00'
+ '(?P<revision_id>[^\x00]+)\x00'
+ )
def __init__(self, versioned_root, tree_references):
"""Create an InventoryJournal.
@@ -329,6 +333,77 @@
return ("%s\x00%s\x00%s\x00%s\x00%s\n" %
(newpath_utf8, file_id, parent_id, last_modified, content))
+ def find_text_key_references(self, line_iterator):
+ """Core routine for extracting references to texts from journals.
+
+ This performs the translation of journal lines to revision ids.
+
+ :param line_iterator: An iterator of lines, origin_version_id
+ :return: A dictionary mapping text keys ((fileid, revision_id) tuples)
+ to whether they were referred to by the inventory of the
+ revision_id that they contain. Note that if that revision_id was
+ not part of the line_iterator's output then False will be given -
+ even though it may actually refer to that key.
+ """
+ result = {}
+
+ # This code needs to read every journal line for every journal added
+ # for a number of revisions. As delta chains are limited we may see
+ # duplicates, which is fine. We generate the full set of referenced
+ # revisions, and calling code can filter down to any subset of
+ # revisions they want to consider. A variation of this function will be
+ # required to report on deleted items, this may become a layer on that
+ # at that point.
+ unescape_revid_cache = {}
+ unescape_fileid_cache = {}
+
+ # jam 20061218 In a big fetch, this handles hundreds of thousands
+ # of lines, so it has had a lot of inlining and optimizing done.
+ # Sorry that it is a little bit messy.
+ # Move several functions to be local variables, since this is a long
+ # running loop.
+ search = self._file_ids_altered_regex.search
+ string_cache = {}
+ setdefault = result.setdefault
+ for line, version_id in line_iterator:
+ match = search(line)
+ if match is None:
+ continue
+ # One call to match.group() returning multiple items is quite a
+ # bit faster than 2 calls to match.group() each returning 1
+ path_utf8, file_id, revision_id = match.group(
+ 'path_utf8', 'file_id', 'revision_id')
+ if path_utf8 == 'None':
+ # a delete.
+ continue
+ else:
+ print path_utf8, file_id, revision_id
+
+ # Using a cache helps by reuse of cached strings, which helps in
+ # future set and equality checks.
+ # (2) is enough that removing encoding entirely along with
+ # the cache (so we are using plain strings) results in no
+ # performance improvement.
+ try:
+ revision_id = string_cache[revision_id]
+ except KeyError:
+ string_cache[revision_id] = revision_id
+
+ # Note that unconditionally unescaping means that we deserialise
+ # every fileid, which for general 'pull' is not great, but we don't
+ # really want to have so many fulltexts inventories that this
+ # matters anyway. RBC 20071114.
+ try:
+ file_id = string_cache[file_id]
+ except KeyError:
+ string_cache[file_id] = file_id
+
+ key = (file_id, revision_id)
+ setdefault(key, False)
+ if revision_id == version_id:
+ result[key] = True
+ return result
+
def parse_text_bytes(self, bytes):
"""Parse the text bytes of a journal entry.
More information about the bazaar-commits
mailing list