Rev 3180: Add a find_text_references method to the inventory journal. in http://people.ubuntu.com/~robertc/baz2.0/inventory.journalled

Robert Collins robertc at robertcollins.net
Mon Jan 7 21:32:35 GMT 2008


At http://people.ubuntu.com/~robertc/baz2.0/inventory.journalled

------------------------------------------------------------
revno: 3180
revision-id:robertc at robertcollins.net-20080107213228-xke579q733x8z40g
parent: robertc at robertcollins.net-20080107025955-6xb183leqq2eais7
committer: Robert Collins <robertc at robertcollins.net>
branch nick: inventory.journalled
timestamp: Tue 2008-01-08 08:32:28 +1100
message:
  Add a find_text_references method to the inventory journal.
modified:
  bzrlib/journalled_inventory.py journalled_inventory-20080103020931-0ht5n40kwc0p7fy1-1
=== modified file 'bzrlib/journalled_inventory.py'
--- a/bzrlib/journalled_inventory.py	2008-01-06 23:31:19 +0000
+++ b/bzrlib/journalled_inventory.py	2008-01-07 21:32:28 +0000
@@ -25,7 +25,7 @@
 
 __all__ = ['EntryAccess', 'InventoryJournal']
 
-from bzrlib import errors
+from bzrlib import errors, lazy_regex
 from bzrlib.osutils import basename, sha_string, sha_strings
 from bzrlib import inventory
 from bzrlib.revision import NULL_REVISION
@@ -243,6 +243,10 @@
     """Serialise and deserialise inventories using a journal."""
 
     FORMAT_1 = 'bzr journalled inventory v1 (bzr 1.2)'
+    _file_ids_altered_regex = lazy_regex.lazy_compile(
+        '^(?P<path_utf8>[^\x00]+)\x00(?P<file_id>[^\x00]+)\x00[^\x00]*\x00'
+        '(?P<revision_id>[^\x00]+)\x00'
+        )
 
     def __init__(self, versioned_root, tree_references):
         """Create an InventoryJournal.
@@ -329,6 +333,77 @@
         return ("%s\x00%s\x00%s\x00%s\x00%s\n" %
             (newpath_utf8, file_id, parent_id, last_modified, content))
 
+    def find_text_key_references(self, line_iterator):
+        """Core routine for extracting references to texts from journals.
+
+        This performs the translation of journal lines to revision ids.
+
+        :param line_iterator: An iterator of lines, origin_version_id
+        :return: A dictionary mapping text keys ((fileid, revision_id) tuples)
+            to whether they were referred to by the inventory of the
+            revision_id that they contain. Note that if that revision_id was
+            not part of the line_iterator's output then False will be given -
+            even though it may actually refer to that key.
+        """
+        result = {}
+
+        # This code needs to read every journal line for every journal added
+        # for a number of revisions. As delta chains are limited we may see
+        # duplicates, which is fine. We generate the full set of referenced
+        # revisions, and calling code can filter down to any subset of
+        # revisions they want to consider. A variation of this function will be
+        # required to report on deleted items, this may become a layer on that
+        # at that point.
+        unescape_revid_cache = {}
+        unescape_fileid_cache = {}
+
+        # jam 20061218 In a big fetch, this handles hundreds of thousands
+        # of lines, so it has had a lot of inlining and optimizing done.
+        # Sorry that it is a little bit messy.
+        # Move several functions to be local variables, since this is a long
+        # running loop.
+        search = self._file_ids_altered_regex.search
+        string_cache = {}
+        setdefault = result.setdefault
+        for line, version_id in line_iterator:
+            match = search(line)
+            if match is None:
+                continue
+            # One call to match.group() returning multiple items is quite a
+            # bit faster than 2 calls to match.group() each returning 1
+            path_utf8, file_id, revision_id = match.group(
+                'path_utf8', 'file_id', 'revision_id')
+            if path_utf8 == 'None':
+                # a delete.
+                continue
+            else:
+                print path_utf8, file_id, revision_id
+
+            # Using a cache helps by reuse of cached strings, which helps in
+            # future set and equality checks.
+            # (2) is enough that removing encoding entirely along with
+            # the cache (so we are using plain strings) results in no
+            # performance improvement.
+            try:
+                revision_id = string_cache[revision_id]
+            except KeyError:
+                string_cache[revision_id] = revision_id
+
+            # Note that unconditionally unescaping means that we deserialise
+            # every fileid, which for general 'pull' is not great, but we don't
+            # really want to have so many fulltexts inventories that this
+            # matters anyway. RBC 20071114.
+            try:
+                file_id = string_cache[file_id]
+            except KeyError:
+                string_cache[file_id] = file_id
+
+            key = (file_id, revision_id)
+            setdefault(key, False)
+            if revision_id == version_id:
+                result[key] = True
+        return result
+
     def parse_text_bytes(self, bytes):
         """Parse the text bytes of a journal entry.
 



More information about the bazaar-commits mailing list