Rev 4477: Initial attempt at refactoring _KnitAnnotator to derive from Annotator. in http://bazaar.launchpad.net/~jameinel/bzr/1.17-rework-annotate

Thu Jun 18 23:08:02 BST 2009

At http://bazaar.launchpad.net/~jameinel/bzr/1.17-rework-annotate

------------------------------------------------------------
revno: 4477
revision-id: john at arbash-meinel.com-20090618220742-10czy03gxzh337fl
parent: john at arbash-meinel.com-20090618210934-804pslw0ii5vsh0j
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.17-rework-annotate
timestamp: Thu 2009-06-18 17:07:42 -0500
message:
  Initial attempt at refactoring _KnitAnnotator to derive from Annotator.
  
  So far it seems possible, by just overriding _get_needed_texts.
  However, we'll also want to override the get_matching_blocks stuff based on the delta.
-------------- next part --------------
=== modified file 'bzrlib/_annotator_py.py'

--- a/bzrlib/_annotator_py.py	2009-06-18 21:09:34 +0000
+++ b/bzrlib/_annotator_py.py	2009-06-18 22:07:42 +0000
@@ -17,7 +17,6 @@
 """Functionality for doing annotations in the 'optimal' way"""
 
 from bzrlib import (
-    annotate,
     errors,
     graph as _mod_graph,
     osutils,

=== modified file 'bzrlib/annotate.py'
--- a/bzrlib/annotate.py	2009-06-17 19:38:58 +0000
+++ b/bzrlib/annotate.py	2009-06-18 22:07:42 +0000
@@ -444,3 +444,6 @@
         # If left and right agree on a range, just push that into the output
         lines_extend(annotated_lines[left_idx:left_idx + match_len])
     return lines
+
+
+from bzrlib._annotator_py import Annotator

=== modified file 'bzrlib/knit.py'
--- a/bzrlib/knit.py	2009-06-18 18:58:55 +0000
+++ b/bzrlib/knit.py	2009-06-18 22:07:42 +0000
@@ -3305,100 +3305,24 @@
     return iter(annotator.annotate(revision_id))
 
 
-class _KnitAnnotator(object):
+class _KnitAnnotator(annotate.Annotator):
     """Build up the annotations for a text."""
 
-    def __init__(self, knit):
-        self._knit = knit
-
-        # Content objects, differs from fulltexts because of how final newlines
-        # are treated by knits. the content objects here will always have a
-        # final newline
-        self._fulltext_contents = {}
-
-        # Annotated lines of specific revisions
-        self._annotated_lines = {}
-
-        # Track the raw data for nodes that we could not process yet.
-        # This maps the revision_id of the base to a list of children that will
-        # annotated from it.
-        self._pending_children = {}
-
-        # Nodes which cannot be extracted
-        self._ghosts = set()
-
-        # Track how many children this node has, so we know if we need to keep
-        # it
-        self._annotate_children = {}
-        self._compression_children = {}
+    def __init__(self, vf):
+        annotate.Annotator.__init__(self, vf)
+
+        # TODO: handle Nodes which cannot be extracted
+        # self._ghosts = set()
+
+        # Map from revision_id => left_matching_blocks, should be 'use once'
+        self._left_matching_blocks = {}
+
+        # KnitContent objects
+        self._content_objects = {}
+        # The number of children that depend on this fulltext content object
+        self._num_compression_children = {}
 
         self._all_build_details = {}
-        # The children => parent revision_id graph
-        self._revision_id_graph = {}
-
-        self._heads_provider = None
-
-        self._nodes_to_keep_annotations = set()
-        self._generations_until_keep = 100
-
-    def set_generations_until_keep(self, value):
-        """Set the number of generations before caching a node.
-
-        Setting this to -1 will cache every merge node, setting this higher
-        will cache fewer nodes.
-        """
-        self._generations_until_keep = value
-
-    def _add_fulltext_content(self, revision_id, content_obj):
-        self._fulltext_contents[revision_id] = content_obj
-        # TODO: jam 20080305 It might be good to check the sha1digest here
-        return content_obj.text()
-
-    def _check_parents(self, child, nodes_to_annotate):
-        """Check if all parents have been processed.
-
-        :param child: A tuple of (rev_id, parents, raw_content)
-        :param nodes_to_annotate: If child is ready, add it to
-            nodes_to_annotate, otherwise put it back in self._pending_children
-        """
-        for parent_id in child[1]:
-            if (parent_id not in self._annotated_lines):
-                # This parent is present, but another parent is missing
-                self._pending_children.setdefault(parent_id,
-                                                  []).append(child)
-                break
-        else:
-            # This one is ready to be processed
-            nodes_to_annotate.append(child)
-
-    def _add_annotation(self, revision_id, fulltext, parent_ids,
-                        left_matching_blocks=None):
-        """Add an annotation entry.
-
-        All parents should already have been annotated.
-        :return: A list of children that now have their parents satisfied.
-        """
-        a = self._annotated_lines
-        annotated_parent_lines = [a[p] for p in parent_ids]
-        annotated_lines = list(annotate.reannotate(annotated_parent_lines,
-            fulltext, revision_id, left_matching_blocks,
-            heads_provider=self._get_heads_provider()))
-        self._annotated_lines[revision_id] = annotated_lines
-        for p in parent_ids:
-            ann_children = self._annotate_children[p]
-            ann_children.remove(revision_id)
-            if (not ann_children
-                and p not in self._nodes_to_keep_annotations):
-                del self._annotated_lines[p]
-                del self._all_build_details[p]
-                if p in self._fulltext_contents:
-                    del self._fulltext_contents[p]
-        # Now that we've added this one, see if there are any pending
-        # deltas to be done, certainly this parent is finished
-        nodes_to_annotate = []
-        for child in self._pending_children.pop(revision_id, []):
-            self._check_parents(child, nodes_to_annotate)
-        return nodes_to_annotate
 
     def _get_build_graph(self, key):
         """Get the graphs for building texts and annotations.
@@ -3412,9 +3336,6 @@
             passing to read_records_iter to start reading in the raw data from
             the pack file.
         """
-        if key in self._annotated_lines:
-            # Nothing to do
-            return []
         pending = set([key])
         records = []
         generation = 0
@@ -3423,191 +3344,138 @@
             # get all pending nodes
             generation += 1
             this_iteration = pending
-            build_details = self._knit._index.get_build_details(this_iteration)
+            build_details = self._vf._index.get_build_details(this_iteration)
             self._all_build_details.update(build_details)
-            # new_nodes = self._knit._index._get_entries(this_iteration)
+            # new_nodes = self._vf._index._get_entries(this_iteration)
             pending = set()
             for key, details in build_details.iteritems():
-                (index_memo, compression_parent, parents,
+                (index_memo, compression_parent, parent_keys,
                  record_details) = details
-                self._revision_id_graph[key] = parents
+                self._parent_map[key] = parent_keys
                 records.append((key, index_memo))
                 # Do we actually need to check _annotated_lines?
-                pending.update(p for p in parents
+                pending.update(p for p in parent_keys
                                  if p not in self._all_build_details)
-                if compression_parent:
-                    self._compression_children.setdefault(compression_parent,
-                        []).append(key)
-                if parents:
-                    for parent in parents:
-                        self._annotate_children.setdefault(parent,
-                            []).append(key)
-                    num_gens = generation - kept_generation
-                    if ((num_gens >= self._generations_until_keep)
-                        and len(parents) > 1):
-                        kept_generation = generation
-                        self._nodes_to_keep_annotations.add(key)
+                if parent_keys:
+                    for parent_key in parent_keys:
+                        if parent_key in self._num_needed_children:
+                            self._num_needed_children[parent_key] += 1
+                        else:
+                            self._num_needed_children[parent_key] = 1
 
             missing_versions = this_iteration.difference(build_details.keys())
-            self._ghosts.update(missing_versions)
-            for missing_version in missing_versions:
-                # add a key, no parents
-                self._revision_id_graph[missing_version] = ()
-                pending.discard(missing_version) # don't look for it
-        if self._ghosts.intersection(self._compression_children):
-            raise KnitCorrupt(
-                "We cannot have nodes which have a ghost compression parent:\n"
-                "ghosts: %r\n"
-                "compression children: %r"
-                % (self._ghosts, self._compression_children))
-        # Cleanout anything that depends on a ghost so that we don't wait for
-        # the ghost to show up
-        for node in self._ghosts:
-            if node in self._annotate_children:
-                # We won't be building this node
-                del self._annotate_children[node]
+            if missing_versions:
+                raise ValueError('i dont handle ghosts')
         # Generally we will want to read the records in reverse order, because
         # we find the parent nodes after the children
         records.reverse()
         return records
 
-    def _annotate_records(self, records):
-        """Build the annotations for the listed records."""
+    def _get_needed_texts(self, key, pb=None):
+        if len(self._vf._fallback_vfs) > 0:
+            # If we have fallbacks, go to the generic path
+            for v in super(_KnitAnnotator, self)._get_needed_texts(key, pb=pb):
+                yield v
+        while True:
+            try:
+                records = self._get_build_graph(key)
+                for key, text, num_lines in self._extract_texts(records):
+                    yield key, text, num_lines
+            except errors.RetryWithNewPacks, e:
+                self._vf._access.reload_or_raise(e)
+                # The cached build_details are no longer valid
+                self._all_build_details.clear()
+
+    def _extract_texts(self, records):
+        """Extract the various texts needed based on records"""
         # We iterate in the order read, rather than a strict order requested
         # However, process what we can, and put off to the side things that
         # still need parents, cleaning them up when those parents are
         # processed.
-        for (rev_id, record,
-             digest) in self._knit._read_records_iter(records):
-            if rev_id in self._annotated_lines:
-                continue
-            parent_ids = self._revision_id_graph[rev_id]
-            parent_ids = [p for p in parent_ids if p not in self._ghosts]
-            details = self._all_build_details[rev_id]
-            (index_memo, compression_parent, parents,
+        # Children that we want to annotate as soon as we get the parent text
+        # Map from parent_key => [child_key]
+        pending_annotation = {}
+        # Children that are missing their compression parent
+        pending_deltas = {}
+        for (key, record,
+             digest) in self._vf._read_records_iter(records):
+            # parent_ids = [p for p in parent_ids if p not in self._ghosts]
+            details = self._all_build_details[keys]
+            (index_memo, compression_parent, parent_keys,
              record_details) = details
-            nodes_to_annotate = []
+            assert parent_keys == self._parent_map[key]
             # TODO: Remove the punning between compression parents, and
             #       parent_ids, we should be able to do this without assuming
             #       the build order
-            if len(parent_ids) == 0:
-                # There are no parents for this node, so just add it
-                # TODO: This probably needs to be decoupled
-                fulltext_content, delta = self._knit._factory.parse_record(
+            if compression_parent:
+                # This is a delta, do we have the parent fulltext?
+                if compression_parent not in self._fulltext_contents:
+                    # Waiting for the parent
+                    pending_deltas.setdefault(compression_parent, []).append(
+                        (key, parent_keys, record, record_details))
+                    continue
+                # We can build this as a fulltext
+                parent_content = self._content_objects[compression_parent]
+                content, delta = self._vf._factory.parse_record(
+                    key, record, record_details,
+                    parent_content,
+                    # TODO: track when we can copy the base
+                    copy_base_content=True)
+            else:
+                # No compression parent means we have a fulltext
+                content, delta = self._vf._factory.parse_record(
                     rev_id, record, record_details, None)
-                fulltext = self._add_fulltext_content(rev_id, fulltext_content)
-                nodes_to_annotate.extend(self._add_annotation(rev_id, fulltext,
-                    parent_ids, left_matching_blocks=None))
-            else:
-                child = (rev_id, parent_ids, record)
-                # Check if all the parents are present
-                self._check_parents(child, nodes_to_annotate)
-            while nodes_to_annotate:
-                # Should we use a queue here instead of a stack?
-                (rev_id, parent_ids, record) = nodes_to_annotate.pop()
-                (index_memo, compression_parent, parents,
-                 record_details) = self._all_build_details[rev_id]
-                blocks = None
-                if compression_parent is not None:
-                    comp_children = self._compression_children[compression_parent]
-                    if rev_id not in comp_children:
-                        raise AssertionError("%r not in compression children %r"
-                            % (rev_id, comp_children))
-                    # If there is only 1 child, it is safe to reuse this
-                    # content
-                    reuse_content = (len(comp_children) == 1
-                        and compression_parent not in
-                            self._nodes_to_keep_annotations)
-                    if reuse_content:
-                        # Remove it from the cache since it will be changing
-                        parent_fulltext_content = self._fulltext_contents.pop(compression_parent)
-                        # Make sure to copy the fulltext since it might be
-                        # modified
-                        parent_fulltext = list(parent_fulltext_content.text())
+            self._content_objects[key] = content
+            to_process = [(key, parent_keys, content)]
+            # Check for compression children that we can expand
+            if key in pending_deltas:
+                children = pending_deltas.pop(key)
+                for child_key, child_parent_keys, child_record, child_details in children:
+                    child_content, child_delta = self._vf._factory.parse_record(
+                        child_key, child_record, child_details,
+                        content,
+                        # TODO: track when we can copy the base
+                        copy_base_content=True)
+                    self._content_objects[child_key] = child_content
+                    to_process.append((child_key, child_parent_keys, child_content))
+            while to_process:
+                cur = to_process
+                to_process = []
+                for key, parent_keys, content in cur:
+                    # Check if all parents are present
+                    for parent_key in parent_keys:
+                        if parent_key not in self._annotations_cache:
+                            # still waiting on a parent text
+                            pending_annotation.setdefault(parent_key,
+                                []).append((key, parent_keys, content))
+                            break
                     else:
-                        parent_fulltext_content = self._fulltext_contents[compression_parent]
-                        parent_fulltext = parent_fulltext_content.text()
-                    comp_children.remove(rev_id)
-                    fulltext_content, delta = self._knit._factory.parse_record(
-                        rev_id, record, record_details,
-                        parent_fulltext_content,
-                        copy_base_content=(not reuse_content))
-                    fulltext = self._add_fulltext_content(rev_id,
-                                                          fulltext_content)
-                    if compression_parent == parent_ids[0]:
-                        # the compression_parent is the left parent, so we can
-                        # re-use the delta
-                        blocks = KnitContent.get_line_delta_blocks(delta,
-                                parent_fulltext, fulltext)
-                else:
-                    fulltext_content = self._knit._factory.parse_fulltext(
-                        record, rev_id)
-                    fulltext = self._add_fulltext_content(rev_id,
-                        fulltext_content)
-                nodes_to_annotate.extend(
-                    self._add_annotation(rev_id, fulltext, parent_ids,
-                                     left_matching_blocks=blocks))
-
-    def _get_heads_provider(self):
-        """Create a heads provider for resolving ancestry issues."""
-        if self._heads_provider is not None:
-            return self._heads_provider
-        self._heads_provider = _mod_graph.KnownGraph(self._revision_id_graph)
-        return self._heads_provider
+                        # All parents present
+                        lines = content.text()
+                        yield key, lines, len(lines)
+                        if key in pending_annotation:
+                            to_process.extend(pending_annotation.pop(key))
 
     def annotate(self, key):
         """Return the annotated fulltext at the given key.
 
         :param key: The key to annotate.
         """
-        if len(self._knit._fallback_vfs) > 0:
+        if len(self._vf._fallback_vfs) > 0:
             # stacked knits can't use the fast path at present.
             return self._simple_annotate(key)
         while True:
             try:
                 records = self._get_build_graph(key)
                 if key in self._ghosts:
-                    raise errors.RevisionNotPresent(key, self._knit)
+                    raise errors.RevisionNotPresent(key, self._vf)
                 self._annotate_records(records)
                 return self._annotated_lines[key]
             except errors.RetryWithNewPacks, e:
-                self._knit._access.reload_or_raise(e)
+                self._vf._access.reload_or_raise(e)
                 # The cached build_details are no longer valid
                 self._all_build_details.clear()
 
-    def _simple_annotate(self, key):
-        """Return annotated fulltext, rediffing from the full texts.
-
-        This is slow but makes no assumptions about the repository
-        being able to produce line deltas.
-        """
-        # TODO: this code generates a parent maps of present ancestors; it
-        #       could be split out into a separate method
-        #       -- mbp and robertc 20080704
-        graph = _mod_graph.Graph(self._knit)
-        parent_map = dict((k, v) for k, v in graph.iter_ancestry([key])
-                          if v is not None)
-        if not parent_map:
-            raise errors.RevisionNotPresent(key, self)
-        keys = parent_map.keys()
-        heads_provider = _mod_graph.KnownGraph(parent_map)
-        parent_cache = {}
-        reannotate = annotate.reannotate
-        for record in self._knit.get_record_stream(keys, 'topological', True):
-            key = record.key
-            fulltext = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
-            parents = parent_map[key]
-            if parents is not None:
-                parent_lines = [parent_cache[parent] for parent in parent_map[key]]
-            else:
-                parent_lines = []
-            parent_cache[key] = list(
-                reannotate(parent_lines, fulltext, key, None, heads_provider))
-        try:
-            return parent_cache[key]
-        except KeyError, e:
-            raise errors.RevisionNotPresent(key, self._knit)
-
 
 try:
     from bzrlib._knit_load_data_c import _load_data_c as _load_data