Rev 4477: Initial attempt at refactoring _KnitAnnotator to derive from Annotator. in http://bazaar.launchpad.net/~jameinel/bzr/1.17-rework-annotate
John Arbash Meinel
john at arbash-meinel.com
Thu Jun 18 23:08:02 BST 2009
At http://bazaar.launchpad.net/~jameinel/bzr/1.17-rework-annotate
------------------------------------------------------------
revno: 4477
revision-id: john at arbash-meinel.com-20090618220742-10czy03gxzh337fl
parent: john at arbash-meinel.com-20090618210934-804pslw0ii5vsh0j
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.17-rework-annotate
timestamp: Thu 2009-06-18 17:07:42 -0500
message:
Initial attempt at refactoring _KnitAnnotator to derive from Annotator.
So far it seems possible, by just overriding _get_needed_texts.
However, we'll also want to override the get_matching_blocks stuff based on the delta.
-------------- next part --------------
=== modified file 'bzrlib/_annotator_py.py'
--- a/bzrlib/_annotator_py.py 2009-06-18 21:09:34 +0000
+++ b/bzrlib/_annotator_py.py 2009-06-18 22:07:42 +0000
@@ -17,7 +17,6 @@
"""Functionality for doing annotations in the 'optimal' way"""
from bzrlib import (
- annotate,
errors,
graph as _mod_graph,
osutils,
=== modified file 'bzrlib/annotate.py'
--- a/bzrlib/annotate.py 2009-06-17 19:38:58 +0000
+++ b/bzrlib/annotate.py 2009-06-18 22:07:42 +0000
@@ -444,3 +444,6 @@
# If left and right agree on a range, just push that into the output
lines_extend(annotated_lines[left_idx:left_idx + match_len])
return lines
+
+
+from bzrlib._annotator_py import Annotator
=== modified file 'bzrlib/knit.py'
--- a/bzrlib/knit.py 2009-06-18 18:58:55 +0000
+++ b/bzrlib/knit.py 2009-06-18 22:07:42 +0000
@@ -3305,100 +3305,24 @@
return iter(annotator.annotate(revision_id))
-class _KnitAnnotator(object):
+class _KnitAnnotator(annotate.Annotator):
"""Build up the annotations for a text."""
- def __init__(self, knit):
- self._knit = knit
-
- # Content objects, differs from fulltexts because of how final newlines
- # are treated by knits. the content objects here will always have a
- # final newline
- self._fulltext_contents = {}
-
- # Annotated lines of specific revisions
- self._annotated_lines = {}
-
- # Track the raw data for nodes that we could not process yet.
- # This maps the revision_id of the base to a list of children that will
- # annotated from it.
- self._pending_children = {}
-
- # Nodes which cannot be extracted
- self._ghosts = set()
-
- # Track how many children this node has, so we know if we need to keep
- # it
- self._annotate_children = {}
- self._compression_children = {}
+ def __init__(self, vf):
+ annotate.Annotator.__init__(self, vf)
+
+ # TODO: handle Nodes which cannot be extracted
+ # self._ghosts = set()
+
+ # Map from revision_id => left_matching_blocks, should be 'use once'
+ self._left_matching_blocks = {}
+
+ # KnitContent objects
+ self._content_objects = {}
+ # The number of children that depend on this fulltext content object
+ self._num_compression_children = {}
self._all_build_details = {}
- # The children => parent revision_id graph
- self._revision_id_graph = {}
-
- self._heads_provider = None
-
- self._nodes_to_keep_annotations = set()
- self._generations_until_keep = 100
-
- def set_generations_until_keep(self, value):
- """Set the number of generations before caching a node.
-
- Setting this to -1 will cache every merge node, setting this higher
- will cache fewer nodes.
- """
- self._generations_until_keep = value
-
- def _add_fulltext_content(self, revision_id, content_obj):
- self._fulltext_contents[revision_id] = content_obj
- # TODO: jam 20080305 It might be good to check the sha1digest here
- return content_obj.text()
-
- def _check_parents(self, child, nodes_to_annotate):
- """Check if all parents have been processed.
-
- :param child: A tuple of (rev_id, parents, raw_content)
- :param nodes_to_annotate: If child is ready, add it to
- nodes_to_annotate, otherwise put it back in self._pending_children
- """
- for parent_id in child[1]:
- if (parent_id not in self._annotated_lines):
- # This parent is present, but another parent is missing
- self._pending_children.setdefault(parent_id,
- []).append(child)
- break
- else:
- # This one is ready to be processed
- nodes_to_annotate.append(child)
-
- def _add_annotation(self, revision_id, fulltext, parent_ids,
- left_matching_blocks=None):
- """Add an annotation entry.
-
- All parents should already have been annotated.
- :return: A list of children that now have their parents satisfied.
- """
- a = self._annotated_lines
- annotated_parent_lines = [a[p] for p in parent_ids]
- annotated_lines = list(annotate.reannotate(annotated_parent_lines,
- fulltext, revision_id, left_matching_blocks,
- heads_provider=self._get_heads_provider()))
- self._annotated_lines[revision_id] = annotated_lines
- for p in parent_ids:
- ann_children = self._annotate_children[p]
- ann_children.remove(revision_id)
- if (not ann_children
- and p not in self._nodes_to_keep_annotations):
- del self._annotated_lines[p]
- del self._all_build_details[p]
- if p in self._fulltext_contents:
- del self._fulltext_contents[p]
- # Now that we've added this one, see if there are any pending
- # deltas to be done, certainly this parent is finished
- nodes_to_annotate = []
- for child in self._pending_children.pop(revision_id, []):
- self._check_parents(child, nodes_to_annotate)
- return nodes_to_annotate
def _get_build_graph(self, key):
"""Get the graphs for building texts and annotations.
@@ -3412,9 +3336,6 @@
passing to read_records_iter to start reading in the raw data from
the pack file.
"""
- if key in self._annotated_lines:
- # Nothing to do
- return []
pending = set([key])
records = []
generation = 0
@@ -3423,191 +3344,138 @@
# get all pending nodes
generation += 1
this_iteration = pending
- build_details = self._knit._index.get_build_details(this_iteration)
+ build_details = self._vf._index.get_build_details(this_iteration)
self._all_build_details.update(build_details)
- # new_nodes = self._knit._index._get_entries(this_iteration)
+ # new_nodes = self._vf._index._get_entries(this_iteration)
pending = set()
for key, details in build_details.iteritems():
- (index_memo, compression_parent, parents,
+ (index_memo, compression_parent, parent_keys,
record_details) = details
- self._revision_id_graph[key] = parents
+ self._parent_map[key] = parent_keys
records.append((key, index_memo))
# Do we actually need to check _annotated_lines?
- pending.update(p for p in parents
+ pending.update(p for p in parent_keys
if p not in self._all_build_details)
- if compression_parent:
- self._compression_children.setdefault(compression_parent,
- []).append(key)
- if parents:
- for parent in parents:
- self._annotate_children.setdefault(parent,
- []).append(key)
- num_gens = generation - kept_generation
- if ((num_gens >= self._generations_until_keep)
- and len(parents) > 1):
- kept_generation = generation
- self._nodes_to_keep_annotations.add(key)
+ if parent_keys:
+ for parent_key in parent_keys:
+ if parent_key in self._num_needed_children:
+ self._num_needed_children[parent_key] += 1
+ else:
+ self._num_needed_children[parent_key] = 1
missing_versions = this_iteration.difference(build_details.keys())
- self._ghosts.update(missing_versions)
- for missing_version in missing_versions:
- # add a key, no parents
- self._revision_id_graph[missing_version] = ()
- pending.discard(missing_version) # don't look for it
- if self._ghosts.intersection(self._compression_children):
- raise KnitCorrupt(
- "We cannot have nodes which have a ghost compression parent:\n"
- "ghosts: %r\n"
- "compression children: %r"
- % (self._ghosts, self._compression_children))
- # Cleanout anything that depends on a ghost so that we don't wait for
- # the ghost to show up
- for node in self._ghosts:
- if node in self._annotate_children:
- # We won't be building this node
- del self._annotate_children[node]
+ if missing_versions:
+ raise ValueError('i dont handle ghosts')
# Generally we will want to read the records in reverse order, because
# we find the parent nodes after the children
records.reverse()
return records
- def _annotate_records(self, records):
- """Build the annotations for the listed records."""
+ def _get_needed_texts(self, key, pb=None):
+ if len(self._vf._fallback_vfs) > 0:
+ # If we have fallbacks, go to the generic path
+ for v in super(_KnitAnnotator, self)._get_needed_texts(key, pb=pb):
+ yield v
+ while True:
+ try:
+ records = self._get_build_graph(key)
+ for key, text, num_lines in self._extract_texts(records):
+ yield key, text, num_lines
+ except errors.RetryWithNewPacks, e:
+ self._vf._access.reload_or_raise(e)
+ # The cached build_details are no longer valid
+ self._all_build_details.clear()
+
+ def _extract_texts(self, records):
+ """Extract the various texts needed based on records"""
# We iterate in the order read, rather than a strict order requested
# However, process what we can, and put off to the side things that
# still need parents, cleaning them up when those parents are
# processed.
- for (rev_id, record,
- digest) in self._knit._read_records_iter(records):
- if rev_id in self._annotated_lines:
- continue
- parent_ids = self._revision_id_graph[rev_id]
- parent_ids = [p for p in parent_ids if p not in self._ghosts]
- details = self._all_build_details[rev_id]
- (index_memo, compression_parent, parents,
+ # Children that we want to annotate as soon as we get the parent text
+ # Map from parent_key => [child_key]
+ pending_annotation = {}
+ # Children that are missing their compression parent
+ pending_deltas = {}
+ for (key, record,
+ digest) in self._vf._read_records_iter(records):
+ # parent_ids = [p for p in parent_ids if p not in self._ghosts]
+ details = self._all_build_details[keys]
+ (index_memo, compression_parent, parent_keys,
record_details) = details
- nodes_to_annotate = []
+ assert parent_keys == self._parent_map[key]
# TODO: Remove the punning between compression parents, and
# parent_ids, we should be able to do this without assuming
# the build order
- if len(parent_ids) == 0:
- # There are no parents for this node, so just add it
- # TODO: This probably needs to be decoupled
- fulltext_content, delta = self._knit._factory.parse_record(
+ if compression_parent:
+ # This is a delta, do we have the parent fulltext?
+ if compression_parent not in self._fulltext_contents:
+ # Waiting for the parent
+ pending_deltas.setdefault(compression_parent, []).append(
+ (key, parent_keys, record, record_details))
+ continue
+ # We can build this as a fulltext
+ parent_content = self._content_objects[compression_parent]
+ content, delta = self._vf._factory.parse_record(
+ key, record, record_details,
+ parent_content,
+ # TODO: track when we can copy the base
+ copy_base_content=True)
+ else:
+ # No compression parent means we have a fulltext
+ content, delta = self._vf._factory.parse_record(
rev_id, record, record_details, None)
- fulltext = self._add_fulltext_content(rev_id, fulltext_content)
- nodes_to_annotate.extend(self._add_annotation(rev_id, fulltext,
- parent_ids, left_matching_blocks=None))
- else:
- child = (rev_id, parent_ids, record)
- # Check if all the parents are present
- self._check_parents(child, nodes_to_annotate)
- while nodes_to_annotate:
- # Should we use a queue here instead of a stack?
- (rev_id, parent_ids, record) = nodes_to_annotate.pop()
- (index_memo, compression_parent, parents,
- record_details) = self._all_build_details[rev_id]
- blocks = None
- if compression_parent is not None:
- comp_children = self._compression_children[compression_parent]
- if rev_id not in comp_children:
- raise AssertionError("%r not in compression children %r"
- % (rev_id, comp_children))
- # If there is only 1 child, it is safe to reuse this
- # content
- reuse_content = (len(comp_children) == 1
- and compression_parent not in
- self._nodes_to_keep_annotations)
- if reuse_content:
- # Remove it from the cache since it will be changing
- parent_fulltext_content = self._fulltext_contents.pop(compression_parent)
- # Make sure to copy the fulltext since it might be
- # modified
- parent_fulltext = list(parent_fulltext_content.text())
+ self._content_objects[key] = content
+ to_process = [(key, parent_keys, content)]
+ # Check for compression children that we can expand
+ if key in pending_deltas:
+ children = pending_deltas.pop(key)
+ for child_key, child_parent_keys, child_record, child_details in children:
+ child_content, child_delta = self._vf._factory.parse_record(
+ child_key, child_record, child_details,
+ content,
+ # TODO: track when we can copy the base
+ copy_base_content=True)
+ self._content_objects[child_key] = child_content
+ to_process.append((child_key, child_parent_keys, child_content))
+ while to_process:
+ cur = to_process
+ to_process = []
+ for key, parent_keys, content in cur:
+ # Check if all parents are present
+ for parent_key in parent_keys:
+ if parent_key not in self._annotations_cache:
+ # still waiting on a parent text
+ pending_annotation.setdefault(parent_key,
+ []).append((key, parent_keys, content))
+ break
else:
- parent_fulltext_content = self._fulltext_contents[compression_parent]
- parent_fulltext = parent_fulltext_content.text()
- comp_children.remove(rev_id)
- fulltext_content, delta = self._knit._factory.parse_record(
- rev_id, record, record_details,
- parent_fulltext_content,
- copy_base_content=(not reuse_content))
- fulltext = self._add_fulltext_content(rev_id,
- fulltext_content)
- if compression_parent == parent_ids[0]:
- # the compression_parent is the left parent, so we can
- # re-use the delta
- blocks = KnitContent.get_line_delta_blocks(delta,
- parent_fulltext, fulltext)
- else:
- fulltext_content = self._knit._factory.parse_fulltext(
- record, rev_id)
- fulltext = self._add_fulltext_content(rev_id,
- fulltext_content)
- nodes_to_annotate.extend(
- self._add_annotation(rev_id, fulltext, parent_ids,
- left_matching_blocks=blocks))
-
- def _get_heads_provider(self):
- """Create a heads provider for resolving ancestry issues."""
- if self._heads_provider is not None:
- return self._heads_provider
- self._heads_provider = _mod_graph.KnownGraph(self._revision_id_graph)
- return self._heads_provider
+ # All parents present
+ lines = content.text()
+ yield key, lines, len(lines)
+ if key in pending_annotation:
+ to_process.extend(pending_annotation.pop(key))
def annotate(self, key):
"""Return the annotated fulltext at the given key.
:param key: The key to annotate.
"""
- if len(self._knit._fallback_vfs) > 0:
+ if len(self._vf._fallback_vfs) > 0:
# stacked knits can't use the fast path at present.
return self._simple_annotate(key)
while True:
try:
records = self._get_build_graph(key)
if key in self._ghosts:
- raise errors.RevisionNotPresent(key, self._knit)
+ raise errors.RevisionNotPresent(key, self._vf)
self._annotate_records(records)
return self._annotated_lines[key]
except errors.RetryWithNewPacks, e:
- self._knit._access.reload_or_raise(e)
+ self._vf._access.reload_or_raise(e)
# The cached build_details are no longer valid
self._all_build_details.clear()
- def _simple_annotate(self, key):
- """Return annotated fulltext, rediffing from the full texts.
-
- This is slow but makes no assumptions about the repository
- being able to produce line deltas.
- """
- # TODO: this code generates a parent maps of present ancestors; it
- # could be split out into a separate method
- # -- mbp and robertc 20080704
- graph = _mod_graph.Graph(self._knit)
- parent_map = dict((k, v) for k, v in graph.iter_ancestry([key])
- if v is not None)
- if not parent_map:
- raise errors.RevisionNotPresent(key, self)
- keys = parent_map.keys()
- heads_provider = _mod_graph.KnownGraph(parent_map)
- parent_cache = {}
- reannotate = annotate.reannotate
- for record in self._knit.get_record_stream(keys, 'topological', True):
- key = record.key
- fulltext = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
- parents = parent_map[key]
- if parents is not None:
- parent_lines = [parent_cache[parent] for parent in parent_map[key]]
- else:
- parent_lines = []
- parent_cache[key] = list(
- reannotate(parent_lines, fulltext, key, None, heads_provider))
- try:
- return parent_cache[key]
- except KeyError, e:
- raise errors.RevisionNotPresent(key, self._knit)
-
try:
from bzrlib._knit_load_data_c import _load_data_c as _load_data
More information about the bazaar-commits
mailing list