Rev 7: Add --btree-plain repository format for testing. in http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk

Tue Jul 1 12:37:43 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk

------------------------------------------------------------
revno: 7
revision-id: robertc at robertcollins.net-20080701113743-n0pc6x42arqd4zhd
parent: robertc at robertcollins.net-20080701095116-ge9gvqt09zie8x42
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Tue 2008-07-01 21:37:43 +1000
message:
  Add --btree-plain repository format for testing.
added:
  repofmt.py                     repofmt.py-20080701113732-m1iu3n94ikbxdelb-1
modified:
  __init__.py                    __init__.py-20080624222253-p0x5f92uyh5hw734-5
  btree_index.py                 index.py-20080624222253-p0x5f92uyh5hw734-7
=== modified file '__init__.py'

--- a/__init__.py	2008-06-30 22:10:08 +0000
+++ b/__init__.py	2008-07-01 11:37:43 +0000
@@ -21,8 +21,33 @@
 =============
 
 See `pydoc bzrlib.plugins.index2.index`.
+
+This plugin also provides an experimental repository format making use of the
+new index classes. bzr init --btree-plain to test this. Warning: if it eats your
+data, you get to do the stomach searching.
 """
 
+from bzrlib.bzrdir import format_registry
+format_registry.register_metadir('btree-plain',
+    'bzrlib.plugins.index2.repofmt.RepositoryFormatPackBTreePlain',
+    help='Trivial rename of pack-0.92 with B+Tree based index. '
+        'Please read '
+        'http://doc.bazaar-vcs.org/latest/developers/development-repo.html '
+        'before use.',
+    branch_format='bzrlib.branch.BzrBranchFormat6',
+    tree_format='bzrlib.workingtree.WorkingTreeFormat4',
+    hidden=False,
+    experimental=True,
+    )
+
+from bzrlib.repository import format_registry as repo_registry
+repo_registry.register_lazy(
+    'Bazaar development format - btree (needs bzr.dev from 1.6)\n',
+    'bzrlib.plugins.index2.repofmt',
+    'RepositoryFormatPackBTreePlain',
+    )
+
+
 def test_suite():
     # Thunk across to load_tests for niceness with older bzr versions
     from bzrlib.tests import TestLoader

=== modified file 'btree_index.py'
--- a/btree_index.py	2008-07-01 09:51:16 +0000
+++ b/btree_index.py	2008-07-01 11:37:43 +0000
@@ -363,6 +363,8 @@
             return
         if self._key_count is None:
             self._cache_nodes([0])
+        if not self._key_count:
+            return
         for key in keys:
             # repeated single-bisection : 'make it work'
             # FUTURE: look within the current node for the next key,
@@ -437,7 +439,7 @@
         """
         signature = bytes[0:len(self._signature())]
         if not signature == self._signature():
-            raise bzrerrors.BadIndexFormatSignature(self._name, GraphIndex)
+            raise bzrerrors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
         lines = bytes[len(self._signature()):].splitlines()
         options_line = lines[0]
         if not options_line.startswith(_OPTION_NODE_REFS):

=== added file 'repofmt.py'
--- a/repofmt.py	1970-01-01 00:00:00 +0000
+++ b/repofmt.py	2008-07-01 11:37:43 +0000
@@ -0,0 +1,394 @@
+# index2, a bzr plugin providing experimental index types.
+# Copyright (C) 2008 Canonical Limited.
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as published
+# by the Free Software Foundation.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+# 
+
+"""Repostory formats using B+Tree indices.
+
+Currently only a plain format is present, this is enough to do basic testing
+on.
+"""
+
+import md5
+import time
+
+from bzrlib import debug, pack
+from bzrlib.plugins.index2.btree_index import BTreeGraphIndex, BTreeBuilder
+from bzrlib.knit import (
+    _KnitGraphIndex,
+    KnitVersionedFiles,
+    )
+from bzrlib.osutils import rand_chars
+from bzrlib.repofmt.pack_repo import (
+    Pack,
+    NewPack,
+    KnitPackRepository,
+    RepositoryPackCollection,
+    RepositoryFormatPackDevelopment0,
+    Packer,
+    ReconcilePacker,
+    OptimisingPacker,
+    )
+
+
+def open_pack(self):
+    return self._pack_collection.pack_factory(self._pack_collection._upload_transport,
+        self._pack_collection._index_transport,
+        self._pack_collection._pack_transport, upload_suffix=self.suffix,
+        file_mode=self._pack_collection.repo.bzrdir._get_file_mode())
+
+Packer.open_pack = open_pack
+
+
+class InMemoryBTree(BTreeBuilder):
+    """TODO - make InMemoryGraphIndex more reusable.
+    
+    (not that all-in-mem is a good idea).
+    """
+
+    def add_nodes(self, nodes):
+        """Add nodes to the index.
+
+        :param nodes: An iterable of (key, node_refs, value) entries to add.
+        """
+        if self.reference_lists:
+            for (key, value, node_refs) in nodes:
+                self.add_node(key, value, node_refs)
+        else:
+            for (key, value) in nodes:
+                self.add_node(key, value)
+
+    def iter_all_entries(self):
+        """Iterate over all keys within the index
+
+        :return: An iterable of (index, key, reference_lists, value). There is no
+            defined order for the result iteration - it will be in the most
+            efficient order for the index (in this case dictionary hash order).
+        """
+        if 'evil' in debug.debug_flags:
+            trace.mutter_callsite(3,
+                "iter_all_entries scales with size of history.")
+        if self.reference_lists:
+            for key, (absent, references, value) in self._nodes.iteritems():
+                if not absent:
+                    yield self, key, value, references
+        else:
+            for key, (absent, references, value) in self._nodes.iteritems():
+                if not absent:
+                    yield self, key, value
+
+    def iter_entries(self, keys):
+        """Iterate over keys within the index.
+
+        :param keys: An iterable providing the keys to be retrieved.
+        :return: An iterable of (index, key, value, reference_lists). There is no
+            defined order for the result iteration - it will be in the most
+            efficient order for the index (keys iteration order in this case).
+        """
+        keys = set(keys)
+        if self.reference_lists:
+            for key in keys.intersection(self._keys):
+                node = self._nodes[key]
+                if not node[0]:
+                    yield self, key, node[2], node[1]
+        else:
+            for key in keys.intersection(self._keys):
+                node = self._nodes[key]
+                if not node[0]:
+                    yield self, key, node[2]
+
+    def iter_entries_prefix(self, keys):
+        """Iterate over keys within the index using prefix matching.
+
+        Prefix matching is applied within the tuple of a key, not to within
+        the bytestring of each key element. e.g. if you have the keys ('foo',
+        'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
+        only the former key is returned.
+
+        :param keys: An iterable providing the key prefixes to be retrieved.
+            Each key prefix takes the form of a tuple the length of a key, but
+            with the last N elements 'None' rather than a regular bytestring.
+            The first element cannot be 'None'.
+        :return: An iterable as per iter_all_entries, but restricted to the
+            keys with a matching prefix to those supplied. No additional keys
+            will be returned, and every match that is in the index will be
+            returned.
+        """
+        # XXX: To much duplication with the GraphIndex class; consider finding
+        # a good place to pull out the actual common logic.
+        keys = set(keys)
+        if not keys:
+            return
+        if self._key_length == 1:
+            for key in keys:
+                # sanity check
+                if key[0] is None:
+                    raise errors.BadIndexKey(key)
+                if len(key) != self._key_length:
+                    raise errors.BadIndexKey(key)
+                node = self._nodes[key]
+                if node[0]:
+                    continue 
+                if self.reference_lists:
+                    yield self, key, node[2], node[1]
+                else:
+                    yield self, key, node[2]
+            return
+        for key in keys:
+            # sanity check
+            if key[0] is None:
+                raise errors.BadIndexKey(key)
+            if len(key) != self._key_length:
+                raise errors.BadIndexKey(key)
+            # find what it refers to:
+            key_dict = self._nodes_by_key
+            elements = list(key)
+            # find the subdict to return
+            try:
+                while len(elements) and elements[0] is not None:
+                    key_dict = key_dict[elements[0]]
+                    elements.pop(0)
+            except KeyError:
+                # a non-existant lookup.
+                continue
+            if len(elements):
+                dicts = [key_dict]
+                while dicts:
+                    key_dict = dicts.pop(-1)
+                    # can't be empty or would not exist
+                    item, value = key_dict.iteritems().next()
+                    if type(value) == dict:
+                        # push keys 
+                        dicts.extend(key_dict.itervalues())
+                    else:
+                        # yield keys
+                        for value in key_dict.itervalues():
+                            yield (self, ) + value
+            else:
+                yield (self, ) + key_dict
+
+    def key_count(self):
+        """Return an estimate of the number of keys in this index.
+        
+        For InMemoryGraphIndex the estimate is exact.
+        """
+        return len(self._keys)
+
+    def validate(self):
+        """In memory index's have no known corruption at the moment."""
+
+
+class BTreePack(NewPack):
+
+    def __init__(self, upload_transport, index_transport, pack_transport,
+        upload_suffix='', file_mode=None):
+        """Create a NewPack instance.
+
+        :param upload_transport: A writable transport for the pack to be
+            incrementally uploaded to.
+        :param index_transport: A writable transport for the pack's indices to
+            be written to when the pack is finished.
+        :param pack_transport: A writable transport for the pack to be renamed
+            to when the upload is complete. This *must* be the same as
+            upload_transport.clone('../packs').
+        :param upload_suffix: An optional suffix to be given to any temporary
+            files created during the pack creation. e.g '.autopack'
+        :param file_mode: An optional file mode to create the new files with.
+        """
+        # The relative locations of the packs are constrained, but all are
+        # passed in because the caller has them, so as to avoid object churn.
+        Pack.__init__(self,
+            # Revisions: parents list, no text compression.
+            InMemoryBTree(reference_lists=1),
+            # Inventory: We want to map compression only, but currently the
+            # knit code hasn't been updated enough to understand that, so we
+            # have a regular 2-list index giving parents and compression
+            # source.
+            InMemoryBTree(reference_lists=2),
+            # Texts: compression and per file graph, for all fileids - so two
+            # reference lists and two elements in the key tuple.
+            InMemoryBTree(reference_lists=2, key_elements=2),
+            # Signatures: Just blobs to store, no compression, no parents
+            # listing.
+            InMemoryBTree(reference_lists=0),
+            )
+        # where should the new pack be opened
+        self.upload_transport = upload_transport
+        # where are indices written out to
+        self.index_transport = index_transport
+        # where is the pack renamed to when it is finished?
+        self.pack_transport = pack_transport
+        # What file mode to upload the pack and indices with.
+        self._file_mode = file_mode
+        # tracks the content written to the .pack file.
+        self._hash = md5.new()
+        # a four-tuple with the length in bytes of the indices, once the pack
+        # is finalised. (rev, inv, text, sigs)
+        self.index_sizes = None
+        # How much data to cache when writing packs. Note that this is not
+        # synchronised with reads, because it's not in the transport layer, so
+        # is not safe unless the client knows it won't be reading from the pack
+        # under creation.
+        self._cache_limit = 0
+        # the temporary pack file name.
+        self.random_name = rand_chars(20) + upload_suffix
+        # when was this pack started ?
+        self.start_time = time.time()
+        # open an output stream for the data added to the pack.
+        self.write_stream = self.upload_transport.open_write_stream(
+            self.random_name, mode=self._file_mode)
+        if 'pack' in debug.debug_flags:
+            mutter('%s: create_pack: pack stream open: %s%s t+%6.3fs',
+                time.ctime(), self.upload_transport.base, self.random_name,
+                time.time() - self.start_time)
+        # A list of byte sequences to be written to the new pack, and the 
+        # aggregate size of them.  Stored as a list rather than separate 
+        # variables so that the _write_data closure below can update them.
+        self._buffer = [[], 0]
+        # create a callable for adding data 
+        #
+        # robertc says- this is a closure rather than a method on the object
+        # so that the variables are locals, and faster than accessing object
+        # members.
+        def _write_data(bytes, flush=False, _buffer=self._buffer,
+            _write=self.write_stream.write, _update=self._hash.update):
+            _buffer[0].append(bytes)
+            _buffer[1] += len(bytes)
+            # buffer cap
+            if _buffer[1] > self._cache_limit or flush:
+                bytes = ''.join(_buffer[0])
+                _write(bytes)
+                _update(bytes)
+                _buffer[:] = [[], 0]
+        # expose this on self, for the occasion when clients want to add data.
+        self._write_data = _write_data
+        # a pack writer object to serialise pack records.
+        self._writer = pack.ContainerWriter(self._write_data)
+        self._writer.begin()
+        # what state is the pack in? (open, finished, aborted)
+        self._state = 'open'
+
+    def _replace_index_with_readonly(self, index_type):
+        setattr(self, index_type + '_index',
+            BTreeGraphIndex(self.index_transport,
+                self.index_name(index_type, self.name),
+                self.index_sizes[self.index_offset(index_type)]))
+
+
+RepositoryPackCollection.pack_factory = NewPack
+
+class BTreeRepositoryPackCollection(RepositoryPackCollection):
+
+    pack_factory = BTreePack
+
+    def _make_index(self, name, suffix):
+        """Overridden to use BTreeGraphIndex objects."""
+        size_offset = self._suffix_offsets[suffix]
+        index_name = name + suffix
+        index_size = self._names[name][size_offset]
+        return BTreeGraphIndex(
+            self._index_transport, index_name, index_size)
+
+    def _start_write_group(self):
+        # Do not permit preparation for writing if we're not in a 'write lock'.
+        if not self.repo.is_write_locked():
+            raise errors.NotWriteLocked(self)
+        self._new_pack = self.pack_factory(self._upload_transport, self._index_transport,
+            self._pack_transport, upload_suffix='.pack',
+            file_mode=self.repo.bzrdir._get_file_mode())
+        # allow writing: queue writes to a new index
+        self.revision_index.add_writable_index(self._new_pack.revision_index,
+            self._new_pack)
+        self.inventory_index.add_writable_index(self._new_pack.inventory_index,
+            self._new_pack)
+        self.text_index.add_writable_index(self._new_pack.text_index,
+            self._new_pack)
+        self.signature_index.add_writable_index(self._new_pack.signature_index,
+            self._new_pack)
+
+        self.repo.inventories._index._add_callback = self.inventory_index.add_callback
+        self.repo.revisions._index._add_callback = self.revision_index.add_callback
+        self.repo.signatures._index._add_callback = self.signature_index.add_callback
+        self.repo.texts._index._add_callback = self.text_index.add_callback
+
+
+
+class BTreePackRepository(KnitPackRepository):
+    """BTree customisation of KnitPackRepository."""
+
+    def __init__(self, _format, a_bzrdir, control_files, _commit_builder_class,
+        _serializer):
+        """Overridden to change pack collection class."""
+        KnitPackRepository.__init__(self, _format, a_bzrdir, control_files,
+            _commit_builder_class, _serializer)
+        # and now replace everything it did :)
+        index_transport = self._transport.clone('indices')
+        self._pack_collection = BTreeRepositoryPackCollection(self,
+            self._transport, index_transport,
+            self._transport.clone('upload'),
+            self._transport.clone('packs'))
+        self.inventories = KnitVersionedFiles(
+            _KnitGraphIndex(self._pack_collection.inventory_index.combined_index,
+                add_callback=self._pack_collection.inventory_index.add_callback,
+                deltas=True, parents=True, is_locked=self.is_locked),
+            data_access=self._pack_collection.inventory_index.data_access,
+            max_delta_chain=200)
+        self.revisions = KnitVersionedFiles(
+            _KnitGraphIndex(self._pack_collection.revision_index.combined_index,
+                add_callback=self._pack_collection.revision_index.add_callback,
+                deltas=False, parents=True, is_locked=self.is_locked),
+            data_access=self._pack_collection.revision_index.data_access,
+            max_delta_chain=0)
+        self.signatures = KnitVersionedFiles(
+            _KnitGraphIndex(self._pack_collection.signature_index.combined_index,
+                add_callback=self._pack_collection.signature_index.add_callback,
+                deltas=False, parents=False, is_locked=self.is_locked),
+            data_access=self._pack_collection.signature_index.data_access,
+            max_delta_chain=0)
+        self.texts = KnitVersionedFiles(
+            _KnitGraphIndex(self._pack_collection.text_index.combined_index,
+                add_callback=self._pack_collection.text_index.add_callback,
+                deltas=True, parents=True, is_locked=self.is_locked),
+            data_access=self._pack_collection.text_index.data_access,
+            max_delta_chain=200)
+        # True when the repository object is 'write locked' (as opposed to the
+        # physical lock only taken out around changes to the pack-names list.) 
+        # Another way to represent this would be a decorator around the control
+        # files object that presents logical locks as physical ones - if this
+        # gets ugly consider that alternative design. RBC 20071011
+        self._write_lock_count = 0
+        self._transaction = None
+        # for tests
+        self._reconcile_does_inventory_gc = True
+        self._reconcile_fixes_text_parents = True
+        self._reconcile_backsup_inventory = False
+
+
+class RepositoryFormatPackBTreePlain(RepositoryFormatPackDevelopment0):
+    """A B+Tree index using pack repository."""
+
+    repository_class = BTreePackRepository
+
+    def get_format_string(self):
+        """See RepositoryFormat.get_format_string()."""
+        return ("Bazaar development format - btree "
+            "(needs bzr.dev from 1.6)\n")
+
+    def get_format_description(self):
+        """See RepositoryFormat.get_format_description()."""
+        return ("Development repository format - btree, interoperates with "
+            "pack-0.92\n")
+
+