Rev 32: Prototype fixed memory implementation of GraphIndex using disk B+Trees. in http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk

Robert Collins robertc at robertcollins.net
Sun Jul 13 21:34:34 BST 2008


At http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk

------------------------------------------------------------
revno: 32
revision-id: robertc at robertcollins.net-20080713203432-v3c9er9b7j8x6awy
parent: robertc at robertcollins.net-20080713173905-dxz6v9ke8iynar4l
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Mon 2008-07-14 06:34:32 +1000
message:
  Prototype fixed memory implementation of GraphIndex using disk B+Trees.
modified:
  btree_index.py                 index.py-20080624222253-p0x5f92uyh5hw734-7
  repofmt.py                     repofmt.py-20080701113732-m1iu3n94ikbxdelb-1
=== modified file 'btree_index.py'
--- a/btree_index.py	2008-07-13 17:39:05 +0000
+++ b/btree_index.py	2008-07-13 20:34:32 +0000
@@ -1307,6 +1307,103 @@
             pass
 
 
+class FixedMemoryGraphIndex(index.GraphIndex):
+    """A variant of GraphIndex which spools to disk during parsing.
+
+    This is useful for avoiding particularly high memory use which can occur
+    with very large GraphIndices. As it imposes a performance overhead due
+    to generating disk-based b+tree's, it is not suitable for general use.
+
+    Also, only iter_all_entries is implemented.
+    """
+
+    def __init__(self, transport, name, size):
+        index.GraphIndex.__init__(self, transport, name, size)
+        self._locs = None
+        self._key_details = None
+
+    def _resolve_locs(self, references):
+        """Return the resolved key references for references.
+        
+        References are resolved by looking up the location of the key in the
+        _keys_by_offset map and substituting the key name, preserving ordering.
+
+        :param references: An iterable of iterables of key locations. e.g. 
+            [[123, 456], [123]]
+        :return: A tuple of tuples of keys.
+        """
+        key_offsets = {}
+        needed_refs = set()
+        node_refs = []
+        for ref_list in references:
+            for ref in ref_list:
+                needed_refs.add(ref[:1])
+        for node in self._locs.iter_entries(needed_refs):
+            key_offsets[node[1][0]] = tuple(node[2].split(' '))
+        for ref_list in references:
+            node_refs.append(tuple([key_offsets[ref[0]] for ref in ref_list]))
+        return tuple(node_refs)
+
+    def iter_all_entries(self):
+        if self._locs is None:
+            self._scan_index()
+        if self.node_ref_lists:
+            for node in self._key_details.iter_all_entries():
+                refs = self._resolve_locs(node[3])
+                yield (self,) + node[1:3] + (refs,)
+        else:
+            for node in self._key_details.iter_all_entries():
+                yield (self,) + node[1:3]
+
+    def _scan_index(self):
+        stream = self._transport.get(self._name)
+        self._read_prefix(stream)
+        self._expected_elements = 3 + self._key_length
+        line_count = 0
+        self._locs = BTreeBuilder(key_elements=1, reference_lists=0)
+        self._key_details = BTreeBuilder(key_elements=self._key_length,
+            reference_lists=self.node_ref_lists)
+        # raw data keyed by offset
+        self._keys_by_offset = {}
+        # ready-to-return key:value or key:value, node_ref_lists
+        self._nodes = {}
+        self._nodes_by_key = {}
+        trailers = 0
+        pos = stream.tell()
+        lines = stream.read().split('\n')
+        del lines[-1]
+        key = None
+        first_key = None
+        trailers = 0
+        for line in lines:
+            if line == '':
+                # must be at the end
+                if self._size:
+                    if not (self._size == pos + 1):
+                        raise AssertionError("%s %s" % (self._size, pos))
+                trailers += 1
+                continue
+            elements = line.split('\0')
+            if len(elements) != self._expected_elements:
+                raise errors.BadIndexData(self)
+            # keys are tuples
+            key = tuple(elements[:self._key_length])
+            if first_key is None:
+                first_key = key
+            absent, references, value = elements[-3:]
+            ref_lists = []
+            for ref_string in references.split('\t'):
+                ref_lists.append(tuple([
+                    (str(int(ref)),) + ('X',) * (self._key_length - 1) for 
+                    ref in ref_string.split('\r') if ref]))
+            ref_lists = tuple(ref_lists)
+            self._locs.add_node((str(pos),), ' '.join(key))
+            pos += len(line) + 1 # +1 for the \n
+            if absent:
+                continue
+            self._key_details.add_node(key, value, ref_lists)
+
+
 try:
     from bzrlib.plugins.index2 import _parse_btree_c as _parse_btree
 except ImportError:

=== modified file 'repofmt.py'
--- a/repofmt.py	2008-07-13 11:51:55 +0000
+++ b/repofmt.py	2008-07-13 20:34:32 +0000
@@ -25,9 +25,10 @@
 import time
 
 from bzrlib import debug, errors, pack, repository
-from bzrlib.index import GraphIndexBuilder
+from bzrlib.index import GraphIndex, GraphIndexBuilder
 from bzrlib.inter import InterObject
-from bzrlib.plugins.index2.btree_index import BTreeGraphIndex, BTreeBuilder
+from bzrlib.plugins.index2.btree_index import BTreeGraphIndex, BTreeBuilder,
+    FixedMemoryGraphIndex
 from bzrlib.knit import (
     _KnitGraphIndex,
     KnitVersionedFiles,
@@ -404,6 +405,10 @@
         index_sizes = {}
         for index in indices:
             keys = index.key_count()
+            if len(keys) > 100000 and type(index) == GraphIndex:
+                # Avoid pathological memory scaling
+                index = FixedMemoryGraphIndex(index._transport, index._name,
+                    index._size)
             key_length = index._key_length
             ref_lists = index.node_ref_lists
             builder = BTreeBuilder(reference_lists=ref_lists, key_elements=key_length)




More information about the bazaar-commits mailing list