Rev 32: Prototype fixed memory implementation of GraphIndex using disk B+Trees. in http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk
Robert Collins
robertc at robertcollins.net
Sun Jul 13 21:34:34 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/index2/trunk
------------------------------------------------------------
revno: 32
revision-id: robertc at robertcollins.net-20080713203432-v3c9er9b7j8x6awy
parent: robertc at robertcollins.net-20080713173905-dxz6v9ke8iynar4l
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Mon 2008-07-14 06:34:32 +1000
message:
Prototype fixed memory implementation of GraphIndex using disk B+Trees.
modified:
btree_index.py index.py-20080624222253-p0x5f92uyh5hw734-7
repofmt.py repofmt.py-20080701113732-m1iu3n94ikbxdelb-1
=== modified file 'btree_index.py'
--- a/btree_index.py 2008-07-13 17:39:05 +0000
+++ b/btree_index.py 2008-07-13 20:34:32 +0000
@@ -1307,6 +1307,103 @@
pass
+class FixedMemoryGraphIndex(index.GraphIndex):
+ """A variant of GraphIndex which spools to disk during parsing.
+
+ This is useful for avoiding particularly high memory use which can occur
+ with very large GraphIndices. As it imposes a performance overhead due
+ to generating disk-based b+tree's, it is not suitable for general use.
+
+ Also, only iter_all_entries is implemented.
+ """
+
+ def __init__(self, transport, name, size):
+ index.GraphIndex.__init__(self, transport, name, size)
+ self._locs = None
+ self._key_details = None
+
+ def _resolve_locs(self, references):
+ """Return the resolved key references for references.
+
+ References are resolved by looking up the location of the key in the
+ _keys_by_offset map and substituting the key name, preserving ordering.
+
+ :param references: An iterable of iterables of key locations. e.g.
+ [[123, 456], [123]]
+ :return: A tuple of tuples of keys.
+ """
+ key_offsets = {}
+ needed_refs = set()
+ node_refs = []
+ for ref_list in references:
+ for ref in ref_list:
+ needed_refs.add(ref[:1])
+ for node in self._locs.iter_entries(needed_refs):
+ key_offsets[node[1][0]] = tuple(node[2].split(' '))
+ for ref_list in references:
+ node_refs.append(tuple([key_offsets[ref[0]] for ref in ref_list]))
+ return tuple(node_refs)
+
+ def iter_all_entries(self):
+ if self._locs is None:
+ self._scan_index()
+ if self.node_ref_lists:
+ for node in self._key_details.iter_all_entries():
+ refs = self._resolve_locs(node[3])
+ yield (self,) + node[1:3] + (refs,)
+ else:
+ for node in self._key_details.iter_all_entries():
+ yield (self,) + node[1:3]
+
+ def _scan_index(self):
+ stream = self._transport.get(self._name)
+ self._read_prefix(stream)
+ self._expected_elements = 3 + self._key_length
+ line_count = 0
+ self._locs = BTreeBuilder(key_elements=1, reference_lists=0)
+ self._key_details = BTreeBuilder(key_elements=self._key_length,
+ reference_lists=self.node_ref_lists)
+ # raw data keyed by offset
+ self._keys_by_offset = {}
+ # ready-to-return key:value or key:value, node_ref_lists
+ self._nodes = {}
+ self._nodes_by_key = {}
+ trailers = 0
+ pos = stream.tell()
+ lines = stream.read().split('\n')
+ del lines[-1]
+ key = None
+ first_key = None
+ trailers = 0
+ for line in lines:
+ if line == '':
+ # must be at the end
+ if self._size:
+ if not (self._size == pos + 1):
+ raise AssertionError("%s %s" % (self._size, pos))
+ trailers += 1
+ continue
+ elements = line.split('\0')
+ if len(elements) != self._expected_elements:
+ raise errors.BadIndexData(self)
+ # keys are tuples
+ key = tuple(elements[:self._key_length])
+ if first_key is None:
+ first_key = key
+ absent, references, value = elements[-3:]
+ ref_lists = []
+ for ref_string in references.split('\t'):
+ ref_lists.append(tuple([
+ (str(int(ref)),) + ('X',) * (self._key_length - 1) for
+ ref in ref_string.split('\r') if ref]))
+ ref_lists = tuple(ref_lists)
+ self._locs.add_node((str(pos),), ' '.join(key))
+ pos += len(line) + 1 # +1 for the \n
+ if absent:
+ continue
+ self._key_details.add_node(key, value, ref_lists)
+
+
try:
from bzrlib.plugins.index2 import _parse_btree_c as _parse_btree
except ImportError:
=== modified file 'repofmt.py'
--- a/repofmt.py 2008-07-13 11:51:55 +0000
+++ b/repofmt.py 2008-07-13 20:34:32 +0000
@@ -25,9 +25,10 @@
import time
from bzrlib import debug, errors, pack, repository
-from bzrlib.index import GraphIndexBuilder
+from bzrlib.index import GraphIndex, GraphIndexBuilder
from bzrlib.inter import InterObject
-from bzrlib.plugins.index2.btree_index import BTreeGraphIndex, BTreeBuilder
+from bzrlib.plugins.index2.btree_index import BTreeGraphIndex, BTreeBuilder,
+ FixedMemoryGraphIndex
from bzrlib.knit import (
_KnitGraphIndex,
KnitVersionedFiles,
@@ -404,6 +405,10 @@
index_sizes = {}
for index in indices:
keys = index.key_count()
+ if len(keys) > 100000 and type(index) == GraphIndex:
+ # Avoid pathological memory scaling
+ index = FixedMemoryGraphIndex(index._transport, index._name,
+ index._size)
key_length = index._key_length
ref_lists = index.node_ref_lists
builder = BTreeBuilder(reference_lists=ref_lists, key_elements=key_length)
More information about the bazaar-commits
mailing list