Rev 3201: Start working on a new index serialiser. in http://people.ubuntu.com/~robertc/baz2.0/index.hashed
Robert Collins
robertc at robertcollins.net
Fri Jan 25 17:36:58 GMT 2008
At http://people.ubuntu.com/~robertc/baz2.0/index.hashed
------------------------------------------------------------
revno: 3201
revision-id:robertc at robertcollins.net-20080125173648-w4gxwn2gc9ofzi92
parent: pqm at pqm.ubuntu.com-20080124050323-gsgsp2em7v1ugtnz
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index.hashed
timestamp: Sat 2008-01-26 04:36:48 +1100
message:
Start working on a new index serialiser.
modified:
bzrlib/index.py index.py-20070712131115-lolkarso50vjr64s-1
bzrlib/tests/test_index.py test_index.py-20070712131115-lolkarso50vjr64s-2
doc/developers/indices.txt indices.txt-20070713142939-m5cdnp31u8ape0td-1
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py 2007-12-18 19:42:10 +0000
+++ b/bzrlib/index.py 2008-01-25 17:36:48 +0000
@@ -22,6 +22,7 @@
'GraphIndexBuilder',
'GraphIndexPrefixAdapter',
'InMemoryGraphIndex',
+ 'SortedGraphIndexBuilder',
]
from bisect import bisect_right
@@ -45,31 +46,19 @@
_OPTION_KEY_ELEMENTS = "key_elements="
_OPTION_LEN = "len="
_OPTION_NODE_REFS = "node_ref_lists="
-_SIGNATURE = "Bazaar Graph Index 1\n"
+_OPTION_HASH_BYTES = "hash_bytes="
+_OPTION_OFFSET_BYTES = "offset_bytes="
+_OPTION_NODE_OFFSET = "node_offset="
+_SIGNATURE_1 = "Bazaar Graph Index 1\n"
+_SIGNATURE_2 = "Bazaar Graph Index 2\n"
_whitespace_re = re.compile('[\t\n\x0b\x0c\r\x00 ]')
_newline_null_re = re.compile('[\n\0]')
-class GraphIndexBuilder(object):
- """A builder that can build a GraphIndex.
-
- The resulting graph has the structure:
-
- _SIGNATURE OPTIONS NODES NEWLINE
- _SIGNATURE := 'Bazaar Graph Index 1' NEWLINE
- OPTIONS := 'node_ref_lists=' DIGITS NEWLINE
- NODES := NODE*
- NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
- KEY := Not-whitespace-utf8
- ABSENT := 'a'
- REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
- REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
- REFERENCE := DIGITS ; digits is the byte offset in the index of the
- ; referenced key.
- VALUE := no-newline-no-null-bytes
- """
+class _GraphIndexBuilder(object):
+ """Common logic for graph index building."""
def __init__(self, reference_lists=0, key_elements=1):
"""Create a GraphIndex builder.
@@ -94,6 +83,10 @@
if not element or _whitespace_re.search(element) is not None:
raise errors.BadIndexKey(element)
+ def _add_custom_headers(self, lines):
+ """Output custom headers for this index."""
+ raise NotImplementedError(self._add_custom_headers)
+
def add_node(self, key, value, references=()):
"""Add a node to the index.
@@ -136,10 +129,11 @@
key_dict[key[-1]] = key_value
def finish(self):
- lines = [_SIGNATURE]
+ lines = [self._SIGNATURE]
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
lines.append(_OPTION_LEN + str(len(self._keys)) + '\n')
+ self._add_custom_headers(lines)
prefix_length = sum(len(x) for x in lines)
# references are byte offsets. To avoid having to do nasty
# polynomial work to resolve offsets (references to later in the
@@ -222,6 +216,77 @@
return StringIO(''.join(lines))
+class GraphIndexBuilder(_GraphIndexBuilder):
+ """A builder that can build a GraphIndex.
+
+ The resulting index has the structure:
+
+ _SIGNATURE OPTIONS NODES NEWLINE
+ _SIGNATURE := 'Bazaar Graph Index 1' NEWLINE
+ OPTIONS := 'node_ref_lists=' DIGITS NEWLINE
+ 'key_elements=' DIGITS NEWLINE
+ 'len=' DIGITS NEWLINE
+ NODES := NODE*
+ NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
+ KEY := Not-whitespace-utf8
+ ABSENT := 'a'
+ REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
+ REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
+ REFERENCE := DIGITS ; digits is the byte offset in the index of the
+ ; referenced key.
+ VALUE := no-newline-no-null-bytes
+ """
+
+ _SIGNATURE = _SIGNATURE_1
+
+ def _add_custom_headers(self, lines):
+ """Output custom headers for this index."""
+
+
+class SortedGraphIndexBuilder(_GraphIndexBuilder):
+ """A builder that can build a SortedGraphIndex.
+
+ The resulting index has the structure:
+ _SIGNATURE OPTIONS KEYHASH COLLISIONLIST NODES NEWLINE
+ _SIGNATURE := 'Bazaar Graph Index 2' NEWLINE
+ OPTIONS := 'node_ref_lists=' DIGITS NEWLINE
+ 'key_elements=' DIGITS NEWLINE
+ 'len=' DIGITS NEWLINE
+ 'hash_bytes=' DIGITS NEWLINE
+ 'offset_bytes=' DIGITS NEWLINE
+ 'node_offset=' DIGITS NEWLINE
+ KEYHASH := An array of offset_byte length integers in network byte
+ order. Each entry in the array is the location of the key
+ that had that hash prefix in the file, and the array is
+ complete (that is, if hash_bytes=1, there are 256 entries
+ in the array). In pathological cases we may need to
+ handle collisions (so that we don't have many gigabytes
+ of index when two entries hash closely: in those cases
+ the location pointed at will be in the collision-handling
+ list. NEWLINE follows the keyhash for debugging ease.
+ COLLISIONLIST := COLLISION * (in lexographic order)
+ COLLISION := KEY NULL REFERENCE NEWLINE
+ NODES := NODE*
+ NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
+ KEY := Not-whitespace-utf8
+ ABSENT := 'a'
+ REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
+ REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
+ REFERENCE := DIGITS ; digits is the byte offset in the index of the
+ ; referenced key.
+ VALUE := no-newline-no-null-bytes
+ """
+
+ _SIGNATURE = _SIGNATURE_2
+
+ def _add_custom_headers(self, lines):
+ """Output custom headers for this index."""
+ lines.append(_OPTION_HASH_BYTES + str(0) + '\n')
+ lines.append(_OPTION_OFFSET_BYTES + str(0) + '\n')
+ lines.append(_OPTION_NODE_OFFSET + str(0) + '\n')
+ lines.append('\n')
+
+
class GraphIndex(object):
"""An index for data with embedded graphs.
@@ -965,7 +1030,7 @@
def _signature(self):
"""The file signature for this index type."""
- return _SIGNATURE
+ return _SIGNATURE_1
def validate(self):
"""Validate that everything in the index can be accessed."""
=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py 2007-10-15 07:56:04 +0000
+++ b/bzrlib/tests/test_index.py 2008-01-25 17:36:48 +0000
@@ -22,15 +22,14 @@
from bzrlib.transport import get_transport
-class TestGraphIndexBuilder(TestCaseWithMemoryTransport):
+class BaseBuilderTests(object):
+ """Base tests for index building tests."""
def test_build_index_empty(self):
- builder = GraphIndexBuilder()
+ builder = self.make_builder()
stream = builder.finish()
contents = stream.read()
- self.assertEqual(
- "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=0\n\n",
- contents)
+ self.assertEqual(self.build_index_empty_bytes(), contents)
def test_build_index_empty_two_element_keys(self):
builder = GraphIndexBuilder(key_elements=2)
@@ -351,6 +350,38 @@
builder.add_node(('reference', 'tokey'), 'data', ([],))
+class TestGraphIndexBuilder(TestCaseWithMemoryTransport, BaseBuilderTests):
+
+ def make_builder(self):
+ return GraphIndexBuilder()
+
+ def build_index_empty_bytes(self):
+ return (
+ "Bazaar Graph Index 1\n"
+ "node_ref_lists=0\n"
+ "key_elements=1\n"
+ "len=0\n"
+ "\n")
+
+
+class TestSortedGraphIndexBuilder(TestCaseWithMemoryTransport, BaseBuilderTests):
+
+ def make_builder(self):
+ return SortedGraphIndexBuilder()
+
+ def build_index_empty_bytes(self):
+ return (
+ "Bazaar Graph Index 2\n"
+ "node_ref_lists=0\n"
+ "key_elements=1\n"
+ "len=0\n"
+ "hash_bytes=0\n"
+ "offset_bytes=0\n"
+ "node_offset=0\n"
+ "\n"
+ "\n")
+
+
class TestGraphIndex(TestCaseWithMemoryTransport):
def make_index(self, ref_lists=0, key_elements=1, nodes=[]):
=== modified file 'doc/developers/indices.txt'
--- a/doc/developers/indices.txt 2007-07-15 15:40:37 +0000
+++ b/doc/developers/indices.txt 2008-01-25 17:36:48 +0000
@@ -103,7 +103,13 @@
``finish`` to obtain a file stream containing the index data. Multiple
indices may be queried using the ``CombinedGraphIndex`` class.
+SortedGraphIndex
+----------------
+``SortedGraphIndex`` supports graph based lookups as ``GraphIndex`` does.
+By decoupling the disk sort order and key lookup a ``SortedGraphIndex``
+can sort the disk contents into topological order allowing
+mainly-forward-io and less round trips during index access.
..
More information about the bazaar-commits
mailing list