Rev 3201: Start working on a new index serialiser. in http://people.ubuntu.com/~robertc/baz2.0/index.hashed

Robert Collins robertc at robertcollins.net
Fri Jan 25 17:36:58 GMT 2008


At http://people.ubuntu.com/~robertc/baz2.0/index.hashed

------------------------------------------------------------
revno: 3201
revision-id:robertc at robertcollins.net-20080125173648-w4gxwn2gc9ofzi92
parent: pqm at pqm.ubuntu.com-20080124050323-gsgsp2em7v1ugtnz
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index.hashed
timestamp: Sat 2008-01-26 04:36:48 +1100
message:
  Start working on a new index serialiser.
modified:
  bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
  bzrlib/tests/test_index.py     test_index.py-20070712131115-lolkarso50vjr64s-2
  doc/developers/indices.txt     indices.txt-20070713142939-m5cdnp31u8ape0td-1
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py	2007-12-18 19:42:10 +0000
+++ b/bzrlib/index.py	2008-01-25 17:36:48 +0000
@@ -22,6 +22,7 @@
     'GraphIndexBuilder',
     'GraphIndexPrefixAdapter',
     'InMemoryGraphIndex',
+    'SortedGraphIndexBuilder',
     ]
 
 from bisect import bisect_right
@@ -45,31 +46,19 @@
 _OPTION_KEY_ELEMENTS = "key_elements="
 _OPTION_LEN = "len="
 _OPTION_NODE_REFS = "node_ref_lists="
-_SIGNATURE = "Bazaar Graph Index 1\n"
+_OPTION_HASH_BYTES = "hash_bytes="
+_OPTION_OFFSET_BYTES = "offset_bytes="
+_OPTION_NODE_OFFSET = "node_offset="
+_SIGNATURE_1 = "Bazaar Graph Index 1\n"
+_SIGNATURE_2 = "Bazaar Graph Index 2\n"
 
 
 _whitespace_re = re.compile('[\t\n\x0b\x0c\r\x00 ]')
 _newline_null_re = re.compile('[\n\0]')
 
 
-class GraphIndexBuilder(object):
-    """A builder that can build a GraphIndex.
-    
-    The resulting graph has the structure:
-    
-    _SIGNATURE OPTIONS NODES NEWLINE
-    _SIGNATURE     := 'Bazaar Graph Index 1' NEWLINE
-    OPTIONS        := 'node_ref_lists=' DIGITS NEWLINE
-    NODES          := NODE*
-    NODE           := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
-    KEY            := Not-whitespace-utf8
-    ABSENT         := 'a'
-    REFERENCES     := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
-    REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
-    REFERENCE      := DIGITS  ; digits is the byte offset in the index of the
-                              ; referenced key.
-    VALUE          := no-newline-no-null-bytes
-    """
+class _GraphIndexBuilder(object):
+    """Common logic for graph index building."""
 
     def __init__(self, reference_lists=0, key_elements=1):
         """Create a GraphIndex builder.
@@ -94,6 +83,10 @@
             if not element or _whitespace_re.search(element) is not None:
                 raise errors.BadIndexKey(element)
 
+    def _add_custom_headers(self, lines):
+        """Output custom headers for this index."""
+        raise NotImplementedError(self._add_custom_headers)
+
     def add_node(self, key, value, references=()):
         """Add a node to the index.
 
@@ -136,10 +129,11 @@
             key_dict[key[-1]] = key_value
 
     def finish(self):
-        lines = [_SIGNATURE]
+        lines = [self._SIGNATURE]
         lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
         lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
         lines.append(_OPTION_LEN + str(len(self._keys)) + '\n')
+        self._add_custom_headers(lines)
         prefix_length = sum(len(x) for x in lines)
         # references are byte offsets. To avoid having to do nasty
         # polynomial work to resolve offsets (references to later in the 
@@ -222,6 +216,77 @@
         return StringIO(''.join(lines))
 
 
+class GraphIndexBuilder(_GraphIndexBuilder):
+    """A builder that can build a GraphIndex.
+    
+    The resulting index has the structure:
+    
+    _SIGNATURE OPTIONS NODES NEWLINE
+    _SIGNATURE     := 'Bazaar Graph Index 1' NEWLINE
+    OPTIONS        := 'node_ref_lists=' DIGITS NEWLINE
+                      'key_elements=' DIGITS NEWLINE
+                      'len=' DIGITS NEWLINE
+    NODES          := NODE*
+    NODE           := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
+    KEY            := Not-whitespace-utf8
+    ABSENT         := 'a'
+    REFERENCES     := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
+    REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
+    REFERENCE      := DIGITS  ; digits is the byte offset in the index of the
+                              ; referenced key.
+    VALUE          := no-newline-no-null-bytes
+    """
+
+    _SIGNATURE = _SIGNATURE_1
+
+    def _add_custom_headers(self, lines):
+        """Output custom headers for this index."""
+
+
+class SortedGraphIndexBuilder(_GraphIndexBuilder):
+    """A builder that can build a SortedGraphIndex.
+
+    The resulting index has the structure:
+    _SIGNATURE OPTIONS KEYHASH COLLISIONLIST NODES NEWLINE
+    _SIGNATURE     := 'Bazaar Graph Index 2' NEWLINE
+    OPTIONS        := 'node_ref_lists=' DIGITS NEWLINE
+                      'key_elements=' DIGITS NEWLINE
+                      'len=' DIGITS NEWLINE
+                      'hash_bytes=' DIGITS NEWLINE
+                      'offset_bytes=' DIGITS NEWLINE
+                      'node_offset=' DIGITS NEWLINE
+    KEYHASH        := An array of offset_byte length integers in network byte
+                      order. Each entry in the array is the location of the key
+                      that had that hash prefix in the file, and the array is
+                      complete (that is, if hash_bytes=1, there are 256 entries
+                      in the array). In pathological cases we may need to
+                      handle collisions (so that we don't have many gigabytes
+                      of index when two entries hash closely: in those cases
+                      the location pointed at will be in the collision-handling
+                      list. NEWLINE follows the keyhash for debugging ease.
+    COLLISIONLIST  := COLLISION * (in lexographic order)
+    COLLISION      := KEY NULL REFERENCE NEWLINE
+    NODES          := NODE*
+    NODE           := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
+    KEY            := Not-whitespace-utf8
+    ABSENT         := 'a'
+    REFERENCES     := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
+    REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
+    REFERENCE      := DIGITS  ; digits is the byte offset in the index of the
+                              ; referenced key.
+    VALUE          := no-newline-no-null-bytes
+    """
+
+    _SIGNATURE = _SIGNATURE_2
+
+    def _add_custom_headers(self, lines):
+        """Output custom headers for this index."""
+        lines.append(_OPTION_HASH_BYTES + str(0) + '\n')
+        lines.append(_OPTION_OFFSET_BYTES + str(0) + '\n')
+        lines.append(_OPTION_NODE_OFFSET + str(0) + '\n')
+        lines.append('\n')
+
+
 class GraphIndex(object):
     """An index for data with embedded graphs.
  
@@ -965,7 +1030,7 @@
 
     def _signature(self):
         """The file signature for this index type."""
-        return _SIGNATURE
+        return _SIGNATURE_1
 
     def validate(self):
         """Validate that everything in the index can be accessed."""

=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py	2007-10-15 07:56:04 +0000
+++ b/bzrlib/tests/test_index.py	2008-01-25 17:36:48 +0000
@@ -22,15 +22,14 @@
 from bzrlib.transport import get_transport
 
 
-class TestGraphIndexBuilder(TestCaseWithMemoryTransport):
+class BaseBuilderTests(object):
+    """Base tests for index building tests."""
 
     def test_build_index_empty(self):
-        builder = GraphIndexBuilder()
+        builder = self.make_builder()
         stream = builder.finish()
         contents = stream.read()
-        self.assertEqual(
-            "Bazaar Graph Index 1\nnode_ref_lists=0\nkey_elements=1\nlen=0\n\n",
-            contents)
+        self.assertEqual(self.build_index_empty_bytes(), contents)
 
     def test_build_index_empty_two_element_keys(self):
         builder = GraphIndexBuilder(key_elements=2)
@@ -351,6 +350,38 @@
         builder.add_node(('reference', 'tokey'), 'data', ([],))
 
 
+class TestGraphIndexBuilder(TestCaseWithMemoryTransport, BaseBuilderTests):
+
+    def make_builder(self):
+        return GraphIndexBuilder()
+
+    def build_index_empty_bytes(self):
+        return (
+            "Bazaar Graph Index 1\n"
+            "node_ref_lists=0\n"
+            "key_elements=1\n"
+            "len=0\n"
+            "\n")
+
+
+class TestSortedGraphIndexBuilder(TestCaseWithMemoryTransport, BaseBuilderTests):
+
+    def make_builder(self):
+        return SortedGraphIndexBuilder()
+
+    def build_index_empty_bytes(self):
+        return (
+            "Bazaar Graph Index 2\n"
+            "node_ref_lists=0\n"
+            "key_elements=1\n"
+            "len=0\n"
+            "hash_bytes=0\n"
+            "offset_bytes=0\n"
+            "node_offset=0\n"
+            "\n"
+            "\n")
+
+
 class TestGraphIndex(TestCaseWithMemoryTransport):
 
     def make_index(self, ref_lists=0, key_elements=1, nodes=[]):

=== modified file 'doc/developers/indices.txt'
--- a/doc/developers/indices.txt	2007-07-15 15:40:37 +0000
+++ b/doc/developers/indices.txt	2008-01-25 17:36:48 +0000
@@ -103,7 +103,13 @@
 ``finish`` to obtain a file stream containing the index data. Multiple
 indices may be queried using the ``CombinedGraphIndex`` class.
 
+SortedGraphIndex
+----------------
 
+``SortedGraphIndex`` supports graph based lookups as ``GraphIndex`` does.
+By decoupling the disk sort order and key lookup a ``SortedGraphIndex``
+can sort the disk contents into topological order allowing
+mainly-forward-io and less round trips during index access.
 
 
 ..



More information about the bazaar-commits mailing list