Rev 3666: Updates to GraphIndex processing. in http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/graph_index_autobuffer

Fri Aug 29 18:13:33 BST 2008

At http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/graph_index_autobuffer

------------------------------------------------------------
revno: 3666
revision-id: john at arbash-meinel.com-20080829171330-mrui49146o15ak98
parent: pqm at pqm.ubuntu.com-20080829062746-ny482m2f2pukdhqt
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: graph_index_autobuffer
timestamp: Fri 2008-08-29 12:13:30 -0500
message:
  Updates to GraphIndex processing.
  
  If _read_and_parse() ends up reading the whole index, pass that
  on to _buffer_all() rather than doing the normal bisect code.
  This happens a lot when reading indexes over remote transports
  which read a 64kB page at a time. The old code would end up
  reading the whole file, parsing it into bisect regions, and
  then reading the whole file again in the next call.
  
  This also includes refactoring of the graph index test suite
  to pull out a lot of redundant code into helper functions.
-------------- next part --------------
=== modified file 'bzrlib/index.py'

--- a/bzrlib/index.py	2008-08-14 19:22:59 +0000
+++ b/bzrlib/index.py	2008-08-29 17:13:30 +0000
@@ -288,14 +288,18 @@
         return "%s(%r)" % (self.__class__.__name__,
             self._transport.abspath(self._name))
 
-    def _buffer_all(self):
+    def _buffer_all(self, stream=None):
         """Buffer all the index data.
 
         Mutates self._nodes and self.keys_by_offset.
         """
+        if self._nodes is not None:
+            # We already did this
+            return
         if 'index' in debug.debug_flags:
             mutter('Reading entire index %s', self._transport.abspath(self._name))
-        stream = self._transport.get(self._name)
+        if stream is None:
+            stream = self._transport.get(self._name)
         self._read_prefix(stream)
         self._expected_elements = 3 + self._key_length
         line_count = 0
@@ -629,10 +633,24 @@
         if self._bisect_nodes is None:
             readv_ranges.append(_HEADER_READV)
         self._read_and_parse(readv_ranges)
+        result = []
+        if self._nodes is not None:
+            # _read_and_parse triggered a _buffer_all because we requested the
+            # whole data range
+            for location, key in location_keys:
+                if key not in self._nodes: # not present
+                    result.append(((location, key), False))
+                elif self.node_ref_lists:
+                    value, refs = self._nodes[key]
+                    result.append(((location, key),
+                        (self, key, value, refs)))
+                else:
+                    result.append(((location, key),
+                        (self, key, self._nodes[key])))
+            return result
         # generate results:
         #  - figure out <, >, missing, present
         #  - result present references so we can return them.
-        result = []
         # keys that we cannot answer until we resolve references
         pending_references = []
         pending_locations = set()
@@ -971,6 +989,17 @@
                 self._size)
             # parse
             for offset, data in readv_data:
+                if offset == 0 and len(data) == self._size:
+                    # We 'accidentally' read the whole range, go straight into
+                    # '_buffer_all'. This could happen because the transport
+                    # upcast our readv request, or because we actually need
+                    # most of the file.
+                    # TODO: This could have been triggered by
+                    # _lookup_keys_via_location, so we can't just return
+                    # unconditionally here, we need to still fill out the
+                    # _bisect_nodes list.
+                    self._buffer_all(StringIO(data))
+                    return
                 if self._bisect_nodes is None:
                     # this must be the start
                     if not (offset == 0):

=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py	2007-10-15 07:56:04 +0000
+++ b/bzrlib/tests/test_index.py	2008-08-29 17:13:30 +0000
@@ -353,6 +353,20 @@
 
 class TestGraphIndex(TestCaseWithMemoryTransport):
 
+    def make_key(self, number):
+        return (str(number) + 'X'*100,)
+
+    def make_value(self, number):
+            return str(number) + 'Y'*100
+
+    def make_nodes(self, count=64):
+        # generate a big enough index that we only read some of it on a typical
+        # bisection lookup.
+        nodes = []
+        for counter in range(count):
+            nodes.append((self.make_key(counter), self.make_value(counter), ()))
+        return nodes
+
     def make_index(self, ref_lists=0, key_elements=1, nodes=[]):
         builder = GraphIndexBuilder(ref_lists, key_elements=key_elements)
         for key, value, references in nodes:
@@ -372,62 +386,80 @@
         self.assertEqual([], index._parsed_byte_map)
         self.assertEqual([], index._parsed_key_map)
 
+    def test_key_count_buffers(self):
+        index = self.make_index(nodes=self.make_nodes(2))
+        # reset the transport log
+        del index._transport._activity[:]
+        self.assertEqual(2, index.key_count())
+        # We should have requested reading the header bytes
+        self.assertEqual([
+            ('readv', 'index', [(0, 200)], True, index._size),
+            ],
+            index._transport._activity)
+        # And that should have been enough to trigger reading the whole index
+        # with buffering
+        self.assertIsNot(None, index._nodes)
+
+    def test_lookup_key_via_location_buffers(self):
+        index = self.make_index()
+        # reset the transport log
+        del index._transport._activity[:]
+        # do a _lookup_keys_via_location call for the middle of the file, which
+        # is what bisection uses.
+        result = index._lookup_keys_via_location(
+            [(index._size // 2, ('missing', ))])
+        # this should have asked for a readv request, with adjust_for_latency,
+        # and two regions: the header, and half-way into the file.
+        self.assertEqual([
+            ('readv', 'index', [(30, 30), (0, 200)], True, 60),
+            ],
+            index._transport._activity)
+        # and the result should be that the key cannot be present, because this
+        # is a trivial index.
+        self.assertEqual([((index._size // 2, ('missing', )), False)],
+            result)
+        # And this should have caused the file to be fully buffered
+        self.assertIsNot(None, index._nodes)
+        self.assertEqual([], index._parsed_byte_map)
+
     def test_first_lookup_key_via_location(self):
-        index = self.make_index()
+        # We need enough data so that the _HEADER_READV doesn't consume the
+        # whole file. We always read 800 bytes for every key, and the local
+        # transport natural expansion is 4096 bytes. So we have to have >8192
+        # bytes or we will trigger "buffer_all".
+        # We also want the 'missing' key to fall within the range that *did*
+        # read
+        nodes = []
+        index = self.make_index(nodes=self.make_nodes(64))
         # reset the transport log
         del index._transport._activity[:]
         # do a _lookup_keys_via_location call for the middle of the file, which
         # is what bisection uses.
+        start_lookup = index._size // 2
         result = index._lookup_keys_via_location(
-            [(index._size // 2, ('missing', ))])
+            [(start_lookup, ('40missing', ))])
         # this should have asked for a readv request, with adjust_for_latency,
         # and two regions: the header, and half-way into the file.
         self.assertEqual([
-            ('readv', 'index', [(30, 30), (0, 200)], True, 60),
+            ('readv', 'index',
+             [(start_lookup, 800), (0, 200)], True, index._size),
             ],
             index._transport._activity)
         # and the result should be that the key cannot be present, because this
         # is a trivial index.
-        self.assertEqual([((index._size // 2, ('missing', )), False)],
-            result)
-        # And the regions of the file that have been parsed - in this case the
-        # entire file - should be in the parsed region map.
-        self.assertEqual([(0, 60)], index._parsed_byte_map)
-        self.assertEqual([(None, None)], index._parsed_key_map)
-
-    def test_parsing_parses_data_adjacent_to_parsed_regions(self):
-        # we trim data we recieve to remove the first and trailing
-        # partial lines, except when they start at the end/finish at the start
-        # of a region we've alread parsed/ the end of the file. The trivial
-        # test for this is an index with 1 key.
-        index = self.make_index(nodes=[(('name', ), 'data', ())])
-        # reset the transport log
-        del index._transport._activity[:]
-        result = index._lookup_keys_via_location(
-            [(index._size // 2, ('missing', ))])
-        # this should have asked for a readv request, with adjust_for_latency,
-        # and two regions: the header, and half-way into the file.
-        self.assertEqual([
-            ('readv', 'index', [(36, 36), (0, 200)], True, 72),
-            ],
-            index._transport._activity)
-        # and the result should be that the key cannot be present, because this
-        # is a trivial index and we should not have to do more round trips.
-        self.assertEqual([((index._size // 2, ('missing', )), False)],
-            result)
-        # The whole file should be parsed at this point.
-        self.assertEqual([(0, 72)], index._parsed_byte_map)
-        self.assertEqual([(None, ('name',))], index._parsed_key_map)
+        self.assertEqual([((start_lookup, ('40missing', )), False)],
+            result)
+        # And this should not have caused the file to be fully buffered
+        self.assertIs(None, index._nodes)
+        # And the regions of the file that have been parsed should be in the
+        # parsed_byte_map and the parsed_key_map
+        self.assertEqual([(0, 4008), (5046, 8996)], index._parsed_byte_map)
+        self.assertEqual([(None, self.make_key(26)),
+                          (self.make_key(31), self.make_key(48))],
+                         index._parsed_key_map)
 
     def test_parsing_non_adjacent_data_trims(self):
-        # generate a big enough index that we only read some of it on a typical
-        # bisection lookup.
-        nodes = []
-        def make_key(number):
-            return (str(number) + 'X'*100,)
-        for counter in range(64):
-            nodes.append((make_key(counter), 'Y'*100, ()))
-        index = self.make_index(nodes=nodes)
+        index = self.make_index(nodes=self.make_nodes(64))
         result = index._lookup_keys_via_location(
             [(index._size // 2, ('40', ))])
         # and the result should be that the key cannot be present, because key is
@@ -437,8 +469,9 @@
             result)
         # and we should have a parse map that includes the header and the
         # region that was parsed after trimming.
-        self.assertEqual([(0, 3972), (5001, 8914)], index._parsed_byte_map)
-        self.assertEqual([(None, make_key(26)), (make_key(31), make_key(48))],
+        self.assertEqual([(0, 4008), (5046, 8996)], index._parsed_byte_map)
+        self.assertEqual([(None, self.make_key(26)),
+                          (self.make_key(31), self.make_key(48))],
             index._parsed_key_map)
 
     def test_parsing_data_handles_parsed_contained_regions(self):
@@ -448,53 +481,45 @@
         # which then trims the start and end so the parsed size is < readv
         # miniumum.
         # then a dual lookup (or a reference lookup for that matter) which
-        # abuts or overlaps the parsed region on both sides will need to 
+        # abuts or overlaps the parsed region on both sides will need to
         # discard the data in the middle, but parse the end as well.
         #
-        # we test this by doing a single lookup to seed the data, then 
-        # a lookup for two keys that are present, and adjacent - 
+        # we test this by doing a single lookup to seed the data, then
+        # a lookup for two keys that are present, and adjacent -
         # we except both to be found, and the parsed byte map to include the
         # locations of both keys.
-        nodes = []
-        def make_key(number):
-            return (str(number) + 'X'*100,)
-        def make_value(number):
-            return 'Y'*100
-        for counter in range(128):
-            nodes.append((make_key(counter), make_value(counter), ()))
-        index = self.make_index(nodes=nodes)
+        index = self.make_index(nodes=self.make_nodes(128))
         result = index._lookup_keys_via_location(
             [(index._size // 2, ('40', ))])
         # and we should have a parse map that includes the header and the
         # region that was parsed after trimming.
-        self.assertEqual([(0, 3991), (11622, 15534)], index._parsed_byte_map)
-        self.assertEqual([(None, make_key(116)), (make_key(35), make_key(51))],
+        self.assertEqual([(0, 4045), (11759, 15707)], index._parsed_byte_map)
+        self.assertEqual([(None, self.make_key(116)),
+                          (self.make_key(35), self.make_key(51))],
             index._parsed_key_map)
         # now ask for two keys, right before and after the parsed region
         result = index._lookup_keys_via_location(
-            [(11450, make_key(34)), (15534, make_key(52))])
+            [(11450, self.make_key(34)), (15707, self.make_key(52))])
         self.assertEqual([
-            ((11450, make_key(34)), (index, make_key(34), make_value(34))),
-            ((15534, make_key(52)), (index, make_key(52), make_value(52))),
+            ((11450, self.make_key(34)),
+             (index, self.make_key(34), self.make_value(34))),
+            ((15707, self.make_key(52)),
+             (index, self.make_key(52), self.make_value(52))),
             ],
             result)
-        self.assertEqual([(0, 3991), (9975, 17799)], index._parsed_byte_map)
+        self.assertEqual([(0, 4045), (9889, 17993)], index._parsed_byte_map)
 
     def test_lookup_missing_key_answers_without_io_when_map_permits(self):
         # generate a big enough index that we only read some of it on a typical
         # bisection lookup.
-        nodes = []
-        def make_key(number):
-            return (str(number) + 'X'*100,)
-        for counter in range(64):
-            nodes.append((make_key(counter), 'Y'*100, ()))
-        index = self.make_index(nodes=nodes)
+        index = self.make_index(nodes=self.make_nodes(64))
         # lookup the keys in the middle of the file
         result =index._lookup_keys_via_location(
             [(index._size // 2, ('40', ))])
         # check the parse map, this determines the test validity
-        self.assertEqual([(0, 3972), (5001, 8914)], index._parsed_byte_map)
-        self.assertEqual([(None, make_key(26)), (make_key(31), make_key(48))],
+        self.assertEqual([(0, 4008), (5046, 8996)], index._parsed_byte_map)
+        self.assertEqual([(None, self.make_key(26)),
+                          (self.make_key(31), self.make_key(48))],
             index._parsed_key_map)
         # reset the transport log
         del index._transport._activity[:]
@@ -502,7 +527,6 @@
         # not create a new transport request, and should return False (cannot
         # be in the index) - even when the byte location we ask for is outside
         # the parsed region
-        # 
         result = index._lookup_keys_via_location(
             [(4000, ('40', ))])
         self.assertEqual([((4000, ('40', )), False)],
@@ -512,20 +536,14 @@
     def test_lookup_present_key_answers_without_io_when_map_permits(self):
         # generate a big enough index that we only read some of it on a typical
         # bisection lookup.
-        nodes = []
-        def make_key(number):
-            return (str(number) + 'X'*100,)
-        def make_value(number):
-            return str(number) + 'Y'*100
-        for counter in range(64):
-            nodes.append((make_key(counter), make_value(counter), ()))
-        index = self.make_index(nodes=nodes)
+        index = self.make_index(nodes=self.make_nodes(64))
         # lookup the keys in the middle of the file
         result =index._lookup_keys_via_location(
             [(index._size // 2, ('40', ))])
         # check the parse map, this determines the test validity
         self.assertEqual([(0, 4008), (5046, 8996)], index._parsed_byte_map)
-        self.assertEqual([(None, make_key(26)), (make_key(31), make_key(48))],
+        self.assertEqual([(None, self.make_key(26)),
+                          (self.make_key(31), self.make_key(48))],
             index._parsed_key_map)
         # reset the transport log
         del index._transport._activity[:]
@@ -534,28 +552,25 @@
         # be in the index) - even when the byte location we ask for is outside
         # the parsed region
         # 
-        result = index._lookup_keys_via_location([(4000, make_key(40))])
+        result = index._lookup_keys_via_location([(4000, self.make_key(40))])
         self.assertEqual(
-            [((4000, make_key(40)), (index, make_key(40), make_value(40)))],
+            [((4000, self.make_key(40)),
+              (index, self.make_key(40), self.make_value(40)))],
             result)
         self.assertEqual([], index._transport._activity)
 
     def test_lookup_key_below_probed_area(self):
         # generate a big enough index that we only read some of it on a typical
         # bisection lookup.
-        nodes = []
-        def make_key(number):
-            return (str(number) + 'X'*100,)
-        for counter in range(64):
-            nodes.append((make_key(counter), 'Y'*100, ()))
-        index = self.make_index(nodes=nodes)
+        index = self.make_index(nodes=self.make_nodes(64))
         # ask for the key in the middle, but a key that is located in the
         # unparsed region before the middle.
         result =index._lookup_keys_via_location(
             [(index._size // 2, ('30', ))])
         # check the parse map, this determines the test validity
-        self.assertEqual([(0, 3972), (5001, 8914)], index._parsed_byte_map)
-        self.assertEqual([(None, make_key(26)), (make_key(31), make_key(48))],
+        self.assertEqual([(0, 4008), (5046, 8996)], index._parsed_byte_map)
+        self.assertEqual([(None, self.make_key(26)),
+                          (self.make_key(31), self.make_key(48))],
             index._parsed_key_map)
         self.assertEqual([((index._size // 2, ('30', )), -1)],
             result)
@@ -563,19 +578,15 @@
     def test_lookup_key_above_probed_area(self):
         # generate a big enough index that we only read some of it on a typical
         # bisection lookup.
-        nodes = []
-        def make_key(number):
-            return (str(number) + 'X'*100,)
-        for counter in range(64):
-            nodes.append((make_key(counter), 'Y'*100, ()))
-        index = self.make_index(nodes=nodes)
+        index = self.make_index(nodes=self.make_nodes(64))
         # ask for the key in the middle, but a key that is located in the
         # unparsed region after the middle.
         result =index._lookup_keys_via_location(
             [(index._size // 2, ('50', ))])
         # check the parse map, this determines the test validity
-        self.assertEqual([(0, 3972), (5001, 8914)], index._parsed_byte_map)
-        self.assertEqual([(None, make_key(26)), (make_key(31), make_key(48))],
+        self.assertEqual([(0, 4008), (5046, 8996)], index._parsed_byte_map)
+        self.assertEqual([(None, self.make_key(26)),
+                          (self.make_key(31), self.make_key(48))],
             index._parsed_key_map)
         self.assertEqual([((index._size // 2, ('50', )), +1)],
             result)
@@ -584,13 +595,9 @@
         # generate a big enough index that we only read some of it on a typical
         # bisection lookup.
         nodes = []
-        def make_key(number):
-            return (str(number) + 'X'*100,)
-        def make_value(number):
-            return str(number) + 'Y'*100
         for counter in range(64):
-            nodes.append((make_key(counter), make_value(counter),
-                ((make_key(counter + 20),),)  ))
+            nodes.append((self.make_key(counter), self.make_value(counter),
+                ((self.make_key(counter + 20),),)  ))
         index = self.make_index(ref_lists=1, nodes=nodes)
         # lookup a key in the middle that does not exist, so that when we can
         # check that the referred-to-keys are not accessed automatically.
@@ -599,7 +606,8 @@
         # check the parse map - only the start and middle should have been
         # parsed.
         self.assertEqual([(0, 3890), (6444, 10274)], index._parsed_byte_map)
-        self.assertEqual([(None, make_key(25)), (make_key(37), make_key(52))],
+        self.assertEqual([(None, self.make_key(25)),
+                          (self.make_key(37), self.make_key(52))],
             index._parsed_key_map)
         # and check the transport activity likewise.
         self.assertEqual(
@@ -609,10 +617,11 @@
         del index._transport._activity[:]
         # now looking up a key in the portion of the file already parsed should
         # only perform IO to resolve its key references.
-        result = index._lookup_keys_via_location([(4000, make_key(40))])
+        result = index._lookup_keys_via_location([(4000, self.make_key(40))])
         self.assertEqual(
-            [((4000, make_key(40)),
-              (index, make_key(40), make_value(40), ((make_key(60),),)))],
+            [((4000, self.make_key(40)),
+              (index, self.make_key(40), self.make_value(40),
+               ((self.make_key(60),),)))],
             result)
         self.assertEqual([('readv', 'index', [(11976, 800)], True, 15813)],
             index._transport._activity)
@@ -640,6 +649,23 @@
             (index, ('ref', ), 'refdata', ((), ))]),
             set(index.iter_all_entries()))
 
+    def test_iter_entries_buffers_once(self):
+        index = self.make_index(nodes=self.make_nodes(2))
+        # reset the transport log
+        del index._transport._activity[:]
+        self.assertEqual(set([(index, self.make_key(1), self.make_value(1))]),
+                         set(index.iter_entries([self.make_key(1)])))
+        # We should have requested reading the header bytes
+        # But not needed any more than that because it would have triggered a
+        # buffer all
+        self.assertEqual([
+            ('readv', 'index', [(0, 200)], True, index._size),
+            ],
+            index._transport._activity)
+        # And that should have been enough to trigger reading the whole index
+        # with buffering
+        self.assertIsNot(None, index._nodes)
+
     def test_iter_entries_references_resolved(self):
         index = self.make_index(1, nodes=[
             (('name', ), 'data', ([('ref', ), ('ref', )], )),
@@ -763,6 +789,16 @@
             (('name', ), '', ()), (('foo', ), '', ())])
         self.assertEqual(2, index.key_count())
 
+    def test_readv_all_triggers_buffer_all(self):
+        index = self.make_index(key_elements=2, nodes=[
+            (('name', 'fin1'), 'data', ()),
+            (('name', 'fin2'), 'beta', ()),
+            (('ref', 'erence'), 'refdata', ())])
+        self.assertTrue(index._size > 0)
+        self.assertIs(None, index._nodes)
+        index._read_and_parse([(0, index._size)])
+        self.assertIsNot(None, index._nodes)
+
     def test_validate_bad_index_errors(self):
         trans = self.get_transport()
         trans.put_bytes('name', "not an index\n")