Rev 2667: Merge index improvements. in http://people.ubuntu.com/~robertc/baz2.0/repository

Robert Collins robertc at robertcollins.net
Wed Jul 18 07:16:24 BST 2007


At http://people.ubuntu.com/~robertc/baz2.0/repository

------------------------------------------------------------
revno: 2667
revision-id: robertc at robertcollins.net-20070718061621-xz2w9x9zj5cwwnnk
parent: robertc at robertcollins.net-20070718060739-2l2y7bjd1rf121fo
parent: robertc at robertcollins.net-20070718061559-36wb5mc52pp26qqq
committer: Robert Collins <robertc at robertcollins.net>
branch nick: repository
timestamp: Wed 2007-07-18 16:16:21 +1000
message:
  Merge index improvements.
modified:
  bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
  bzrlib/tests/test_index.py     test_index.py-20070712131115-lolkarso50vjr64s-2
    ------------------------------------------------------------
    revno: 2592.1.25.2.10
    revision-id: robertc at robertcollins.net-20070718061559-36wb5mc52pp26qqq
    parent: robertc at robertcollins.net-20070718060649-k1c0mh6bmra497n2
    committer: Robert Collins <robertc at robertcollins.net>
    branch nick: index
    timestamp: Wed 2007-07-18 16:15:59 +1000
    message:
      Make GraphIndex.iter_entries do hash lookups rather than table scans.
    modified:
      bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
    ------------------------------------------------------------
    revno: 2592.1.25.2.9
    revision-id: robertc at robertcollins.net-20070718060649-k1c0mh6bmra497n2
    parent: robertc at robertcollins.net-20070718044624-kb7pmne2pd96ekum
    committer: Robert Collins <robertc at robertcollins.net>
    branch nick: index
    timestamp: Wed 2007-07-18 16:06:49 +1000
    message:
      Temporary performance hack for GraphIndex : load the entire index once and only once into ram.
    modified:
      bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
    ------------------------------------------------------------
    revno: 2592.1.25.2.8
    revision-id: robertc at robertcollins.net-20070718044624-kb7pmne2pd96ekum
    parent: pqm at pqm.ubuntu.com-20070717110203-zzmtp28nunhsoz12
    committer: Robert Collins <robertc at robertcollins.net>
    branch nick: index
    timestamp: Wed 2007-07-18 14:46:24 +1000
    message:
      InMemoryGraphIndex.add_nodes was inconsistent with other metods for non-node-reference indices.
    modified:
      bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
      bzrlib/tests/test_index.py     test_index.py-20070712131115-lolkarso50vjr64s-2
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py	2007-07-18 04:31:29 +0000
+++ b/bzrlib/index.py	2007-07-18 06:16:21 +0000
@@ -196,20 +196,21 @@
         """
         self._transport = transport
         self._name = name
-
-    def iter_all_entries(self):
-        """Iterate over all keys within the index.
-
-        :return: An iterable of (key, value) or (key, value, reference_lists).
-            The former tuple is used when there are no reference lists in the
-            index, making the API compatible with simple key:value index types.
-            There is no defined order for the result iteration - it will be in
-            the most efficient order for the index.
+        self._nodes = None
+        self._keys_by_offset = None
+
+    def _buffer_all(self):
+        """Buffer all the index data.
+
+        Mutates self._nodes and self.keys_by_offset.
         """
         stream = self._transport.get(self._name)
         self._read_prefix(stream)
         line_count = 0
-        self.keys_by_offset = {}
+        # raw data keyed by offset
+        self._keys_by_offset = {}
+        # ready-to-return key:value or key:value, node_ref_lists
+        self._nodes = {}
         trailers = 0
         pos = stream.tell()
         for line in stream.readlines():
@@ -224,23 +225,41 @@
                     int(ref) for ref in ref_string.split('\r') if ref
                     ]))
             ref_lists = tuple(ref_lists)
-            self.keys_by_offset[pos] = (key, absent, ref_lists, value)
+            self._keys_by_offset[pos] = (key, absent, ref_lists, value)
             pos += len(line)
-        for key, absent, references, value in self.keys_by_offset.itervalues():
+        for key, absent, references, value in self._keys_by_offset.itervalues():
             if absent:
                 continue
             # resolve references:
             if self.node_ref_lists:
                 node_refs = []
                 for ref_list in references:
-                    node_refs.append(tuple([self.keys_by_offset[ref][0] for ref in ref_list]))
-                yield (key, value, tuple(node_refs))
+                    node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
+                self._nodes[key] = (value, tuple(node_refs))
             else:
-                yield (key, value)
+                self._nodes[key] = value
         if trailers != 1:
             # there must be one line - the empty trailer line.
             raise errors.BadIndexData(self)
 
+    def iter_all_entries(self):
+        """Iterate over all keys within the index.
+
+        :return: An iterable of (key, value) or (key, value, reference_lists).
+            The former tuple is used when there are no reference lists in the
+            index, making the API compatible with simple key:value index types.
+            There is no defined order for the result iteration - it will be in
+            the most efficient order for the index.
+        """
+        if self._nodes is None:
+            self._buffer_all()
+        if self.node_ref_lists:
+            for key, (value, node_ref_lists) in self._nodes.iteritems():
+                yield key, value, node_ref_lists
+        else:
+            for key, value in self._nodes.iteritems():
+                yield key, value
+
     def _read_prefix(self, stream):
         signature = stream.read(len(self._signature()))
         if not signature == self._signature():
@@ -264,12 +283,16 @@
         keys = set(keys)
         if not keys:
             return
-        for node in self.iter_all_entries():
-            if not keys:
-                return
-            if node[0] in keys:
-                yield node
-                keys.remove(node[0])
+        if self._nodes is None:
+            self._buffer_all()
+        keys = keys.intersection(self._nodes)
+        if self.node_ref_lists:
+            for key in keys:
+                value, node_refs = self._nodes[key]
+                yield key, value, node_refs
+        else:
+            for key in keys:
+                yield key, self._nodes[key]
 
     def _signature(self):
         """The file signature for this index type."""

=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py	2007-07-15 15:40:37 +0000
+++ b/bzrlib/tests/test_index.py	2007-07-18 04:46:24 +0000
@@ -471,6 +471,16 @@
         result.add_nodes(nodes)
         return result
 
+    def test_add_nodes_no_refs(self):
+        index = self.make_index(0)
+        index.add_nodes([('name', 'data')])
+        index.add_nodes([('name2', ''), ('name3', '')])
+        self.assertEqual(set([
+            ('name', 'data'),
+            ('name2', ''),
+            ('name3', ''),
+            ]), set(index.iter_all_entries()))
+
     def test_add_nodes(self):
         index = self.make_index(1)
         index.add_nodes([('name', 'data', ([],))])
@@ -486,7 +496,7 @@
         self.assertEqual([], list(index.iter_all_entries()))
 
     def test_iter_all_entries_simple(self):
-        index = self.make_index(nodes=[('name', 'data', ())])
+        index = self.make_index(nodes=[('name', 'data')])
         self.assertEqual([('name', 'data')],
             list(index.iter_all_entries()))
 
@@ -528,7 +538,7 @@
         index.validate()
 
     def test_validate_no_refs_content(self):
-        index = self.make_index(nodes=[('key', 'value', ())])
+        index = self.make_index(nodes=[('key', 'value')])
         index.validate()
 
 



More information about the bazaar-commits mailing list