Rev 5363: Change the DirState id_index structure to use tuples instead of sets. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-dirstate-index

Mon Aug 2 22:43:24 BST 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-dirstate-index

------------------------------------------------------------
revno: 5363
revision-id: john at arbash-meinel.com-20100802214315-r0henszv95y5uo1c
parent: john at arbash-meinel.com-20100802214007-7wdfs47bss26ksfb
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-dirstate-index
timestamp: Mon 2010-08-02 16:43:15 -0500
message:
  Change the DirState id_index structure to use tuples instead of sets.
-------------- next part --------------
=== modified file 'bzrlib/dirstate.py'

--- a/bzrlib/dirstate.py	2010-08-02 21:40:07 +0000
+++ b/bzrlib/dirstate.py	2010-08-02 21:43:15 +0000
@@ -548,7 +548,7 @@
            self._ensure_block(block_index, entry_index, utf8path)
         self._dirblock_state = DirState.IN_MEMORY_MODIFIED
         if self._id_index:
-            self._id_index.setdefault(entry_key[2], set()).add(entry_key)
+            self._add_to_id_index(self._id_index, entry_key)
 
     def _bisect(self, paths):
         """Bisect through the disk structure for specific rows.
@@ -2143,14 +2143,47 @@
                 yield entry
 
     def _get_id_index(self):
-        """Get an id index of self._dirblocks."""
+        """Get an id index of self._dirblocks.
+        
+        This maps from file_id => [(directory, name, file_id)] entries where
+        that file_id appears in one of the trees.
+        """
         if self._id_index is None:
             id_index = {}
             for key, tree_details in self._iter_entries():
-                id_index.setdefault(key[2], set()).add(key)
+                self._add_to_id_index(id_index, key)
             self._id_index = id_index
         return self._id_index
 
+    def _add_to_id_index(self, id_index, entry_key):
+        """Add this entry to the _id_index mapping."""
+        # This code used to use a set for every entry in the id_index. However,
+        # it is *rare* to have more than one entry. So a set is a large
+        # overkill. And even when we do, we won't ever have more than the
+        # number of parent trees. Which is still a small number (rarely >2). As
+        # such, we use a simple tuple, and do our own uniqueness checks. While
+        # the 'in' check is O(N) since N is nicely bounded it shouldn't ever
+        # cause quadratic failure.
+        # TODO: This should use StaticTuple
+        file_id = entry_key[2]
+        if file_id not in id_index:
+            id_index[file_id] = (entry_key,)
+        else:
+            entry_keys = id_index[file_id]
+            if entry_key not in entry_keys:
+                id_index[file_id] = entry_keys + (entry_key,)
+
+    def _remove_from_id_index(self, id_index, entry_key):
+        """Remove this entry from the _id_index mapping.
+
+        It is an programming error to call this when the entry_key is not
+        already present.
+        """
+        file_id = entry_key[2]
+        entry_keys = id_index[file_id]
+        idx = entry_keys.index(entry_key)
+        id_index[file_id] = entry_keys[:idx] + entry_keys[idx+1:]
+
     def _get_output_lines(self, lines):
         """Format lines for final output.
 
@@ -2414,7 +2447,9 @@
                 continue
             by_path[entry[0]] = [entry[1][0]] + \
                 [DirState.NULL_PARENT_DETAILS] * parent_count
-            id_index[entry[0][2]] = set([entry[0]])
+            # TODO: Possibly inline this, since we know it isn't present yet
+            #       id_index[entry[0][2]] = (entry[0],)
+            self._add_to_id_index(id_index, entry[0])
 
         # now the parent trees:
         for tree_index, tree in enumerate(parent_trees):
@@ -2442,7 +2477,7 @@
                 new_entry_key = (dirname, basename, file_id)
                 # tree index consistency: All other paths for this id in this tree
                 # index must point to the correct path.
-                for entry_key in id_index.setdefault(file_id, set()):
+                for entry_key in id_index.get(file_id, ()):
                     # TODO:PROFILING: It might be faster to just update
                     # rather than checking if we need to, and then overwrite
                     # the one we are located at.
@@ -2454,7 +2489,8 @@
                         by_path[entry_key][tree_index] = ('r', path_utf8, 0, False, '')
                 # by path consistency: Insert into an existing path record (trivial), or
                 # add a new one with relocation pointers for the other tree indexes.
-                if new_entry_key in id_index[file_id]:
+                entry_keys = id_index.get(file_id, ())
+                if new_entry_key in entry_keys:
                     # there is already an entry where this data belongs, just insert it.
                     by_path[new_entry_key][tree_index] = \
                         self._inv_entry_to_details(entry)
@@ -2465,16 +2501,16 @@
                     new_details = []
                     for lookup_index in xrange(tree_index):
                         # boundary case: this is the first occurence of file_id
-                        # so there are no id_indexs, possibly take this out of
+                        # so there are no id_indexes, possibly take this out of
                         # the loop?
-                        if not len(id_index[file_id]):
+                        if not len(entry_keys):
                             new_details.append(DirState.NULL_PARENT_DETAILS)
                         else:
                             # grab any one entry, use it to find the right path.
                             # TODO: optimise this to reduce memory use in highly
                             # fragmented situations by reusing the relocation
                             # records.
-                            a_key = iter(id_index[file_id]).next()
+                            a_key = iter(entry_keys).next()
                             if by_path[a_key][lookup_index][0] in ('r', 'a'):
                                 # its a pointer or missing statement, use it as is.
                                 new_details.append(by_path[a_key][lookup_index])
@@ -2485,7 +2521,7 @@
                     new_details.append(self._inv_entry_to_details(entry))
                     new_details.extend(new_location_suffix)
                     by_path[new_entry_key] = new_details
-                    id_index[file_id].add(new_entry_key)
+                    self._add_to_id_index(id_index, new_entry_key)
         # --- end generation of full tree mappings
 
         # sort and output all the entries
@@ -2673,7 +2709,7 @@
             block[1].pop(entry_index)
             # if we have an id_index in use, remove this key from it for this id.
             if self._id_index is not None:
-                self._id_index[current_old[0][2]].remove(current_old[0])
+                self._remove_from_id_index(self._id_index, current_old[0])
         # update all remaining keys for this id to record it as absent. The
         # existing details may either be the record we are marking as deleted
         # (if there were other trees with the id present at this path), or may
@@ -2748,7 +2784,7 @@
                     else:
                         break
             # new entry, synthesis cross reference here,
-            existing_keys = id_index.setdefault(key[2], set())
+            existing_keys = id_index.get(key[2], ())
             if not existing_keys:
                 # not currently in the state, simplest case
                 new_entry = key, [new_details] + self._empty_parent_info()
@@ -2797,6 +2833,8 @@
                 #  - or by creating a new pointer to the right row inside that column
                 num_present_parents = self._num_present_parents()
                 if num_present_parents:
+                    # TODO: This re-evaluates the existing_keys set, do we need
+                    #       to do that ourselves?
                     other_key = list(existing_keys)[0]
                 for lookup_index in xrange(1, num_present_parents + 1):
                     # grab any one entry, use it to find the right path.
@@ -2821,7 +2859,7 @@
                         pointer_path = osutils.pathjoin(*other_key[0:2])
                         new_entry[1].append(('r', pointer_path, 0, False, ''))
             block.insert(entry_index, new_entry)
-            existing_keys.add(key)
+            self._add_to_id_index(id_index, key)
         else:
             # Does the new state matter?
             block[entry_index][1][0] = new_details
@@ -2836,8 +2874,12 @@
             # converted to relocated.
             if path_utf8 is None:
                 raise AssertionError('no path')
-            assert key in id_index[key[2]]
-            for entry_key in id_index.get(key[2], ()):
+            existing_keys = id_index[key[2]]
+            if key not in existing_keys:
+                raise AssertionError('We found the entry in the blocks, but'
+                    ' the key is not in the id_index.'
+                    ' key: %s, existing_keys: %s' % (key, existing_keys))
+            for entry_key in id_index[key[2]]:
                 # TODO:PROFILING: It might be faster to just update
                 # rather than checking if we need to, and then overwrite
                 # the one we are located at.
@@ -2854,7 +2896,6 @@
                         raise AssertionError('not present: %r', entry_key)
                     self._dirblocks[block_index][1][entry_index][1][0] = \
                         ('r', path_utf8, 0, False, '')
-            assert key in id_index[key[2]]
         # add a containing dirblock if needed.
         if new_details[0] == 'd':
             subdir_key = (osutils.pathjoin(*key[0:2]), '', '')
@@ -2878,7 +2919,7 @@
                 break
         if not present_in_row:
             block.pop(index)
-            id_index[entry[0][2]].remove(entry[0])
+            self._remove_from_id_index(id_index, entry[0])
             return True
         return False
 
@@ -3028,6 +3069,10 @@
                         raise AssertionError(
                             'file_id %r did not match entry key %s'
                             % (file_id, entry_key))
+                if len(entry_keys) != len(set(entry_keys)):
+                    raise AssertionError(
+                        'id_index contained non-unique data for %s'
+                        % (entry_keys,))
 
     def _wipe_state(self):
         """Forget all state information about the dirstate."""