Rev 5393: (jameinel) use StaticTuple for dirstate and don't use sets in _id_index for in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Fri Aug 27 23:09:40 BST 2010

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 5393 [merge]
revision-id: pqm at pqm.ubuntu.com-20100827220935-gwk3320p99ggl80n
parent: pqm at pqm.ubuntu.com-20100827204731-42zzp1f2yyg3jowf
parent: john at arbash-meinel.com-20100827180222-wdq7ac9cdai76mk4
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Fri 2010-08-27 23:09:35 +0100
message:
  (jameinel) use StaticTuple for dirstate and don't use sets in _id_index for
   improved memory use (John A Meinel)
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/_dirstate_helpers_pyx.pyx dirstate_helpers.pyx-20070503201057-u425eni465q4idwn-3
  bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
  bzrlib/tests/per_controldir/__init__.py __init__.py-20060131065642-34c39b54f42dd048
  bzrlib/tests/per_controldir/test_push.py test_push.py-20090403142358-xnn0wtsk3gx238ot-1
  bzrlib/tests/per_repository/__init__.py __init__.py-20060131092037-9564957a7d4a841b
  bzrlib/tests/test_dirstate.py  test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
  bzrlib/tests/test_options.py   testoptions.py-20051014093702-96457cfc86319a8f
=== modified file 'NEWS'

--- a/NEWS	2010-08-25 10:21:17 +0000
+++ b/NEWS	2010-08-27 22:09:35 +0000
@@ -134,6 +134,10 @@
   down to 247MB, expect even more significant savings on 64-bit platforms.
   (John Arbash Meinel)
 
+* ``DirState`` internals use a little bit less memory. For bzr.dev it
+  drops the memory from 1MB down to about 800kB. And replaces a few
+  thousand tuples and sets with StaticTuple.  (John Arbash Meinel)
+
 * Inventory entries now consume less memory (on 32-bit Ubuntu file entries
   have dropped from 68 bytes to 40, and directory entries from 120 bytes
   to 48).  (Andrew Bennetts)

=== modified file 'bzrlib/_dirstate_helpers_pyx.pyx'
--- a/bzrlib/_dirstate_helpers_pyx.pyx	2010-05-20 02:57:52 +0000
+++ b/bzrlib/_dirstate_helpers_pyx.pyx	2010-08-27 18:02:22 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2007, 2008, 2010 Canonical Ltd
+# Copyright (C) 2007-2010 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -118,6 +118,11 @@
     # ??? memrchr is a GNU extension :(
     # void *memrchr(void *s, int c, size_t len)
 
+# cimport all of the definitions we will need to access
+from _static_tuple_c cimport import_static_tuple_c, StaticTuple, \
+    StaticTuple_New, StaticTuple_SET_ITEM
+
+import_static_tuple_c()
 
 cdef void* _my_memrchr(void *s, int c, size_t n): # cannot_raise
     # memrchr seems to be a GNU extension, so we have to implement it ourselves
@@ -610,7 +615,8 @@
         :param new_block: This is to let the caller know that it needs to
             create a new directory block to store the next entry.
         """
-        cdef object path_name_file_id_key
+        cdef StaticTuple path_name_file_id_key
+        cdef StaticTuple tmp
         cdef char *entry_size_cstr
         cdef unsigned long int entry_size
         cdef char* executable_cstr
@@ -650,10 +656,20 @@
         # Build up the key that will be used.
         # By using <object>(void *) Pyrex will automatically handle the
         # Py_INCREF that we need.
-        path_name_file_id_key = (<object>p_current_dirname[0],
-                                 self.get_next_str(),
-                                 self.get_next_str(),
-                                )
+        cur_dirname = <object>p_current_dirname[0]
+        # Use StaticTuple_New to pre-allocate, rather than creating a regular
+        # tuple and passing it to the StaticTuple constructor.
+        # path_name_file_id_key = StaticTuple(<object>p_current_dirname[0],
+        #                          self.get_next_str(),
+        #                          self.get_next_str(),
+        #                         )
+        tmp = StaticTuple_New(3)
+        Py_INCREF(cur_dirname); StaticTuple_SET_ITEM(tmp, 0, cur_dirname)
+        cur_basename = self.get_next_str()
+        cur_file_id = self.get_next_str()
+        Py_INCREF(cur_basename); StaticTuple_SET_ITEM(tmp, 1, cur_basename)
+        Py_INCREF(cur_file_id); StaticTuple_SET_ITEM(tmp, 2, cur_file_id)
+        path_name_file_id_key = tmp
 
         # Parse all of the per-tree information. current has the information in
         # the same location as parent trees. The only difference is that 'info'
@@ -677,7 +693,20 @@
             executable_cstr = self.get_next(&cur_size)
             is_executable = (executable_cstr[0] == c'y')
             info = self.get_next_str()
-            PyList_Append(trees, (
+            # TODO: If we want to use StaticTuple_New here we need to be pretty
+            #       careful. We are relying on a bit of Pyrex
+            #       automatic-conversion from 'int' to PyInt, and that doesn't
+            #       play well with the StaticTuple_SET_ITEM macro.
+            #       Timing doesn't (yet) show a worthwile improvement in speed
+            #       versus complexity and maintainability.
+            # tmp = StaticTuple_New(5)
+            # Py_INCREF(minikind); StaticTuple_SET_ITEM(tmp, 0, minikind)
+            # Py_INCREF(fingerprint); StaticTuple_SET_ITEM(tmp, 1, fingerprint)
+            # Py_INCREF(entry_size); StaticTuple_SET_ITEM(tmp, 2, entry_size)
+            # Py_INCREF(is_executable); StaticTuple_SET_ITEM(tmp, 3, is_executable)
+            # Py_INCREF(info); StaticTuple_SET_ITEM(tmp, 4, info)
+            # PyList_Append(trees, tmp)
+            PyList_Append(trees, StaticTuple(
                 minikind,     # minikind
                 fingerprint,  # fingerprint
                 entry_size,   # size

=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py	2010-01-13 23:06:42 +0000
+++ b/bzrlib/dirstate.py	2010-08-02 22:07:54 +0000
@@ -220,6 +220,7 @@
     inventory,
     lock,
     osutils,
+    static_tuple,
     trace,
     )
 
@@ -548,7 +549,7 @@
            self._ensure_block(block_index, entry_index, utf8path)
         self._dirblock_state = DirState.IN_MEMORY_MODIFIED
         if self._id_index:
-            self._id_index.setdefault(entry_key[2], set()).add(entry_key)
+            self._add_to_id_index(self._id_index, entry_key)
 
     def _bisect(self, paths):
         """Bisect through the disk structure for specific rows.
@@ -1566,7 +1567,7 @@
             return
         id_index = self._get_id_index()
         for file_id in new_ids:
-            for key in id_index.get(file_id, []):
+            for key in id_index.get(file_id, ()):
                 block_i, entry_i, d_present, f_present = \
                     self._get_block_entry_index(key[0], key[1], tree_index)
                 if not f_present:
@@ -1980,7 +1981,7 @@
                                           ' tree_index, file_id and path')
             return entry
         else:
-            possible_keys = self._get_id_index().get(fileid_utf8, None)
+            possible_keys = self._get_id_index().get(fileid_utf8, ())
             if not possible_keys:
                 return None, None
             for key in possible_keys:
@@ -2143,14 +2144,48 @@
                 yield entry
 
     def _get_id_index(self):
-        """Get an id index of self._dirblocks."""
+        """Get an id index of self._dirblocks.
+        
+        This maps from file_id => [(directory, name, file_id)] entries where
+        that file_id appears in one of the trees.
+        """
         if self._id_index is None:
             id_index = {}
             for key, tree_details in self._iter_entries():
-                id_index.setdefault(key[2], set()).add(key)
+                self._add_to_id_index(id_index, key)
             self._id_index = id_index
         return self._id_index
 
+    def _add_to_id_index(self, id_index, entry_key):
+        """Add this entry to the _id_index mapping."""
+        # This code used to use a set for every entry in the id_index. However,
+        # it is *rare* to have more than one entry. So a set is a large
+        # overkill. And even when we do, we won't ever have more than the
+        # number of parent trees. Which is still a small number (rarely >2). As
+        # such, we use a simple tuple, and do our own uniqueness checks. While
+        # the 'in' check is O(N) since N is nicely bounded it shouldn't ever
+        # cause quadratic failure.
+        # TODO: This should use StaticTuple
+        file_id = entry_key[2]
+        entry_key = static_tuple.StaticTuple.from_sequence(entry_key)
+        if file_id not in id_index:
+            id_index[file_id] = static_tuple.StaticTuple(entry_key,)
+        else:
+            entry_keys = id_index[file_id]
+            if entry_key not in entry_keys:
+                id_index[file_id] = entry_keys + (entry_key,)
+
+    def _remove_from_id_index(self, id_index, entry_key):
+        """Remove this entry from the _id_index mapping.
+
+        It is an programming error to call this when the entry_key is not
+        already present.
+        """
+        file_id = entry_key[2]
+        entry_keys = list(id_index[file_id])
+        entry_keys.remove(entry_key)
+        id_index[file_id] = static_tuple.StaticTuple.from_sequence(entry_keys)
+
     def _get_output_lines(self, lines):
         """Format lines for final output.
 
@@ -2414,7 +2449,9 @@
                 continue
             by_path[entry[0]] = [entry[1][0]] + \
                 [DirState.NULL_PARENT_DETAILS] * parent_count
-            id_index[entry[0][2]] = set([entry[0]])
+            # TODO: Possibly inline this, since we know it isn't present yet
+            #       id_index[entry[0][2]] = (entry[0],)
+            self._add_to_id_index(id_index, entry[0])
 
         # now the parent trees:
         for tree_index, tree in enumerate(parent_trees):
@@ -2442,7 +2479,7 @@
                 new_entry_key = (dirname, basename, file_id)
                 # tree index consistency: All other paths for this id in this tree
                 # index must point to the correct path.
-                for entry_key in id_index.setdefault(file_id, set()):
+                for entry_key in id_index.get(file_id, ()):
                     # TODO:PROFILING: It might be faster to just update
                     # rather than checking if we need to, and then overwrite
                     # the one we are located at.
@@ -2454,7 +2491,8 @@
                         by_path[entry_key][tree_index] = ('r', path_utf8, 0, False, '')
                 # by path consistency: Insert into an existing path record (trivial), or
                 # add a new one with relocation pointers for the other tree indexes.
-                if new_entry_key in id_index[file_id]:
+                entry_keys = id_index.get(file_id, ())
+                if new_entry_key in entry_keys:
                     # there is already an entry where this data belongs, just insert it.
                     by_path[new_entry_key][tree_index] = \
                         self._inv_entry_to_details(entry)
@@ -2465,16 +2503,16 @@
                     new_details = []
                     for lookup_index in xrange(tree_index):
                         # boundary case: this is the first occurence of file_id
-                        # so there are no id_indexs, possibly take this out of
+                        # so there are no id_indexes, possibly take this out of
                         # the loop?
-                        if not len(id_index[file_id]):
+                        if not len(entry_keys):
                             new_details.append(DirState.NULL_PARENT_DETAILS)
                         else:
                             # grab any one entry, use it to find the right path.
                             # TODO: optimise this to reduce memory use in highly
                             # fragmented situations by reusing the relocation
                             # records.
-                            a_key = iter(id_index[file_id]).next()
+                            a_key = iter(entry_keys).next()
                             if by_path[a_key][lookup_index][0] in ('r', 'a'):
                                 # its a pointer or missing statement, use it as is.
                                 new_details.append(by_path[a_key][lookup_index])
@@ -2485,7 +2523,7 @@
                     new_details.append(self._inv_entry_to_details(entry))
                     new_details.extend(new_location_suffix)
                     by_path[new_entry_key] = new_details
-                    id_index[file_id].add(new_entry_key)
+                    self._add_to_id_index(id_index, new_entry_key)
         # --- end generation of full tree mappings
 
         # sort and output all the entries
@@ -2673,7 +2711,7 @@
             block[1].pop(entry_index)
             # if we have an id_index in use, remove this key from it for this id.
             if self._id_index is not None:
-                self._id_index[current_old[0][2]].remove(current_old[0])
+                self._remove_from_id_index(self._id_index, current_old[0])
         # update all remaining keys for this id to record it as absent. The
         # existing details may either be the record we are marking as deleted
         # (if there were other trees with the id present at this path), or may
@@ -2748,7 +2786,7 @@
                     else:
                         break
             # new entry, synthesis cross reference here,
-            existing_keys = id_index.setdefault(key[2], set())
+            existing_keys = id_index.get(key[2], ())
             if not existing_keys:
                 # not currently in the state, simplest case
                 new_entry = key, [new_details] + self._empty_parent_info()
@@ -2785,8 +2823,11 @@
                     # loop.
                     other_entry = other_block[other_entry_index]
                     other_entry[1][0] = ('r', path_utf8, 0, False, '')
-                    self._maybe_remove_row(other_block, other_entry_index,
-                        id_index)
+                    if self._maybe_remove_row(other_block, other_entry_index,
+                                              id_index):
+                        # If the row holding this was removed, we need to
+                        # recompute where this entry goes
+                        entry_index, _ = self._find_entry_index(key, block)
 
                 # This loop:
                 # adds a tuple to the new details for each column
@@ -2794,6 +2835,8 @@
                 #  - or by creating a new pointer to the right row inside that column
                 num_present_parents = self._num_present_parents()
                 if num_present_parents:
+                    # TODO: This re-evaluates the existing_keys set, do we need
+                    #       to do that ourselves?
                     other_key = list(existing_keys)[0]
                 for lookup_index in xrange(1, num_present_parents + 1):
                     # grab any one entry, use it to find the right path.
@@ -2818,7 +2861,7 @@
                         pointer_path = osutils.pathjoin(*other_key[0:2])
                         new_entry[1].append(('r', pointer_path, 0, False, ''))
             block.insert(entry_index, new_entry)
-            existing_keys.add(key)
+            self._add_to_id_index(id_index, key)
         else:
             # Does the new state matter?
             block[entry_index][1][0] = new_details
@@ -2833,7 +2876,12 @@
             # converted to relocated.
             if path_utf8 is None:
                 raise AssertionError('no path')
-            for entry_key in id_index.setdefault(key[2], set()):
+            existing_keys = id_index[key[2]]
+            if key not in existing_keys:
+                raise AssertionError('We found the entry in the blocks, but'
+                    ' the key is not in the id_index.'
+                    ' key: %s, existing_keys: %s' % (key, existing_keys))
+            for entry_key in id_index[key[2]]:
                 # TODO:PROFILING: It might be faster to just update
                 # rather than checking if we need to, and then overwrite
                 # the one we are located at.
@@ -2863,6 +2911,7 @@
         """Remove index if it is absent or relocated across the row.
         
         id_index is updated accordingly.
+        :return: True if we removed the row, False otherwise
         """
         present_in_row = False
         entry = block[index]
@@ -2872,7 +2921,9 @@
                 break
         if not present_in_row:
             block.pop(index)
-            id_index[entry[0][2]].remove(entry[0])
+            self._remove_from_id_index(id_index, entry[0])
+            return True
+        return False
 
     def _validate(self):
         """Check that invariants on the dirblock are correct.
@@ -3020,6 +3071,10 @@
                         raise AssertionError(
                             'file_id %r did not match entry key %s'
                             % (file_id, entry_key))
+                if len(entry_keys) != len(set(entry_keys)):
+                    raise AssertionError(
+                        'id_index contained non-unique data for %s'
+                        % (entry_keys,))
 
     def _wipe_state(self):
         """Forget all state information about the dirstate."""

=== modified file 'bzrlib/tests/per_controldir/__init__.py'
--- a/bzrlib/tests/per_controldir/__init__.py	2010-08-21 16:06:24 +0000
+++ b/bzrlib/tests/per_controldir/__init__.py	2010-08-27 17:53:08 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2006 Canonical Ltd
+# Copyright (C) 2006-2010 Canonical Ltd
 # Authors: Robert Collins <robert.collins at canonical.com>
 # -*- coding: utf-8 -*-
 #

=== modified file 'bzrlib/tests/per_controldir/test_push.py'
--- a/bzrlib/tests/per_controldir/test_push.py	2010-08-21 16:06:24 +0000
+++ b/bzrlib/tests/per_controldir/test_push.py	2010-08-27 17:53:08 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2009 Canonical Ltd
+# Copyright (C) 2009, 2010 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by

=== modified file 'bzrlib/tests/per_repository/__init__.py'
--- a/bzrlib/tests/per_repository/__init__.py	2010-08-21 16:06:24 +0000
+++ b/bzrlib/tests/per_repository/__init__.py	2010-08-27 17:53:08 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2006, 2007, 2008, 2009 Canonical Ltd
+# Copyright (C) 2006-2010 Canonical Ltd
 # Authors: Robert Collins <robert.collins at canonical.com>
 #          and others
 #

=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py	2010-01-25 17:48:22 +0000
+++ b/bzrlib/tests/test_dirstate.py	2010-08-02 21:40:07 +0000
@@ -730,6 +730,22 @@
 
 class TestDirStateManipulations(TestCaseWithDirState):
 
+    def test_update_minimal_updates_id_index(self):
+        state = self.create_dirstate_with_root_and_subdir()
+        self.addCleanup(state.unlock)
+        id_index = state._get_id_index()
+        self.assertEqual(['a-root-value', 'subdir-id'], sorted(id_index))
+        state.add('file-name', 'file-id', 'file', None, '')
+        self.assertEqual(['a-root-value', 'file-id', 'subdir-id'],
+                         sorted(id_index))
+        state.update_minimal(('', 'new-name', 'file-id'), 'f',
+                             path_utf8='new-name')
+        self.assertEqual(['a-root-value', 'file-id', 'subdir-id'],
+                         sorted(id_index))
+        self.assertEqual([('', 'new-name', 'file-id')],
+                         sorted(id_index['file-id']))
+        state._validate()
+
     def test_set_state_from_inventory_no_content_no_parents(self):
         # setting the current inventory is a slow but important api to support.
         tree1 = self.make_branch_and_memory_tree('tree1')

=== modified file 'bzrlib/tests/test_options.py'
--- a/bzrlib/tests/test_options.py	2010-08-02 18:36:31 +0000
+++ b/bzrlib/tests/test_options.py	2010-08-27 17:53:08 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2005, 2006, 2007 Canonical Ltd
+# Copyright (C) 2005-2010 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by