Rev 5393: (jameinel) use StaticTuple for dirstate and don't use sets in _id_index for in file:///home/pqm/archives/thelove/bzr/%2Btrunk/
Canonical.com Patch Queue Manager
pqm at pqm.ubuntu.com
Fri Aug 27 23:09:40 BST 2010
At file:///home/pqm/archives/thelove/bzr/%2Btrunk/
------------------------------------------------------------
revno: 5393 [merge]
revision-id: pqm at pqm.ubuntu.com-20100827220935-gwk3320p99ggl80n
parent: pqm at pqm.ubuntu.com-20100827204731-42zzp1f2yyg3jowf
parent: john at arbash-meinel.com-20100827180222-wdq7ac9cdai76mk4
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Fri 2010-08-27 23:09:35 +0100
message:
(jameinel) use StaticTuple for dirstate and don't use sets in _id_index for
improved memory use (John A Meinel)
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/_dirstate_helpers_pyx.pyx dirstate_helpers.pyx-20070503201057-u425eni465q4idwn-3
bzrlib/dirstate.py dirstate.py-20060728012006-d6mvoihjb3je9peu-1
bzrlib/tests/per_controldir/__init__.py __init__.py-20060131065642-34c39b54f42dd048
bzrlib/tests/per_controldir/test_push.py test_push.py-20090403142358-xnn0wtsk3gx238ot-1
bzrlib/tests/per_repository/__init__.py __init__.py-20060131092037-9564957a7d4a841b
bzrlib/tests/test_dirstate.py test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
bzrlib/tests/test_options.py testoptions.py-20051014093702-96457cfc86319a8f
=== modified file 'NEWS'
--- a/NEWS 2010-08-25 10:21:17 +0000
+++ b/NEWS 2010-08-27 22:09:35 +0000
@@ -134,6 +134,10 @@
down to 247MB, expect even more significant savings on 64-bit platforms.
(John Arbash Meinel)
+* ``DirState`` internals use a little bit less memory. For bzr.dev it
+ drops the memory from 1MB down to about 800kB. And replaces a few
+ thousand tuples and sets with StaticTuple. (John Arbash Meinel)
+
* Inventory entries now consume less memory (on 32-bit Ubuntu file entries
have dropped from 68 bytes to 40, and directory entries from 120 bytes
to 48). (Andrew Bennetts)
=== modified file 'bzrlib/_dirstate_helpers_pyx.pyx'
--- a/bzrlib/_dirstate_helpers_pyx.pyx 2010-05-20 02:57:52 +0000
+++ b/bzrlib/_dirstate_helpers_pyx.pyx 2010-08-27 18:02:22 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2007, 2008, 2010 Canonical Ltd
+# Copyright (C) 2007-2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -118,6 +118,11 @@
# ??? memrchr is a GNU extension :(
# void *memrchr(void *s, int c, size_t len)
+# cimport all of the definitions we will need to access
+from _static_tuple_c cimport import_static_tuple_c, StaticTuple, \
+ StaticTuple_New, StaticTuple_SET_ITEM
+
+import_static_tuple_c()
cdef void* _my_memrchr(void *s, int c, size_t n): # cannot_raise
# memrchr seems to be a GNU extension, so we have to implement it ourselves
@@ -610,7 +615,8 @@
:param new_block: This is to let the caller know that it needs to
create a new directory block to store the next entry.
"""
- cdef object path_name_file_id_key
+ cdef StaticTuple path_name_file_id_key
+ cdef StaticTuple tmp
cdef char *entry_size_cstr
cdef unsigned long int entry_size
cdef char* executable_cstr
@@ -650,10 +656,20 @@
# Build up the key that will be used.
# By using <object>(void *) Pyrex will automatically handle the
# Py_INCREF that we need.
- path_name_file_id_key = (<object>p_current_dirname[0],
- self.get_next_str(),
- self.get_next_str(),
- )
+ cur_dirname = <object>p_current_dirname[0]
+ # Use StaticTuple_New to pre-allocate, rather than creating a regular
+ # tuple and passing it to the StaticTuple constructor.
+ # path_name_file_id_key = StaticTuple(<object>p_current_dirname[0],
+ # self.get_next_str(),
+ # self.get_next_str(),
+ # )
+ tmp = StaticTuple_New(3)
+ Py_INCREF(cur_dirname); StaticTuple_SET_ITEM(tmp, 0, cur_dirname)
+ cur_basename = self.get_next_str()
+ cur_file_id = self.get_next_str()
+ Py_INCREF(cur_basename); StaticTuple_SET_ITEM(tmp, 1, cur_basename)
+ Py_INCREF(cur_file_id); StaticTuple_SET_ITEM(tmp, 2, cur_file_id)
+ path_name_file_id_key = tmp
# Parse all of the per-tree information. current has the information in
# the same location as parent trees. The only difference is that 'info'
@@ -677,7 +693,20 @@
executable_cstr = self.get_next(&cur_size)
is_executable = (executable_cstr[0] == c'y')
info = self.get_next_str()
- PyList_Append(trees, (
+ # TODO: If we want to use StaticTuple_New here we need to be pretty
+ # careful. We are relying on a bit of Pyrex
+ # automatic-conversion from 'int' to PyInt, and that doesn't
+ # play well with the StaticTuple_SET_ITEM macro.
+ # Timing doesn't (yet) show a worthwile improvement in speed
+ # versus complexity and maintainability.
+ # tmp = StaticTuple_New(5)
+ # Py_INCREF(minikind); StaticTuple_SET_ITEM(tmp, 0, minikind)
+ # Py_INCREF(fingerprint); StaticTuple_SET_ITEM(tmp, 1, fingerprint)
+ # Py_INCREF(entry_size); StaticTuple_SET_ITEM(tmp, 2, entry_size)
+ # Py_INCREF(is_executable); StaticTuple_SET_ITEM(tmp, 3, is_executable)
+ # Py_INCREF(info); StaticTuple_SET_ITEM(tmp, 4, info)
+ # PyList_Append(trees, tmp)
+ PyList_Append(trees, StaticTuple(
minikind, # minikind
fingerprint, # fingerprint
entry_size, # size
=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py 2010-01-13 23:06:42 +0000
+++ b/bzrlib/dirstate.py 2010-08-02 22:07:54 +0000
@@ -220,6 +220,7 @@
inventory,
lock,
osutils,
+ static_tuple,
trace,
)
@@ -548,7 +549,7 @@
self._ensure_block(block_index, entry_index, utf8path)
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
if self._id_index:
- self._id_index.setdefault(entry_key[2], set()).add(entry_key)
+ self._add_to_id_index(self._id_index, entry_key)
def _bisect(self, paths):
"""Bisect through the disk structure for specific rows.
@@ -1566,7 +1567,7 @@
return
id_index = self._get_id_index()
for file_id in new_ids:
- for key in id_index.get(file_id, []):
+ for key in id_index.get(file_id, ()):
block_i, entry_i, d_present, f_present = \
self._get_block_entry_index(key[0], key[1], tree_index)
if not f_present:
@@ -1980,7 +1981,7 @@
' tree_index, file_id and path')
return entry
else:
- possible_keys = self._get_id_index().get(fileid_utf8, None)
+ possible_keys = self._get_id_index().get(fileid_utf8, ())
if not possible_keys:
return None, None
for key in possible_keys:
@@ -2143,14 +2144,48 @@
yield entry
def _get_id_index(self):
- """Get an id index of self._dirblocks."""
+ """Get an id index of self._dirblocks.
+
+ This maps from file_id => [(directory, name, file_id)] entries where
+ that file_id appears in one of the trees.
+ """
if self._id_index is None:
id_index = {}
for key, tree_details in self._iter_entries():
- id_index.setdefault(key[2], set()).add(key)
+ self._add_to_id_index(id_index, key)
self._id_index = id_index
return self._id_index
+ def _add_to_id_index(self, id_index, entry_key):
+ """Add this entry to the _id_index mapping."""
+ # This code used to use a set for every entry in the id_index. However,
+ # it is *rare* to have more than one entry. So a set is a large
+ # overkill. And even when we do, we won't ever have more than the
+ # number of parent trees. Which is still a small number (rarely >2). As
+ # such, we use a simple tuple, and do our own uniqueness checks. While
+ # the 'in' check is O(N) since N is nicely bounded it shouldn't ever
+ # cause quadratic failure.
+ # TODO: This should use StaticTuple
+ file_id = entry_key[2]
+ entry_key = static_tuple.StaticTuple.from_sequence(entry_key)
+ if file_id not in id_index:
+ id_index[file_id] = static_tuple.StaticTuple(entry_key,)
+ else:
+ entry_keys = id_index[file_id]
+ if entry_key not in entry_keys:
+ id_index[file_id] = entry_keys + (entry_key,)
+
+ def _remove_from_id_index(self, id_index, entry_key):
+ """Remove this entry from the _id_index mapping.
+
+ It is an programming error to call this when the entry_key is not
+ already present.
+ """
+ file_id = entry_key[2]
+ entry_keys = list(id_index[file_id])
+ entry_keys.remove(entry_key)
+ id_index[file_id] = static_tuple.StaticTuple.from_sequence(entry_keys)
+
def _get_output_lines(self, lines):
"""Format lines for final output.
@@ -2414,7 +2449,9 @@
continue
by_path[entry[0]] = [entry[1][0]] + \
[DirState.NULL_PARENT_DETAILS] * parent_count
- id_index[entry[0][2]] = set([entry[0]])
+ # TODO: Possibly inline this, since we know it isn't present yet
+ # id_index[entry[0][2]] = (entry[0],)
+ self._add_to_id_index(id_index, entry[0])
# now the parent trees:
for tree_index, tree in enumerate(parent_trees):
@@ -2442,7 +2479,7 @@
new_entry_key = (dirname, basename, file_id)
# tree index consistency: All other paths for this id in this tree
# index must point to the correct path.
- for entry_key in id_index.setdefault(file_id, set()):
+ for entry_key in id_index.get(file_id, ()):
# TODO:PROFILING: It might be faster to just update
# rather than checking if we need to, and then overwrite
# the one we are located at.
@@ -2454,7 +2491,8 @@
by_path[entry_key][tree_index] = ('r', path_utf8, 0, False, '')
# by path consistency: Insert into an existing path record (trivial), or
# add a new one with relocation pointers for the other tree indexes.
- if new_entry_key in id_index[file_id]:
+ entry_keys = id_index.get(file_id, ())
+ if new_entry_key in entry_keys:
# there is already an entry where this data belongs, just insert it.
by_path[new_entry_key][tree_index] = \
self._inv_entry_to_details(entry)
@@ -2465,16 +2503,16 @@
new_details = []
for lookup_index in xrange(tree_index):
# boundary case: this is the first occurence of file_id
- # so there are no id_indexs, possibly take this out of
+ # so there are no id_indexes, possibly take this out of
# the loop?
- if not len(id_index[file_id]):
+ if not len(entry_keys):
new_details.append(DirState.NULL_PARENT_DETAILS)
else:
# grab any one entry, use it to find the right path.
# TODO: optimise this to reduce memory use in highly
# fragmented situations by reusing the relocation
# records.
- a_key = iter(id_index[file_id]).next()
+ a_key = iter(entry_keys).next()
if by_path[a_key][lookup_index][0] in ('r', 'a'):
# its a pointer or missing statement, use it as is.
new_details.append(by_path[a_key][lookup_index])
@@ -2485,7 +2523,7 @@
new_details.append(self._inv_entry_to_details(entry))
new_details.extend(new_location_suffix)
by_path[new_entry_key] = new_details
- id_index[file_id].add(new_entry_key)
+ self._add_to_id_index(id_index, new_entry_key)
# --- end generation of full tree mappings
# sort and output all the entries
@@ -2673,7 +2711,7 @@
block[1].pop(entry_index)
# if we have an id_index in use, remove this key from it for this id.
if self._id_index is not None:
- self._id_index[current_old[0][2]].remove(current_old[0])
+ self._remove_from_id_index(self._id_index, current_old[0])
# update all remaining keys for this id to record it as absent. The
# existing details may either be the record we are marking as deleted
# (if there were other trees with the id present at this path), or may
@@ -2748,7 +2786,7 @@
else:
break
# new entry, synthesis cross reference here,
- existing_keys = id_index.setdefault(key[2], set())
+ existing_keys = id_index.get(key[2], ())
if not existing_keys:
# not currently in the state, simplest case
new_entry = key, [new_details] + self._empty_parent_info()
@@ -2785,8 +2823,11 @@
# loop.
other_entry = other_block[other_entry_index]
other_entry[1][0] = ('r', path_utf8, 0, False, '')
- self._maybe_remove_row(other_block, other_entry_index,
- id_index)
+ if self._maybe_remove_row(other_block, other_entry_index,
+ id_index):
+ # If the row holding this was removed, we need to
+ # recompute where this entry goes
+ entry_index, _ = self._find_entry_index(key, block)
# This loop:
# adds a tuple to the new details for each column
@@ -2794,6 +2835,8 @@
# - or by creating a new pointer to the right row inside that column
num_present_parents = self._num_present_parents()
if num_present_parents:
+ # TODO: This re-evaluates the existing_keys set, do we need
+ # to do that ourselves?
other_key = list(existing_keys)[0]
for lookup_index in xrange(1, num_present_parents + 1):
# grab any one entry, use it to find the right path.
@@ -2818,7 +2861,7 @@
pointer_path = osutils.pathjoin(*other_key[0:2])
new_entry[1].append(('r', pointer_path, 0, False, ''))
block.insert(entry_index, new_entry)
- existing_keys.add(key)
+ self._add_to_id_index(id_index, key)
else:
# Does the new state matter?
block[entry_index][1][0] = new_details
@@ -2833,7 +2876,12 @@
# converted to relocated.
if path_utf8 is None:
raise AssertionError('no path')
- for entry_key in id_index.setdefault(key[2], set()):
+ existing_keys = id_index[key[2]]
+ if key not in existing_keys:
+ raise AssertionError('We found the entry in the blocks, but'
+ ' the key is not in the id_index.'
+ ' key: %s, existing_keys: %s' % (key, existing_keys))
+ for entry_key in id_index[key[2]]:
# TODO:PROFILING: It might be faster to just update
# rather than checking if we need to, and then overwrite
# the one we are located at.
@@ -2863,6 +2911,7 @@
"""Remove index if it is absent or relocated across the row.
id_index is updated accordingly.
+ :return: True if we removed the row, False otherwise
"""
present_in_row = False
entry = block[index]
@@ -2872,7 +2921,9 @@
break
if not present_in_row:
block.pop(index)
- id_index[entry[0][2]].remove(entry[0])
+ self._remove_from_id_index(id_index, entry[0])
+ return True
+ return False
def _validate(self):
"""Check that invariants on the dirblock are correct.
@@ -3020,6 +3071,10 @@
raise AssertionError(
'file_id %r did not match entry key %s'
% (file_id, entry_key))
+ if len(entry_keys) != len(set(entry_keys)):
+ raise AssertionError(
+ 'id_index contained non-unique data for %s'
+ % (entry_keys,))
def _wipe_state(self):
"""Forget all state information about the dirstate."""
=== modified file 'bzrlib/tests/per_controldir/__init__.py'
--- a/bzrlib/tests/per_controldir/__init__.py 2010-08-21 16:06:24 +0000
+++ b/bzrlib/tests/per_controldir/__init__.py 2010-08-27 17:53:08 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2006 Canonical Ltd
+# Copyright (C) 2006-2010 Canonical Ltd
# Authors: Robert Collins <robert.collins at canonical.com>
# -*- coding: utf-8 -*-
#
=== modified file 'bzrlib/tests/per_controldir/test_push.py'
--- a/bzrlib/tests/per_controldir/test_push.py 2010-08-21 16:06:24 +0000
+++ b/bzrlib/tests/per_controldir/test_push.py 2010-08-27 17:53:08 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2009 Canonical Ltd
+# Copyright (C) 2009, 2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
=== modified file 'bzrlib/tests/per_repository/__init__.py'
--- a/bzrlib/tests/per_repository/__init__.py 2010-08-21 16:06:24 +0000
+++ b/bzrlib/tests/per_repository/__init__.py 2010-08-27 17:53:08 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2006, 2007, 2008, 2009 Canonical Ltd
+# Copyright (C) 2006-2010 Canonical Ltd
# Authors: Robert Collins <robert.collins at canonical.com>
# and others
#
=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py 2010-01-25 17:48:22 +0000
+++ b/bzrlib/tests/test_dirstate.py 2010-08-02 21:40:07 +0000
@@ -730,6 +730,22 @@
class TestDirStateManipulations(TestCaseWithDirState):
+ def test_update_minimal_updates_id_index(self):
+ state = self.create_dirstate_with_root_and_subdir()
+ self.addCleanup(state.unlock)
+ id_index = state._get_id_index()
+ self.assertEqual(['a-root-value', 'subdir-id'], sorted(id_index))
+ state.add('file-name', 'file-id', 'file', None, '')
+ self.assertEqual(['a-root-value', 'file-id', 'subdir-id'],
+ sorted(id_index))
+ state.update_minimal(('', 'new-name', 'file-id'), 'f',
+ path_utf8='new-name')
+ self.assertEqual(['a-root-value', 'file-id', 'subdir-id'],
+ sorted(id_index))
+ self.assertEqual([('', 'new-name', 'file-id')],
+ sorted(id_index['file-id']))
+ state._validate()
+
def test_set_state_from_inventory_no_content_no_parents(self):
# setting the current inventory is a slow but important api to support.
tree1 = self.make_branch_and_memory_tree('tree1')
=== modified file 'bzrlib/tests/test_options.py'
--- a/bzrlib/tests/test_options.py 2010-08-02 18:36:31 +0000
+++ b/bzrlib/tests/test_options.py 2010-08-27 17:53:08 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2005, 2006, 2007 Canonical Ltd
+# Copyright (C) 2005-2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
More information about the bazaar-commits
mailing list