Rev 4788: Some prototyping work, which shaves ~10MB off of peak while 'streaming'. in http://bazaar.launchpad.net/~jameinel/bzr/2.1-st-chk-map-using-simpleset
John Arbash Meinel
john at arbash-meinel.com
Fri Oct 23 18:59:53 BST 2009
At http://bazaar.launchpad.net/~jameinel/bzr/2.1-st-chk-map-using-simpleset
------------------------------------------------------------
revno: 4788
revision-id: john at arbash-meinel.com-20091023175941-3bcppq5fx41r73df
parent: john at arbash-meinel.com-20091023172745-71kq3uh3t0m9fgah
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1-st-chk-map-using-simpleset
timestamp: Fri 2009-10-23 12:59:41 -0500
message:
Some prototyping work, which shaves ~10MB off of peak while 'streaming'.
Basically, new_refs becomes a very large set. So instead use SimpleSet, and update
it in place. Saving a reasonable amount of memory.
-------------- next part --------------
=== modified file 'bzrlib/_simple_set_pyx.pyx'
--- a/bzrlib/_simple_set_pyx.pyx 2009-10-14 15:57:06 +0000
+++ b/bzrlib/_simple_set_pyx.pyx 2009-10-23 17:59:41 +0000
@@ -272,6 +272,42 @@
"""
return self._add(key)
+ def discard_all(self, keys):
+ """Discard every key in the iterable keys"""
+ cdef Py_ssize_t pos
+ cdef PyObject *internal_key, **slot
+ cdef SimpleSet other
+
+ if len(self) > len(keys):
+ for key in keys:
+ # Note that we could avoid some PyObject_Hash calls if we know that
+ # 'keys' is a dict or set...
+ self._discard(key)
+ elif isinstance(keys, SimpleSet):
+ other = keys
+ pos = 0
+ while SimpleSet_Next(self, &pos, &internal_key):
+ key = <object>internal_key
+ slot = _lookup(other, key)
+ if slot[0] == _dummy or slot[0] == NULL:
+ # Not present
+ continue
+ # Discarded
+ self._table[pos] = _dummy
+ else:
+ pos = 0
+ while SimpleSet_Next(self, &pos, &internal_key):
+ key = <object>internal_key
+ if key in keys:
+ # Discarded
+ self._table[pos] = _dummy
+
+ def update(self, keys):
+ """Similar to set.update(). Add all of keys to self."""
+ # This is just done here to compile the for loop
+ for key in keys:
+ self._add(key)
+
cdef object _add(self, key):
cdef PyObject **slot, *py_key
cdef int added
=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py 2009-10-23 17:27:45 +0000
+++ b/bzrlib/chk_map.py 2009-10-23 17:59:41 +0000
@@ -47,6 +47,7 @@
)
""")
from bzrlib import (
+ _simple_set_pyx,
lru_cache,
osutils,
registry,
@@ -1477,7 +1478,7 @@
# interesting queue (so we don't need to walk them again)
# TODO: processed_new_refs becomes O(all_chks), consider switching to
# SimpleSet here.
- self._processed_new_refs = set()
+ self._processed_new_refs = _simple_set_pyx.SimpleSet()
self._search_key_func = search_key_func
# The uninteresting and interesting nodes to be searched
@@ -1629,8 +1630,17 @@
refs = refs.difference(all_old_chks)
processed_new_refs.update(refs)
while refs:
- next_refs = set()
- next_refs_update = next_refs.update
+ # TODO: Use SimpleSet for next_refs. On a 2-layer tree, next_refs
+ # can end up pointing to a significant fraction of the
+ # ancestry (in LP it is 67k nodes at the root, and 394k nodes
+ # in the second layer. So while reading the root, this
+ # becomes a set with 394k entries. Which ends up being an 8MB
+ # set. 394k*2 [overhead] rounds up to 2**20
+ # and 2**20 * (4*2 pointer+long) = 8MB
+ # I show a savings of 10MB peak by using a SimpleSet and
+ # using '.discard' rather than a set here...
+ next_refs = _simple_set_pyx.SimpleSet()
+ next_refs_add = next_refs.add
# Inlining _read_nodes_from_store improves 'bzr branch bzr.dev'
# from 1m54s to 1m51s. Consider it.
for record, _, p_refs, items in self._read_nodes_from_store(refs):
@@ -1640,8 +1650,8 @@
items = [item for item in items
if item not in all_old_items]
yield record, items
- next_refs_update([p_r[1] for p_r in p_refs])
- del p_refs
+ for _, r in p_refs:
+ next_refs_add(r)
# set1.difference(set/dict) walks all of set1, and checks if it
# exists in 'other'.
# set1.difference(iterable) walks all of iterable, and does a
@@ -1649,8 +1659,14 @@
# expected sizes of objects.
# in our case it is expected that 'new_refs' will always be quite
# small.
- next_refs = next_refs.difference(all_old_chks)
- next_refs = next_refs.difference(processed_new_refs)
+ if all_old_chks:
+ next_refs.discard_all(all_old_chks)
+ if len(next_refs) > len(processed_new_refs):
+ to_discard = processed_new_refs
+ else:
+ to_discard = [r for r in next_refs
+ if r in processed_new_refs]
+ next_refs.discard_all(to_discard)
processed_new_refs.update(next_refs)
refs = next_refs
More information about the bazaar-commits
mailing list