Rev 183: Default to only showing 100 parent references. in http://bazaar.launchpad.net/~meliae-dev/meliae/trunk
John Arbash Meinel
john at arbash-meinel.com
Mon Aug 9 17:13:14 BST 2010
At http://bazaar.launchpad.net/~meliae-dev/meliae/trunk
------------------------------------------------------------
revno: 183
revision-id: john at arbash-meinel.com-20100809161308-oa1wlnk33w6hvg23
parent: john at arbash-meinel.com-20100809154752-ah3vho249qprf4ky
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: trunk
timestamp: Mon 2010-08-09 11:13:08 -0500
message:
Default to only showing 100 parent references.
This should decrease memory consumption for some really big dumps, without
diminishing utility. (At least until we have a gui or something.)
-------------- next part --------------
=== modified file 'CHANGES.txt'
--- a/CHANGES.txt 2010-08-09 15:35:21 +0000
+++ b/CHANGES.txt 2010-08-09 16:13:08 +0000
@@ -14,6 +14,17 @@
I don't want to work around. Namely sizeof(Class) doesn't work under
even pyrex 0.9.9. (John Arbash Meinel)
+* Duplicate parent entries are filtered out. (eg the intern dict refers
+ to the same string 2x, you'll see 1 parent ref in the child, not 2.)
+ (John Arbash Meinel)
+
+* We now default to limiting the maximum length of the parents list
+ (default 100). I had some dumps where a single object was referenced
+ 50k times. Anything over about 10 is at the point where you won't
+ really walk them. This can be disabled with ``load(max_parents=-1)``.
+ The main win is lowering memory consumption. A 50k parent list takes
+ 200kB by itself (on 32-bit). (John Arbash Meinel)
+
Meliae 0.3
##########
=== modified file 'meliae/loader.py'
--- a/meliae/loader.py 2010-08-09 15:47:52 +0000
+++ b/meliae/loader.py 2010-08-09 16:13:08 +0000
@@ -204,9 +204,21 @@
This is the interface for doing queries, etc.
"""
- def __init__(self, objs, show_progress=True):
+ def __init__(self, objs, show_progress=True, max_parents=None):
+ """Create a new ObjManager
+
+ :param show_progress: If True, as content is loading, write progress
+ information to stderr.
+ :param max_parents: When running compute_parents(), cap the maximum
+ parents tracked to a fixed number, since knowing there are 50k
+ references is only informative, you won't actually track into them.
+ If 0 we will not compute parents, if < 0 we will show all parents.
+ """
self.objs = objs
self.show_progress = show_progress
+ self.max_parents = max_parents
+ if self.max_parents is None:
+ self.max_parents = 100
def __getitem__(self, address):
return self.objs[address]
@@ -219,6 +231,8 @@
def compute_parents(self):
"""For each object, figure out who is referencing it."""
+ if self.max_parents == 0:
+ return
parents = {}
get_refs = parents.get
total = len(self.objs)
@@ -256,13 +270,30 @@
elif t in (int, long):
refs = (refs, address)
elif t is tuple:
- if len(refs) >= 10:
+ if len(refs) >= 5:
refs = list(refs)
refs.append(address)
else:
refs = refs + (address,)
elif t is list:
+ # if we are close to the maximum number of entries, put
+ # it through a set() to make sure we get all the
+ # duplicates
+ if (self.max_parents > 0):
+ if (len(refs) >= self.max_parents):
+ # Our list has been filled, all done
+ continue
+ elif (len(refs) == self.max_parents - 1):
+ # We are one step away from being full. We put
+ # the content into a set() so that we are sure
+ # any duplicates will get filtered out, leaving
+ # space for the new ref.
+ refs.append(address)
+ refs[:] = set(refs)
+ continue
refs.append(address)
+ # We don't need to set it, because we modify-in-place
+ continue
else:
raise TypeError('unknown refs type: %s\n' % (t,))
parents[ref] = refs
@@ -355,6 +386,8 @@
So we collapse those references back into the object, and grow its
'size' at the same time.
+
+ :return: True if some data was collapsed
"""
# The instances I'm focusing on have a custom type name, and every
# instance has 2 pointers. The first is to __dict__, and the second is
@@ -423,6 +456,7 @@
% (item_idx, total, collapsed))
if collapsed:
self.compute_parents()
+ return collapsed
def refs_as_dict(self, obj):
"""Expand the ref list considering it to be a 'dict' structure.
@@ -473,7 +507,8 @@
return o
-def load(source, using_json=None, show_prog=True, collapse=True):
+def load(source, using_json=None, show_prog=True, collapse=True,
+ max_parents=None):
"""Load objects from the given source.
:param source: If this is a string, we will open it as a file and read all
@@ -486,6 +521,8 @@
is available, and use it if it is. (With _speedups built, simplejson
parses faster and more accurately than the regex.)
:param show_prog: If True, display the progress as we read in data
+ :param collapse: If True, run collapse_instance_dicts() after loading.
+ :param max_parents: See ObjManager.__init__(max_parents)
"""
cleanup = None
if isinstance(source, str):
@@ -501,13 +538,15 @@
if using_json is None:
using_json = (simplejson is not None)
try:
- manager = _load(source, using_json, show_prog, input_size)
+ manager = _load(source, using_json, show_prog, input_size,
+ max_parents=max_parents)
finally:
if cleanup is not None:
cleanup()
if collapse:
tstart = time.time()
- manager.collapse_instance_dicts()
+ if not manager.collapse_instance_dicts():
+ manager.compute_parents()
if show_prog:
tend = time.time()
sys.stderr.write('collapsed in %.1fs\n'
@@ -576,13 +615,13 @@
% (line_num, len(objs), mb_read, input_mb, tdelta))
-def _load(source, using_json, show_prog, input_size):
+def _load(source, using_json, show_prog, input_size, max_parents=None):
objs = _loader.MemObjectCollection()
for memobj in iter_objs(source, using_json, show_prog, input_size, objs,
factory=objs.add):
# objs.add automatically adds the object as it is created
pass
- return ObjManager(objs, show_progress=show_prog)
+ return ObjManager(objs, show_progress=show_prog, max_parents=max_parents)
def remove_expensive_references(source, total_objs=0, show_progress=False):
=== modified file 'meliae/tests/test_loader.py'
--- a/meliae/tests/test_loader.py 2010-08-09 15:47:52 +0000
+++ b/meliae/tests/test_loader.py 2010-08-09 16:13:08 +0000
@@ -279,11 +279,34 @@
def test_compute_parents_ignore_repeated(self):
manager = loader.load(_intern_dict_dump, show_prog=False)
str_5 = manager[5]
- manager.compute_parents()
# Each of these refers to str_5 multiple times, but they should only
# show up 1 time in the parent list.
self.assertEqual([6, 7, 8], sorted(str_5.parents))
+ def test_compute_parents_no_parents(self):
+ manager = loader.load(_intern_dict_dump, show_prog=False, max_parents=0)
+ str_5 = manager[5]
+ # Each of these refers to str_5 multiple times, but they should only
+ # show up 1 time in the parent list.
+ self.assertEqual([], sorted(str_5.parents))
+
+ def test_compute_parents_many_parents(self):
+ content = [
+'{"address": 2, "type": "str", "size": 25, "len": 1, "value": "a", "refs": []}',
+]
+ for x in xrange(200):
+ content.append('{"address": %d, "type": "tuple", "size": 20,'
+ ' "len": 2, "refs": [2, 2]}' % (x+100))
+ # By default, we only track 100 parents
+ manager = loader.load(content, show_prog=False)
+ self.assertEqual(100, manager[2].num_parents)
+ manager = loader.load(content, show_prog=False, max_parents=0)
+ self.assertEqual(0, manager[2].num_parents)
+ manager = loader.load(content, show_prog=False, max_parents=-1)
+ self.assertEqual(200, manager[2].num_parents)
+ manager = loader.load(content, show_prog=False, max_parents=10)
+ self.assertEqual(10, manager[2].num_parents)
+
def test_compute_total_size(self):
manager = loader.load(_example_dump, show_prog=False)
objs = manager.objs
More information about the bazaar-commits
mailing list