Rev 183: Default to only showing 100 parent references. in http://bazaar.launchpad.net/~meliae-dev/meliae/trunk

John Arbash Meinel john at arbash-meinel.com
Mon Aug 9 17:13:14 BST 2010


At http://bazaar.launchpad.net/~meliae-dev/meliae/trunk

------------------------------------------------------------
revno: 183
revision-id: john at arbash-meinel.com-20100809161308-oa1wlnk33w6hvg23
parent: john at arbash-meinel.com-20100809154752-ah3vho249qprf4ky
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: trunk
timestamp: Mon 2010-08-09 11:13:08 -0500
message:
  Default to only showing 100 parent references.
  
  This should decrease memory consumption for some really big dumps, without
  diminishing utility. (At least until we have a gui or something.)
-------------- next part --------------
=== modified file 'CHANGES.txt'
--- a/CHANGES.txt	2010-08-09 15:35:21 +0000
+++ b/CHANGES.txt	2010-08-09 16:13:08 +0000
@@ -14,6 +14,17 @@
   I don't want to work around. Namely sizeof(Class) doesn't work under
   even pyrex 0.9.9. (John Arbash Meinel)
 
+* Duplicate parent entries are filtered out. (eg the intern dict refers
+  to the same string 2x, you'll see 1 parent ref in the child, not 2.)
+  (John Arbash Meinel)
+
+* We now default to limiting the maximum length of the parents list
+  (default 100). I had some dumps where a single object was referenced
+  50k times. Anything over about 10 is at the point where you won't
+  really walk them. This can be disabled with ``load(max_parents=-1)``.
+  The main win is lowering memory consumption. A 50k parent list takes
+  200kB by itself (on 32-bit).  (John Arbash Meinel)
+
 Meliae 0.3
 ##########
 

=== modified file 'meliae/loader.py'
--- a/meliae/loader.py	2010-08-09 15:47:52 +0000
+++ b/meliae/loader.py	2010-08-09 16:13:08 +0000
@@ -204,9 +204,21 @@
     This is the interface for doing queries, etc.
     """
 
-    def __init__(self, objs, show_progress=True):
+    def __init__(self, objs, show_progress=True, max_parents=None):
+        """Create a new ObjManager
+
+        :param show_progress: If True, as content is loading, write progress
+            information to stderr.
+        :param max_parents: When running compute_parents(), cap the maximum
+            parents tracked to a fixed number, since knowing there are 50k
+            references is only informative, you won't actually track into them.
+            If 0 we will not compute parents, if < 0 we will show all parents.
+        """
         self.objs = objs
         self.show_progress = show_progress
+        self.max_parents = max_parents
+        if self.max_parents is None:
+            self.max_parents = 100
 
     def __getitem__(self, address):
         return self.objs[address]
@@ -219,6 +231,8 @@
 
     def compute_parents(self):
         """For each object, figure out who is referencing it."""
+        if self.max_parents == 0:
+            return
         parents = {}
         get_refs = parents.get
         total = len(self.objs)
@@ -256,13 +270,30 @@
                     elif t in (int, long):
                         refs = (refs, address)
                     elif t is tuple:
-                        if len(refs) >= 10:
+                        if len(refs) >= 5:
                             refs = list(refs)
                             refs.append(address)
                         else:
                             refs = refs + (address,)
                     elif t is list:
+                        # if we are close to the maximum number of entries, put
+                        # it through a set() to make sure we get all the
+                        # duplicates
+                        if (self.max_parents > 0):
+                            if (len(refs) >= self.max_parents):
+                                # Our list has been filled, all done
+                                continue
+                            elif (len(refs) == self.max_parents - 1):
+                                # We are one step away from being full. We put
+                                # the content into a set() so that we are sure
+                                # any duplicates will get filtered out, leaving
+                                # space for the new ref.
+                                refs.append(address)
+                                refs[:] = set(refs)
+                                continue
                         refs.append(address)
+                        # We don't need to set it, because we modify-in-place
+                        continue
                     else:
                         raise TypeError('unknown refs type: %s\n' % (t,))
                     parents[ref] = refs
@@ -355,6 +386,8 @@
 
         So we collapse those references back into the object, and grow its
         'size' at the same time.
+
+        :return: True if some data was collapsed
         """
         # The instances I'm focusing on have a custom type name, and every
         # instance has 2 pointers. The first is to __dict__, and the second is
@@ -423,6 +456,7 @@
                              % (item_idx, total, collapsed))
         if collapsed:
             self.compute_parents()
+        return collapsed
 
     def refs_as_dict(self, obj):
         """Expand the ref list considering it to be a 'dict' structure.
@@ -473,7 +507,8 @@
                 return o
 
 
-def load(source, using_json=None, show_prog=True, collapse=True):
+def load(source, using_json=None, show_prog=True, collapse=True,
+         max_parents=None):
     """Load objects from the given source.
 
     :param source: If this is a string, we will open it as a file and read all
@@ -486,6 +521,8 @@
         is available, and use it if it is. (With _speedups built, simplejson
         parses faster and more accurately than the regex.)
     :param show_prog: If True, display the progress as we read in data
+    :param collapse: If True, run collapse_instance_dicts() after loading.
+    :param max_parents: See ObjManager.__init__(max_parents)
     """
     cleanup = None
     if isinstance(source, str):
@@ -501,13 +538,15 @@
     if using_json is None:
         using_json = (simplejson is not None)
     try:
-        manager = _load(source, using_json, show_prog, input_size)
+        manager = _load(source, using_json, show_prog, input_size,
+                        max_parents=max_parents)
     finally:
         if cleanup is not None:
             cleanup()
     if collapse:
         tstart = time.time()
-        manager.collapse_instance_dicts()
+        if not manager.collapse_instance_dicts():
+            manager.compute_parents()
         if show_prog:
             tend = time.time()
             sys.stderr.write('collapsed in %.1fs\n'
@@ -576,13 +615,13 @@
             % (line_num, len(objs), mb_read, input_mb, tdelta))
 
 
-def _load(source, using_json, show_prog, input_size):
+def _load(source, using_json, show_prog, input_size, max_parents=None):
     objs = _loader.MemObjectCollection()
     for memobj in iter_objs(source, using_json, show_prog, input_size, objs,
                             factory=objs.add):
         # objs.add automatically adds the object as it is created
         pass
-    return ObjManager(objs, show_progress=show_prog)
+    return ObjManager(objs, show_progress=show_prog, max_parents=max_parents)
 
 
 def remove_expensive_references(source, total_objs=0, show_progress=False):

=== modified file 'meliae/tests/test_loader.py'
--- a/meliae/tests/test_loader.py	2010-08-09 15:47:52 +0000
+++ b/meliae/tests/test_loader.py	2010-08-09 16:13:08 +0000
@@ -279,11 +279,34 @@
     def test_compute_parents_ignore_repeated(self):
         manager = loader.load(_intern_dict_dump, show_prog=False)
         str_5 = manager[5]
-        manager.compute_parents()
         # Each of these refers to str_5 multiple times, but they should only
         # show up 1 time in the parent list.
         self.assertEqual([6, 7, 8], sorted(str_5.parents))
 
+    def test_compute_parents_no_parents(self):
+        manager = loader.load(_intern_dict_dump, show_prog=False, max_parents=0)
+        str_5 = manager[5]
+        # Each of these refers to str_5 multiple times, but they should only
+        # show up 1 time in the parent list.
+        self.assertEqual([], sorted(str_5.parents))
+
+    def test_compute_parents_many_parents(self):
+        content = [
+'{"address": 2, "type": "str", "size": 25, "len": 1, "value": "a", "refs": []}',
+]
+        for x in xrange(200):
+            content.append('{"address": %d, "type": "tuple", "size": 20,'
+                           ' "len": 2, "refs": [2, 2]}' % (x+100))
+        # By default, we only track 100 parents
+        manager = loader.load(content, show_prog=False)
+        self.assertEqual(100, manager[2].num_parents)
+        manager = loader.load(content, show_prog=False, max_parents=0)
+        self.assertEqual(0, manager[2].num_parents)
+        manager = loader.load(content, show_prog=False, max_parents=-1)
+        self.assertEqual(200, manager[2].num_parents)
+        manager = loader.load(content, show_prog=False, max_parents=10)
+        self.assertEqual(10, manager[2].num_parents)
+
     def test_compute_total_size(self):
         manager = loader.load(_example_dump, show_prog=False)
         objs = manager.objs



More information about the bazaar-commits mailing list