Rev 184: We now support custom sizeof methods registered at runtime. in http://bazaar.launchpad.net/~meliae-dev/meliae/trunk

John Arbash Meinel john at arbash-meinel.com
Tue Aug 10 17:22:00 BST 2010


At http://bazaar.launchpad.net/~meliae-dev/meliae/trunk

------------------------------------------------------------
revno: 184 [merge]
revision-id: john at arbash-meinel.com-20100810162152-kwinrlv78flsdox5
parent: john at arbash-meinel.com-20100809161308-oa1wlnk33w6hvg23
parent: john at arbash-meinel.com-20100810162009-62a3o1k0j1x8did3
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: trunk
timestamp: Tue 2010-08-10 11:21:52 -0500
message:
  We now support custom sizeof methods registered at runtime.
modified:
  CHANGES.txt                    changes.txt-20100104131503-ipkk7tyh2bnv0lu4-1
  meliae/_scanner.pyx            _scanner.pyx-20090401185718-094vrprmymne09r1-2
  meliae/_scanner_core.c         _scanner_core.c-20090402012435-66bb6fp08v4begco-1
  meliae/_scanner_core.h         _scanner_core.h-20090402012435-66bb6fp08v4begco-2
  meliae/scanner.py              scanner.py-20090402040945-idv8d14z8re05gvg-1
  meliae/tests/test__scanner.py  test__scanner.py-20090401185718-094vrprmymne09r1-3
-------------- next part --------------
=== modified file 'CHANGES.txt'
--- a/CHANGES.txt	2010-08-09 16:13:08 +0000
+++ b/CHANGES.txt	2010-08-10 16:20:09 +0000
@@ -25,6 +25,15 @@
   The main win is lowering memory consumption. A 50k parent list takes
   200kB by itself (on 32-bit).  (John Arbash Meinel)
 
+* Fix a PyInt memory leak. We were calling __sizeof__ which returns an
+  PyInt and not DECREFing it. (John Arbash Meinel)
+
+* Initial support for overriding ``__sizeof__`` definitions. It turns
+  out that object has a basic definition, so all new style classes
+  inherit it. We now provide ``meliae.scanner.add_special_size``, which
+  takes a type name and some callbacks to determine the size of an
+  object. This lets you register them without importing the module.
+
 Meliae 0.3
 ##########
 

=== modified file 'meliae/_scanner.pyx'
--- a/meliae/_scanner.pyx	2010-05-20 14:08:53 +0000
+++ b/meliae/_scanner.pyx	2010-08-10 16:14:19 +0000
@@ -39,6 +39,7 @@
     void _dump_object_info(write_callback write, void *callee_data,
                            object c_obj, object nodump, int recurse)
     object _get_referents(object c_obj)
+    object _get_special_case_dict()
 
 
 _word_size = sizeof(Py_ssize_t)
@@ -111,3 +112,133 @@
     tp_traverse.
     """
     return _get_referents(obj)
+
+
+def add_special_size(object tp_name, object size_of_32, object size_of_64):
+    """Special case a given object size.
+
+    This is only meant to be used for objects we don't already handle or which
+    don't implement __sizeof__ (those are checked before this check happens).
+
+    This is meant for things like zlib.Compress which allocates a lot of
+    internal buffers, which are not easily accessible (but can be
+    approximated).  The gc header should not be included in this size, it will
+    be added at runtime.
+
+    Setting the value to None will remove the value.
+
+    (We only distinguish size_of_32 from size_of_64 for the implementer's
+    benefit, since sizeof() is not generally accessible from Python.)
+
+    :param tp_name: The type string we care about (such as 'zlib.Compress').
+        This will be matched against object->type->tp_name.
+    :param size_of_32: Called when _word_size == 32-bits
+    :param size_of_64: Called when _word_size == 64-bits
+    :return: None
+    """
+    special_dict = _get_special_case_dict()
+    if _word_size == 4:
+        sz = size_of_32
+    elif _word_size == 8:
+        sz = size_of_64
+    else:
+        raise RuntimeError('Unknown word size: %s' % (_word_size,))
+    if sz is None:
+        if tp_name in special_dict:
+            del special_dict[tp_name]
+    else:
+        special_dict[tp_name] = sz
+
+
+def _zlib_size_of_32(zlib_obj):
+    """Return a __sizeof__ for a zlib object."""
+    cdef Py_ssize_t size
+
+    t = type(zlib_obj)
+    name = t.__name__
+    # Size of the zlib 'compobject', (PyObject_HEAD + z_stream, + misc)
+    size = 56
+    if name.endswith('Decompress'):
+        # _get_referents doesn't track into these attributes, so we just
+        # attribute the size to the object itself.
+        size += _size_of(zlib_obj.unused_data)
+        size += _size_of(zlib_obj.unconsumed_tail)
+        # sizeof(inflate_state)
+        size += 7116
+        # sizeof(buffers allocated for inflate)
+        # (1 << state->wbits)
+        # However, we don't have access to wbits, so we assume the default (and
+        # largest) of 15 wbits
+        size += (1 << 15)
+        # Empirically 42kB / object during decompression, and this gives 39998
+    elif name.endswith('Compress'):
+        # compress objects have a reference to unused_data, etc, but it always
+        # points to the empty string.
+        # sizeof(deflate_state)
+        size += 5828
+        # We don't have access to the stream C attributes, so we assume the
+        # standard values and go with it
+        # Pos == unsigned short
+        # Byte == unsigned char
+        # w_size = 1 << s->w_bits, default 15 => (1<<15)
+        # memLevel default is 8 (maybe 9?)
+        # s->w_size * 2*sizeof(Byte) = (1<<15) * 2 * 1 = 65536
+        size += 65536
+        # s_>w_size * sizeof(Pos) = (1<<15) * 2 = 65536
+        size += 65536
+        # s->hash_size * sizeof(Pos) = (1 << (8+7)) * 2 = 65536
+        size += 65536
+        # s->lit_bufsize = 1 << (8 + 6) = (1 << 14) = 16384
+        # s->pending_buf = lit_bufsize * (sizeof(ush)+2) = 4*16384 = 65536
+        size += 65536
+        # empirically, I got ~96378 bytes/object after allocating a lot of them
+        # After sending a bunch of compression data to all of them, I got
+        # ~270127 bytes/object. (according to WorkingMem)
+        # This gives 268028, which is pretty close
+    else:
+        return -1
+    # We assume that everything is at least aligned to word boundary
+    if size % _word_size != 0:
+        size += _word_size - (size % _word_size)
+    return size
+
+
+def _zlib_size_of_64(zlib_obj):
+    """Return a __sizeof__ for a zlib object."""
+    t = type(zlib_obj)
+    name = t.__name__
+    # Size of the zlib 'compobject', (PyObject_HEAD + z_stream, + misc)
+    # All the 64-bit numbers here are 'made up'
+    size = (56 * 2)
+    if name.endswith('Decompress'):
+        size += _size_of(zlib_obj.unused_data)
+        size += _size_of(zlib_obj.unconsumed_tail)
+        # sizeof(inflate_state)
+        size += (7116 * 2)
+        # sizeof(buffers allocated for inflate)
+        # (1 << state->wbits)
+        # However, we don't have access to wbits, so we assume the default (and
+        # largest) of 15 wbits
+        size += (1 << 15)
+    elif name.endswith('Compress'):
+        # sizeof(deflate_state)
+        size += (5828 * 2)
+        # We don't have access to the stream C attributes, so we assume the
+        # standard values and go with it
+        # s->w_size * 2*sizeof(Byte) = (1<<15) * 2 * 1 = 65536
+        size += 65536
+        # s_>w_size * sizeof(Pos) = (1<<15) * 2 = 65536
+        size += 65536
+        # s->hash_size * sizeof(Pos) = (1 << (8+7)) * 2 = 65536
+        size += 65536
+        # s->lit_bufsize = 1 << (8 + 6) = (1 << 14) = 16384
+        # s->pending_buf = lit_bufsize * (sizeof(ush)+2) = 4*16384 = 65536
+        size += 65536
+    else:
+        return -1
+    if size % _word_size != 0:
+        size += _word_size - (size % _word_size)
+    return size
+
+add_special_size('zlib.Compress', _zlib_size_of_32, _zlib_size_of_64)
+add_special_size('zlib.Decompress', _zlib_size_of_32, _zlib_size_of_64)

=== modified file 'meliae/_scanner_core.c'
--- a/meliae/_scanner_core.c	2010-07-12 17:14:01 +0000
+++ b/meliae/_scanner_core.c	2010-08-10 00:58:56 +0000
@@ -60,12 +60,14 @@
 #else
 static void _write_to_ref_info(struct ref_info *info, const char *fmt_string, ...);
 #endif
+static PyObject * _get_specials();
 
 /* The address of the last thing we dumped. Stuff like dumping the string
  * interned dictionary will dump the same string 2x in a row. This helps
  * prevent that.
  */
 static PyObject *_last_dumped = NULL;
+static PyObject *_special_case_dict = NULL;
 
 void
 _clear_last_dumped()
@@ -73,7 +75,7 @@
     _last_dumped = NULL;
 }
 
-Py_ssize_t
+static Py_ssize_t
 _basic_object_size(PyObject *c_obj)
 {
     Py_ssize_t size;
@@ -85,7 +87,7 @@
 }
 
 
-Py_ssize_t
+static Py_ssize_t
 _var_object_size(PyVarObject *c_obj)
 {
     Py_ssize_t num_entries;
@@ -99,7 +101,28 @@
             + num_entries * c_obj->ob_type->tp_itemsize;
 }
 
-Py_ssize_t
+static Py_ssize_t
+_object_to_size_with_gc(PyObject *size_obj, PyObject *c_obj)
+{
+    Py_ssize_t size = -1;
+
+    size = PyInt_AsSsize_t(size_obj);
+    if (size == -1) {
+        // Probably an error occurred, we don't know for sure, but we might as
+        // well just claim that we don't know the size. We *could* check
+        // PyErr_Occurred(), but if we are just clearing it anyway...
+        PyErr_Clear();
+        return -1;
+    }
+    // There is one trick left. Namely, __sizeof__ doesn't include the
+    // GC overhead, so let's add that back in
+    if (PyType_HasFeature(Py_TYPE(c_obj), Py_TPFLAGS_HAVE_GC)) {
+        size += sizeof(PyGC_Head);
+    }
+    return size;
+}
+
+static Py_ssize_t
 _size_of_from__sizeof__(PyObject *c_obj)
 {
     PyObject *size_obj = NULL;
@@ -117,24 +140,13 @@
         PyErr_Clear();
         return -1;
     }
-    size = PyInt_AsSsize_t(size_obj);
-    if (size == -1) {
-        // Probably an error occurred, we don't know for sure, but we might as
-        // well just claim that we don't know the size. We *could* check
-        // PyErr_Occurred(), but if we are just clearing it anyway...
-        PyErr_Clear();
-        return -1;
-    }
-    // There is one trick left. Namely, __sizeof__ doesn't seem to include the
-    // GC overhead, so let's add that back in
-    if (PyType_HasFeature(c_obj->ob_type, Py_TPFLAGS_HAVE_GC)) {
-        size += sizeof(PyGC_Head);
-    }
+    size = _object_to_size_with_gc(size_obj, c_obj);
+    Py_DECREF(size_obj);
     return size;
 }
 
 
-Py_ssize_t
+static Py_ssize_t
 _size_of_list(PyListObject *c_obj)
 {
     Py_ssize_t size;
@@ -144,7 +156,7 @@
 }
 
 
-Py_ssize_t
+static Py_ssize_t
 _size_of_set(PySetObject *c_obj)
 {
     Py_ssize_t size;
@@ -156,7 +168,7 @@
 }
 
 
-Py_ssize_t
+static Py_ssize_t
 _size_of_dict(PyDictObject *c_obj)
 {
     Py_ssize_t size;
@@ -168,7 +180,7 @@
 }
 
 
-Py_ssize_t
+static Py_ssize_t
 _size_of_unicode(PyUnicodeObject *c_obj)
 {
     Py_ssize_t size;
@@ -177,6 +189,51 @@
     return size;
 }
 
+static Py_ssize_t
+_size_of_from_specials(PyObject *c_obj)
+{
+    PyObject *special_dict;
+    PyObject *special_size_of;
+    PyObject *val;
+    Py_ssize_t size;
+
+    special_dict = _get_specials();
+    if (special_dict == NULL) {
+        PyErr_Clear(); // Not sure what happened, but don't propogate it
+        return -1;
+    }
+    special_size_of = PyDict_GetItemString(special_dict,
+                                           Py_TYPE(c_obj)->tp_name);
+    if (special_size_of == NULL) {
+        // if special_size_of is NULL, an exception is *not* set
+        return -1;
+    } 
+    // special_size_of is a *borrowed referenced*
+    val = PyObject_CallFunction(special_size_of, "O", c_obj);
+    if (val == NULL) {
+        return -1;
+    }
+    size = _object_to_size_with_gc(val, c_obj);
+    Py_DECREF(val);
+    return size;
+}
+
+static Py_ssize_t
+_size_of_from_var_or_basic_size(PyObject *c_obj)
+{
+    /* There are a bunch of types that we know we can check directly, without
+     * having to go through the __sizeof__ abstraction. This allows us to avoid
+     * the extra intermediate allocations. It is also our final fallback
+     * method.
+     */
+
+    if (c_obj->ob_type->tp_itemsize != 0) {
+        // Variable length object with inline storage
+        // total size is tp_itemsize * ob_size
+        return _var_object_size((PyVarObject *)c_obj);
+    }
+    return _basic_object_size(c_obj);
+}
 
 Py_ssize_t
 _size_of(PyObject *c_obj)
@@ -191,19 +248,27 @@
         return _size_of_dict((PyDictObject *)c_obj);
     } else if PyUnicode_Check(c_obj) {
         return _size_of_unicode((PyUnicodeObject *)c_obj);
+    } else if (PyTuple_CheckExact(c_obj)
+            || PyString_CheckExact(c_obj)
+            || PyInt_CheckExact(c_obj)
+            || PyBool_Check(c_obj)
+            || c_obj == Py_None
+            || PyModule_CheckExact(c_obj))
+    {
+        // All of these implement __sizeof__, but we don't need to use it
+        return _size_of_from_var_or_basic_size(c_obj);
     }
 
+    // object implements __sizeof__ so we have to specials first
+    size = _size_of_from_specials(c_obj);
+    if (size != -1) {
+        return size;
+    }
     size = _size_of_from__sizeof__(c_obj);
     if (size != -1) {
         return size;
     }
-
-    if (c_obj->ob_type->tp_itemsize != 0) {
-        // Variable length object with inline storage
-        // total size is tp_itemsize * ob_size
-        return _var_object_size((PyVarObject *)c_obj);
-    }
-    return _basic_object_size(c_obj);
+    return _size_of_from_var_or_basic_size(c_obj);
 }
 
 
@@ -527,7 +592,8 @@
 /**
  * Return a PyList of all objects referenced via tp_traverse.
  */
-PyObject *_get_referents(PyObject *c_obj)
+PyObject *
+_get_referents(PyObject *c_obj)
 {
     PyObject *lst;
 
@@ -543,3 +609,22 @@
     }
     return lst;
 }
+
+static PyObject *
+_get_specials()
+{
+    if (_special_case_dict == NULL) {
+        _special_case_dict = PyDict_New();
+    }
+    return _special_case_dict;
+}
+
+PyObject *
+_get_special_case_dict()
+{
+    PyObject *ret;
+
+    ret = _get_specials();
+    Py_XINCREF(ret);
+    return ret;
+}

=== modified file 'meliae/_scanner_core.h'
--- a/meliae/_scanner_core.h	2010-07-12 20:20:13 +0000
+++ b/meliae/_scanner_core.h	2010-08-09 22:16:14 +0000
@@ -57,6 +57,15 @@
  */
 extern PyObject *_get_referents(PyObject *c_obj);
 
+/**
+ * Return a (mutable) dict of known special cases.
+ * 
+ * These are objects whose size is not reported properly, but which we have
+ * figured out via trial-and-error.
+ * The key is tp_name strings, the value is a PyInt of the appropriate size.
+ */
+extern PyObject *_get_special_case_dict();
+
 
 #endif // _SCANNER_CORE_H_
 

=== modified file 'meliae/scanner.py'
--- a/meliae/scanner.py	2010-07-12 22:28:03 +0000
+++ b/meliae/scanner.py	2010-08-10 16:20:09 +0000
@@ -25,6 +25,7 @@
 
 size_of = _scanner.size_of
 get_referents = _scanner.get_referents
+add_special_size = _scanner.add_special_size
 
 
 def dump_all_referenced(outf, obj, is_pending=False):

=== modified file 'meliae/tests/test__scanner.py'
--- a/meliae/tests/test__scanner.py	2010-07-12 20:20:13 +0000
+++ b/meliae/tests/test__scanner.py	2010-08-10 16:14:19 +0000
@@ -18,6 +18,7 @@
 import sys
 import tempfile
 import types
+import zlib
 
 from meliae import (
     _scanner,
@@ -170,6 +171,60 @@
         # back to the original size
         self.assertSizeOf(4, CustomSize(-1), has_gc=True)
 
+    def test_size_of_special(self):
+        class CustomWithoutSizeof(object):
+            pass
+        log = []
+        def _size_32(obj):
+            log.append(obj)
+            return 800
+        def _size_64(obj):
+            log.append(obj)
+            return 1600
+            
+        obj = CustomWithoutSizeof()
+        self.assertSizeOf(4, obj)
+        _scanner.add_special_size('CustomWithoutSizeof', _size_32, _size_64)
+        try:
+            self.assertSizeOf(200, obj)
+        finally:
+            _scanner.add_special_size('CustomWithoutSizeof', None, None)
+        self.assertEqual([obj], log)
+        del log[:]
+        self.assertSizeOf(4, obj)
+        self.assertEqual([], log)
+
+    def test_size_of_special_neg1(self):
+        # Returning -1 falls back to the regular __sizeof__, etc interface
+        class CustomWithoutSizeof(object):
+            pass
+        log = []
+        def _size_neg1(obj):
+            log.append(obj)
+            return -1
+        obj = CustomWithoutSizeof()
+        self.assertSizeOf(4, obj)
+        _scanner.add_special_size('CustomWithoutSizeof', _size_neg1, _size_neg1)
+        try:
+            self.assertSizeOf(4, obj)
+        finally:
+            _scanner.add_special_size('CustomWithoutSizeof', None, None)
+        self.assertEqual([obj], log)
+
+    def test_size_of_zlib_compress_obj(self):
+        # zlib compress objects allocate a lot of extra buffers, we want to
+        # track that. Note that we are approximating it, because we don't
+        # actually inspect the C attributes. But it is a closer approximation
+        # than not doing this.
+        c = zlib.compressobj()
+        self.assertTrue(_scanner.size_of(c) > 256000)
+        self.assertEqual(0, _scanner.size_of(c) % _scanner._word_size)
+
+    def test_size_of_zlib_decompress_obj(self):
+        d = zlib.decompressobj()
+        self.assertTrue(_scanner.size_of(d) > 30000)
+        self.assertEqual(0, _scanner.size_of(d) % _scanner._word_size)
+
 
 def _string_to_json(s):
     out = ['"']



More information about the bazaar-commits mailing list