Rev 184: We now support custom sizeof methods registered at runtime. in http://bazaar.launchpad.net/~meliae-dev/meliae/trunk
John Arbash Meinel
john at arbash-meinel.com
Tue Aug 10 17:22:00 BST 2010
At http://bazaar.launchpad.net/~meliae-dev/meliae/trunk
------------------------------------------------------------
revno: 184 [merge]
revision-id: john at arbash-meinel.com-20100810162152-kwinrlv78flsdox5
parent: john at arbash-meinel.com-20100809161308-oa1wlnk33w6hvg23
parent: john at arbash-meinel.com-20100810162009-62a3o1k0j1x8did3
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: trunk
timestamp: Tue 2010-08-10 11:21:52 -0500
message:
We now support custom sizeof methods registered at runtime.
modified:
CHANGES.txt changes.txt-20100104131503-ipkk7tyh2bnv0lu4-1
meliae/_scanner.pyx _scanner.pyx-20090401185718-094vrprmymne09r1-2
meliae/_scanner_core.c _scanner_core.c-20090402012435-66bb6fp08v4begco-1
meliae/_scanner_core.h _scanner_core.h-20090402012435-66bb6fp08v4begco-2
meliae/scanner.py scanner.py-20090402040945-idv8d14z8re05gvg-1
meliae/tests/test__scanner.py test__scanner.py-20090401185718-094vrprmymne09r1-3
-------------- next part --------------
=== modified file 'CHANGES.txt'
--- a/CHANGES.txt 2010-08-09 16:13:08 +0000
+++ b/CHANGES.txt 2010-08-10 16:20:09 +0000
@@ -25,6 +25,15 @@
The main win is lowering memory consumption. A 50k parent list takes
200kB by itself (on 32-bit). (John Arbash Meinel)
+* Fix a PyInt memory leak. We were calling __sizeof__ which returns an
+ PyInt and not DECREFing it. (John Arbash Meinel)
+
+* Initial support for overriding ``__sizeof__`` definitions. It turns
+ out that object has a basic definition, so all new style classes
+ inherit it. We now provide ``meliae.scanner.add_special_size``, which
+ takes a type name and some callbacks to determine the size of an
+ object. This lets you register them without importing the module.
+
Meliae 0.3
##########
=== modified file 'meliae/_scanner.pyx'
--- a/meliae/_scanner.pyx 2010-05-20 14:08:53 +0000
+++ b/meliae/_scanner.pyx 2010-08-10 16:14:19 +0000
@@ -39,6 +39,7 @@
void _dump_object_info(write_callback write, void *callee_data,
object c_obj, object nodump, int recurse)
object _get_referents(object c_obj)
+ object _get_special_case_dict()
_word_size = sizeof(Py_ssize_t)
@@ -111,3 +112,133 @@
tp_traverse.
"""
return _get_referents(obj)
+
+
+def add_special_size(object tp_name, object size_of_32, object size_of_64):
+ """Special case a given object size.
+
+ This is only meant to be used for objects we don't already handle or which
+ don't implement __sizeof__ (those are checked before this check happens).
+
+ This is meant for things like zlib.Compress which allocates a lot of
+ internal buffers, which are not easily accessible (but can be
+ approximated). The gc header should not be included in this size, it will
+ be added at runtime.
+
+ Setting the value to None will remove the value.
+
+ (We only distinguish size_of_32 from size_of_64 for the implementer's
+ benefit, since sizeof() is not generally accessible from Python.)
+
+ :param tp_name: The type string we care about (such as 'zlib.Compress').
+ This will be matched against object->type->tp_name.
+ :param size_of_32: Called when _word_size == 32-bits
+ :param size_of_64: Called when _word_size == 64-bits
+ :return: None
+ """
+ special_dict = _get_special_case_dict()
+ if _word_size == 4:
+ sz = size_of_32
+ elif _word_size == 8:
+ sz = size_of_64
+ else:
+ raise RuntimeError('Unknown word size: %s' % (_word_size,))
+ if sz is None:
+ if tp_name in special_dict:
+ del special_dict[tp_name]
+ else:
+ special_dict[tp_name] = sz
+
+
+def _zlib_size_of_32(zlib_obj):
+ """Return a __sizeof__ for a zlib object."""
+ cdef Py_ssize_t size
+
+ t = type(zlib_obj)
+ name = t.__name__
+ # Size of the zlib 'compobject', (PyObject_HEAD + z_stream, + misc)
+ size = 56
+ if name.endswith('Decompress'):
+ # _get_referents doesn't track into these attributes, so we just
+ # attribute the size to the object itself.
+ size += _size_of(zlib_obj.unused_data)
+ size += _size_of(zlib_obj.unconsumed_tail)
+ # sizeof(inflate_state)
+ size += 7116
+ # sizeof(buffers allocated for inflate)
+ # (1 << state->wbits)
+ # However, we don't have access to wbits, so we assume the default (and
+ # largest) of 15 wbits
+ size += (1 << 15)
+ # Empirically 42kB / object during decompression, and this gives 39998
+ elif name.endswith('Compress'):
+ # compress objects have a reference to unused_data, etc, but it always
+ # points to the empty string.
+ # sizeof(deflate_state)
+ size += 5828
+ # We don't have access to the stream C attributes, so we assume the
+ # standard values and go with it
+ # Pos == unsigned short
+ # Byte == unsigned char
+ # w_size = 1 << s->w_bits, default 15 => (1<<15)
+ # memLevel default is 8 (maybe 9?)
+ # s->w_size * 2*sizeof(Byte) = (1<<15) * 2 * 1 = 65536
+ size += 65536
+ # s_>w_size * sizeof(Pos) = (1<<15) * 2 = 65536
+ size += 65536
+ # s->hash_size * sizeof(Pos) = (1 << (8+7)) * 2 = 65536
+ size += 65536
+ # s->lit_bufsize = 1 << (8 + 6) = (1 << 14) = 16384
+ # s->pending_buf = lit_bufsize * (sizeof(ush)+2) = 4*16384 = 65536
+ size += 65536
+ # empirically, I got ~96378 bytes/object after allocating a lot of them
+ # After sending a bunch of compression data to all of them, I got
+ # ~270127 bytes/object. (according to WorkingMem)
+ # This gives 268028, which is pretty close
+ else:
+ return -1
+ # We assume that everything is at least aligned to word boundary
+ if size % _word_size != 0:
+ size += _word_size - (size % _word_size)
+ return size
+
+
+def _zlib_size_of_64(zlib_obj):
+ """Return a __sizeof__ for a zlib object."""
+ t = type(zlib_obj)
+ name = t.__name__
+ # Size of the zlib 'compobject', (PyObject_HEAD + z_stream, + misc)
+ # All the 64-bit numbers here are 'made up'
+ size = (56 * 2)
+ if name.endswith('Decompress'):
+ size += _size_of(zlib_obj.unused_data)
+ size += _size_of(zlib_obj.unconsumed_tail)
+ # sizeof(inflate_state)
+ size += (7116 * 2)
+ # sizeof(buffers allocated for inflate)
+ # (1 << state->wbits)
+ # However, we don't have access to wbits, so we assume the default (and
+ # largest) of 15 wbits
+ size += (1 << 15)
+ elif name.endswith('Compress'):
+ # sizeof(deflate_state)
+ size += (5828 * 2)
+ # We don't have access to the stream C attributes, so we assume the
+ # standard values and go with it
+ # s->w_size * 2*sizeof(Byte) = (1<<15) * 2 * 1 = 65536
+ size += 65536
+ # s_>w_size * sizeof(Pos) = (1<<15) * 2 = 65536
+ size += 65536
+ # s->hash_size * sizeof(Pos) = (1 << (8+7)) * 2 = 65536
+ size += 65536
+ # s->lit_bufsize = 1 << (8 + 6) = (1 << 14) = 16384
+ # s->pending_buf = lit_bufsize * (sizeof(ush)+2) = 4*16384 = 65536
+ size += 65536
+ else:
+ return -1
+ if size % _word_size != 0:
+ size += _word_size - (size % _word_size)
+ return size
+
+add_special_size('zlib.Compress', _zlib_size_of_32, _zlib_size_of_64)
+add_special_size('zlib.Decompress', _zlib_size_of_32, _zlib_size_of_64)
=== modified file 'meliae/_scanner_core.c'
--- a/meliae/_scanner_core.c 2010-07-12 17:14:01 +0000
+++ b/meliae/_scanner_core.c 2010-08-10 00:58:56 +0000
@@ -60,12 +60,14 @@
#else
static void _write_to_ref_info(struct ref_info *info, const char *fmt_string, ...);
#endif
+static PyObject * _get_specials();
/* The address of the last thing we dumped. Stuff like dumping the string
* interned dictionary will dump the same string 2x in a row. This helps
* prevent that.
*/
static PyObject *_last_dumped = NULL;
+static PyObject *_special_case_dict = NULL;
void
_clear_last_dumped()
@@ -73,7 +75,7 @@
_last_dumped = NULL;
}
-Py_ssize_t
+static Py_ssize_t
_basic_object_size(PyObject *c_obj)
{
Py_ssize_t size;
@@ -85,7 +87,7 @@
}
-Py_ssize_t
+static Py_ssize_t
_var_object_size(PyVarObject *c_obj)
{
Py_ssize_t num_entries;
@@ -99,7 +101,28 @@
+ num_entries * c_obj->ob_type->tp_itemsize;
}
-Py_ssize_t
+static Py_ssize_t
+_object_to_size_with_gc(PyObject *size_obj, PyObject *c_obj)
+{
+ Py_ssize_t size = -1;
+
+ size = PyInt_AsSsize_t(size_obj);
+ if (size == -1) {
+ // Probably an error occurred, we don't know for sure, but we might as
+ // well just claim that we don't know the size. We *could* check
+ // PyErr_Occurred(), but if we are just clearing it anyway...
+ PyErr_Clear();
+ return -1;
+ }
+ // There is one trick left. Namely, __sizeof__ doesn't include the
+ // GC overhead, so let's add that back in
+ if (PyType_HasFeature(Py_TYPE(c_obj), Py_TPFLAGS_HAVE_GC)) {
+ size += sizeof(PyGC_Head);
+ }
+ return size;
+}
+
+static Py_ssize_t
_size_of_from__sizeof__(PyObject *c_obj)
{
PyObject *size_obj = NULL;
@@ -117,24 +140,13 @@
PyErr_Clear();
return -1;
}
- size = PyInt_AsSsize_t(size_obj);
- if (size == -1) {
- // Probably an error occurred, we don't know for sure, but we might as
- // well just claim that we don't know the size. We *could* check
- // PyErr_Occurred(), but if we are just clearing it anyway...
- PyErr_Clear();
- return -1;
- }
- // There is one trick left. Namely, __sizeof__ doesn't seem to include the
- // GC overhead, so let's add that back in
- if (PyType_HasFeature(c_obj->ob_type, Py_TPFLAGS_HAVE_GC)) {
- size += sizeof(PyGC_Head);
- }
+ size = _object_to_size_with_gc(size_obj, c_obj);
+ Py_DECREF(size_obj);
return size;
}
-Py_ssize_t
+static Py_ssize_t
_size_of_list(PyListObject *c_obj)
{
Py_ssize_t size;
@@ -144,7 +156,7 @@
}
-Py_ssize_t
+static Py_ssize_t
_size_of_set(PySetObject *c_obj)
{
Py_ssize_t size;
@@ -156,7 +168,7 @@
}
-Py_ssize_t
+static Py_ssize_t
_size_of_dict(PyDictObject *c_obj)
{
Py_ssize_t size;
@@ -168,7 +180,7 @@
}
-Py_ssize_t
+static Py_ssize_t
_size_of_unicode(PyUnicodeObject *c_obj)
{
Py_ssize_t size;
@@ -177,6 +189,51 @@
return size;
}
+static Py_ssize_t
+_size_of_from_specials(PyObject *c_obj)
+{
+ PyObject *special_dict;
+ PyObject *special_size_of;
+ PyObject *val;
+ Py_ssize_t size;
+
+ special_dict = _get_specials();
+ if (special_dict == NULL) {
+ PyErr_Clear(); // Not sure what happened, but don't propogate it
+ return -1;
+ }
+ special_size_of = PyDict_GetItemString(special_dict,
+ Py_TYPE(c_obj)->tp_name);
+ if (special_size_of == NULL) {
+ // if special_size_of is NULL, an exception is *not* set
+ return -1;
+ }
+ // special_size_of is a *borrowed referenced*
+ val = PyObject_CallFunction(special_size_of, "O", c_obj);
+ if (val == NULL) {
+ return -1;
+ }
+ size = _object_to_size_with_gc(val, c_obj);
+ Py_DECREF(val);
+ return size;
+}
+
+static Py_ssize_t
+_size_of_from_var_or_basic_size(PyObject *c_obj)
+{
+ /* There are a bunch of types that we know we can check directly, without
+ * having to go through the __sizeof__ abstraction. This allows us to avoid
+ * the extra intermediate allocations. It is also our final fallback
+ * method.
+ */
+
+ if (c_obj->ob_type->tp_itemsize != 0) {
+ // Variable length object with inline storage
+ // total size is tp_itemsize * ob_size
+ return _var_object_size((PyVarObject *)c_obj);
+ }
+ return _basic_object_size(c_obj);
+}
Py_ssize_t
_size_of(PyObject *c_obj)
@@ -191,19 +248,27 @@
return _size_of_dict((PyDictObject *)c_obj);
} else if PyUnicode_Check(c_obj) {
return _size_of_unicode((PyUnicodeObject *)c_obj);
+ } else if (PyTuple_CheckExact(c_obj)
+ || PyString_CheckExact(c_obj)
+ || PyInt_CheckExact(c_obj)
+ || PyBool_Check(c_obj)
+ || c_obj == Py_None
+ || PyModule_CheckExact(c_obj))
+ {
+ // All of these implement __sizeof__, but we don't need to use it
+ return _size_of_from_var_or_basic_size(c_obj);
}
+ // object implements __sizeof__ so we have to specials first
+ size = _size_of_from_specials(c_obj);
+ if (size != -1) {
+ return size;
+ }
size = _size_of_from__sizeof__(c_obj);
if (size != -1) {
return size;
}
-
- if (c_obj->ob_type->tp_itemsize != 0) {
- // Variable length object with inline storage
- // total size is tp_itemsize * ob_size
- return _var_object_size((PyVarObject *)c_obj);
- }
- return _basic_object_size(c_obj);
+ return _size_of_from_var_or_basic_size(c_obj);
}
@@ -527,7 +592,8 @@
/**
* Return a PyList of all objects referenced via tp_traverse.
*/
-PyObject *_get_referents(PyObject *c_obj)
+PyObject *
+_get_referents(PyObject *c_obj)
{
PyObject *lst;
@@ -543,3 +609,22 @@
}
return lst;
}
+
+static PyObject *
+_get_specials()
+{
+ if (_special_case_dict == NULL) {
+ _special_case_dict = PyDict_New();
+ }
+ return _special_case_dict;
+}
+
+PyObject *
+_get_special_case_dict()
+{
+ PyObject *ret;
+
+ ret = _get_specials();
+ Py_XINCREF(ret);
+ return ret;
+}
=== modified file 'meliae/_scanner_core.h'
--- a/meliae/_scanner_core.h 2010-07-12 20:20:13 +0000
+++ b/meliae/_scanner_core.h 2010-08-09 22:16:14 +0000
@@ -57,6 +57,15 @@
*/
extern PyObject *_get_referents(PyObject *c_obj);
+/**
+ * Return a (mutable) dict of known special cases.
+ *
+ * These are objects whose size is not reported properly, but which we have
+ * figured out via trial-and-error.
+ * The key is tp_name strings, the value is a PyInt of the appropriate size.
+ */
+extern PyObject *_get_special_case_dict();
+
#endif // _SCANNER_CORE_H_
=== modified file 'meliae/scanner.py'
--- a/meliae/scanner.py 2010-07-12 22:28:03 +0000
+++ b/meliae/scanner.py 2010-08-10 16:20:09 +0000
@@ -25,6 +25,7 @@
size_of = _scanner.size_of
get_referents = _scanner.get_referents
+add_special_size = _scanner.add_special_size
def dump_all_referenced(outf, obj, is_pending=False):
=== modified file 'meliae/tests/test__scanner.py'
--- a/meliae/tests/test__scanner.py 2010-07-12 20:20:13 +0000
+++ b/meliae/tests/test__scanner.py 2010-08-10 16:14:19 +0000
@@ -18,6 +18,7 @@
import sys
import tempfile
import types
+import zlib
from meliae import (
_scanner,
@@ -170,6 +171,60 @@
# back to the original size
self.assertSizeOf(4, CustomSize(-1), has_gc=True)
+ def test_size_of_special(self):
+ class CustomWithoutSizeof(object):
+ pass
+ log = []
+ def _size_32(obj):
+ log.append(obj)
+ return 800
+ def _size_64(obj):
+ log.append(obj)
+ return 1600
+
+ obj = CustomWithoutSizeof()
+ self.assertSizeOf(4, obj)
+ _scanner.add_special_size('CustomWithoutSizeof', _size_32, _size_64)
+ try:
+ self.assertSizeOf(200, obj)
+ finally:
+ _scanner.add_special_size('CustomWithoutSizeof', None, None)
+ self.assertEqual([obj], log)
+ del log[:]
+ self.assertSizeOf(4, obj)
+ self.assertEqual([], log)
+
+ def test_size_of_special_neg1(self):
+ # Returning -1 falls back to the regular __sizeof__, etc interface
+ class CustomWithoutSizeof(object):
+ pass
+ log = []
+ def _size_neg1(obj):
+ log.append(obj)
+ return -1
+ obj = CustomWithoutSizeof()
+ self.assertSizeOf(4, obj)
+ _scanner.add_special_size('CustomWithoutSizeof', _size_neg1, _size_neg1)
+ try:
+ self.assertSizeOf(4, obj)
+ finally:
+ _scanner.add_special_size('CustomWithoutSizeof', None, None)
+ self.assertEqual([obj], log)
+
+ def test_size_of_zlib_compress_obj(self):
+ # zlib compress objects allocate a lot of extra buffers, we want to
+ # track that. Note that we are approximating it, because we don't
+ # actually inspect the C attributes. But it is a closer approximation
+ # than not doing this.
+ c = zlib.compressobj()
+ self.assertTrue(_scanner.size_of(c) > 256000)
+ self.assertEqual(0, _scanner.size_of(c) % _scanner._word_size)
+
+ def test_size_of_zlib_decompress_obj(self):
+ d = zlib.decompressobj()
+ self.assertTrue(_scanner.size_of(d) > 30000)
+ self.assertEqual(0, _scanner.size_of(d) % _scanner._word_size)
+
def _string_to_json(s):
out = ['"']
More information about the bazaar-commits
mailing list