Rev 4777: Try out interning of some of the strings. in http://bazaar.launchpad.net/~jameinel/bzr/2.1-static-tuple-chk-map

John Arbash Meinel john at arbash-meinel.com
Wed Oct 21 21:50:38 BST 2009


At http://bazaar.launchpad.net/~jameinel/bzr/2.1-static-tuple-chk-map

------------------------------------------------------------
revno: 4777
revision-id: john at arbash-meinel.com-20091021205004-q81ipjjhdy2fl3xy
parent: john at arbash-meinel.com-20091021194305-eso7vfaetsvmhgz6
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1-static-tuple-chk-map
timestamp: Wed 2009-10-21 15:50:04 -0500
message:
  Try out interning of some of the strings.
  
  Doesn't seem to have a serious impact.
  If we only have a 2-deep tree (1 internal, 1 leaf), all the internal nodes
  are 1-byte wide anyway, so they are already unique (single chars).
  Interning the values is a slight net loss. A bit suprising. At an average
  of 21 values per node, (and an average of 1 change per node) I would have
  thought we would have more overlap in values.
  I'll probably revert this as the meager to nil benefit doesn't outweigh
  the complexity.
-------------- next part --------------
=== modified file 'bzrlib/_chk_map_pyx.pyx'
--- a/bzrlib/_chk_map_pyx.pyx	2009-10-21 19:43:05 +0000
+++ b/bzrlib/_chk_map_pyx.pyx	2009-10-21 20:50:04 +0000
@@ -17,7 +17,14 @@
 
 #python2.4 support
 cdef extern from "python-compat.h":
-    pass
+    # PyString_InternInPlace takes a **, and mutate the pointer supplied.
+    # Neither Pyrex nor Cython handles that particularly well.
+    # We also cannot use intern() directly, because our strings have '\x00' in
+    # them, and pyrex and cython both translate intern() into
+    # PyString_InternFromString(PyString_AsString()) which assumes a regular
+    # null terminated C string
+    # 
+    void INTERN_STRING(object)
 
 cdef extern from *:
     ctypedef unsigned int size_t
@@ -50,6 +57,8 @@
     Py_ssize_t PyString_GET_SIZE_ptr "PyString_GET_SIZE" (PyObject *s)
     char *PyString_AS_STRING_ptr "PyString_AS_STRING" (PyObject *s)
     object PyString_FromStringAndSize(char*, Py_ssize_t)
+    void PyString_InternInPlace(PyObject **)
+    int PyString_CHECK_INTERNED(object)
 
 # It seems we need to import the definitions so that the pyrex compiler has
 # local names to access them.
@@ -292,7 +301,13 @@
             # SET_ITEM 'steals' a reference
             Py_INCREF(entry)
             StaticTuple_SET_ITEM(entry_bits, i, entry)
+        # TODO: Consider interning the value lines
+        #       Often there will be many leaf nodes that only have a single
+        #       line or two that is different among many (60+) lines
         value = PyString_FromStringAndSize(value_start, next_line - value_start)
+        INTERN_STRING(value)
+        if not PyString_CHECK_INTERNED(value):
+            raise AssertionError('i asked to intern, but it didn\'t work')
         # The next entry bit needs the 'tail' from the prefix, and first part
         # of the line
         entry_start = line_start
@@ -400,15 +415,21 @@
         next_null = <char *>_my_memrchr(cur, c'\0', next_line - cur)
         if next_null == NULL:
             raise ValueError('bad no null')
+        # TODO: intern item_prefix, it will almost always be redundant with all
+        #       of the other internal node instances at this depth.
         item_prefix = PyString_FromStringAndSize(NULL,
             prefix_length + next_null - cur)
         c_item_prefix = PyString_AS_STRING(item_prefix)
         if prefix_length:
             memcpy(c_item_prefix, prefix, prefix_length)
         memcpy(c_item_prefix + prefix_length, cur, next_null - cur)
+        c_item_prefix = NULL # Done with this
         flat_key = PyString_FromStringAndSize(next_null + 1,
                                               next_line - next_null - 1)
         flat_key = StaticTuple(flat_key).intern()
+        INTERN_STRING(item_prefix)
+        if not PyString_CHECK_INTERNED(item_prefix):
+            raise AssertionError('i asked to intern, but it didn\'t work')
         PyDict_SetItem(items, item_prefix, flat_key)
         cur = next_line + 1
     assert len(items) > 0

=== modified file 'bzrlib/python-compat.h'
--- a/bzrlib/python-compat.h	2009-10-12 21:44:27 +0000
+++ b/bzrlib/python-compat.h	2009-10-21 20:50:04 +0000
@@ -84,4 +84,7 @@
 #  define Py_REFCNT(o) ((o)->ob_refcnt)
 #endif
 
+
+#define INTERN_STRING(obj)  (PyString_InternInPlace(&(obj)))
+
 #endif /* _BZR_PYTHON_COMPAT_H */

=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py	2009-10-21 19:07:30 +0000
+++ b/bzrlib/tests/test_chk_map.py	2009-10-21 20:50:04 +0000
@@ -1139,6 +1139,10 @@
             {("a","a"):"content here", ("a", "b",):"more content",
              ("b", ""): 'boring content'},
             maximum_size=10, key_width=2)
+        print
+        print chkmap._dump_tree()
+        print repr(chk_map._page_cache[chkmap.key()])
+        print repr(chk_map._page_cache[chkmap._root_node._items['a']._key])
         self.assertEqual(
             {("a", "a"): "content here", ("a", "b"): 'more content'},
             self.to_dict(chkmap, [("a",)]))



More information about the bazaar-commits mailing list