Rev 5225: (jam) Some chk_map and CHKInventory tweaks to reduce CPU overhead for in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Tue May 11 18:22:17 BST 2010

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 5225 [merge]
revision-id: pqm at pqm.ubuntu.com-20100511172212-1khjwgh7p1d9zxlu
parent: pqm at pqm.ubuntu.com-20100511150400-7euir3548ln55n7s
parent: john at arbash-meinel.com-20100511155912-wvmknzw9idwx826w
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Tue 2010-05-11 18:22:12 +0100
message:
  (jam) Some chk_map and CHKInventory tweaks to reduce CPU overhead for
  	full fetch.
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/_chk_map_py.py          _chk_map_py.py-20090309114220-1kurz7oez2gwqtcf-1
  bzrlib/_chk_map_pyx.pyx        _chk_map_pyx.pyx-20090309111231-peyz7p2azr0dzdrb-1
  bzrlib/_static_tuple_c.h       _keys_type_c.h-20090913185138-f86v5xm1zlckguaj-1
  bzrlib/_static_tuple_c.pxd     _static_tuple_c.pxd-20091001185745-r6twbimalel9jetl-1
  bzrlib/chk_map.py              chk_map.py-20081001014447-ue6kkuhofvdecvxa-1
  bzrlib/repofmt/groupcompress_repo.py repofmt.py-20080715094215-wp1qfvoo7093c8qr-1
  bzrlib/smart/branch.py         branch.py-20061124031907-mzh3pla28r83r97f-1
  bzrlib/tests/per_repository/test_refresh_data.py test_refresh_data.py-20090316045630-5sw0ipqwk7rvpn3h-1
  bzrlib/tests/per_workingtree/test_locking.py test_locking.py-20060707151933-tav3o2hpibwi53u4-3
  bzrlib/tests/test__chk_map.py  test__chk_map.py-20090309114220-1kurz7oez2gwqtcf-2
=== modified file 'NEWS'

--- a/NEWS	2010-05-11 15:04:00 +0000
+++ b/NEWS	2010-05-11 17:22:12 +0000
@@ -170,6 +170,11 @@
 Internals
 *********
 
+* ``chk_map._bytes_to_text_key`` is now an optimized function to extract
+  the (file-id, revision-id) key from a CHKInventory entry. This can
+  potentially shave 5-10% time off during a large fetch. Related to bug
+  #562666. (John Arbash Meinel)
+
 * ``log._get_info_for_log_files`` now takes an add_cleanup callable.
   (Robert Collins)
 

=== modified file 'bzrlib/_chk_map_py.py'
--- a/bzrlib/_chk_map_py.py	2010-02-17 17:11:16 +0000
+++ b/bzrlib/_chk_map_py.py	2010-05-11 10:45:26 +0000
@@ -157,3 +157,11 @@
     result._node_width = len(prefix)
     result._search_prefix = common_prefix
     return result
+
+
+def _bytes_to_text_key(bytes):
+    """Take a CHKInventory value string and return a (file_id, rev_id) tuple"""
+    sections = bytes.split('\n')
+    kind, file_id = sections[0].split(': ')
+    return (intern(file_id), intern(sections[3]))
+

=== modified file 'bzrlib/_chk_map_pyx.pyx'
--- a/bzrlib/_chk_map_pyx.pyx	2010-04-29 22:08:34 +0000
+++ b/bzrlib/_chk_map_pyx.pyx	2010-05-11 15:59:12 +0000
@@ -35,7 +35,9 @@
     Py_ssize_t PyTuple_GET_SIZE(object t)
     int PyString_CheckExact(object)
     char *PyString_AS_STRING(object s)
+    PyObject *PyString_FromStringAndSize_ptr "PyString_FromStringAndSize" (char *, Py_ssize_t)
     Py_ssize_t PyString_GET_SIZE(object)
+    void PyString_InternInPlace(PyObject **)
     unsigned long PyInt_AsUnsignedLongMask(object) except? -1
 
     int PyDict_SetItem(object d, object k, object v) except -1
@@ -44,6 +46,7 @@
     void PyTuple_SET_ITEM(object t, Py_ssize_t offset, object)
 
     void Py_INCREF(object)
+    void Py_DECREF_ptr "Py_DECREF" (PyObject *)
 
     PyObject * PyTuple_GET_ITEM_ptr "PyTuple_GET_ITEM" (object t,
                                                         Py_ssize_t offset)
@@ -55,7 +58,8 @@
 # cimport all of the definitions we will need to access
 from _static_tuple_c cimport StaticTuple,\
     import_static_tuple_c, StaticTuple_New, \
-    StaticTuple_Intern, StaticTuple_SET_ITEM, StaticTuple_CheckExact
+    StaticTuple_Intern, StaticTuple_SET_ITEM, StaticTuple_CheckExact, \
+    StaticTuple_GET_SIZE
 
 cdef extern from "_static_tuple_c.h":
     # Defined explicitly rather than cimport-ing. Trying to use cimport, the
@@ -93,6 +97,21 @@
     return NULL
 
 
+cdef object safe_interned_string_from_size(char *s, Py_ssize_t size):
+    cdef PyObject *py_str
+    if size < 0:
+        raise AssertionError(
+            'tried to create a string with an invalid size: %d @0x%x'
+            % (size, <int>s))
+    py_str = PyString_FromStringAndSize_ptr(s, size)
+    PyString_InternInPlace(&py_str)
+    result = <object>py_str
+    # Casting a PyObject* to an <object> triggers an INCREF from Pyrex, so we
+    # DECREF it to avoid geting immortal strings
+    Py_DECREF_ptr(py_str)
+    return result
+
+
 def _search_key_16(key):
     """See chk_map._search_key_16."""
     cdef Py_ssize_t num_bits
@@ -171,6 +190,16 @@
     return value
 
 
+cdef _import_globals():
+    """Set the global attributes. Done lazy to avoid recursive import loops."""
+    global _LeafNode, _InternalNode, _unknown
+
+    from bzrlib import chk_map
+    _LeafNode = chk_map.LeafNode
+    _InternalNode = chk_map.InternalNode
+    _unknown = chk_map._unknown
+
+
 def _deserialise_leaf_node(bytes, key, search_key_func=None):
     """Deserialise bytes, with key key, into a LeafNode.
 
@@ -188,10 +217,7 @@
     cdef StaticTuple entry_bits
 
     if _LeafNode is None:
-        from bzrlib import chk_map
-        _LeafNode = chk_map.LeafNode
-        _InternalNode = chk_map.InternalNode
-        _unknown = chk_map._unknown
+        _import_globals()
 
     result = _LeafNode(search_key_func=search_key_func)
     # Splitlines can split on '\r' so don't use it, split('\n') adds an
@@ -295,7 +321,7 @@
                                                next_null - entry_start)
             Py_INCREF(entry)
             StaticTuple_SET_ITEM(entry_bits, i, entry)
-        if len(entry_bits) != width:
+        if StaticTuple_GET_SIZE(entry_bits) != width:
             raise AssertionError(
                 'Incorrect number of elements (%d vs %d)'
                 % (len(entry_bits)+1, width + 1))
@@ -331,10 +357,7 @@
     cdef char *prefix, *line_prefix, *next_null, *c_item_prefix
 
     if _InternalNode is None:
-        from bzrlib import chk_map
-        _LeafNode = chk_map.LeafNode
-        _InternalNode = chk_map.InternalNode
-        _unknown = chk_map._unknown
+        _import_globals()
     result = _InternalNode(search_key_func=search_key_func)
 
     if not StaticTuple_CheckExact(key):
@@ -395,3 +418,51 @@
     result._node_width = len(item_prefix)
     result._search_prefix = PyString_FromStringAndSize(prefix, prefix_length)
     return result
+
+
+def _bytes_to_text_key(bytes):
+    """Take a CHKInventory value string and return a (file_id, rev_id) tuple"""
+    cdef StaticTuple key
+    cdef char *byte_str, *cur_end, *file_id_str, *byte_end
+    cdef char *revision_str
+    cdef Py_ssize_t byte_size, pos, file_id_len
+
+    if not PyString_CheckExact(bytes):
+        raise TypeError('bytes must be a string')
+    byte_str = PyString_AS_STRING(bytes)
+    byte_size = PyString_GET_SIZE(bytes)
+    byte_end = byte_str + byte_size
+    cur_end = <char*>memchr(byte_str, c':', byte_size)
+    if cur_end == NULL:
+        raise ValueError('No kind section found.')
+    if cur_end[1] != c' ':
+        raise ValueError('Kind section should end with ": "')
+    file_id_str = cur_end + 2
+    # file_id is now the data up until the next newline
+    cur_end = <char*>memchr(file_id_str, c'\n', byte_end - file_id_str)
+    if cur_end == NULL:
+        raise ValueError('no newline after file-id')
+    file_id = safe_interned_string_from_size(file_id_str,
+                                             cur_end - file_id_str)
+    # this is the end of the parent_str
+    cur_end = <char*>memchr(cur_end + 1, c'\n', byte_end - cur_end - 1)
+    if cur_end == NULL:
+        raise ValueError('no newline after parent_str')
+    # end of the name str
+    cur_end = <char*>memchr(cur_end + 1, c'\n', byte_end - cur_end - 1)
+    if cur_end == NULL:
+        raise ValueError('no newline after name str')
+    # the next section is the revision info
+    revision_str = cur_end + 1
+    cur_end = <char*>memchr(cur_end + 1, c'\n', byte_end - cur_end - 1)
+    if cur_end == NULL:
+        # This is probably a dir: entry, which has revision as the last item
+        cur_end = byte_end
+    revision = safe_interned_string_from_size(revision_str,
+        cur_end - revision_str)
+    key = StaticTuple_New(2)
+    Py_INCREF(file_id)
+    StaticTuple_SET_ITEM(key, 0, file_id) 
+    Py_INCREF(revision)
+    StaticTuple_SET_ITEM(key, 1, revision) 
+    return StaticTuple_Intern(key)

=== modified file 'bzrlib/_static_tuple_c.h'
--- a/bzrlib/_static_tuple_c.h	2009-10-13 18:00:16 +0000
+++ b/bzrlib/_static_tuple_c.h	2010-05-11 14:13:31 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009 Canonical Ltd
+/* Copyright (C) 2009, 2010 Canonical Ltd
  * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -67,6 +67,7 @@
 #define StaticTuple_SET_ITEM(key, offset, val) \
     ((((StaticTuple*)(key))->items[(offset)]) = ((PyObject *)(val)))
 #define StaticTuple_GET_ITEM(key, offset) (((StaticTuple*)key)->items[offset])
+#define StaticTuple_GET_SIZE(key) (((StaticTuple*)key)->size)
 
 
 #ifdef STATIC_TUPLE_MODULE

=== modified file 'bzrlib/_static_tuple_c.pxd'
--- a/bzrlib/_static_tuple_c.pxd	2010-02-17 17:11:16 +0000
+++ b/bzrlib/_static_tuple_c.pxd	2010-05-11 14:13:31 +0000
@@ -42,3 +42,4 @@
     # not an 'object' return value.
     void *StaticTuple_GET_ITEM(StaticTuple key, Py_ssize_t offset)
     int StaticTuple_CheckExact(object)
+    Py_ssize_t StaticTuple_GET_SIZE(StaticTuple key)

=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py	2010-04-21 20:33:10 +0000
+++ b/bzrlib/chk_map.py	2010-05-11 10:45:26 +0000
@@ -1727,6 +1727,7 @@
 
 try:
     from bzrlib._chk_map_pyx import (
+        _bytes_to_text_key,
         _search_key_16,
         _search_key_255,
         _deserialise_leaf_node,
@@ -1735,6 +1736,7 @@
 except ImportError, e:
     osutils.failed_to_load_extension(e)
     from bzrlib._chk_map_py import (
+        _bytes_to_text_key,
         _search_key_16,
         _search_key_255,
         _deserialise_leaf_node,

=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- a/bzrlib/repofmt/groupcompress_repo.py	2010-02-10 02:17:15 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py	2010-05-11 10:45:26 +0000
@@ -262,7 +262,6 @@
         remaining_keys = set(keys)
         counter = [0]
         if self._gather_text_refs:
-            bytes_to_info = inventory.CHKInventory._bytes_to_utf8name_key
             self._text_refs = set()
         def _get_referenced_stream(root_keys, parse_leaf_nodes=False):
             cur_keys = root_keys
@@ -289,8 +288,7 @@
                     # Store is None, because we know we have a LeafNode, and we
                     # just want its entries
                     for file_id, bytes in node.iteritems(None):
-                        name_utf8, file_id, revision_id = bytes_to_info(bytes)
-                        self._text_refs.add((file_id, revision_id))
+                        self._text_refs.add(chk_map._bytes_to_text_key(bytes))
                 def next_stream():
                     stream = source_vf.get_record_stream(cur_keys,
                                                          'as-requested', True)
@@ -647,10 +645,10 @@
         chk_diff = chk_map.iter_interesting_nodes(
             chk_bytes_no_fallbacks, root_key_info.interesting_root_keys,
             root_key_info.uninteresting_root_keys)
-        bytes_to_info = inventory.CHKInventory._bytes_to_utf8name_key
         text_keys = set()
         try:
-            for record in _filter_text_keys(chk_diff, text_keys, bytes_to_info):
+            for record in _filter_text_keys(chk_diff, text_keys,
+                                            chk_map._bytes_to_text_key):
                 pass
         except errors.NoSuchRevision, e:
             # XXX: It would be nice if we could give a more precise error here.
@@ -1087,13 +1085,12 @@
                 uninteresting_root_keys.add(inv.id_to_entry.key())
                 uninteresting_pid_root_keys.add(
                     inv.parent_id_basename_to_file_id.key())
-        bytes_to_info = inventory.CHKInventory._bytes_to_utf8name_key
         chk_bytes = self.from_repository.chk_bytes
         def _filter_id_to_entry():
             interesting_nodes = chk_map.iter_interesting_nodes(chk_bytes,
                         self._chk_id_roots, uninteresting_root_keys)
             for record in _filter_text_keys(interesting_nodes, self._text_keys,
-                    bytes_to_info):
+                    chk_map._bytes_to_text_key):
                 if record is not None:
                     yield record
             # Consumed
@@ -1187,19 +1184,13 @@
     return result
 
 
-def _filter_text_keys(interesting_nodes_iterable, text_keys, bytes_to_info):
+def _filter_text_keys(interesting_nodes_iterable, text_keys, bytes_to_text_key):
     """Iterate the result of iter_interesting_nodes, yielding the records
     and adding to text_keys.
     """
+    text_keys_update = text_keys.update
     for record, items in interesting_nodes_iterable:
-        for name, bytes in items:
-            # Note: we don't care about name_utf8, because groupcompress repos
-            # are always rich-root, so there are no synthesised root records to
-            # ignore.
-            _, file_id, revision_id = bytes_to_info(bytes)
-            file_id = intern(file_id)
-            revision_id = intern(revision_id)
-            text_keys.add(StaticTuple(file_id, revision_id).intern())
+        text_keys_update([bytes_to_text_key(b) for n,b in items])
         yield record
 
 

=== modified file 'bzrlib/smart/branch.py'
--- a/bzrlib/smart/branch.py	2010-05-06 23:41:35 +0000
+++ b/bzrlib/smart/branch.py	2010-05-11 14:14:55 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2006 Canonical Ltd
+# Copyright (C) 2006-2010 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by

=== modified file 'bzrlib/tests/per_repository/test_refresh_data.py'
--- a/bzrlib/tests/per_repository/test_refresh_data.py	2010-05-11 10:14:23 +0000
+++ b/bzrlib/tests/per_repository/test_refresh_data.py	2010-05-11 14:14:55 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2009 Canonical Ltd
+# Copyright (C) 2009, 2010 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by

=== modified file 'bzrlib/tests/per_workingtree/test_locking.py'
--- a/bzrlib/tests/per_workingtree/test_locking.py	2010-05-06 23:41:35 +0000
+++ b/bzrlib/tests/per_workingtree/test_locking.py	2010-05-11 14:14:55 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2006 Canonical Ltd
+# Copyright (C) 2006, 2008, 2009, 2010 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by

=== modified file 'bzrlib/tests/test__chk_map.py'
--- a/bzrlib/tests/test__chk_map.py	2010-02-17 17:11:16 +0000
+++ b/bzrlib/tests/test__chk_map.py	2010-05-11 10:45:26 +0000
@@ -18,6 +18,7 @@
 
 from bzrlib import (
     chk_map,
+    inventory,
     tests,
     )
 from bzrlib.static_tuple import StaticTuple
@@ -236,3 +237,44 @@
         self.assertEqual(("sha1:1234",), node.key())
         self.assertEqual('pref\x00fo', node._search_prefix)
         self.assertEqual({'pref\x00fo\x00': ('sha1:abcd',)}, node._items)
+
+
+class Test_BytesToTextKey(tests.TestCase):
+
+    def assertBytesToTextKey(self, key, bytes):
+        self.assertEqual(key,
+                         self.module._bytes_to_text_key(bytes))
+
+    def assertBytesToTextKeyRaises(self, bytes):
+        # These are invalid bytes, and we want to make sure the code under test
+        # raises an exception rather than segfaults, etc. We don't particularly
+        # care what exception.
+        self.assertRaises(Exception, self.module._bytes_to_text_key, bytes)
+
+    def test_file(self):
+        self.assertBytesToTextKey(('file-id', 'revision-id'),
+                 'file: file-id\nparent-id\nname\nrevision-id\n'
+                 'da39a3ee5e6b4b0d3255bfef95601890afd80709\n100\nN')
+
+    def test_invalid_no_kind(self):
+        self.assertBytesToTextKeyRaises(
+                 'file  file-id\nparent-id\nname\nrevision-id\n'
+                 'da39a3ee5e6b4b0d3255bfef95601890afd80709\n100\nN')
+
+    def test_invalid_no_space(self):
+        self.assertBytesToTextKeyRaises(
+                 'file:file-id\nparent-id\nname\nrevision-id\n'
+                 'da39a3ee5e6b4b0d3255bfef95601890afd80709\n100\nN')
+
+    def test_invalid_too_short_file_id(self):
+        self.assertBytesToTextKeyRaises('file:file-id')
+
+    def test_invalid_too_short_parent_id(self):
+        self.assertBytesToTextKeyRaises('file:file-id\nparent-id')
+
+    def test_invalid_too_short_name(self):
+        self.assertBytesToTextKeyRaises('file:file-id\nparent-id\nname')
+
+    def test_dir(self):
+        self.assertBytesToTextKey(('dir-id', 'revision-id'),
+                 'dir: dir-id\nparent-id\nname\nrevision-id')