Rev 5225: (jam) Some chk_map and CHKInventory tweaks to reduce CPU overhead for in file:///home/pqm/archives/thelove/bzr/%2Btrunk/
Canonical.com Patch Queue Manager
pqm at pqm.ubuntu.com
Tue May 11 18:22:17 BST 2010
At file:///home/pqm/archives/thelove/bzr/%2Btrunk/
------------------------------------------------------------
revno: 5225 [merge]
revision-id: pqm at pqm.ubuntu.com-20100511172212-1khjwgh7p1d9zxlu
parent: pqm at pqm.ubuntu.com-20100511150400-7euir3548ln55n7s
parent: john at arbash-meinel.com-20100511155912-wvmknzw9idwx826w
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Tue 2010-05-11 18:22:12 +0100
message:
(jam) Some chk_map and CHKInventory tweaks to reduce CPU overhead for
full fetch.
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/_chk_map_py.py _chk_map_py.py-20090309114220-1kurz7oez2gwqtcf-1
bzrlib/_chk_map_pyx.pyx _chk_map_pyx.pyx-20090309111231-peyz7p2azr0dzdrb-1
bzrlib/_static_tuple_c.h _keys_type_c.h-20090913185138-f86v5xm1zlckguaj-1
bzrlib/_static_tuple_c.pxd _static_tuple_c.pxd-20091001185745-r6twbimalel9jetl-1
bzrlib/chk_map.py chk_map.py-20081001014447-ue6kkuhofvdecvxa-1
bzrlib/repofmt/groupcompress_repo.py repofmt.py-20080715094215-wp1qfvoo7093c8qr-1
bzrlib/smart/branch.py branch.py-20061124031907-mzh3pla28r83r97f-1
bzrlib/tests/per_repository/test_refresh_data.py test_refresh_data.py-20090316045630-5sw0ipqwk7rvpn3h-1
bzrlib/tests/per_workingtree/test_locking.py test_locking.py-20060707151933-tav3o2hpibwi53u4-3
bzrlib/tests/test__chk_map.py test__chk_map.py-20090309114220-1kurz7oez2gwqtcf-2
=== modified file 'NEWS'
--- a/NEWS 2010-05-11 15:04:00 +0000
+++ b/NEWS 2010-05-11 17:22:12 +0000
@@ -170,6 +170,11 @@
Internals
*********
+* ``chk_map._bytes_to_text_key`` is now an optimized function to extract
+ the (file-id, revision-id) key from a CHKInventory entry. This can
+ potentially shave 5-10% time off during a large fetch. Related to bug
+ #562666. (John Arbash Meinel)
+
* ``log._get_info_for_log_files`` now takes an add_cleanup callable.
(Robert Collins)
=== modified file 'bzrlib/_chk_map_py.py'
--- a/bzrlib/_chk_map_py.py 2010-02-17 17:11:16 +0000
+++ b/bzrlib/_chk_map_py.py 2010-05-11 10:45:26 +0000
@@ -157,3 +157,11 @@
result._node_width = len(prefix)
result._search_prefix = common_prefix
return result
+
+
+def _bytes_to_text_key(bytes):
+ """Take a CHKInventory value string and return a (file_id, rev_id) tuple"""
+ sections = bytes.split('\n')
+ kind, file_id = sections[0].split(': ')
+ return (intern(file_id), intern(sections[3]))
+
=== modified file 'bzrlib/_chk_map_pyx.pyx'
--- a/bzrlib/_chk_map_pyx.pyx 2010-04-29 22:08:34 +0000
+++ b/bzrlib/_chk_map_pyx.pyx 2010-05-11 15:59:12 +0000
@@ -35,7 +35,9 @@
Py_ssize_t PyTuple_GET_SIZE(object t)
int PyString_CheckExact(object)
char *PyString_AS_STRING(object s)
+ PyObject *PyString_FromStringAndSize_ptr "PyString_FromStringAndSize" (char *, Py_ssize_t)
Py_ssize_t PyString_GET_SIZE(object)
+ void PyString_InternInPlace(PyObject **)
unsigned long PyInt_AsUnsignedLongMask(object) except? -1
int PyDict_SetItem(object d, object k, object v) except -1
@@ -44,6 +46,7 @@
void PyTuple_SET_ITEM(object t, Py_ssize_t offset, object)
void Py_INCREF(object)
+ void Py_DECREF_ptr "Py_DECREF" (PyObject *)
PyObject * PyTuple_GET_ITEM_ptr "PyTuple_GET_ITEM" (object t,
Py_ssize_t offset)
@@ -55,7 +58,8 @@
# cimport all of the definitions we will need to access
from _static_tuple_c cimport StaticTuple,\
import_static_tuple_c, StaticTuple_New, \
- StaticTuple_Intern, StaticTuple_SET_ITEM, StaticTuple_CheckExact
+ StaticTuple_Intern, StaticTuple_SET_ITEM, StaticTuple_CheckExact, \
+ StaticTuple_GET_SIZE
cdef extern from "_static_tuple_c.h":
# Defined explicitly rather than cimport-ing. Trying to use cimport, the
@@ -93,6 +97,21 @@
return NULL
+cdef object safe_interned_string_from_size(char *s, Py_ssize_t size):
+ cdef PyObject *py_str
+ if size < 0:
+ raise AssertionError(
+ 'tried to create a string with an invalid size: %d @0x%x'
+ % (size, <int>s))
+ py_str = PyString_FromStringAndSize_ptr(s, size)
+ PyString_InternInPlace(&py_str)
+ result = <object>py_str
+ # Casting a PyObject* to an <object> triggers an INCREF from Pyrex, so we
+ # DECREF it to avoid geting immortal strings
+ Py_DECREF_ptr(py_str)
+ return result
+
+
def _search_key_16(key):
"""See chk_map._search_key_16."""
cdef Py_ssize_t num_bits
@@ -171,6 +190,16 @@
return value
+cdef _import_globals():
+ """Set the global attributes. Done lazy to avoid recursive import loops."""
+ global _LeafNode, _InternalNode, _unknown
+
+ from bzrlib import chk_map
+ _LeafNode = chk_map.LeafNode
+ _InternalNode = chk_map.InternalNode
+ _unknown = chk_map._unknown
+
+
def _deserialise_leaf_node(bytes, key, search_key_func=None):
"""Deserialise bytes, with key key, into a LeafNode.
@@ -188,10 +217,7 @@
cdef StaticTuple entry_bits
if _LeafNode is None:
- from bzrlib import chk_map
- _LeafNode = chk_map.LeafNode
- _InternalNode = chk_map.InternalNode
- _unknown = chk_map._unknown
+ _import_globals()
result = _LeafNode(search_key_func=search_key_func)
# Splitlines can split on '\r' so don't use it, split('\n') adds an
@@ -295,7 +321,7 @@
next_null - entry_start)
Py_INCREF(entry)
StaticTuple_SET_ITEM(entry_bits, i, entry)
- if len(entry_bits) != width:
+ if StaticTuple_GET_SIZE(entry_bits) != width:
raise AssertionError(
'Incorrect number of elements (%d vs %d)'
% (len(entry_bits)+1, width + 1))
@@ -331,10 +357,7 @@
cdef char *prefix, *line_prefix, *next_null, *c_item_prefix
if _InternalNode is None:
- from bzrlib import chk_map
- _LeafNode = chk_map.LeafNode
- _InternalNode = chk_map.InternalNode
- _unknown = chk_map._unknown
+ _import_globals()
result = _InternalNode(search_key_func=search_key_func)
if not StaticTuple_CheckExact(key):
@@ -395,3 +418,51 @@
result._node_width = len(item_prefix)
result._search_prefix = PyString_FromStringAndSize(prefix, prefix_length)
return result
+
+
+def _bytes_to_text_key(bytes):
+ """Take a CHKInventory value string and return a (file_id, rev_id) tuple"""
+ cdef StaticTuple key
+ cdef char *byte_str, *cur_end, *file_id_str, *byte_end
+ cdef char *revision_str
+ cdef Py_ssize_t byte_size, pos, file_id_len
+
+ if not PyString_CheckExact(bytes):
+ raise TypeError('bytes must be a string')
+ byte_str = PyString_AS_STRING(bytes)
+ byte_size = PyString_GET_SIZE(bytes)
+ byte_end = byte_str + byte_size
+ cur_end = <char*>memchr(byte_str, c':', byte_size)
+ if cur_end == NULL:
+ raise ValueError('No kind section found.')
+ if cur_end[1] != c' ':
+ raise ValueError('Kind section should end with ": "')
+ file_id_str = cur_end + 2
+ # file_id is now the data up until the next newline
+ cur_end = <char*>memchr(file_id_str, c'\n', byte_end - file_id_str)
+ if cur_end == NULL:
+ raise ValueError('no newline after file-id')
+ file_id = safe_interned_string_from_size(file_id_str,
+ cur_end - file_id_str)
+ # this is the end of the parent_str
+ cur_end = <char*>memchr(cur_end + 1, c'\n', byte_end - cur_end - 1)
+ if cur_end == NULL:
+ raise ValueError('no newline after parent_str')
+ # end of the name str
+ cur_end = <char*>memchr(cur_end + 1, c'\n', byte_end - cur_end - 1)
+ if cur_end == NULL:
+ raise ValueError('no newline after name str')
+ # the next section is the revision info
+ revision_str = cur_end + 1
+ cur_end = <char*>memchr(cur_end + 1, c'\n', byte_end - cur_end - 1)
+ if cur_end == NULL:
+ # This is probably a dir: entry, which has revision as the last item
+ cur_end = byte_end
+ revision = safe_interned_string_from_size(revision_str,
+ cur_end - revision_str)
+ key = StaticTuple_New(2)
+ Py_INCREF(file_id)
+ StaticTuple_SET_ITEM(key, 0, file_id)
+ Py_INCREF(revision)
+ StaticTuple_SET_ITEM(key, 1, revision)
+ return StaticTuple_Intern(key)
=== modified file 'bzrlib/_static_tuple_c.h'
--- a/bzrlib/_static_tuple_c.h 2009-10-13 18:00:16 +0000
+++ b/bzrlib/_static_tuple_c.h 2010-05-11 14:13:31 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009 Canonical Ltd
+/* Copyright (C) 2009, 2010 Canonical Ltd
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -67,6 +67,7 @@
#define StaticTuple_SET_ITEM(key, offset, val) \
((((StaticTuple*)(key))->items[(offset)]) = ((PyObject *)(val)))
#define StaticTuple_GET_ITEM(key, offset) (((StaticTuple*)key)->items[offset])
+#define StaticTuple_GET_SIZE(key) (((StaticTuple*)key)->size)
#ifdef STATIC_TUPLE_MODULE
=== modified file 'bzrlib/_static_tuple_c.pxd'
--- a/bzrlib/_static_tuple_c.pxd 2010-02-17 17:11:16 +0000
+++ b/bzrlib/_static_tuple_c.pxd 2010-05-11 14:13:31 +0000
@@ -42,3 +42,4 @@
# not an 'object' return value.
void *StaticTuple_GET_ITEM(StaticTuple key, Py_ssize_t offset)
int StaticTuple_CheckExact(object)
+ Py_ssize_t StaticTuple_GET_SIZE(StaticTuple key)
=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py 2010-04-21 20:33:10 +0000
+++ b/bzrlib/chk_map.py 2010-05-11 10:45:26 +0000
@@ -1727,6 +1727,7 @@
try:
from bzrlib._chk_map_pyx import (
+ _bytes_to_text_key,
_search_key_16,
_search_key_255,
_deserialise_leaf_node,
@@ -1735,6 +1736,7 @@
except ImportError, e:
osutils.failed_to_load_extension(e)
from bzrlib._chk_map_py import (
+ _bytes_to_text_key,
_search_key_16,
_search_key_255,
_deserialise_leaf_node,
=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- a/bzrlib/repofmt/groupcompress_repo.py 2010-02-10 02:17:15 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py 2010-05-11 10:45:26 +0000
@@ -262,7 +262,6 @@
remaining_keys = set(keys)
counter = [0]
if self._gather_text_refs:
- bytes_to_info = inventory.CHKInventory._bytes_to_utf8name_key
self._text_refs = set()
def _get_referenced_stream(root_keys, parse_leaf_nodes=False):
cur_keys = root_keys
@@ -289,8 +288,7 @@
# Store is None, because we know we have a LeafNode, and we
# just want its entries
for file_id, bytes in node.iteritems(None):
- name_utf8, file_id, revision_id = bytes_to_info(bytes)
- self._text_refs.add((file_id, revision_id))
+ self._text_refs.add(chk_map._bytes_to_text_key(bytes))
def next_stream():
stream = source_vf.get_record_stream(cur_keys,
'as-requested', True)
@@ -647,10 +645,10 @@
chk_diff = chk_map.iter_interesting_nodes(
chk_bytes_no_fallbacks, root_key_info.interesting_root_keys,
root_key_info.uninteresting_root_keys)
- bytes_to_info = inventory.CHKInventory._bytes_to_utf8name_key
text_keys = set()
try:
- for record in _filter_text_keys(chk_diff, text_keys, bytes_to_info):
+ for record in _filter_text_keys(chk_diff, text_keys,
+ chk_map._bytes_to_text_key):
pass
except errors.NoSuchRevision, e:
# XXX: It would be nice if we could give a more precise error here.
@@ -1087,13 +1085,12 @@
uninteresting_root_keys.add(inv.id_to_entry.key())
uninteresting_pid_root_keys.add(
inv.parent_id_basename_to_file_id.key())
- bytes_to_info = inventory.CHKInventory._bytes_to_utf8name_key
chk_bytes = self.from_repository.chk_bytes
def _filter_id_to_entry():
interesting_nodes = chk_map.iter_interesting_nodes(chk_bytes,
self._chk_id_roots, uninteresting_root_keys)
for record in _filter_text_keys(interesting_nodes, self._text_keys,
- bytes_to_info):
+ chk_map._bytes_to_text_key):
if record is not None:
yield record
# Consumed
@@ -1187,19 +1184,13 @@
return result
-def _filter_text_keys(interesting_nodes_iterable, text_keys, bytes_to_info):
+def _filter_text_keys(interesting_nodes_iterable, text_keys, bytes_to_text_key):
"""Iterate the result of iter_interesting_nodes, yielding the records
and adding to text_keys.
"""
+ text_keys_update = text_keys.update
for record, items in interesting_nodes_iterable:
- for name, bytes in items:
- # Note: we don't care about name_utf8, because groupcompress repos
- # are always rich-root, so there are no synthesised root records to
- # ignore.
- _, file_id, revision_id = bytes_to_info(bytes)
- file_id = intern(file_id)
- revision_id = intern(revision_id)
- text_keys.add(StaticTuple(file_id, revision_id).intern())
+ text_keys_update([bytes_to_text_key(b) for n,b in items])
yield record
=== modified file 'bzrlib/smart/branch.py'
--- a/bzrlib/smart/branch.py 2010-05-06 23:41:35 +0000
+++ b/bzrlib/smart/branch.py 2010-05-11 14:14:55 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2006 Canonical Ltd
+# Copyright (C) 2006-2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
=== modified file 'bzrlib/tests/per_repository/test_refresh_data.py'
--- a/bzrlib/tests/per_repository/test_refresh_data.py 2010-05-11 10:14:23 +0000
+++ b/bzrlib/tests/per_repository/test_refresh_data.py 2010-05-11 14:14:55 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2009 Canonical Ltd
+# Copyright (C) 2009, 2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
=== modified file 'bzrlib/tests/per_workingtree/test_locking.py'
--- a/bzrlib/tests/per_workingtree/test_locking.py 2010-05-06 23:41:35 +0000
+++ b/bzrlib/tests/per_workingtree/test_locking.py 2010-05-11 14:14:55 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2006 Canonical Ltd
+# Copyright (C) 2006, 2008, 2009, 2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
=== modified file 'bzrlib/tests/test__chk_map.py'
--- a/bzrlib/tests/test__chk_map.py 2010-02-17 17:11:16 +0000
+++ b/bzrlib/tests/test__chk_map.py 2010-05-11 10:45:26 +0000
@@ -18,6 +18,7 @@
from bzrlib import (
chk_map,
+ inventory,
tests,
)
from bzrlib.static_tuple import StaticTuple
@@ -236,3 +237,44 @@
self.assertEqual(("sha1:1234",), node.key())
self.assertEqual('pref\x00fo', node._search_prefix)
self.assertEqual({'pref\x00fo\x00': ('sha1:abcd',)}, node._items)
+
+
+class Test_BytesToTextKey(tests.TestCase):
+
+ def assertBytesToTextKey(self, key, bytes):
+ self.assertEqual(key,
+ self.module._bytes_to_text_key(bytes))
+
+ def assertBytesToTextKeyRaises(self, bytes):
+ # These are invalid bytes, and we want to make sure the code under test
+ # raises an exception rather than segfaults, etc. We don't particularly
+ # care what exception.
+ self.assertRaises(Exception, self.module._bytes_to_text_key, bytes)
+
+ def test_file(self):
+ self.assertBytesToTextKey(('file-id', 'revision-id'),
+ 'file: file-id\nparent-id\nname\nrevision-id\n'
+ 'da39a3ee5e6b4b0d3255bfef95601890afd80709\n100\nN')
+
+ def test_invalid_no_kind(self):
+ self.assertBytesToTextKeyRaises(
+ 'file file-id\nparent-id\nname\nrevision-id\n'
+ 'da39a3ee5e6b4b0d3255bfef95601890afd80709\n100\nN')
+
+ def test_invalid_no_space(self):
+ self.assertBytesToTextKeyRaises(
+ 'file:file-id\nparent-id\nname\nrevision-id\n'
+ 'da39a3ee5e6b4b0d3255bfef95601890afd80709\n100\nN')
+
+ def test_invalid_too_short_file_id(self):
+ self.assertBytesToTextKeyRaises('file:file-id')
+
+ def test_invalid_too_short_parent_id(self):
+ self.assertBytesToTextKeyRaises('file:file-id\nparent-id')
+
+ def test_invalid_too_short_name(self):
+ self.assertBytesToTextKeyRaises('file:file-id\nparent-id\nname')
+
+ def test_dir(self):
+ self.assertBytesToTextKey(('dir-id', 'revision-id'),
+ 'dir: dir-id\nparent-id\nname\nrevision-id')
More information about the bazaar-commits
mailing list