Rev 4412: (jam) Delay generating a delta index until we call make_delta() in file:///home/pqm/archives/thelove/bzr/%2Btrunk/
Canonical.com Patch Queue Manager
pqm at pqm.ubuntu.com
Fri Jun 5 05:50:35 BST 2009
At file:///home/pqm/archives/thelove/bzr/%2Btrunk/
------------------------------------------------------------
revno: 4412
revision-id: pqm at pqm.ubuntu.com-20090605045030-yj7sm39ao623zoqo
parent: pqm at pqm.ubuntu.com-20090605035605-xfvzlg2to1qkop1c
parent: john at arbash-meinel.com-20090602212635-qvinez0cgr7gmt0p
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Fri 2009-06-05 05:50:30 +0100
message:
(jam) Delay generating a delta index until we call make_delta()
[saves a lot of memory during commit of large files.]
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/_groupcompress_pyx.pyx _groupcompress_c.pyx-20080724041824-yelg6ii7c7zxt4z0-1
bzrlib/groupcompress.py groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
bzrlib/tests/test__groupcompress.py test__groupcompress_-20080724145854-koifwb7749cfzrvj-1
------------------------------------------------------------
revno: 4398.6.3
revision-id: john at arbash-meinel.com-20090602212635-qvinez0cgr7gmt0p
parent: john at arbash-meinel.com-20090602212305-vs2fwlvslptnjkxv
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.16-no-first-delta-index
timestamp: Tue 2009-06-02 16:26:35 -0500
message:
NEWS entry about improvements to 'bzr commit'
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
------------------------------------------------------------
revno: 4398.6.2
revision-id: john at arbash-meinel.com-20090602212305-vs2fwlvslptnjkxv
parent: john at arbash-meinel.com-20090602211118-fjsx4dxokahrqkrr
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.16-no-first-delta-index
timestamp: Tue 2009-06-02 16:23:05 -0500
message:
Add a TODO, marking the code that causes us to peak at 2x memory consumption
for commit of a large file, rather than 1x.
modified:
bzrlib/groupcompress.py groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
------------------------------------------------------------
revno: 4398.6.1
revision-id: john at arbash-meinel.com-20090602211118-fjsx4dxokahrqkrr
parent: pqm at pqm.ubuntu.com-20090602153906-1q6bresxw669b34i
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.16-no-first-delta-index
timestamp: Tue 2009-06-02 16:11:18 -0500
message:
Change groupcompress.DeltaIndex to be lazy about indexing the first source.
This changes the performance characteristics of 'commit', especially of large files.
The main benefit is that during commit, we won't be doing any deltas as we add
all new content to a new group anyway.
Thus we know that we won't ever use the delta index we were creating, so
we can save both time and memory by never creating the index until it is
needed.
modified:
bzrlib/_groupcompress_pyx.pyx _groupcompress_c.pyx-20080724041824-yelg6ii7c7zxt4z0-1
bzrlib/tests/test__groupcompress.py test__groupcompress_-20080724145854-koifwb7749cfzrvj-1
=== modified file 'NEWS'
--- a/NEWS 2009-06-04 21:25:46 +0000
+++ b/NEWS 2009-06-05 04:50:30 +0000
@@ -36,6 +36,12 @@
bugs with stacking and non default formats.)
(John Arbash Meinel, #373455)
+* ``--development6-rich-root`` delays generating a delta index for the
+ first object inserted into a group. This has a beneficial impact on
+ ``bzr commit`` since each committed texts goes to its own group. For
+ committing a 90MB file, it drops peak memory by about 200MB, and speeds
+ up commit from 7s => 4s. (John Arbash Meinel)
+
* Numerous operations are now faster for huge projects, i.e. those
with a large number of files and/or a large number of revisions,
particularly when the latest development format is used. These
=== modified file 'bzrlib/_groupcompress_pyx.pyx'
--- a/bzrlib/_groupcompress_pyx.pyx 2009-04-09 20:23:07 +0000
+++ b/bzrlib/_groupcompress_pyx.pyx 2009-06-02 21:11:18 +0000
@@ -118,6 +118,9 @@
self._index = NULL
safe_free(<void **>&self._source_infos)
+ def _has_index(self):
+ return (self._index != NULL)
+
def add_delta_source(self, delta, unadded_bytes):
"""Add a new delta to the source texts.
@@ -171,6 +174,9 @@
source_location = len(self._sources)
if source_location >= self._max_num_sources:
self._expand_sources()
+ if source_location != 0 and self._index == NULL:
+ # We were lazy about populating the index, create it now
+ self._populate_first_index()
self._sources.append(source)
c_source = PyString_AS_STRING(source)
c_source_size = PyString_GET_SIZE(source)
@@ -179,11 +185,24 @@
src.size = c_source_size
src.agg_offset = self._source_offset + unadded_bytes
- index = create_delta_index(src, self._index)
self._source_offset = src.agg_offset + src.size
- if index != NULL:
- free_delta_index(self._index)
- self._index = index
+ # We delay creating the index on the first insert
+ if source_location != 0:
+ index = create_delta_index(src, self._index)
+ if index != NULL:
+ free_delta_index(self._index)
+ self._index = index
+
+ cdef _populate_first_index(self):
+ cdef delta_index *index
+ if len(self._sources) != 1 or self._index != NULL:
+ raise AssertionError('_populate_first_index should only be'
+ ' called when we have a single source and no index yet')
+
+ # We know that self._index is already NULL, so whatever
+ # create_delta_index returns is fine
+ self._index = create_delta_index(&self._source_infos[0], NULL)
+ assert self._index != NULL
cdef _expand_sources(self):
raise RuntimeError('if we move self._source_infos, then we need to'
@@ -201,7 +220,10 @@
cdef unsigned long delta_size
if self._index == NULL:
- return None
+ if len(self._sources) == 0:
+ return None
+ # We were just lazy about generating the index
+ self._populate_first_index()
if not PyString_CheckExact(target_bytes):
raise TypeError('target is not a str')
=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py 2009-06-03 15:27:03 +0000
+++ b/bzrlib/groupcompress.py 2009-06-05 04:50:30 +0000
@@ -754,6 +754,14 @@
After calling this, the compressor should no longer be used
"""
+ # TODO: this causes us to 'bloat' to 2x the size of content in the
+ # group. This has an impact for 'commit' of large objects.
+ # One possibility is to use self._content_chunks, and be lazy and
+ # only fill out self._content as a full string when we actually
+ # need it. That would at least drop the peak memory consumption
+ # for 'commit' down to ~1x the size of the largest file, at a
+ # cost of increased complexity within this code. 2x is still <<
+ # 3x the size of the largest file, so we are doing ok.
content = ''.join(self.chunks)
self.chunks = None
self._delta_index = None
=== modified file 'bzrlib/tests/test__groupcompress.py'
--- a/bzrlib/tests/test__groupcompress.py 2009-04-29 05:53:21 +0000
+++ b/bzrlib/tests/test__groupcompress.py 2009-06-02 21:11:18 +0000
@@ -272,6 +272,25 @@
di = self._gc_module.DeltaIndex('test text\n')
self.assertEqual('DeltaIndex(1, 10)', repr(di))
+ def test_first_add_source_doesnt_index_until_make_delta(self):
+ di = self._gc_module.DeltaIndex()
+ self.assertFalse(di._has_index())
+ di.add_source(_text1, 0)
+ self.assertFalse(di._has_index())
+ # However, asking to make a delta will trigger the index to be
+ # generated, and will generate a proper delta
+ delta = di.make_delta(_text2)
+ self.assertTrue(di._has_index())
+ self.assertEqual('N\x90/\x1fdiffer from\nagainst other text\n', delta)
+
+ def test_second_add_source_triggers_make_index(self):
+ di = self._gc_module.DeltaIndex()
+ self.assertFalse(di._has_index())
+ di.add_source(_text1, 0)
+ self.assertFalse(di._has_index())
+ di.add_source(_text2, 0)
+ self.assertTrue(di._has_index())
+
def test_make_delta(self):
di = self._gc_module.DeltaIndex(_text1)
delta = di.make_delta(_text2)
More information about the bazaar-commits
mailing list