Rev 4664: Work around bug #402623 by allowing BTreeGraphIndex(..., unlimited_cache=True). in http://bazaar.launchpad.net/~jameinel/bzr/2.0-cache-cix
John Arbash Meinel
john at arbash-meinel.com
Wed Sep 9 19:53:13 BST 2009
At http://bazaar.launchpad.net/~jameinel/bzr/2.0-cache-cix
------------------------------------------------------------
revno: 4664
revision-id: john at arbash-meinel.com-20090909185256-rdaxy872xauoem46
parent: pqm at pqm.ubuntu.com-20090909084829-8nzz6kop4pnhoftv
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.0-cache-cix
timestamp: Wed 2009-09-09 13:52:56 -0500
message:
Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).
The basic issue is that the access pattern for chk pages is fully random,
because the keys are 'sha1' handles. As such, we have no locality of
reference, and downloading a large project over HTTP can cause us to
redownload all of the .cix pages multiple times. The bug report
noticed the pages getting downloaded 4-5 times.
This was causing a significant increase in the total bytes downloaded.
(For Launchpad, downloading the 10MB cix file 5 times was 50MB, out of
around 160MB total download.)
-------------- next part --------------
=== modified file 'NEWS'
--- a/NEWS 2009-09-09 06:06:44 +0000
+++ b/NEWS 2009-09-09 18:52:56 +0000
@@ -6,6 +6,18 @@
.. contents:: List of Releases
:depth: 1
+bzr 2.0.1 (not released yet)
+############################
+
+Bug Fixes
+*********
+
+* The CHK index pages now use an unlimited cache size. With a limited
+ cache and a large project, the random access of chk pages could cause us
+ to download the entire cix file many times.
+ (John Arbash Meinel, #402623)
+
+
bzr 2.0rc2 (not released yet)
#############################
=== modified file 'bzrlib/btree_index.py'
--- a/bzrlib/btree_index.py 2009-08-17 22:11:06 +0000
+++ b/bzrlib/btree_index.py 2009-09-09 18:52:56 +0000
@@ -628,7 +628,7 @@
memory except when very large walks are done.
"""
- def __init__(self, transport, name, size):
+ def __init__(self, transport, name, size, unlimited_cache=False):
"""Create a B+Tree index object on the index name.
:param transport: The transport to read data for the index from.
@@ -638,6 +638,9 @@
the initial read (to read the root node header) can be done
without over-reading even on empty indices, and on small indices
allows single-IO to read the entire index.
+ :param unlimited_cache: If set to True, then instead of using an
+ LRUCache with size _NODE_CACHE_SIZE, we will use a dict and always
+ cache all leaf nodes.
"""
self._transport = transport
self._name = name
@@ -647,12 +650,15 @@
self._root_node = None
# Default max size is 100,000 leave values
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
- self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
- # We could limit this, but even a 300k record btree has only 3k leaf
- # nodes, and only 20 internal nodes. So the default of 100 nodes in an
- # LRU would mean we always cache everything anyway, no need to pay the
- # overhead of LRU
- self._internal_node_cache = fifo_cache.FIFOCache(100)
+ if unlimited_cache:
+ self._leaf_node_cache = {}
+ self._internal_node_cache = {}
+ else:
+ self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
+ # We use a FIFO here just to prevent possible blowout. However, a
+ # 300k record btree has only 3k leaf nodes, and only 20 internal
+ # nodes. A value of 100 scales to ~100*100*100 = 1M records.
+ self._internal_node_cache = fifo_cache.FIFOCache(100)
self._key_count = None
self._row_lengths = None
self._row_offsets = None # Start of each row, [-1] is the end
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py 2009-08-17 22:11:06 +0000
+++ b/bzrlib/index.py 2009-09-09 18:52:56 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2007, 2008 Canonical Ltd
+# Copyright (C) 2007, 2008, 2009 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -368,7 +368,7 @@
suitable for production use. :XXX
"""
- def __init__(self, transport, name, size):
+ def __init__(self, transport, name, size, unlimited_cache=False):
"""Open an index called name on transport.
:param transport: A bzrlib.transport.Transport.
=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py 2009-09-08 05:51:36 +0000
+++ b/bzrlib/repofmt/pack_repo.py 2009-09-09 18:52:56 +0000
@@ -224,10 +224,14 @@
return self.index_name('text', name)
def _replace_index_with_readonly(self, index_type):
+ unlimited_cache = False
+ if index_type == 'chk':
+ unlimited_cache = True
setattr(self, index_type + '_index',
self.index_class(self.index_transport,
self.index_name(index_type, self.name),
- self.index_sizes[self.index_offset(index_type)]))
+ self.index_sizes[self.index_offset(index_type)],
+ unlimited_cache=unlimited_cache))
class ExistingPack(Pack):
@@ -1674,7 +1678,7 @@
txt_index = self._make_index(name, '.tix')
sig_index = self._make_index(name, '.six')
if self.chk_index is not None:
- chk_index = self._make_index(name, '.cix')
+ chk_index = self._make_index(name, '.cix', unlimited_cache=True)
else:
chk_index = None
result = ExistingPack(self._pack_transport, name, rev_index,
@@ -1699,7 +1703,8 @@
txt_index = self._make_index(name, '.tix', resume=True)
sig_index = self._make_index(name, '.six', resume=True)
if self.chk_index is not None:
- chk_index = self._make_index(name, '.cix', resume=True)
+ chk_index = self._make_index(name, '.cix', resume=True,
+ unlimited_cache=True)
else:
chk_index = None
result = self.resumed_pack_factory(name, rev_index, inv_index,
@@ -1735,7 +1740,7 @@
return self._index_class(self.transport, 'pack-names', None
).iter_all_entries()
- def _make_index(self, name, suffix, resume=False):
+ def _make_index(self, name, suffix, resume=False, unlimited_cache=False):
size_offset = self._suffix_offsets[suffix]
index_name = name + suffix
if resume:
@@ -1744,7 +1749,8 @@
else:
transport = self._index_transport
index_size = self._names[name][size_offset]
- return self._index_class(transport, index_name, index_size)
+ return self._index_class(transport, index_name, index_size,
+ unlimited_cache=unlimited_cache)
def _max_pack_count(self, total_revisions):
"""Return the maximum number of packs to use for total revisions.
=== modified file 'bzrlib/tests/per_repository_chk/test_supported.py'
--- a/bzrlib/tests/per_repository_chk/test_supported.py 2009-09-08 06:25:26 +0000
+++ b/bzrlib/tests/per_repository_chk/test_supported.py 2009-09-09 18:52:56 +0000
@@ -17,8 +17,10 @@
"""Tests for repositories that support CHK indices."""
from bzrlib import (
+ btree_index,
errors,
osutils,
+ repository,
)
from bzrlib.versionedfile import VersionedFiles
from bzrlib.tests.per_repository_chk import TestCaseWithRepositoryCHK
@@ -108,6 +110,39 @@
finally:
repo.unlock()
+ def test_chk_bytes_are_fully_buffered(self):
+ repo = self.make_repository('.')
+ repo.lock_write()
+ self.addCleanup(repo.unlock)
+ repo.start_write_group()
+ try:
+ sha1, len, _ = repo.chk_bytes.add_lines((None,),
+ None, ["foo\n", "bar\n"], random_id=True)
+ self.assertEqual('4e48e2c9a3d2ca8a708cb0cc545700544efb5021',
+ sha1)
+ self.assertEqual(
+ set([('sha1:4e48e2c9a3d2ca8a708cb0cc545700544efb5021',)]),
+ repo.chk_bytes.keys())
+ except:
+ repo.abort_write_group()
+ raise
+ else:
+ repo.commit_write_group()
+ # This may not always be correct if we change away from BTreeGraphIndex
+ # in the future. But for now, lets check that chk_bytes are fully
+ # buffered
+ index = repo.chk_bytes._index._graph_index._indices[0]
+ self.assertIsInstance(index, btree_index.BTreeGraphIndex)
+ self.assertIs(type(index._leaf_node_cache), dict)
+ # Re-opening the repository should also have a repo with everything
+ # fully buffered
+ repo2 = repository.Repository.open(self.get_url())
+ repo2.lock_read()
+ self.addCleanup(repo2.unlock)
+ index = repo2.chk_bytes._index._graph_index._indices[0]
+ self.assertIsInstance(index, btree_index.BTreeGraphIndex)
+ self.assertIs(type(index._leaf_node_cache), dict)
+
class TestCommitWriteGroupIntegrityCheck(TestCaseWithRepositoryCHK):
"""Tests that commit_write_group prevents various kinds of invalid data
=== modified file 'bzrlib/tests/test_btree_index.py'
--- a/bzrlib/tests/test_btree_index.py 2009-08-13 19:56:26 +0000
+++ b/bzrlib/tests/test_btree_index.py 2009-09-09 18:52:56 +0000
@@ -23,6 +23,8 @@
from bzrlib import (
btree_index,
errors,
+ fifo_cache,
+ lru_cache,
osutils,
tests,
)
@@ -1115,6 +1117,30 @@
self.assertEqual({}, parent_map)
self.assertEqual(set([('one',), ('two',)]), missing_keys)
+ def test_supports_unlimited_cache(self):
+ builder = btree_index.BTreeBuilder(reference_lists=0, key_elements=1)
+ stream = builder.finish()
+ trans = get_transport(self.get_url())
+ size = trans.put_file('index', stream)
+ index = btree_index.BTreeGraphIndex(trans, 'index', size)
+ self.assertIsInstance(index._leaf_node_cache, lru_cache.LRUCache)
+ self.assertEqual(btree_index._NODE_CACHE_SIZE,
+ index._leaf_node_cache._max_cache)
+ self.assertIsInstance(index._internal_node_cache, fifo_cache.FIFOCache)
+ self.assertEqual(100, index._internal_node_cache._max_cache)
+ # No change if unlimited_cache=False is passed
+ index = btree_index.BTreeGraphIndex(trans, 'index', size,
+ unlimited_cache=False)
+ self.assertIsInstance(index._leaf_node_cache, lru_cache.LRUCache)
+ self.assertEqual(btree_index._NODE_CACHE_SIZE,
+ index._leaf_node_cache._max_cache)
+ self.assertIsInstance(index._internal_node_cache, fifo_cache.FIFOCache)
+ self.assertEqual(100, index._internal_node_cache._max_cache)
+ index = btree_index.BTreeGraphIndex(trans, 'index', size,
+ unlimited_cache=True)
+ self.assertIsInstance(index._leaf_node_cache, dict)
+ self.assertIs(type(index._internal_node_cache), dict)
+
class TestBTreeNodes(BTreeTestCase):
=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py 2009-08-13 19:56:26 +0000
+++ b/bzrlib/tests/test_index.py 2009-09-09 18:52:56 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2007 Canonical Ltd
+# Copyright (C) 2007, 2009 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -1006,6 +1006,15 @@
self.assertEqual(set(), missing_keys)
self.assertEqual(set(), search_keys)
+ def test_supports_unlimited_cache(self):
+ builder = GraphIndexBuilder(0, key_elements=1)
+ stream = builder.finish()
+ trans = get_transport(self.get_url())
+ size = trans.put_file('index', stream)
+ # It doesn't matter what unlimited_cache does here, just that it can be
+ # passed
+ index = GraphIndex(trans, 'index', size, unlimited_cache=True)
+
class TestCombinedGraphIndex(TestCaseWithMemoryTransport):
More information about the bazaar-commits
mailing list