Rev 4664: Work around bug #402623 by allowing BTreeGraphIndex(..., unlimited_cache=True). in http://bazaar.launchpad.net/~jameinel/bzr/2.0-cache-cix

Wed Sep 9 19:53:13 BST 2009

At http://bazaar.launchpad.net/~jameinel/bzr/2.0-cache-cix

------------------------------------------------------------
revno: 4664
revision-id: john at arbash-meinel.com-20090909185256-rdaxy872xauoem46
parent: pqm at pqm.ubuntu.com-20090909084829-8nzz6kop4pnhoftv
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.0-cache-cix
timestamp: Wed 2009-09-09 13:52:56 -0500
message:
  Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).
  
  The basic issue is that the access pattern for chk pages is fully random,
  because the keys are 'sha1' handles. As such, we have no locality of
  reference, and downloading a large project over HTTP can cause us to
  redownload all of the .cix pages multiple times. The bug report
  noticed the pages getting downloaded 4-5 times.
  This was causing a significant increase in the total bytes downloaded.
  (For Launchpad, downloading the 10MB cix file 5 times was 50MB, out of
  around 160MB total download.)
-------------- next part --------------
=== modified file 'NEWS'

--- a/NEWS	2009-09-09 06:06:44 +0000
+++ b/NEWS	2009-09-09 18:52:56 +0000
@@ -6,6 +6,18 @@
 .. contents:: List of Releases
    :depth: 1
 
+bzr 2.0.1 (not released yet)
+############################
+
+Bug Fixes
+*********
+
+* The CHK index pages now use an unlimited cache size. With a limited
+  cache and a large project, the random access of chk pages could cause us
+  to download the entire cix file many times.
+  (John Arbash Meinel, #402623)
+
+
 bzr 2.0rc2 (not released yet)
 #############################
 

=== modified file 'bzrlib/btree_index.py'
--- a/bzrlib/btree_index.py	2009-08-17 22:11:06 +0000
+++ b/bzrlib/btree_index.py	2009-09-09 18:52:56 +0000
@@ -628,7 +628,7 @@
     memory except when very large walks are done.
     """
 
-    def __init__(self, transport, name, size):
+    def __init__(self, transport, name, size, unlimited_cache=False):
         """Create a B+Tree index object on the index name.
 
         :param transport: The transport to read data for the index from.
@@ -638,6 +638,9 @@
             the initial read (to read the root node header) can be done
             without over-reading even on empty indices, and on small indices
             allows single-IO to read the entire index.
+        :param unlimited_cache: If set to True, then instead of using an
+            LRUCache with size _NODE_CACHE_SIZE, we will use a dict and always
+            cache all leaf nodes.
         """
         self._transport = transport
         self._name = name
@@ -647,12 +650,15 @@
         self._root_node = None
         # Default max size is 100,000 leave values
         self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
-        self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
-        # We could limit this, but even a 300k record btree has only 3k leaf
-        # nodes, and only 20 internal nodes. So the default of 100 nodes in an
-        # LRU would mean we always cache everything anyway, no need to pay the
-        # overhead of LRU
-        self._internal_node_cache = fifo_cache.FIFOCache(100)
+        if unlimited_cache:
+            self._leaf_node_cache = {}
+            self._internal_node_cache = {}
+        else:
+            self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
+            # We use a FIFO here just to prevent possible blowout. However, a
+            # 300k record btree has only 3k leaf nodes, and only 20 internal
+            # nodes. A value of 100 scales to ~100*100*100 = 1M records.
+            self._internal_node_cache = fifo_cache.FIFOCache(100)
         self._key_count = None
         self._row_lengths = None
         self._row_offsets = None # Start of each row, [-1] is the end

=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py	2009-08-17 22:11:06 +0000
+++ b/bzrlib/index.py	2009-09-09 18:52:56 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2007, 2008 Canonical Ltd
+# Copyright (C) 2007, 2008, 2009 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -368,7 +368,7 @@
     suitable for production use. :XXX
     """
 
-    def __init__(self, transport, name, size):
+    def __init__(self, transport, name, size, unlimited_cache=False):
         """Open an index called name on transport.
 
         :param transport: A bzrlib.transport.Transport.

=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py	2009-09-08 05:51:36 +0000
+++ b/bzrlib/repofmt/pack_repo.py	2009-09-09 18:52:56 +0000
@@ -224,10 +224,14 @@
         return self.index_name('text', name)
 
     def _replace_index_with_readonly(self, index_type):
+        unlimited_cache = False
+        if index_type == 'chk':
+            unlimited_cache = True
         setattr(self, index_type + '_index',
             self.index_class(self.index_transport,
                 self.index_name(index_type, self.name),
-                self.index_sizes[self.index_offset(index_type)]))
+                self.index_sizes[self.index_offset(index_type)],
+                unlimited_cache=unlimited_cache))
 
 
 class ExistingPack(Pack):
@@ -1674,7 +1678,7 @@
             txt_index = self._make_index(name, '.tix')
             sig_index = self._make_index(name, '.six')
             if self.chk_index is not None:
-                chk_index = self._make_index(name, '.cix')
+                chk_index = self._make_index(name, '.cix', unlimited_cache=True)
             else:
                 chk_index = None
             result = ExistingPack(self._pack_transport, name, rev_index,
@@ -1699,7 +1703,8 @@
             txt_index = self._make_index(name, '.tix', resume=True)
             sig_index = self._make_index(name, '.six', resume=True)
             if self.chk_index is not None:
-                chk_index = self._make_index(name, '.cix', resume=True)
+                chk_index = self._make_index(name, '.cix', resume=True,
+                                             unlimited_cache=True)
             else:
                 chk_index = None
             result = self.resumed_pack_factory(name, rev_index, inv_index,
@@ -1735,7 +1740,7 @@
         return self._index_class(self.transport, 'pack-names', None
                 ).iter_all_entries()
 
-    def _make_index(self, name, suffix, resume=False):
+    def _make_index(self, name, suffix, resume=False, unlimited_cache=False):
         size_offset = self._suffix_offsets[suffix]
         index_name = name + suffix
         if resume:
@@ -1744,7 +1749,8 @@
         else:
             transport = self._index_transport
             index_size = self._names[name][size_offset]
-        return self._index_class(transport, index_name, index_size)
+        return self._index_class(transport, index_name, index_size,
+                                 unlimited_cache=unlimited_cache)
 
     def _max_pack_count(self, total_revisions):
         """Return the maximum number of packs to use for total revisions.

=== modified file 'bzrlib/tests/per_repository_chk/test_supported.py'
--- a/bzrlib/tests/per_repository_chk/test_supported.py	2009-09-08 06:25:26 +0000
+++ b/bzrlib/tests/per_repository_chk/test_supported.py	2009-09-09 18:52:56 +0000
@@ -17,8 +17,10 @@
 """Tests for repositories that support CHK indices."""
 
 from bzrlib import (
+    btree_index,
     errors,
     osutils,
+    repository,
     )
 from bzrlib.versionedfile import VersionedFiles
 from bzrlib.tests.per_repository_chk import TestCaseWithRepositoryCHK
@@ -108,6 +110,39 @@
         finally:
             repo.unlock()
 
+    def test_chk_bytes_are_fully_buffered(self):
+        repo = self.make_repository('.')
+        repo.lock_write()
+        self.addCleanup(repo.unlock)
+        repo.start_write_group()
+        try:
+            sha1, len, _ = repo.chk_bytes.add_lines((None,),
+                None, ["foo\n", "bar\n"], random_id=True)
+            self.assertEqual('4e48e2c9a3d2ca8a708cb0cc545700544efb5021',
+                sha1)
+            self.assertEqual(
+                set([('sha1:4e48e2c9a3d2ca8a708cb0cc545700544efb5021',)]),
+                repo.chk_bytes.keys())
+        except:
+            repo.abort_write_group()
+            raise
+        else:
+            repo.commit_write_group()
+        # This may not always be correct if we change away from BTreeGraphIndex
+        # in the future. But for now, lets check that chk_bytes are fully
+        # buffered
+        index = repo.chk_bytes._index._graph_index._indices[0]
+        self.assertIsInstance(index, btree_index.BTreeGraphIndex)
+        self.assertIs(type(index._leaf_node_cache), dict)
+        # Re-opening the repository should also have a repo with everything
+        # fully buffered
+        repo2 = repository.Repository.open(self.get_url())
+        repo2.lock_read()
+        self.addCleanup(repo2.unlock)
+        index = repo2.chk_bytes._index._graph_index._indices[0]
+        self.assertIsInstance(index, btree_index.BTreeGraphIndex)
+        self.assertIs(type(index._leaf_node_cache), dict)
+
 
 class TestCommitWriteGroupIntegrityCheck(TestCaseWithRepositoryCHK):
     """Tests that commit_write_group prevents various kinds of invalid data

=== modified file 'bzrlib/tests/test_btree_index.py'
--- a/bzrlib/tests/test_btree_index.py	2009-08-13 19:56:26 +0000
+++ b/bzrlib/tests/test_btree_index.py	2009-09-09 18:52:56 +0000
@@ -23,6 +23,8 @@
 from bzrlib import (
     btree_index,
     errors,
+    fifo_cache,
+    lru_cache,
     osutils,
     tests,
     )
@@ -1115,6 +1117,30 @@
         self.assertEqual({}, parent_map)
         self.assertEqual(set([('one',), ('two',)]), missing_keys)
 
+    def test_supports_unlimited_cache(self):
+        builder = btree_index.BTreeBuilder(reference_lists=0, key_elements=1)
+        stream = builder.finish()
+        trans = get_transport(self.get_url())
+        size = trans.put_file('index', stream)
+        index = btree_index.BTreeGraphIndex(trans, 'index', size)
+        self.assertIsInstance(index._leaf_node_cache, lru_cache.LRUCache)
+        self.assertEqual(btree_index._NODE_CACHE_SIZE,
+                         index._leaf_node_cache._max_cache)
+        self.assertIsInstance(index._internal_node_cache, fifo_cache.FIFOCache)
+        self.assertEqual(100, index._internal_node_cache._max_cache)
+        # No change if unlimited_cache=False is passed
+        index = btree_index.BTreeGraphIndex(trans, 'index', size,
+                                            unlimited_cache=False)
+        self.assertIsInstance(index._leaf_node_cache, lru_cache.LRUCache)
+        self.assertEqual(btree_index._NODE_CACHE_SIZE,
+                         index._leaf_node_cache._max_cache)
+        self.assertIsInstance(index._internal_node_cache, fifo_cache.FIFOCache)
+        self.assertEqual(100, index._internal_node_cache._max_cache)
+        index = btree_index.BTreeGraphIndex(trans, 'index', size,
+                                            unlimited_cache=True)
+        self.assertIsInstance(index._leaf_node_cache, dict)
+        self.assertIs(type(index._internal_node_cache), dict)
+
 
 class TestBTreeNodes(BTreeTestCase):
 

=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py	2009-08-13 19:56:26 +0000
+++ b/bzrlib/tests/test_index.py	2009-09-09 18:52:56 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2007 Canonical Ltd
+# Copyright (C) 2007, 2009 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -1006,6 +1006,15 @@
         self.assertEqual(set(), missing_keys)
         self.assertEqual(set(), search_keys)
 
+    def test_supports_unlimited_cache(self):
+        builder = GraphIndexBuilder(0, key_elements=1)
+        stream = builder.finish()
+        trans = get_transport(self.get_url())
+        size = trans.put_file('index', stream)
+        # It doesn't matter what unlimited_cache does here, just that it can be
+        # passed
+        index = GraphIndex(trans, 'index', size, unlimited_cache=True)
+
 
 class TestCombinedGraphIndex(TestCaseWithMemoryTransport):