Rev 5366: Find a case where we are wasting a bit of memory. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gc-build-details

Thu Aug 5 17:27:54 BST 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gc-build-details

------------------------------------------------------------
revno: 5366
revision-id: john at arbash-meinel.com-20100805162735-172opvx34sr5gpbl
parent: pqm at pqm.ubuntu.com-20100802234934-d963xmqwx5gzevr0
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gc-build-details
timestamp: Thu 2010-08-05 11:27:35 -0500
message:
  Find a case where we are wasting a bit of memory.
  
  Specifically the 'build_details' tuple contains a lot of wasted references,
  and we hold on to one of these for each record we are fetching.
  And for something like 'bzr pack', that is all keys.
  
  For just loading all text build details on my bzr+ repository, With:
  locations = b.repository.texts._index.get_build_details(b.repository.texts.keys())
  This drops the memory consumption from:
  WorkingSize   77604KiB
   to
  WorkingSize   64640KiB
  
  Or around 10.6MB. I worked it out to a savings of about 80 bytes/record
  on data that can have hundreds of thousands of records (in 32-bit).
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'

--- a/bzrlib/groupcompress.py	2010-05-20 02:57:52 +0000
+++ b/bzrlib/groupcompress.py	2010-08-05 16:27:35 +0000
@@ -1809,6 +1809,58 @@
         return result
 
 
+class _GCBuildDetails(object):
+    """A blob of data about the build details.
+
+    This stores the minimal data, which then allows compatibility with the old
+    api, without taking as much memory.
+    """
+
+    __slots__ = ('_index', '_group_start', '_group_end', '_basis_end',
+                 '_delta_end', '_parents')
+
+    method = 'group'
+    compression_parent = None
+
+    def __init__(self, parents, position_info):
+        self._parents = parents
+        self._index = position_info[0]
+        self._group_start = position_info[1]
+        # Is this _end or length? Doesn't really matter to us
+        self._group_end = position_info[2]
+        self._basis_end = position_info[3]
+        self._delta_end = position_info[4]
+
+    def __repr__(self):
+        return '%s(%s, %s)' % (self.__class__.__name__,
+            self.index_memo, self._parents)
+
+    @property
+    def index_memo(self):
+        return (self._index, self._group_start, self._group_end,
+                self._basis_end, self._delta_end)
+
+    @property
+    def record_details(self):
+        return static_tuple.StaticTuple(self.method, None)
+
+    def __getitem__(self, offset):
+        """Compatibility thunk to act like a tuple."""
+        if offset == 0:
+            return self.index_memo
+        elif offset == 1:
+            return self.compression_parent # Always None
+        elif offset == 2:
+            return self._parents
+        elif offset == 3:
+            return self.record_details
+        else:
+            raise IndexError('offset out of range')
+            
+    def __len__(self):
+        return 4
+
+
 class _GCGraphIndex(object):
     """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
 
@@ -2009,9 +2061,8 @@
                 parents = None
             else:
                 parents = entry[3][0]
-            method = 'group'
-            result[key] = (self._node_to_position(entry),
-                                  None, parents, (method, None))
+            details = _GCBuildDetails(parents, self._node_to_position(entry))
+            result[key] = details
         return result
 
     def keys(self):
@@ -2033,7 +2084,7 @@
         # each, or about 7MB. Note that it might be even more when you consider
         # how PyInt is allocated in separate slabs. And you can't return a slab
         # to the OS if even 1 int on it is in use. Note though that Python uses
-        # a LIFO when re-using PyInt slots, which probably causes more
+        # a LIFO when re-using PyInt slots, which might cause more
         # fragmentation.
         start = int(bits[0])
         start = self._int_cache.setdefault(start, start)

=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py	2010-02-17 17:11:16 +0000
+++ b/bzrlib/tests/test_groupcompress.py	2010-08-05 16:27:35 +0000
@@ -1066,3 +1066,25 @@
         # consumption
         self.add_key_to_manager(('key4',), locations, block, manager)
         self.assertTrue(manager.check_is_well_utilized())
+
+
+class Test_GCBuildDetails(tests.TestCase):
+
+    def test_acts_like_tuple(self):
+        # _GCBuildDetails inlines some of the data that used to be spread out
+        # across a bunch of tuples
+        bd = groupcompress._GCBuildDetails((('parent1',), ('parent2',)),
+            ('INDEX', 10, 20, 0, 5))
+        self.assertEqual(4, len(bd))
+        self.assertEqual(('INDEX', 10, 20, 0, 5), bd[0])
+        self.assertEqual(None, bd[1]) # Compression Parent is always None
+        self.assertEqual((('parent1',), ('parent2',)), bd[2])
+        self.assertEqual(('group', None), bd[3]) # Record details
+
+    def test__repr__(self):
+        bd = groupcompress._GCBuildDetails((('parent1',), ('parent2',)),
+            ('INDEX', 10, 20, 0, 5))
+        self.assertEqual("_GCBuildDetails(('INDEX', 10, 20, 0, 5),"
+                         " (('parent1',), ('parent2',)))",
+                         repr(bd))
+