Rev 4783: Interning the start and stop group positions saves another 7MB peak mem. \o/ in http://bazaar.launchpad.net/~jameinel/bzr/2.1-static-tuple-chk-map

Fri Oct 23 16:46:14 BST 2009

At http://bazaar.launchpad.net/~jameinel/bzr/2.1-static-tuple-chk-map

------------------------------------------------------------
revno: 4783
revision-id: john at arbash-meinel.com-20091023154601-2gg0gcbs3s5m90c3
parent: john at arbash-meinel.com-20091023153654-n8in6fpsc8p4665h
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1-static-tuple-chk-map
timestamp: Fri 2009-10-23 10:46:01 -0500
message:
  Interning the start and stop group positions saves another 7MB peak mem. \o/
  
  I'm hoping some of this savings is the 'dark memory', but that is unclear at this point.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'

--- a/bzrlib/groupcompress.py	2009-10-23 15:36:54 +0000
+++ b/bzrlib/groupcompress.py	2009-10-23 15:46:01 +0000
@@ -1269,6 +1269,7 @@
         """See VersionedFiles.clear_cache()"""
         self._group_cache.clear()
         self._index._graph_index.clear_cache()
+        self._index._int_cache.clear()
 
     def _check_add(self, key, lines, random_id, check_content):
         """check that version_id and lines are safe to add."""
@@ -1832,6 +1833,9 @@
         self.has_graph = parents
         self._is_locked = is_locked
         self._inconsistency_fatal = inconsistency_fatal
+        # GroupCompress records tend to have the same 'group' start + offset
+        # repeated over and over, this creates a surplus of ints
+        self._int_cache = {}
         if track_external_parent_refs:
             self._key_dependencies = knit._KeyRefs(
                 track_new_keys=track_new_keys)
@@ -2013,13 +2017,24 @@
         """Convert an index value to position details."""
         bits = node[2].split(' ')
         # It would be nice not to read the entire gzip.
-        # TODO: Intern the start and stop integers. They are *very* common
-        #       between all records in the index. See revno 4781
+        # start and stop are put into _int_cache because they are very common.
+        # They define the 'group' that an entry is in, and many groups can have
+        # thousands of objects.
+        # Branching Launchpad, for example, saves ~600k integers, at 12 bytes
+        # each, or about 7MB. Note that it might be even more when you consider
+        # how PyInt is allocated in separate slabs. And you can't return a slab
+        # to the OS if even 1 int on it is in use. Note though that Python uses
+        # a LIFO when re-using PyInt slots, which probably causes more
+        # fragmentation.
         start = int(bits[0])
+        start = self._int_cache.setdefault(start, start)
         stop = int(bits[1])
+        stop = self._int_cache.setdefault(stop, stop)
         basis_end = int(bits[2])
         delta_end = int(bits[3])
-        return node[0], start, stop, basis_end, delta_end
+        # We can't use StaticTuple here, because node[0] is a BTreeGraphIndex
+        # instance...
+        return (node[0], start, stop, basis_end, delta_end)
 
     def scan_unvalidated_index(self, graph_index):
         """Inform this _GCGraphIndex that there is an unvalidated index.