Rev 3764: Possible fix for bug #269456. in http://bzr.arbash-meinel.com/branches/bzr/1.8-dev/lighter_iter_files_bytes

Fri Oct 3 17:14:45 BST 2008

At http://bzr.arbash-meinel.com/branches/bzr/1.8-dev/lighter_iter_files_bytes

------------------------------------------------------------
revno: 3764
revision-id: john at arbash-meinel.com-20081003161439-h23zdckp4z78wh3r
parent: pqm at pqm.ubuntu.com-20081002172844-d6df1l8dzpsqzyup
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: lighter_iter_files_bytes
timestamp: Fri 2008-10-03 11:14:39 -0500
message:
  Possible fix for bug #269456.
  
  During 'get_record_stream()' only unpack one file's content map
  at a time. This helps prevent abusive consumption of memory.
  Also, free the lines cache as texts are consumed.
-------------- next part --------------
=== modified file 'NEWS'

--- a/NEWS	2008-10-02 17:28:44 +0000
+++ b/NEWS	2008-10-03 16:14:39 +0000
@@ -83,6 +83,10 @@
       repository now preserves the repository format.
       (Andrew Bennetts, #269214)
 
+    * ``bzr co`` uses less memory. It used to unpack the entire WT into
+      memory before writing it to disk. This was a little bit faster, but
+      consumed lots of memory. (John Arbash Meinel, #269456)
+
     * ``bzr log`` now accepts a ``--change`` option.
       (Vincent Ladeuil, #248427)
 

=== modified file 'bzrlib/knit.py'
--- a/bzrlib/knit.py	2008-10-01 05:40:45 +0000
+++ b/bzrlib/knit.py	2008-10-03 16:14:39 +0000
@@ -1124,6 +1124,26 @@
             record_map[key] = record, record_details, digest, next
         return record_map
 
+    def _split_by_prefix(self, keys):
+        """For the given keys, split them up based on their prefix.
+
+        To keep memory pressure somewhat under control, split the
+        requests back into per-file-id requests, otherwise "bzr co"
+        extracts the full tree into memory before writing it to disk.
+        This should be revisited if _get_content_maps() can ever cross
+        file-id boundaries.
+
+        :param keys: An iterable of key tuples
+        :return: A dict of {prefix: [key_list]}
+        """
+        split_by_prefix = {}
+        for key in keys:
+            if len(key) == 1:
+                split_by_prefix.setdefault('', []).append(key)
+            else:
+                split_by_prefix.setdefault(key[0], []).append(key)
+        return split_by_prefix
+
     def get_record_stream(self, keys, ordering, include_delta_closure):
         """Get a stream of records for keys.
 
@@ -1223,11 +1243,18 @@
         if include_delta_closure:
             # XXX: get_content_maps performs its own index queries; allow state
             # to be passed in.
-            text_map, _ = self._get_content_maps(present_keys,
-                needed_from_fallback - absent_keys)
-            for key in present_keys:
-                yield FulltextContentFactory(key, global_map[key], None,
-                    ''.join(text_map[key]))
+            non_local_keys = needed_from_fallback - absent_keys
+            prefix_split_keys = self._split_by_prefix(present_keys)
+            prefix_split_non_local_keys = self._split_by_prefix(non_local_keys)
+            for prefix, keys in prefix_split_keys.iteritems():
+                non_local = prefix_split_non_local_keys.get(prefix, [])
+                non_local = set(non_local)
+                text_map, _ = self._get_content_maps(keys, non_local)
+                for key in keys:
+                    lines = text_map.pop(key)
+                    text = ''.join(lines)
+                    yield FulltextContentFactory(key, global_map[key], None,
+                                                 text)
         else:
             for source, keys in source_keys:
                 if source is parent_maps[0]: