Rev 3905: Try splitting the chk stream based on prefix, rather than just 'newness'. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/split_pack

John Arbash Meinel john at arbash-meinel.com
Wed Mar 25 14:53:24 GMT 2009


At http://bzr.arbash-meinel.com/branches/bzr/brisbane/split_pack

------------------------------------------------------------
revno: 3905
revision-id: john at arbash-meinel.com-20090325145300-gycg1qqm2mtbm7tq
parent: john at arbash-meinel.com-20090324215215-ceot294mf4q57qh8
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: split_pack
timestamp: Wed 2009-03-25 09:53:00 -0500
message:
  Try splitting the chk stream based on prefix, rather than just 'newness'.
-------------- next part --------------
=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- a/bzrlib/repofmt/groupcompress_repo.py	2009-03-24 21:52:15 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py	2009-03-25 14:53:00 +0000
@@ -251,75 +251,63 @@
         inv = inventory.CHKInventory(None)
         self._text_refs = set()
         def _get_referenced_stream(root_keys, parse_leaf_nodes=False):
-            cur_keys = root_keys
-            while cur_keys:
-                keys_by_search_prefix = {}
-                remaining_keys.difference_update(cur_keys)
+            keys_by_search_prefix = {'':root_keys}
+            def handle_internal_node(node):
+                for prefix, value in node._items.iteritems():
+                    # We don't want to request the same key twice, and we
+                    # want to order it by the first time it is seen.
+                    # Even further, we don't want to request a key which is
+                    # not in this group of pack files (it should be in the
+                    # repo, but it doesn't have to be in the group being
+                    # packed.)
+                    # TODO: consider how to treat externally referenced chk
+                    #       pages as 'external_references' so that we
+                    #       always fill them in for stacked branches
+                    if value not in next_keys and value in remaining_keys:
+                        keys_by_search_prefix.setdefault(prefix,
+                            []).append(value)
+                        next_keys.add(value)
+            def handle_leaf_node(node):
+                # Store is None, because we know we have a LeafNode, and we
+                # just want its entries
+                for file_id, bytes in node.iteritems(None):
+                    entry = inv._bytes_to_entry(bytes)
+                    self._text_refs.add((entry.file_id, entry.revision))
+            def next_stream(subkeys):
+                remaining_keys.difference_update(subkeys)
+                stream = source_vf.get_record_stream(subkeys,
+                                                     'as-requested', True)
+                for record in stream:
+                    bytes = record.get_bytes_as('fulltext')
+                    # We don't care about search_key_func for this code,
+                    # because we only care about external references.
+                    node = chk_map._deserialise(bytes, record.key,
+                                                search_key_func=None)
+                    common_base = node._search_prefix
+                    if isinstance(node, chk_map.InternalNode):
+                        handle_internal_node(node)
+                    elif parse_leaf_nodes:
+                        handle_leaf_node(node)
+                    counter[0] += 1
+                    if pb is not None:
+                        pb.update('chk node', counter[0], total_keys)
+                    yield record
+            while keys_by_search_prefix:
                 next_keys = set()
-                def handle_internal_node(node):
-                    for prefix, value in node._items.iteritems():
-                        # We don't want to request the same key twice, and we
-                        # want to order it by the first time it is seen.
-                        # Even further, we don't want to request a key which is
-                        # not in this group of pack files (it should be in the
-                        # repo, but it doesn't have to be in the group being
-                        # packed.)
-                        # TODO: consider how to treat externally referenced chk
-                        #       pages as 'external_references' so that we
-                        #       always fill them in for stacked branches
-                        if value not in next_keys and value in remaining_keys:
-                            keys_by_search_prefix.setdefault(prefix,
-                                []).append(value)
-                            next_keys.add(value)
-                def handle_leaf_node(node):
-                    # Store is None, because we know we have a LeafNode, and we
-                    # just want its entries
-                    for file_id, bytes in node.iteritems(None):
-                        entry = inv._bytes_to_entry(bytes)
-                        self._text_refs.add((entry.file_id, entry.revision))
-                def next_stream():
-                    stream = source_vf.get_record_stream(cur_keys,
-                                                         'as-requested', True)
-                    for record in stream:
-                        bytes = record.get_bytes_as('fulltext')
-                        # We don't care about search_key_func for this code,
-                        # because we only care about external references.
-                        node = chk_map._deserialise(bytes, record.key,
-                                                    search_key_func=None)
-                        common_base = node._search_prefix
-                        if isinstance(node, chk_map.InternalNode):
-                            handle_internal_node(node)
-                        elif parse_leaf_nodes:
-                            handle_leaf_node(node)
-                        counter[0] += 1
-                        if pb is not None:
-                            pb.update('chk node', counter[0], total_keys)
-                        yield record
-                yield next_stream()
-                # Double check that we won't be emitting any keys twice
-                # If we get rid of the pre-calculation of all keys, we could
-                # turn this around and do
-                # next_keys.difference_update(seen_keys)
-                # However, we also may have references to chk pages in another
-                # pack file during autopack. We filter earlier, so we should no
-                # longer need to do this
-                # next_keys = next_keys.intersection(remaining_keys)
-                cur_keys = []
+                todo = []
+                # Empty the current keys_by_search_prefix, by poping them into
+                # a todo list. At that point, we will fill it up again, by
+                # processing those nodes
                 for prefix in sorted(keys_by_search_prefix):
-                    cur_keys.extend(keys_by_search_prefix.pop(prefix))
-        recent_roots = self._chk_id_roots[:self._RECENT_HORIZON]
-        old_roots = self._chk_id_roots[self._RECENT_HORIZON:]
-        del self._chk_id_roots
+                    todo.append(keys_by_search_prefix.pop(prefix))
+                for subkeys in todo:
+                    yield next_stream(subkeys)
         # Grab the text keys that are referenced by recent commits, so we can
         # prioritize those as well
-        for stream in _get_referenced_stream(recent_roots,
-                                            self._gather_text_refs):
-            yield stream
-        del recent_roots
-        for stream in _get_referenced_stream(old_roots,
+        for stream in _get_referenced_stream(self._chk_id_roots,
                                              self._gather_text_refs):
             yield stream
-        del old_roots
+        del self._chk_id_roots
         # while it isn't really possible for chk_id_roots to not be in the
         # local group of packs, it is possible that the tree shape has not
         # changed recently, so we need to filter _chk_p_id_roots by the
@@ -327,14 +315,9 @@
         chk_p_id_roots = [key for key in self._chk_p_id_roots
                           if key in remaining_keys]
         del self._chk_p_id_roots
-        recent_pid_roots = chk_p_id_roots[:self._RECENT_HORIZON]
-        old_pid_roots = chk_p_id_roots[self._RECENT_HORIZON:]
-        for stream in _get_referenced_stream(recent_pid_roots, False):
-            yield stream
-        del recent_pid_roots
-        for stream in _get_referenced_stream(old_pid_roots, False):
-            yield stream
-        del old_pid_roots
+        for stream in _get_referenced_stream(chk_p_id_roots, False):
+            yield stream
+        del chk_p_id_roots
         if remaining_keys:
             trace.mutter('There were %d keys in the chk index, %d of which'
                          ' were not referenced', total_keys,



More information about the bazaar-commits mailing list