Rev 2265: Eek out another 100ms improvement by using a single list instead of 2 in http://bzr.arbash-meinel.com/branches/bzr/0.15-dev/list_patch

John Arbash Meinel john at arbash-meinel.com
Wed Jan 31 00:28:34 GMT 2007


At http://bzr.arbash-meinel.com/branches/bzr/0.15-dev/list_patch

------------------------------------------------------------
revno: 2265
revision-id: john at arbash-meinel.com-20070131002829-8q0r5zqy24y11ng7
parent: john at arbash-meinel.com-20070130230331-f2ygx33kurrovx1p
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: list_patch
timestamp: Tue 2007-01-30 18:28:29 -0600
message:
  Eek out another 100ms improvement by using a single list instead of 2
modified:
  bzrlib/hybrid_linked_list.py   hybrid_linked_list.p-20070130001028-zlexwz74tzo56o2t-1
  bzrlib/tests/test_hybrid_linked_list.py test_hybrid_linked_l-20070130001028-zlexwz74tzo56o2t-2
-------------- next part --------------
=== modified file 'bzrlib/hybrid_linked_list.py'
--- a/bzrlib/hybrid_linked_list.py	2007-01-30 23:03:31 +0000
+++ b/bzrlib/hybrid_linked_list.py	2007-01-31 00:28:29 +0000
@@ -35,12 +35,10 @@
     nodes themselves are never modified.
 
     :ivar _head: Pointer to the first range, may be None
-    :ivar _ranges: The map of ranges. This maps handle => (start, end, content)
-    :ivar _links: This maps each range to the next range. It could technically
-        be stored with _ranges, but it makes it easier to update links if they
-        are in a separate dict. (less tuple churn)
+    :ivar _ranges: The map of ranges. This maps
+                    handle => (start, end, next, content)
     :ivar _length: The current length of the output content
-    :cvar _max_links: When copying, flatten the text if we exceed _max_links
+    :cvar _max_ranges: When copying, flatten the text if we exceed _max_ranges
     """
 
     # This was determined empirically on a knit with 55,000 lines of content,
@@ -64,9 +62,9 @@
     #
     # TODO: jam 20070130 This seems like a pretty good class to
     #       optimize as a C or pyrex extension.
-    _max_links = 500
+    _max_ranges = 500
 
-    __slots__ = ['_head', '_ranges', '_links', '_length']
+    __slots__ = ['_head', '_ranges', '_length']
 
     def __init__(self, starting=[]):
         """Create a new HybridLinkedList.
@@ -85,7 +83,6 @@
         self._head = None
         self._ranges = []
         # This is a singly linked list map for range to next range
-        self._links = []
         self._length = 0
 
         if starting:
@@ -96,12 +93,11 @@
         cur_range_id = self._head
 
         _ranges = self._ranges
-        _links = self._links
         while cur_range_id is not None:
-            start, end, content = _ranges[cur_range_id]
+            start, end, next_range_id, content = _ranges[cur_range_id]
             for pos in xrange(start, end):
                 yield content[pos]
-            cur_range_id = _links[cur_range_id]
+            cur_range_id = next_range_id
 
     def __len__(self):
         # Profiling shows __len__ to be adding a lot of overhead, so we use a
@@ -129,13 +125,12 @@
 
             # offset, start, end are in terms of the current content
             (prev_range_id, cur_range_id, offset,
-             start, end, content) = self._find_nodes_for_pos(cur)
+             start, end, next_range_id, content) = self._find_nodes_for_pos(cur)
 
             # They requested something out of range
             if cur_range_id is None:
                 return result
 
-            _links = self._links
             _ranges = self._ranges
 
             offset += start
@@ -143,11 +138,11 @@
                 while offset >= end:
                     # Keep skipping until we find next
                     offset -= end
-                    cur_range_id = _links[cur_range_id]
+                    cur_range_id = next_range_id
                     if cur_range_id is None:
                         content = None
                         break
-                    start, end, content = _ranges[cur_range_id]
+                    start, end, next_range_id, content = _ranges[cur_range_id]
                     offset += start
                 if content is None:
                     break
@@ -157,7 +152,7 @@
             return result
         else:
             (prev_range_id, cur_range_id, offset,
-             start, end, content) = self._find_nodes_for_pos(pos)
+             start, end, next_range_id, content) = self._find_nodes_for_pos(pos)
             return content[start+offset]
 
     def __setitem__(self, pos, value):
@@ -192,8 +187,9 @@
         cur = self._head
         _ranges = self._ranges
         while cur is not None:
-            flattened.append('%s:%s' % (cur, _ranges[cur]))
-            cur = self._links[cur]
+            start, end, next, content = _ranges[cur]
+            flattened.append('%s:%s' % (cur, (start, end, content)))
+            cur = next
         return 'HybridLinkedList(' + ', '.join(flattened) + ')'
 
     def flatten(self):
@@ -212,7 +208,7 @@
         prev_range_id = None
         cur_range_id = self._head
         cur_pos = 0
-        content = start = end = None
+        content = start = end = next_range_id = None
 
         # collapsing to local variables decreased the lsprof time from:
         # 4716 4.8879 bzrlib.hybrid_linked_list:182(_find_nodes_for_pos)
@@ -225,65 +221,71 @@
         # 4719 1.4945 bzrlib.hybrid_linked_list:183(_find_nodes_for_pos)
         # So while it takes 300+ms to flatten, we avoid bad O(N) behavior for
         # number of links.
+        # Using 500 links:
+        # 4723 0.7790 bzrlib.hybrid_linked_list:203(_find_nodes_for_pos)
+        #
+        # Collapsing to a single list instead of using to results in:
+        # 4723 0.7933 bzrlib.hybrid_linked_list:203(_find_nodes_for_pos)
         _ranges = self._ranges
-        _links = self._links
 
         while cur_range_id is not None:
-            start, end, content = _ranges[cur_range_id]
+            start, end, next_range_id, content = _ranges[cur_range_id]
             next_pos = cur_pos + (end - start)
             if pos < next_pos:
                 break
             prev_range_id = cur_range_id
-            cur_range_id = _links[cur_range_id]
+            cur_range_id = next_range_id
             cur_pos = next_pos
-            content = start = end = None
+        else:
+            content = start = end = next_range_id = None
         return (prev_range_id, cur_range_id, pos - cur_pos,
-                start, end, content)
+                start, end, next_range_id, content)
 
-    def _new_range_id(self, start, end, content):
-        """Get a new range id."""
+    def _new_range_id(self, start, end, next, content):
+        """Create a new range id."""
         new_range_id = len(self._ranges)
-        self._ranges.append((start, end, content))
-        self._links.append(None)
+        self._ranges.append((start, end, next, content))
         return new_range_id
 
+    def _change_next_link(self, range_id, next_range_id):
+        """Change the 'next' pointer for a given range_id."""
+        start, end, next, content = self._ranges[range_id]
+        self._ranges[range_id] = (start, end, next_range_id, content)
+
     def insert(self, start, content):
         """Insert a set of entries starting at the first position."""
         content_len = len(content)
-        new_range_id = self._new_range_id(0, content_len, content)
-
-        (prev_range_id, cur_range_id, offset,
-         start, end, cur_content) = self._find_nodes_for_pos(start)
-
-        _links = self._links
+        #new_range_id = self._new_range_id(0, content_len, content)
+
+        (prev_range_id, cur_range_id, offset, start, end,
+         next_range_id, cur_content) = self._find_nodes_for_pos(start)
+
+        _ranges = self._ranges
 
         if offset == 0:
             # We are inserting before an existing node, or at the exact end of
             # the list
             if prev_range_id is None:
                 # We are at _head, update its reference
-                _links[new_range_id] = self._head
+                new_range_id = self._new_range_id(0, content_len, self._head,
+                                                  content)
                 self._head = new_range_id
             else:
-                _links[new_range_id] = cur_range_id
-                _links[prev_range_id] = new_range_id
+                new_range_id = self._new_range_id(0, content_len, cur_range_id,
+                                                  content)
+                self._change_next_link(prev_range_id, new_range_id)
         # We are inserting inside an existing node
         elif cur_range_id is not None:
             # We need to break up the current node, and insert the
             # requested one
             break_point = start + offset
-            split_range_id = self._new_range_id(break_point, end, cur_content)
-            next_range_id = _links[cur_range_id]
-            _links[split_range_id] = next_range_id
-            _links[new_range_id] = split_range_id
+            split_range_id = self._new_range_id(break_point, end, next_range_id,
+                                                cur_content)
+            new_range_id = self._new_range_id(0, content_len, split_range_id,
+                                              content)
 
-            # We could make this atomic by creating a new id for the
-            # initial range, and then the atomic step is
-            # self._links[prev_range_id] = new_head_range
-            # And then afterwards we would want to delete the cur_range_id
-            # entry in the map
-            self._ranges[cur_range_id] = (start, break_point, cur_content)
-            _links[cur_range_id] = new_range_id
+            _ranges[cur_range_id] = (start, break_point, new_range_id,
+                                     cur_content)
         else:
             # We are appending a node
             # python lists have the property that x[BIG_NUM:BIGNUM] = [10]
@@ -291,8 +293,8 @@
             # range
             # TODO: For now, I'm duplicating that, but it seems to make
             #   more sense to raise IndexError or something
-            _links[new_range_id] = None
-            _links[prev_range_id] = new_range_id
+            new_range_id = self._new_range_id(0, content_len, None, content_len)
+            self._change_next_link(prev_range_id, new_range_id)
         self._length += content_len
 
     def delete(self, start, end):
@@ -309,15 +311,14 @@
             # Nothing to do
             return
 
-        (prev_range_id, cur_range_id, offset,
-         cur_start, cur_end, content) = self._find_nodes_for_pos(start)
+        (prev_range_id, cur_range_id, offset, cur_start, cur_end,
+         next_range_id, content) = self._find_nodes_for_pos(start)
 
         if cur_range_id is None:
             # We fell off the end, nothing to do
             # In python "del list[BIGNUM:BIGNUM]" does nothing
             return
 
-        _links = self._links
         _ranges = self._ranges
 
         while cur_range_id is not None:
@@ -339,21 +340,19 @@
             possible_remove_length = entry_length - offset
 
             if remove_length >= possible_remove_length:
-                next_range_id = _links[cur_range_id]
                 if offset == 0:
                     # CASE 1: this node will be fully removed
                     if prev_range_id is None:
                         # Update _head
                         self._head = next_range_id
                     else:
-                        _links[prev_range_id] = next_range_id
-                    _links[cur_range_id] = -1
+                        self._change_next_link(prev_range_id, next_range_id)
                     _ranges[cur_range_id] = None
                 else:
                     # CASE 2: we save the beginning, but remove the tail
                     # This only requires a node entry update
                     _ranges[cur_range_id] = (cur_start, cur_start + offset,
-                                             content)
+                                             next_range_id, content)
                     prev_range_id = cur_range_id
                 remove_length -= possible_remove_length
                 self._length -= possible_remove_length
@@ -362,13 +361,14 @@
                 # step to next
                 offset = 0
                 cur_range_id = next_range_id
-                cur_start, cur_end, content = _ranges[next_range_id]
+                (cur_start, cur_end, next_range_id,
+                 content) = _ranges[cur_range_id]
             else:
                 if offset == 0:
                     # CASE 3: We remove only the beginning of this node
                     # Only requires a node entry update
                     _ranges[cur_range_id] = (cur_start + remove_length, cur_end,
-                                             content)
+                                             next_range_id, content)
                 else:
                     # We have a beginning and a tail, we need to create a new
                     # record, and update the current one
@@ -378,13 +378,10 @@
                     new_end = cur_start + offset
                     new_start = new_end + remove_length
 
-                    new_tail_range_id = self._new_range_id(new_start, cur_end,
-                                                           content)
-                    next_range_id = _links[cur_range_id]
-                    _links[new_tail_range_id] = next_range_id
-
-                    _links[cur_range_id] = new_tail_range_id
-                    _ranges[cur_range_id] = (cur_start, new_end, content)
+                    new_range_id = self._new_range_id(new_start, cur_end,
+                                                      next_range_id, content)
+                    _ranges[cur_range_id] = (cur_start, new_end, new_range_id,
+                                             content)
                 # We officially removed enough, so we are done
                 self._length -= remove_length
                 break
@@ -419,11 +416,10 @@
         # Without flattening on the mozilla tree, with 458 deltas, we end up
         # with 4000 links. Which starts being a bottleneck in
         # _find_nodes_for_pos
-        if len(self._ranges) > self._max_links:
+        if len(self._ranges) > self._max_ranges:
             self.flatten()
         new_hll = HybridLinkedList()
         new_hll._head = self._head
         new_hll._ranges = self._ranges[:]
-        new_hll._links = self._links[:]
         new_hll._length = self._length
         return new_hll

=== modified file 'bzrlib/tests/test_hybrid_linked_list.py'
--- a/bzrlib/tests/test_hybrid_linked_list.py	2007-01-30 21:04:07 +0000
+++ b/bzrlib/tests/test_hybrid_linked_list.py	2007-01-31 00:28:29 +0000
@@ -40,15 +40,21 @@
         hll.delete(2, 3)
         hll.delete(5, 7)
         self.assertEqual([0, 1, 3, 4, 5, 8], hll)
-        self.assertEqual((None, 0, 0, 0, 2, content), hll._find_nodes_for_pos(0))
-        self.assertEqual((None, 0, 1, 0, 2, content), hll._find_nodes_for_pos(1))
-        self.assertEqual((0, 1, 0, 3, 6, content), hll._find_nodes_for_pos(2))
-        self.assertEqual((0, 1, 1, 3, 6, content), hll._find_nodes_for_pos(3))
-        self.assertEqual((0, 1, 2, 3, 6, content), hll._find_nodes_for_pos(4))
-        self.assertEqual((1, 2, 0, 8, 9, content), hll._find_nodes_for_pos(5))
-        self.assertEqual((2, None, 0, None, None, None),
+        self.assertEqual((None, 0, 0, 0, 2, 1, content),
+                         hll._find_nodes_for_pos(0))
+        self.assertEqual((None, 0, 1, 0, 2, 1, content),
+                         hll._find_nodes_for_pos(1))
+        self.assertEqual((0, 1, 0, 3, 6, 2, content),
+                         hll._find_nodes_for_pos(2))
+        self.assertEqual((0, 1, 1, 3, 6, 2, content),
+                         hll._find_nodes_for_pos(3))
+        self.assertEqual((0, 1, 2, 3, 6, 2, content),
+                         hll._find_nodes_for_pos(4))
+        self.assertEqual((1, 2, 0, 8, 9, None, content),
+                         hll._find_nodes_for_pos(5))
+        self.assertEqual((2, None, 0, None, None, None, None),
                          hll._find_nodes_for_pos(6))
-        self.assertEqual((2, None, 1, None, None, None),
+        self.assertEqual((2, None, 1, None, None, None, None),
                          hll._find_nodes_for_pos(7))
 
     def test__eq__(self):
@@ -184,11 +190,10 @@
         self.assertEqual(9, len(hll))
         self.assertEqual([0, 1, 2, 3, 4, 5, 6, 7, 8], hll)
         self.assertEqual(0, hll._head)
-        self.assertEqual([1, 2, 3, None], hll._links)
-        self.assertEqual([(0, 3, [0, 1, 2]),
-                          (0, 2, [3, 4]),
-                          (0, 3, [5, 6, 7]),
-                          (0, 1, [8]),
+        self.assertEqual([(0, 3, 1, [0, 1, 2]),
+                          (0, 2, 2, [3, 4]),
+                          (0, 3, 3, [5, 6, 7]),
+                          (0, 1, None, [8]),
                          ], hll._ranges)
 
     def test_copy(self):
@@ -203,15 +208,14 @@
         # After copying, the contents should be identical
         self.assertEqual(hll._head, hll2._head)
         self.assertEqual(hll._ranges, hll2._ranges)
-        self.assertEqual(hll._links, hll2._links)
 
         # The content entries should be identical items, but the rest are not
         for range_id, value in enumerate(hll._ranges):
             if value is not None:
-                start, end, content = value
+                start, end, next, content = value
             else:
                 continue
-            start2, end2, content2 = hll2._ranges[range_id]
+            start2, end2, next2, content2 = hll2._ranges[range_id]
             self.assertEqual(id(content), id(content2))
 
         # Now modifying one should not effect the other
@@ -238,11 +242,10 @@
         hll.delete(0, 3)
         self.assertEqual([3, 4, 5, 6, 7, 8], hll)
         self.assertEqual(1, hll._head)
-        self.assertEqual([-1, 2, 3, None], hll._links)
         self.assertEqual([None,
-                          (0, 2, [3, 4]),
-                          (0, 3, [5, 6, 7]),
-                          (0, 1, [8]),
+                          (0, 2, 2, [3, 4]),
+                          (0, 3, 3, [5, 6, 7]),
+                          (0, 1, None, [8]),
                          ], hll._ranges)
 
     def test_delete_middle_range(self):
@@ -250,11 +253,10 @@
         hll.delete(3, 5)
         self.assertEqual([0, 1, 2, 5, 6, 7, 8], hll)
         self.assertEqual(0, hll._head)
-        self.assertEqual([2, -1, 3, None], hll._links)
-        self.assertEqual([(0, 3, [0, 1, 2]),
+        self.assertEqual([(0, 3, 2, [0, 1, 2]),
                           None,
-                          (0, 3, [5, 6, 7]),
-                          (0, 1, [8]),
+                          (0, 3, 3, [5, 6, 7]),
+                          (0, 1, None, [8]),
                          ], hll._ranges)
 
     def test_delete_last_range(self):
@@ -262,10 +264,9 @@
         hll.delete(8, 9)
         self.assertEqual([0, 1, 2, 3, 4, 5, 6, 7], hll)
         self.assertEqual(0, hll._head)
-        self.assertEqual([1, 2, None, -1], hll._links)
-        self.assertEqual([(0, 3, [0, 1, 2]),
-                          (0, 2, [3, 4]),
-                          (0, 3, [5, 6, 7]),
+        self.assertEqual([(0, 3, 1, [0, 1, 2]),
+                          (0, 2, 2, [3, 4]),
+                          (0, 3, None, [5, 6, 7]),
                           None,
                          ], hll._ranges)
 
@@ -274,11 +275,10 @@
         hll.delete(0, 2)
         self.assertEqual([2, 3, 4, 5, 6, 7, 8], hll)
         self.assertEqual(0, hll._head)
-        self.assertEqual([1, 2, 3, None], hll._links)
-        self.assertEqual([(2, 3, [0, 1, 2]),
-                          (0, 2, [3, 4]),
-                          (0, 3, [5, 6, 7]),
-                          (0, 1, [8]),
+        self.assertEqual([(2, 3, 1, [0, 1, 2]),
+                          (0, 2, 2, [3, 4]),
+                          (0, 3, 3, [5, 6, 7]),
+                          (0, 1, None, [8]),
                          ], hll._ranges)
 
     def test_delete_partial_ending(self):
@@ -286,11 +286,10 @@
         hll.delete(1, 3)
         self.assertEqual([0, 3, 4, 5, 6, 7, 8], hll)
         self.assertEqual(0, hll._head)
-        self.assertEqual([1, 2, 3, None], hll._links)
-        self.assertEqual([(0, 1, [0, 1, 2]),
-                          (0, 2, [3, 4]),
-                          (0, 3, [5, 6, 7]),
-                          (0, 1, [8]),
+        self.assertEqual([(0, 1, 1, [0, 1, 2]),
+                          (0, 2, 2, [3, 4]),
+                          (0, 3, 3, [5, 6, 7]),
+                          (0, 1, None, [8]),
                          ], hll._ranges)
 
     def test_delete_middle_section(self):
@@ -298,27 +297,25 @@
         hll.delete(1, 2)
         self.assertEqual([0, 2, 3, 4, 5, 6, 7, 8], hll)
         self.assertEqual(0, hll._head)
-        self.assertEqual([4, 2, 3, None, 1], hll._links)
-        self.assertEqual([(0, 1, [0, 1, 2]),
-                          (0, 2, [3, 4]),
-                          (0, 3, [5, 6, 7]),
-                          (0, 1, [8]),
-                          (2, 3, [0, 1, 2]),
+        self.assertEqual([(0, 1, 4, [0, 1, 2]),
+                          (0, 2, 2, [3, 4]),
+                          (0, 3, 3, [5, 6, 7]),
+                          (0, 1, None, [8]),
+                          (2, 3, 1, [0, 1, 2]),
                          ], hll._ranges)
         # Make sure that even though we have 2 entries with the same content,
         # it just copied a reference, not the actual content.
-        self.assertEqual(id(hll._ranges[0][2]), id(hll._ranges[4][2]))
+        self.assertEqual(id(hll._ranges[0][-1]), id(hll._ranges[4][-1]))
 
     def test_delete_middle(self):
         hll = self.get_sectioned_hll()
         hll.delete(3, 4)
         self.assertEqual([0, 1, 2, 4, 5, 6, 7, 8], hll)
         self.assertEqual(0, hll._head)
-        self.assertEqual([1, 2, 3, None], hll._links)
-        self.assertEqual([(0, 3, [0, 1, 2]),
-                          (1, 2, [3, 4]),
-                          (0, 3, [5, 6, 7]),
-                          (0, 1, [8]),
+        self.assertEqual([(0, 3, 1, [0, 1, 2]),
+                          (1, 2, 2, [3, 4]),
+                          (0, 3, 3, [5, 6, 7]),
+                          (0, 1, None, [8]),
                          ], hll._ranges)
 
     def test_delete_across_sections(self):
@@ -326,11 +323,10 @@
         hll.delete(1, 7)
         self.assertEqual([0, 7, 8], hll)
         self.assertEqual(0, hll._head)
-        self.assertEqual([2, -1, 3, None], hll._links)
-        self.assertEqual([(0, 1, [0, 1, 2]),
+        self.assertEqual([(0, 1, 2, [0, 1, 2]),
                           None,
-                          (2, 3, [5, 6, 7]),
-                          (0, 1, [8]),
+                          (2, 3, 3, [5, 6, 7]),
+                          (0, 1, None, [8]),
                          ], hll._ranges)
 
     def test_delete_off_end(self):
@@ -338,8 +334,7 @@
         hll.delete(1, 9)
         self.assertEqual([0], hll)
         self.assertEqual(0, hll._head)
-        self.assertEqual([None, -1, -1, -1], hll._links)
-        self.assertEqual([(0, 1, [0, 1, 2]),
+        self.assertEqual([(0, 1, None, [0, 1, 2]),
                           None,
                           None,
                           None,
@@ -350,7 +345,6 @@
         hll.delete(0, 9)
         self.assertEqual([], hll)
         self.assertEqual(None, hll._head)
-        self.assertEqual([-1, -1, -1, -1], hll._links)
         self.assertEqual([None, None, None, None], hll._ranges)
 
         hll.insert(0, [1])



More information about the bazaar-commits mailing list