Rev 2265: Eek out another 100ms improvement by using a single list instead of 2 in http://bzr.arbash-meinel.com/branches/bzr/0.15-dev/list_patch
John Arbash Meinel
john at arbash-meinel.com
Tue Jan 30 23:46:30 GMT 2007
At http://bzr.arbash-meinel.com/branches/bzr/0.15-dev/list_patch
------------------------------------------------------------
revno: 2265
revision-id: john at arbash-meinel.com-20070130234624-vydiov14ynhuas13
parent: john at arbash-meinel.com-20070130230331-f2ygx33kurrovx1p
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: list_patch
timestamp: Tue 2007-01-30 17:46:24 -0600
message:
Eek out another 100ms improvement by using a single list instead of 2
modified:
bzrlib/hybrid_linked_list.py hybrid_linked_list.p-20070130001028-zlexwz74tzo56o2t-1
bzrlib/knit.py knit.py-20051212171256-f056ac8f0fbe1bd9
bzrlib/tests/test_hybrid_linked_list.py test_hybrid_linked_l-20070130001028-zlexwz74tzo56o2t-2
bzrlib/versionedfile.py versionedfile.py-20060222045106-5039c71ee3b65490
-------------- next part --------------
=== modified file 'bzrlib/hybrid_linked_list.py'
--- a/bzrlib/hybrid_linked_list.py 2007-01-30 23:03:31 +0000
+++ b/bzrlib/hybrid_linked_list.py 2007-01-30 23:46:24 +0000
@@ -35,12 +35,10 @@
nodes themselves are never modified.
:ivar _head: Pointer to the first range, may be None
- :ivar _ranges: The map of ranges. This maps handle => (start, end, content)
- :ivar _links: This maps each range to the next range. It could technically
- be stored with _ranges, but it makes it easier to update links if they
- are in a separate dict. (less tuple churn)
+ :ivar _ranges: The map of ranges. This maps
+ handle => (start, end, next, content)
:ivar _length: The current length of the output content
- :cvar _max_links: When copying, flatten the text if we exceed _max_links
+ :cvar _max_ranges: When copying, flatten the text if we exceed _max_ranges
"""
# This was determined empirically on a knit with 55,000 lines of content,
@@ -64,9 +62,9 @@
#
# TODO: jam 20070130 This seems like a pretty good class to
# optimize as a C or pyrex extension.
- _max_links = 500
+ _max_ranges = 500
- __slots__ = ['_head', '_ranges', '_links', '_length']
+ __slots__ = ['_head', '_ranges', '_length']
def __init__(self, starting=[]):
"""Create a new HybridLinkedList.
@@ -85,7 +83,6 @@
self._head = None
self._ranges = []
# This is a singly linked list map for range to next range
- self._links = []
self._length = 0
if starting:
@@ -96,12 +93,11 @@
cur_range_id = self._head
_ranges = self._ranges
- _links = self._links
while cur_range_id is not None:
- start, end, content = _ranges[cur_range_id]
+ start, end, next_range_id, content = _ranges[cur_range_id]
for pos in xrange(start, end):
yield content[pos]
- cur_range_id = _links[cur_range_id]
+ cur_range_id = next_range_id
def __len__(self):
# Profiling shows __len__ to be adding a lot of overhead, so we use a
@@ -129,13 +125,12 @@
# offset, start, end are in terms of the current content
(prev_range_id, cur_range_id, offset,
- start, end, content) = self._find_nodes_for_pos(cur)
+ start, end, next_range_id, content) = self._find_nodes_for_pos(cur)
# They requested something out of range
if cur_range_id is None:
return result
- _links = self._links
_ranges = self._ranges
offset += start
@@ -143,11 +138,11 @@
while offset >= end:
# Keep skipping until we find next
offset -= end
- cur_range_id = _links[cur_range_id]
+ cur_range_id = next_range_id
if cur_range_id is None:
content = None
break
- start, end, content = _ranges[cur_range_id]
+ start, end, next_range_id, content = _ranges[cur_range_id]
offset += start
if content is None:
break
@@ -157,7 +152,7 @@
return result
else:
(prev_range_id, cur_range_id, offset,
- start, end, content) = self._find_nodes_for_pos(pos)
+ start, end, next_range_id, content) = self._find_nodes_for_pos(pos)
return content[start+offset]
def __setitem__(self, pos, value):
@@ -192,8 +187,9 @@
cur = self._head
_ranges = self._ranges
while cur is not None:
- flattened.append('%s:%s' % (cur, _ranges[cur]))
- cur = self._links[cur]
+ start, end, next, content = _ranges[cur]
+ flattened.append('%s:%s' % (cur, (start, end, content)))
+ cur = next
return 'HybridLinkedList(' + ', '.join(flattened) + ')'
def flatten(self):
@@ -212,7 +208,7 @@
prev_range_id = None
cur_range_id = self._head
cur_pos = 0
- content = start = end = None
+ content = start = end = next_range_id = None
# collapsing to local variables decreased the lsprof time from:
# 4716 4.8879 bzrlib.hybrid_linked_list:182(_find_nodes_for_pos)
@@ -225,65 +221,71 @@
# 4719 1.4945 bzrlib.hybrid_linked_list:183(_find_nodes_for_pos)
# So while it takes 300+ms to flatten, we avoid bad O(N) behavior for
# number of links.
+ # Using 500 links:
+ # 4723 0.7790 bzrlib.hybrid_linked_list:203(_find_nodes_for_pos)
+ #
+ # Collapsing to a single list instead of using to results in:
+ # 4723 0.7933 bzrlib.hybrid_linked_list:203(_find_nodes_for_pos)
_ranges = self._ranges
- _links = self._links
while cur_range_id is not None:
- start, end, content = _ranges[cur_range_id]
+ start, end, next_range_id, content = _ranges[cur_range_id]
next_pos = cur_pos + (end - start)
if pos < next_pos:
break
prev_range_id = cur_range_id
- cur_range_id = _links[cur_range_id]
+ cur_range_id = next_range_id
cur_pos = next_pos
- content = start = end = None
+ else:
+ content = start = end = next_range_id = None
return (prev_range_id, cur_range_id, pos - cur_pos,
- start, end, content)
+ start, end, next_range_id, content)
- def _new_range_id(self, start, end, content):
- """Get a new range id."""
+ def _new_range_id(self, start, end, next, content):
+ """Create a new range id."""
new_range_id = len(self._ranges)
- self._ranges.append((start, end, content))
- self._links.append(None)
+ self._ranges.append((start, end, next, content))
return new_range_id
+ def _change_next_link(self, range_id, next_range_id):
+ """Change the 'next' pointer for a given range_id."""
+ start, end, next, content = self._ranges[range_id]
+ self._ranges[range_id] = (start, end, next_range_id, content)
+
def insert(self, start, content):
"""Insert a set of entries starting at the first position."""
content_len = len(content)
- new_range_id = self._new_range_id(0, content_len, content)
-
- (prev_range_id, cur_range_id, offset,
- start, end, cur_content) = self._find_nodes_for_pos(start)
-
- _links = self._links
+ #new_range_id = self._new_range_id(0, content_len, content)
+
+ (prev_range_id, cur_range_id, offset, start, end,
+ next_range_id, cur_content) = self._find_nodes_for_pos(start)
+
+ _ranges = self._ranges
if offset == 0:
# We are inserting before an existing node, or at the exact end of
# the list
if prev_range_id is None:
# We are at _head, update its reference
- _links[new_range_id] = self._head
+ new_range_id = self._new_range_id(0, content_len, self._head,
+ content)
self._head = new_range_id
else:
- _links[new_range_id] = cur_range_id
- _links[prev_range_id] = new_range_id
+ new_range_id = self._new_range_id(0, content_len, cur_range_id,
+ content)
+ self._change_next_link(prev_range_id, new_range_id)
# We are inserting inside an existing node
elif cur_range_id is not None:
# We need to break up the current node, and insert the
# requested one
break_point = start + offset
- split_range_id = self._new_range_id(break_point, end, cur_content)
- next_range_id = _links[cur_range_id]
- _links[split_range_id] = next_range_id
- _links[new_range_id] = split_range_id
+ split_range_id = self._new_range_id(break_point, end, next_range_id,
+ cur_content)
+ new_range_id = self._new_range_id(0, content_len, split_range_id,
+ content)
- # We could make this atomic by creating a new id for the
- # initial range, and then the atomic step is
- # self._links[prev_range_id] = new_head_range
- # And then afterwards we would want to delete the cur_range_id
- # entry in the map
- self._ranges[cur_range_id] = (start, break_point, cur_content)
- _links[cur_range_id] = new_range_id
+ _ranges[cur_range_id] = (start, break_point, new_range_id,
+ cur_content)
else:
# We are appending a node
# python lists have the property that x[BIG_NUM:BIGNUM] = [10]
@@ -291,8 +293,8 @@
# range
# TODO: For now, I'm duplicating that, but it seems to make
# more sense to raise IndexError or something
- _links[new_range_id] = None
- _links[prev_range_id] = new_range_id
+ new_range_id = self._new_range_id(0, content_len, None, content_len)
+ self._change_next_link(prev_range_id, new_range_id)
self._length += content_len
def delete(self, start, end):
@@ -309,15 +311,14 @@
# Nothing to do
return
- (prev_range_id, cur_range_id, offset,
- cur_start, cur_end, content) = self._find_nodes_for_pos(start)
+ (prev_range_id, cur_range_id, offset, cur_start, cur_end,
+ next_range_id, content) = self._find_nodes_for_pos(start)
if cur_range_id is None:
# We fell off the end, nothing to do
# In python "del list[BIGNUM:BIGNUM]" does nothing
return
- _links = self._links
_ranges = self._ranges
while cur_range_id is not None:
@@ -339,21 +340,19 @@
possible_remove_length = entry_length - offset
if remove_length >= possible_remove_length:
- next_range_id = _links[cur_range_id]
if offset == 0:
# CASE 1: this node will be fully removed
if prev_range_id is None:
# Update _head
self._head = next_range_id
else:
- _links[prev_range_id] = next_range_id
- _links[cur_range_id] = -1
+ self._change_next_link(prev_range_id, next_range_id)
_ranges[cur_range_id] = None
else:
# CASE 2: we save the beginning, but remove the tail
# This only requires a node entry update
_ranges[cur_range_id] = (cur_start, cur_start + offset,
- content)
+ next_range_id, content)
prev_range_id = cur_range_id
remove_length -= possible_remove_length
self._length -= possible_remove_length
@@ -362,13 +361,14 @@
# step to next
offset = 0
cur_range_id = next_range_id
- cur_start, cur_end, content = _ranges[next_range_id]
+ (cur_start, cur_end, next_range_id,
+ content) = _ranges[cur_range_id]
else:
if offset == 0:
# CASE 3: We remove only the beginning of this node
# Only requires a node entry update
_ranges[cur_range_id] = (cur_start + remove_length, cur_end,
- content)
+ next_range_id, content)
else:
# We have a beginning and a tail, we need to create a new
# record, and update the current one
@@ -378,13 +378,10 @@
new_end = cur_start + offset
new_start = new_end + remove_length
- new_tail_range_id = self._new_range_id(new_start, cur_end,
- content)
- next_range_id = _links[cur_range_id]
- _links[new_tail_range_id] = next_range_id
-
- _links[cur_range_id] = new_tail_range_id
- _ranges[cur_range_id] = (cur_start, new_end, content)
+ new_range_id = self._new_range_id(new_start, cur_end,
+ next_range_id, content)
+ _ranges[cur_range_id] = (cur_start, new_end, new_range_id,
+ content)
# We officially removed enough, so we are done
self._length -= remove_length
break
@@ -419,11 +416,10 @@
# Without flattening on the mozilla tree, with 458 deltas, we end up
# with 4000 links. Which starts being a bottleneck in
# _find_nodes_for_pos
- if len(self._ranges) > self._max_links:
+ if len(self._ranges) > self._max_ranges:
self.flatten()
new_hll = HybridLinkedList()
new_hll._head = self._head
new_hll._ranges = self._ranges[:]
- new_hll._links = self._links[:]
new_hll._length = self._length
return new_hll
=== modified file 'bzrlib/knit.py'
--- a/bzrlib/knit.py 2007-01-17 15:37:08 +0000
+++ b/bzrlib/knit.py 2007-01-30 23:46:24 +0000
@@ -73,6 +73,7 @@
from bzrlib import (
cache_utf8,
errors,
+ hybrid_linked_list,
patiencediff,
progress,
ui,
@@ -122,7 +123,7 @@
"""Content of a knit version to which deltas can be applied."""
def __init__(self, lines):
- self._lines = lines
+ self._lines = hybrid_linked_list.HybridLinkedList(lines)
def annotate_iter(self):
"""Yield tuples of (origin, text) for each content line."""
@@ -150,7 +151,9 @@
return [text for origin, text in self._lines]
def copy(self):
- return KnitContent(self._lines[:])
+ content = KnitContent([])
+ content._lines = self._lines.copy()
+ return content
class _KnitFactory(object):
@@ -606,6 +609,13 @@
# content for any line that matches the last-checked parent.
# FIXME: save the sequence control data for delta compression
# against the most relevant parent rather than rediffing.
+ # TODO: jam 20070130 We could actually do this by sending
+ # the complete content over and just using a specific
+ # range, rather than having to extract a slice from
+ # merge_content._lines[]. Consider doing that.
+ # But it starts getting ugly if merge_content._lines
+ # is also a bunch of delta hunks. So this may be the
+ # cleanest overall.
content._lines[j:j+n] = merge_content._lines[i:i+n]
if delta:
if not annotated:
=== modified file 'bzrlib/tests/test_hybrid_linked_list.py'
--- a/bzrlib/tests/test_hybrid_linked_list.py 2007-01-30 21:04:07 +0000
+++ b/bzrlib/tests/test_hybrid_linked_list.py 2007-01-30 23:46:24 +0000
@@ -40,15 +40,21 @@
hll.delete(2, 3)
hll.delete(5, 7)
self.assertEqual([0, 1, 3, 4, 5, 8], hll)
- self.assertEqual((None, 0, 0, 0, 2, content), hll._find_nodes_for_pos(0))
- self.assertEqual((None, 0, 1, 0, 2, content), hll._find_nodes_for_pos(1))
- self.assertEqual((0, 1, 0, 3, 6, content), hll._find_nodes_for_pos(2))
- self.assertEqual((0, 1, 1, 3, 6, content), hll._find_nodes_for_pos(3))
- self.assertEqual((0, 1, 2, 3, 6, content), hll._find_nodes_for_pos(4))
- self.assertEqual((1, 2, 0, 8, 9, content), hll._find_nodes_for_pos(5))
- self.assertEqual((2, None, 0, None, None, None),
+ self.assertEqual((None, 0, 0, 0, 2, 1, content),
+ hll._find_nodes_for_pos(0))
+ self.assertEqual((None, 0, 1, 0, 2, 1, content),
+ hll._find_nodes_for_pos(1))
+ self.assertEqual((0, 1, 0, 3, 6, 2, content),
+ hll._find_nodes_for_pos(2))
+ self.assertEqual((0, 1, 1, 3, 6, 2, content),
+ hll._find_nodes_for_pos(3))
+ self.assertEqual((0, 1, 2, 3, 6, 2, content),
+ hll._find_nodes_for_pos(4))
+ self.assertEqual((1, 2, 0, 8, 9, None, content),
+ hll._find_nodes_for_pos(5))
+ self.assertEqual((2, None, 0, None, None, None, None),
hll._find_nodes_for_pos(6))
- self.assertEqual((2, None, 1, None, None, None),
+ self.assertEqual((2, None, 1, None, None, None, None),
hll._find_nodes_for_pos(7))
def test__eq__(self):
@@ -184,11 +190,10 @@
self.assertEqual(9, len(hll))
self.assertEqual([0, 1, 2, 3, 4, 5, 6, 7, 8], hll)
self.assertEqual(0, hll._head)
- self.assertEqual([1, 2, 3, None], hll._links)
- self.assertEqual([(0, 3, [0, 1, 2]),
- (0, 2, [3, 4]),
- (0, 3, [5, 6, 7]),
- (0, 1, [8]),
+ self.assertEqual([(0, 3, 1, [0, 1, 2]),
+ (0, 2, 2, [3, 4]),
+ (0, 3, 3, [5, 6, 7]),
+ (0, 1, None, [8]),
], hll._ranges)
def test_copy(self):
@@ -203,15 +208,14 @@
# After copying, the contents should be identical
self.assertEqual(hll._head, hll2._head)
self.assertEqual(hll._ranges, hll2._ranges)
- self.assertEqual(hll._links, hll2._links)
# The content entries should be identical items, but the rest are not
for range_id, value in enumerate(hll._ranges):
if value is not None:
- start, end, content = value
+ start, end, next, content = value
else:
continue
- start2, end2, content2 = hll2._ranges[range_id]
+ start2, end2, next2, content2 = hll2._ranges[range_id]
self.assertEqual(id(content), id(content2))
# Now modifying one should not effect the other
@@ -238,11 +242,10 @@
hll.delete(0, 3)
self.assertEqual([3, 4, 5, 6, 7, 8], hll)
self.assertEqual(1, hll._head)
- self.assertEqual([-1, 2, 3, None], hll._links)
self.assertEqual([None,
- (0, 2, [3, 4]),
- (0, 3, [5, 6, 7]),
- (0, 1, [8]),
+ (0, 2, 2, [3, 4]),
+ (0, 3, 3, [5, 6, 7]),
+ (0, 1, None, [8]),
], hll._ranges)
def test_delete_middle_range(self):
@@ -250,11 +253,10 @@
hll.delete(3, 5)
self.assertEqual([0, 1, 2, 5, 6, 7, 8], hll)
self.assertEqual(0, hll._head)
- self.assertEqual([2, -1, 3, None], hll._links)
- self.assertEqual([(0, 3, [0, 1, 2]),
+ self.assertEqual([(0, 3, 2, [0, 1, 2]),
None,
- (0, 3, [5, 6, 7]),
- (0, 1, [8]),
+ (0, 3, 3, [5, 6, 7]),
+ (0, 1, None, [8]),
], hll._ranges)
def test_delete_last_range(self):
@@ -262,10 +264,9 @@
hll.delete(8, 9)
self.assertEqual([0, 1, 2, 3, 4, 5, 6, 7], hll)
self.assertEqual(0, hll._head)
- self.assertEqual([1, 2, None, -1], hll._links)
- self.assertEqual([(0, 3, [0, 1, 2]),
- (0, 2, [3, 4]),
- (0, 3, [5, 6, 7]),
+ self.assertEqual([(0, 3, 1, [0, 1, 2]),
+ (0, 2, 2, [3, 4]),
+ (0, 3, None, [5, 6, 7]),
None,
], hll._ranges)
@@ -274,11 +275,10 @@
hll.delete(0, 2)
self.assertEqual([2, 3, 4, 5, 6, 7, 8], hll)
self.assertEqual(0, hll._head)
- self.assertEqual([1, 2, 3, None], hll._links)
- self.assertEqual([(2, 3, [0, 1, 2]),
- (0, 2, [3, 4]),
- (0, 3, [5, 6, 7]),
- (0, 1, [8]),
+ self.assertEqual([(2, 3, 1, [0, 1, 2]),
+ (0, 2, 2, [3, 4]),
+ (0, 3, 3, [5, 6, 7]),
+ (0, 1, None, [8]),
], hll._ranges)
def test_delete_partial_ending(self):
@@ -286,11 +286,10 @@
hll.delete(1, 3)
self.assertEqual([0, 3, 4, 5, 6, 7, 8], hll)
self.assertEqual(0, hll._head)
- self.assertEqual([1, 2, 3, None], hll._links)
- self.assertEqual([(0, 1, [0, 1, 2]),
- (0, 2, [3, 4]),
- (0, 3, [5, 6, 7]),
- (0, 1, [8]),
+ self.assertEqual([(0, 1, 1, [0, 1, 2]),
+ (0, 2, 2, [3, 4]),
+ (0, 3, 3, [5, 6, 7]),
+ (0, 1, None, [8]),
], hll._ranges)
def test_delete_middle_section(self):
@@ -298,27 +297,25 @@
hll.delete(1, 2)
self.assertEqual([0, 2, 3, 4, 5, 6, 7, 8], hll)
self.assertEqual(0, hll._head)
- self.assertEqual([4, 2, 3, None, 1], hll._links)
- self.assertEqual([(0, 1, [0, 1, 2]),
- (0, 2, [3, 4]),
- (0, 3, [5, 6, 7]),
- (0, 1, [8]),
- (2, 3, [0, 1, 2]),
+ self.assertEqual([(0, 1, 4, [0, 1, 2]),
+ (0, 2, 2, [3, 4]),
+ (0, 3, 3, [5, 6, 7]),
+ (0, 1, None, [8]),
+ (2, 3, 1, [0, 1, 2]),
], hll._ranges)
# Make sure that even though we have 2 entries with the same content,
# it just copied a reference, not the actual content.
- self.assertEqual(id(hll._ranges[0][2]), id(hll._ranges[4][2]))
+ self.assertEqual(id(hll._ranges[0][-1]), id(hll._ranges[4][-1]))
def test_delete_middle(self):
hll = self.get_sectioned_hll()
hll.delete(3, 4)
self.assertEqual([0, 1, 2, 4, 5, 6, 7, 8], hll)
self.assertEqual(0, hll._head)
- self.assertEqual([1, 2, 3, None], hll._links)
- self.assertEqual([(0, 3, [0, 1, 2]),
- (1, 2, [3, 4]),
- (0, 3, [5, 6, 7]),
- (0, 1, [8]),
+ self.assertEqual([(0, 3, 1, [0, 1, 2]),
+ (1, 2, 2, [3, 4]),
+ (0, 3, 3, [5, 6, 7]),
+ (0, 1, None, [8]),
], hll._ranges)
def test_delete_across_sections(self):
@@ -326,11 +323,10 @@
hll.delete(1, 7)
self.assertEqual([0, 7, 8], hll)
self.assertEqual(0, hll._head)
- self.assertEqual([2, -1, 3, None], hll._links)
- self.assertEqual([(0, 1, [0, 1, 2]),
+ self.assertEqual([(0, 1, 2, [0, 1, 2]),
None,
- (2, 3, [5, 6, 7]),
- (0, 1, [8]),
+ (2, 3, 3, [5, 6, 7]),
+ (0, 1, None, [8]),
], hll._ranges)
def test_delete_off_end(self):
@@ -338,8 +334,7 @@
hll.delete(1, 9)
self.assertEqual([0], hll)
self.assertEqual(0, hll._head)
- self.assertEqual([None, -1, -1, -1], hll._links)
- self.assertEqual([(0, 1, [0, 1, 2]),
+ self.assertEqual([(0, 1, None, [0, 1, 2]),
None,
None,
None,
@@ -350,7 +345,6 @@
hll.delete(0, 9)
self.assertEqual([], hll)
self.assertEqual(None, hll._head)
- self.assertEqual([-1, -1, -1, -1], hll._links)
self.assertEqual([None, None, None, None], hll._ranges)
hll.insert(0, [1])
=== modified file 'bzrlib/versionedfile.py'
--- a/bzrlib/versionedfile.py 2007-01-17 15:37:08 +0000
+++ b/bzrlib/versionedfile.py 2007-01-30 23:46:24 +0000
@@ -387,7 +387,6 @@
def _apply_delta(self, lines, delta):
"""Apply delta to lines."""
- lines = list(lines)
offset = 0
for start, end, count, delta_lines in delta:
lines[offset+start:offset+end] = delta_lines
More information about the bazaar-commits
mailing list