Rev 5533: Trying to do some instrumentation of where the time went. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
John Arbash Meinel
john at arbash-meinel.com
Tue Nov 23 21:10:43 GMT 2010
At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
------------------------------------------------------------
revno: 5533
revision-id: john at arbash-meinel.com-20101123211007-v5eu386vc5jv0vbj
parent: john at arbash-meinel.com-20101123185819-016sfnhqupjnat7p
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Tue 2010-11-23 15:10:07 -0600
message:
Trying to do some instrumentation of where the time went.
Looks to be all time spent in make_delta_source. I wonder if the new
heuristic penalizes us there, by creating more 'good but not great' matches.
Basic timing:
spent 0.676s in add_source
spent 1.689s in add_delta_source
spent 132.955s in make_delta_source
Trying to profile make_delta_source doesn't work so well:
spent 0.653s in add_source
spent 1.789s in add_delta_source
spent 1246.300s in make_delta_source
spent 669.533s in _find_better_match
thats a *lot* of overhead (10x overhead for a couple of time.clock() calls)
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'
--- a/bzrlib/_delta_index_pyx.pyx 2010-11-23 18:58:19 +0000
+++ b/bzrlib/_delta_index_pyx.pyx 2010-11-23 21:10:07 +0000
@@ -70,12 +70,30 @@
import time
cdef object _timer
_timer = time.clock
-cdef double total_time
-total_time = 0.0
+cdef double add_total_time
+add_total_time = 0.0
+cdef double add_delta_total_time
+add_delta_total_time = 0.0
+cdef double make_delta_total_time
+make_delta_total_time = 0.0
+cdef double find_better_total_time
+find_better_total_time = 0.0
+cdef int matches_checked
+matches_checked = 0
+cdef int try_again_count
+try_again_count = 0
+cdef int try_again_matches_checked
+try_again_matches_checked = 0
import atexit, sys
def report_total_time():
- sys.stderr.write('spent %.3fs in _limit_hash_entries\n' % (total_time,))
+ sys.stderr.write('spent %.3fs in add_source\n' % (add_total_time,))
+ sys.stderr.write('spent %.3fs in add_delta_source\n' % (add_delta_total_time,))
+ sys.stderr.write('spent %.3fs in make_delta_source\n' % (make_delta_total_time,))
+ sys.stderr.write('spent %.3fs in _find_better_match\n' % (find_better_total_time,))
+ sys.stderr.write('%d try again, %d checks resulted in retry,'
+ ' %d total matches checked\n' % (
+ try_again_count, try_again_matches_checked, matches_checked))
atexit.register(report_total_time)
@@ -436,10 +454,7 @@
entry.global_offset = global_offset
self.entry_count += 1
if search.match_count + 1 >= MAX_HASH_BUCKET_ENTRIES:
- global total_time
- c = _timer()
self._limit_hash_entries(val, search)
- total_time += <double>(_timer() - c)
return entry
def _py_add(self, val, global_offset, pre_bytes, post_bytes):
@@ -748,18 +763,22 @@
"""
if not PyString_CheckExact(source):
raise TypeError("source must be a str, not %s" % (type(source),))
+ t = _timer()
start_offset = self.total_source_bytes + extra_offset
new_info = SourceInfo(source, start_offset)
self.sources.append(new_info)
self._hash_map.reserve(len(source) / RABIN_WINDOW)
self._compute_offsets(source, start_offset)
self.total_source_bytes += len(source) + extra_offset
+ global add_total_time
+ add_total_time += _timer() - t
def add_delta_source(self, delta_source, extra_offset=0):
"""Add more source content for this delta data."""
if not PyString_CheckExact(delta_source):
raise TypeError("delta_source must be a str, not %s"
% (type(delta_source),))
+ t = _timer()
start_offset = self.total_source_bytes + extra_offset
new_info = SourceInfo(delta_source, start_offset)
self.sources.append(new_info)
@@ -767,14 +786,20 @@
self._hash_map.reserve(len(delta_source) / RABIN_WINDOW)
self._compute_offsets_from_delta(delta_source, start_offset)
self.total_source_bytes += len(delta_source) + extra_offset
+ global add_delta_total_time
+ add_delta_total_time += _timer() - t
def make_delta(self, content, max_delta_size=0, noisy=False):
"""Create a delta stream for the given content vs the current state."""
cdef _DeltaCreator creator
if not self.sources:
return None
+ t = _timer()
creator = _DeltaCreator(self, content, max_delta_size, noisy)
- return creator.create()
+ val = creator.create()
+ global make_delta_total_time
+ make_delta_total_time += _timer() - t
+ return val
cdef class _DeltaCreator:
@@ -834,8 +859,8 @@
self.out[self.out_pos] = <unsigned char>(i)
self.out_pos += 1
- cdef _find_better_match(self, const_data data, const_data data_end,
- unsigned int cur_rabin_val):
+ cdef int _find_better_match(self, const_data data, const_data data_end,
+ unsigned int cur_rabin_val) except -1:
cdef rabin_entry *cur_entry
cdef const_data cur
cdef const_data ref
@@ -875,7 +900,8 @@
self.match_entry = cur_entry
if cur_match_size >= 4096:
# Stop searching for better matches
- return
+ return search.match_count
+ return search.match_count
cdef _finalize_pending_insert(self):
cdef unsigned char insert_byte
@@ -1000,6 +1026,7 @@
cdef int i
cdef unsigned char c
cdef Py_ssize_t target_len
+ cdef int count
if PyString_GET_SIZE(self.content) < RABIN_WINDOW:
# We don't have enough content to match, so don't try
@@ -1021,19 +1048,27 @@
self.insert_count = RABIN_WINDOW
self.match_size = 0
+ count = 0
while data < data_end:
if self.match_size < 4096:
# Our current match isn't "good enough", so look for a better
# one
# Pull in the next data character
RABIN_UPDATE(cur_rabin_val, data[-RABIN_WINDOW], data[0])
- self._find_better_match(data, data_end, cur_rabin_val)
+ t = _timer()
+ count = self._find_better_match(data, data_end, cur_rabin_val)
+ global find_better_total_time, matches_checked
+ find_better_total_time += _timer() - t
+ matches_checked += (count - 1)
if self.match_size < 4:
# Our best match is only 4 extra bytes, just count this as an
# insert
self._output_one(data[0])
data += 1
self.match_size = 0
+ global try_again_count, try_again_matches_checked
+ try_again_count += 1
+ try_again_matches_checked += (count -1)
else:
data = self._output_match(data)
if self.match_size < 4096:
More information about the bazaar-commits
mailing list