Rev 5542: sorting-on-repack doesn't seem to help a lot. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
John Arbash Meinel
john at arbash-meinel.com
Thu Dec 2 21:26:50 GMT 2010
At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
------------------------------------------------------------
revno: 5542
revision-id: john at arbash-meinel.com-20101202212630-vycb3zf5uy5iz2tc
parent: john at arbash-meinel.com-20101202201238-31yot935saovm9z5
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Thu 2010-12-02 15:26:30 -0600
message:
sorting-on-repack doesn't seem to help a lot.
It adds a couple of seconds to 'build' time, because of more overhead and lots of
'old' search steps.
I'm a bit surprised at how many old steps are taken, maybe because the table is
now 'overly full'? which is why we are resizing in the first place.
Total time is a fairly significant loss (2m2s). Only a couple seconds due to
the time to resize, so probably somehow 'less optimal' distribution now...
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'
--- a/bzrlib/_delta_index_pyx.pyx 2010-12-02 20:12:38 +0000
+++ b/bzrlib/_delta_index_pyx.pyx 2010-12-02 21:26:30 +0000
@@ -115,6 +115,9 @@
unsigned int rabin_val
# The number of bytes since the start of all sources
unsigned int global_offset
+ # How many times has this entry been considered a success?
+ unsigned short match_count
+
# The maximum number of bytes that can be matched around this pointer.
# Note that this is limited to 64kB, and a theoretical match could be
# longer. However, that will be handled by a later match
@@ -124,9 +127,6 @@
# 256 values here
unsigned char match_pre
- # How many times has this entry been considered a success?
- unsigned short match_count
-
cdef class RabinHashMap
@@ -164,12 +164,11 @@
self.free_slot = NULL
self._step = 0
- # Consider, self._next_hash & self._table_mask never see the 'upper'
- # bits of the rabin_val. Is there a decent way to 'mix that in', to
- # help keep chains from overlapping too much? (ATM 0x10001 and 0x20001
- # will overlap for any table size less than 0x10000, at *all* steps) I
- # don't know how common collisions are, but it may be more-than-normal
- # because RABIN isn't a super-strong hash.
+ # Note: Adding in the upper bits of rabin_val to _next_hash (via
+ # something like (rabin_val ^ (rabin_val >> 16)), appears to be
+ # worse than not. It might be that ^ is bleeding through similar
+ # characters (eg, content of 'aaaa' could cause a hash of
+ # 'aa' ^ 'aa' == 00.
self._next_hash = rabin_val
@cython.profile(False)
@@ -257,7 +256,7 @@
# Dummy entries are noted by having ptr == dummy_ptr
cdef rabin_entry *_table
- # TODO: two options for 'soft' resizing.
+ # TODO: two options for 'softer' resizing.
# a) 2 active tables. One for 'new' content. When 'new' is full, we
# allocate a new table of size(new)*2. We then copy all of the records
# from 'old' and free old. Then new items go into new. As an example:
@@ -316,10 +315,15 @@
# it to the new table. Though this func should be renamed 'move' at
# that point.
cdef rabin_entry *slot, *new_slot, *old_table
- cdef unsigned int i, new_offset, old_mask
- cdef unsigned long total_matches, total_steps, total_items
- cdef RabinEntrySearch search
+ cdef unsigned int i, new_offset, old_mask, rabin_val
+ cdef unsigned long total_old_steps, total_steps, total_items
+ cdef RabinEntrySearch search_old, search_new
+ cdef rabin_entry temp_entries[128]
+ cdef int temp_idx
+ cdef list all_entries
+ # search_old will maintain pointers to the old table and mask
+ search_old = RabinEntrySearch(self)
old_table = self._table
old_mask = self.table_mask
self._table = new_table
@@ -350,29 +354,51 @@
# have to walk over items we already walked. With the current
# code w/ 64 entries in a bucket, we walk, 1, 2, 3, 4, 5...
# 2047 steps.
- total_steps = total_matches = total_items = 0
+ total_steps = total_old_steps = total_items = 0
+ all_entries = []
try:
- search = RabinEntrySearch(self)
+ search_new = RabinEntrySearch(self)
for i from 0 <= i <= old_mask:
slot = old_table + i
if (slot.ptr == NULL or slot.ptr == dummy_ptr):
# An empty slot
continue
- total_items += 1
- search.start(slot.rabin_val)
- while search.free_slot == NULL:
- # Note: It should be fine to use search.next() here,
- # because there should be no dummy pointers in the new
- # table. Might want to validate that, though.
- search.next()
- search.free_slot[0] = slot[0]
- total_steps += search._step
- total_matches += search.match_count
+ temp_idx = 0
+ rabin_val = slot.rabin_val
+ search_old.start(rabin_val)
+ search_old.next()
+ del all_entries[:]
+ while search_old.entry != NULL:
+ temp_entries[temp_idx] = search_old.entry[0]
+ all_entries.append((self._weight_for_entry(search_old.entry), temp_idx))
+ temp_idx += 1
+ if temp_idx >= 128:
+ raise RuntimeError('we overflowed the temp_entries list')
+ # We've consumed this entry, delete it
+ # TODO: We should track search_old steps
+ search_old.entry.ptr = dummy_ptr
+ search_old.entry.rabin_val = 0
+ search_old.next()
+ all_entries.sort(reverse=True)
+ total_old_steps += search_old._step
+
+ search_new.start(rabin_val)
+ for _, temp_idx in all_entries:
+ total_items += 1
+ while search_new.free_slot == NULL:
+ # Note: It should be fine to use search.next() here,
+ # because there should be no dummy pointers in the new
+ # table we would want to fill in. Might want to
+ # validate that, though.
+ search_new.next()
+ search_new.free_slot[0] = temp_entries[temp_idx]
+ search_new.free_slot = NULL
+ total_steps += search_new._step
finally:
PyMem_Free(old_table)
- trace.mutter(' took %d steps, %d matches,'
+ trace.mutter(' took %d steps, %d old_steps,'
' for %d items'
- % (total_steps, total_matches,
+ % (total_steps, total_old_steps,
total_items))
def reserve(self, new_entries):
@@ -430,18 +456,51 @@
# 4) For now, we just use a dirt-simple algorithm that says
# remove-every-other entry when the bucket gets full. We have to
# use 'dummy' entries, because collisions work across buckets.
- cdef list all_entries
- cdef int weight
- cdef unsigned int offset
- cdef rabin_entry *entry
- cdef int i
- cdef int temp_idx
# Must be larger than MAX_HASH_BUCKET_ENTRIES. Also, after packing we
# add dummy entries, which can cause other hash chains to insert early.
# As such, we can have > 64 entries in a chain.
cdef rabin_entry temp_entries[128]
-
- search.start(rabin_val)
+ cdef list all_entries
+
+ search.start(rabin_val)
+ all_entries = self._find_entries_to_sort(search, temp_entries)
+ search.start(rabin_val)
+ self._refill_table_entries(search, all_entries, temp_entries)
+
+ cdef int _weight_for_entry(self, rabin_entry *entry) except? -1:
+ cdef unsigned int offset
+ cdef int weight
+
+ weight = 0
+ offset = entry.global_offset
+ while offset > 0:
+ # Every 8 bits of offset is 1 byte of copy command
+ offset >>= 8
+ weight -= 1
+ # 'pre' bytes is less valuable than 'post' bytes, because of how
+ # the matching works, so double-weight post bytes.
+ # Having hits is very valuable, more so than a byte or two
+ # available
+ weight += ((entry.match_count * 20)
+ + (entry.match_tail * 4)
+ + entry.match_pre * 2
+ + entry.global_offset & 0xF)
+ return weight
+
+ cdef _find_entries_to_sort(self, RabinEntrySearch search,
+ rabin_entry *temp_entries):
+ """Build up temp_entries into entries that need to be sorted.
+
+ :param search: A RabinEntrySearch object. We assume it has already been
+ "start()"ed and just needs to be iterated.
+ :return: A list of weights for each entry offset.
+ """
+ cdef list all_entries
+ cdef unsigned int offset
+ cdef rabin_entry *entry
+ cdef int i
+ cdef int temp_idx
+
search.next()
all_entries = []
temp_idx = 0
@@ -452,37 +511,34 @@
# content at the beginning should be favored to content at the end,
# since the delta is smaller, though only 1 byte per 2^7 bytes
# offset.
- offset = search.entry.global_offset
- while offset > 0:
- # Every 8 bits of offset is 1 byte of copy command
- offset >>= 8
- weight -= 1
- # 'pre' bytes is less valuable than 'post' bytes, because of how
- # the matching works, so double-weight post bytes.
- # Having hits is very valuable, more so than a byte or two
- # available
- weight += ((search.entry.match_count * 20)
- + (search.entry.match_tail * 4)
- + search.entry.match_pre * 2
- + search.entry.global_offset & 0xF)
- all_entries.append((weight, temp_idx))
+ all_entries.append((self._weight_for_entry(search.entry), temp_idx))
temp_entries[temp_idx] = search.entry[0]
temp_idx += 1
if temp_idx >= 128:
raise RuntimeError('we overflowed the temp_entries list')
search.next()
all_entries.sort(reverse=True)
+ return all_entries
+
+ cdef _refill_table_entries(self, RabinEntrySearch search, list all_entries,
+ rabin_entry *temp_entries):
+ """Use a sorted list of offsets, a list of entries, and fill the table
+
+ :param search: A RabinEntrySearch used for finding where the entries in
+ the table should go. Should already be seeded with start(rabin_val)
+ """
+ cdef rabin_entry *entry
+ cdef int i
+ cdef int temp_idx
# Now do the search again, and rewrite the entries. We fill in all
# slots and all dummy slots with the newly sorted entries. We also
# check _keep_map, to see if this is an entry we should omit.
i = 0
- search.start(rabin_val)
entry = search.next_with_dummy()
while entry != NULL:
if entry.ptr != dummy_ptr:
self.entry_count -= 1
- assert entry.rabin_val == 0 or entry.rabin_val == rabin_val
while i < MAX_HASH_BUCKET_ENTRIES and _keep_map[i] == c'n':
# Skip entries that are marked to be removed
temp_idx = all_entries[i][1]
More information about the bazaar-commits
mailing list