Rev 5530: (broken, segfaulting) Trying to do a 'fancier' limit buckets heuristic. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
John Arbash Meinel
john at arbash-meinel.com
Mon Nov 22 22:50:38 GMT 2010
At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
------------------------------------------------------------
revno: 5530
revision-id: john at arbash-meinel.com-20101122225006-y8e23h0r864l1mnr
parent: john at arbash-meinel.com-20101122203416-6q45s5j57ik4i1hn
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Mon 2010-11-22 16:50:06 -0600
message:
(broken, segfaulting) Trying to do a 'fancier' limit buckets heuristic.
We'll see. I think I have the basic idea down, just that right now its segfaulting...
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'
--- a/bzrlib/_delta_index_pyx.pyx 2010-11-22 20:34:16 +0000
+++ b/bzrlib/_delta_index_pyx.pyx 2010-11-22 22:50:06 +0000
@@ -50,6 +50,7 @@
void RABIN_REMOVE(unsigned int val, unsigned char old_data)
void RABIN_UPDATE(unsigned int val, unsigned char old_data,
unsigned char new_data)
+ char _keep_map[64]
cdef extern from "delta.h":
unsigned long get_delta_hdr_size(unsigned char **datap,
@@ -66,6 +67,7 @@
MAX_MATCH_TAIL = 65535
+
cdef struct rabin_entry:
# A pointer to the actual matching bytes
const_data ptr
@@ -83,6 +85,12 @@
unsigned char match_pre
+cdef class RabinEntryProxy:
+ """Just a wrapper around a rabin_entry struct, so we can use python."""
+
+ cdef rabin_entry *entry
+
+
cdef class RabinHashMap
# This encapsulates the context needed to search for multiple values that match
@@ -120,6 +128,26 @@
self._step = 0
self._next_hash = rabin_val
+ cdef rabin_entry* next_with_dummy(self) except? NULL:
+ cdef rabin_entry *slot
+ while self._step < self._table_mask:
+ # print self.rabin_val, self._step, (
+ # self._next_hash & self._table_mask)
+ slot = self._table + (self._next_hash & self._table_mask)
+ self._step += 1
+ self._next_hash = self._next_hash + self._step
+ if slot.ptr == NULL:
+ # We found a blank entry, so this chain is done
+ return NULL
+ elif slot.ptr == dummy_ptr:
+ return slot
+ if slot.rabin_val != self.rabin_val:
+ continue
+ # We found an entry that matches exactly, return it
+ self.match_count += 1
+ return slot
+ raise RuntimeError('Too many steps in next_with_dummy()')
+
cdef next(self):
cdef rabin_entry *slot
# This uses Quadratic Probing:
@@ -335,24 +363,61 @@
# 4) For now, we just use a dirt-simple algorithm that says
# remove-every-other entry when the bucket gets full. We have to
# use 'dummy' entries, because collisions work across buckets.
+ cdef RabinEntryProxy proxy
+ cdef list all_entries
+ cdef int weight
+ cdef unsigned int offset
+ cdef rabin_entry *entry
+ cdef int i
+ # Must be larger than MAX_HASH_BUCKET_ENTRIES
+ cdef rabin_entry temp_entries[70]
+
search.start(rabin_val)
search.next()
- # We will remove all 'odd' entries, by pointing them at the dummy
- # pointer. We start the loop at an 'even' (0 is even), and then step
- # once to get to odd, set that to dummy, and step again.
- count = 0
+ all_entries = []
+ i = 0
while search.entry != NULL:
- search.next()
- if search.entry == NULL:
- break
- search.entry.rabin_val = 0
- search.entry.ptr = dummy_ptr
- self.entry_count -= 1
- # We could set the search.free_slot here, but we don't actually
- # care.
- count += 1
- search.next()
- # print "dummified %d entries" % (count,)
+ proxy.entry = search.entry
+ # We're going for a 'weighted entry' style of sorting. The idea is
+ # that entries that have a larger area 'around' them can provide
+ # better matches than ones with less. Also, that all things equal,
+ # content at the beginning should be favored to content at the end,
+ # since the delta is smaller, though only 1 byte per 2^7 bytes
+ # offset.
+ offset = search.entry.global_offset
+ while offset > 0:
+ # Every 8 bits of offset is 1 byte of copy command
+ offset >>= 8
+ weight -= 1
+ # 'pre' bytes is less valuable than 'post' bytes, because of how
+ # the matching works, so double-weight post bytes.
+ weight += (search.entry.match_tail * 2) + search.entry.match_pre
+ all_entries.append((weight, i))
+ temp_entries[i] = search.entry[0]
+ i += 1
+ if i > 70:
+ raise RuntimeError('we overflowed the temp_entries list')
+ all_entries.sort(reverse=True)
+ assert len(all_entries) == 64
+
+ # Now do the search again, and rewrite the entries. We fill in all
+ # slots and all dummy slots with the newly sorted entries. We also
+ # check _keep_map, to see if this is an entry we should omit.
+ i = 0
+ search.start(rabin_val)
+ entry = search.next_with_dummy()
+ while entry != NULL:
+ if i > MAX_HASH_BUCKET_ENTRIES or _keep_map[i] == c'n':
+ # We won't keep this entry, but we have to keep a 'dummy' entry
+ # because of all the other possible hash chains that would step
+ # on it
+ entry.ptr = dummy_ptr
+ entry.rabin_val = 0
+ else:
+ entry[0] = temp_entries[all_entries[i][1]]
+ entry = search.next_with_dummy()
+ i += 1
+ self.entry_count -= 32
cdef rabin_entry *add(self, const_data ptr, unsigned int global_offset,
unsigned int val, Py_ssize_t match_pre,
=== modified file 'bzrlib/_rabin_consts.h'
--- a/bzrlib/_rabin_consts.h 2010-10-20 22:34:22 +0000
+++ b/bzrlib/_rabin_consts.h 2010-11-22 22:50:06 +0000
@@ -121,3 +121,11 @@
// Update a rabin hash by removing old data, and adding new
#define RABIN_UPDATE(val, old_data, new_data) \
RABIN_REMOVE(val, old_data); RABIN_ADD(val, new_data)
+
+// Remove every 5th, remove every 4th, remove every 3rd, keep every other
+// keep every 3rd, keep every 4th, keep every 5th.
+// This matches exactly 32 entries to be kept out of 64.
+static char _keep_map[64] = "yyyynyyyyn" "yyynyyyn"
+ "yynyynyyn" "ynynynynyn"
+ "ynnynnynn" "ynnnynnn"
+ "ynnnnynnnn";
More information about the bazaar-commits
mailing list