Rev 5530: (broken, segfaulting) Trying to do a 'fancier' limit buckets heuristic. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

Mon Nov 22 22:50:38 GMT 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

------------------------------------------------------------
revno: 5530
revision-id: john at arbash-meinel.com-20101122225006-y8e23h0r864l1mnr
parent: john at arbash-meinel.com-20101122203416-6q45s5j57ik4i1hn
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Mon 2010-11-22 16:50:06 -0600
message:
  (broken, segfaulting) Trying to do a 'fancier' limit buckets heuristic.
  
  We'll see. I think I have the basic idea down, just that right now its segfaulting...
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'

--- a/bzrlib/_delta_index_pyx.pyx	2010-11-22 20:34:16 +0000
+++ b/bzrlib/_delta_index_pyx.pyx	2010-11-22 22:50:06 +0000
@@ -50,6 +50,7 @@
     void RABIN_REMOVE(unsigned int val, unsigned char old_data)
     void RABIN_UPDATE(unsigned int val, unsigned char old_data,
                       unsigned char new_data)
+    char _keep_map[64]
 
 cdef extern from "delta.h":
     unsigned long get_delta_hdr_size(unsigned char **datap,
@@ -66,6 +67,7 @@
 MAX_MATCH_TAIL = 65535
 
 
+
 cdef struct rabin_entry:
     # A pointer to the actual matching bytes
     const_data ptr
@@ -83,6 +85,12 @@
     unsigned char match_pre
 
 
+cdef class RabinEntryProxy:
+    """Just a wrapper around a rabin_entry struct, so we can use python."""
+
+    cdef rabin_entry *entry
+
+
 cdef class RabinHashMap
 
 # This encapsulates the context needed to search for multiple values that match
@@ -120,6 +128,26 @@
         self._step = 0
         self._next_hash = rabin_val
 
+    cdef rabin_entry* next_with_dummy(self) except? NULL:
+        cdef rabin_entry *slot
+        while self._step < self._table_mask:
+            # print self.rabin_val, self._step, (
+            #         self._next_hash & self._table_mask)
+            slot = self._table + (self._next_hash & self._table_mask)
+            self._step += 1
+            self._next_hash = self._next_hash + self._step
+            if slot.ptr == NULL:
+                # We found a blank entry, so this chain is done
+                return NULL
+            elif slot.ptr == dummy_ptr:
+                return slot
+            if slot.rabin_val != self.rabin_val:
+                continue
+            # We found an entry that matches exactly, return it
+            self.match_count += 1
+            return slot
+        raise RuntimeError('Too many steps in next_with_dummy()')
+
     cdef next(self):
         cdef rabin_entry *slot
         # This uses Quadratic Probing:
@@ -335,24 +363,61 @@
         #   4) For now, we just use a dirt-simple algorithm that says
         #      remove-every-other entry when the bucket gets full. We have to
         #      use 'dummy' entries, because collisions work across buckets.
+        cdef RabinEntryProxy proxy
+        cdef list all_entries
+        cdef int weight
+        cdef unsigned int offset
+        cdef rabin_entry *entry
+        cdef int i
+        # Must be larger than MAX_HASH_BUCKET_ENTRIES
+        cdef rabin_entry temp_entries[70]
+
         search.start(rabin_val)
         search.next()
-        # We will remove all 'odd' entries, by pointing them at the dummy
-        # pointer. We start the loop at an 'even' (0 is even), and then step
-        # once to get to odd, set that to dummy, and step again.
-        count = 0
+        all_entries = []
+        i = 0
         while search.entry != NULL:
-            search.next()
-            if search.entry == NULL:
-                break
-            search.entry.rabin_val = 0
-            search.entry.ptr = dummy_ptr
-            self.entry_count -= 1
-            # We could set the search.free_slot here, but we don't actually
-            # care.
-            count += 1
-            search.next()
-        # print "dummified %d entries" % (count,)
+            proxy.entry = search.entry
+            # We're going for a 'weighted entry' style of sorting. The idea is
+            # that entries that have a larger area 'around' them can provide
+            # better matches than ones with less. Also, that all things equal,
+            # content at the beginning should be favored to content at the end,
+            # since the delta is smaller, though only 1 byte per 2^7 bytes
+            # offset.
+            offset = search.entry.global_offset
+            while offset > 0:
+                # Every 8 bits of offset is 1 byte of copy command
+                offset >>= 8
+                weight -= 1
+            # 'pre' bytes is less valuable than 'post' bytes, because of how
+            # the matching works, so double-weight post bytes.
+            weight += (search.entry.match_tail * 2) + search.entry.match_pre
+            all_entries.append((weight, i))
+            temp_entries[i] = search.entry[0]
+            i += 1
+            if i > 70:
+                raise RuntimeError('we overflowed the temp_entries list')
+        all_entries.sort(reverse=True)
+        assert len(all_entries) == 64
+
+        # Now do the search again, and rewrite the entries. We fill in all
+        # slots and all dummy slots with the newly sorted entries. We also
+        # check _keep_map, to see if this is an entry we should omit.
+        i = 0
+        search.start(rabin_val)
+        entry = search.next_with_dummy()
+        while entry != NULL:
+            if i > MAX_HASH_BUCKET_ENTRIES or _keep_map[i] == c'n':
+                # We won't keep this entry, but we have to keep a 'dummy' entry
+                # because of all the other possible hash chains that would step
+                # on it
+                entry.ptr = dummy_ptr
+                entry.rabin_val = 0
+            else:
+                entry[0] = temp_entries[all_entries[i][1]]
+            entry = search.next_with_dummy()
+            i += 1
+        self.entry_count -= 32
 
     cdef rabin_entry *add(self, const_data ptr, unsigned int global_offset,
                           unsigned int val, Py_ssize_t match_pre,

=== modified file 'bzrlib/_rabin_consts.h'
--- a/bzrlib/_rabin_consts.h	2010-10-20 22:34:22 +0000
+++ b/bzrlib/_rabin_consts.h	2010-11-22 22:50:06 +0000
@@ -121,3 +121,11 @@
 // Update a rabin hash by removing old data, and adding new
 #define RABIN_UPDATE(val, old_data, new_data) \
     RABIN_REMOVE(val, old_data); RABIN_ADD(val, new_data)
+
+// Remove every 5th, remove every 4th, remove every 3rd, keep every other
+// keep every 3rd, keep every 4th, keep every 5th.
+// This matches exactly 32 entries to be kept out of 64.
+static char _keep_map[64] = "yyyynyyyyn" "yyynyyyn"
+                            "yynyynyyn" "ynynynynyn"
+                            "ynnynnynn" "ynnnynnn"
+                            "ynnnnynnnn";