Rev 5522: (broken, but compiles) Almost there. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

Fri Nov 19 22:30:12 GMT 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

------------------------------------------------------------
revno: 5522
revision-id: john at arbash-meinel.com-20101119222956-kklvotp0bxojgvia
parent: john at arbash-meinel.com-20101119200510-rslzmtz8ofvjez5i
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Fri 2010-11-19 16:29:56 -0600
message:
  (broken, but compiles) Almost there.
  
  The code has been switched over to use the new structures, now we need to
  start iterating on getting the test suite passing again, and we'll be good.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'

--- a/bzrlib/_delta_index_pyx.pyx	2010-11-19 20:05:10 +0000
+++ b/bzrlib/_delta_index_pyx.pyx	2010-11-19 22:29:56 +0000
@@ -71,8 +71,10 @@
     const_data ptr
     # The RABIN_VAL associated with the matching bytes
     unsigned int rabin_val
-    # The maximum number of bytes that can be matched following this pointer.
-    # Note that this is limited to 65kB, and a theoretical match could be
+    # The index for the actual SourceInfo object
+    unsigned short source_idx
+    # The maximum number of bytes that can be matched around this pointer.
+    # Note that this is limited to 64kB, and a theoretical match could be
     # longer. However, that will be handled by a later match
     unsigned short match_tail
     # We know that we can't match more than 127 bytes before this pointer
@@ -100,14 +102,14 @@
     # Have we found a 'dummy' entry, that we could use later?
     cdef rabin_entry *free_slot
 
-    cdef rabin_entry *table
-    cdef unsigned int table_mask
+    cdef rabin_entry *_table
+    cdef unsigned int _table_mask
 
     def __init__(self, hash_map):
         cdef RabinHashMap rabin_hash_map
         rabin_hash_map = hash_map
-        self.table = rabin_hash_map._table
-        self.table_mask = rabin_hash_map.table_mask
+        self._table = rabin_hash_map._table
+        self._table_mask = rabin_hash_map.table_mask
 
     cdef start(self, unsigned int rabin_val):
         self.rabin_val = rabin_val
@@ -132,8 +134,8 @@
         # Note that all of these are '& mask', but that is computed *after* the
         # offset.
         # Don't search forever, even if we have bogus entries in the table
-        while self._step < self.table_mask:
-            slot = self.table + (self._next_hash & self.table_mask)
+        while self._step < self._table_mask:
+            slot = self._table + (self._next_hash & self._table_mask)
             self._step += 1
             self._next_hash = self._next_hash + self._step
             if slot.rabin_val == self.rabin_val:
@@ -305,9 +307,9 @@
         search.start(rabin_val)
         return search
 
-    cdef rabin_entry *add(self, const_data ptr, unsigned int val,
-                          unsigned char match_pre, unsigned short match_tail
-                          ) except NULL:
+    cdef rabin_entry *add(self, const_data ptr, unsigned short source_idx,
+                          unsigned int val, Py_ssize_t match_pre,
+                          Py_ssize_t match_tail) except NULL:
         """Add this entry to the hash map."""
         cdef RabinEntrySearch search
         cdef rabin_entry *entry
@@ -317,11 +319,16 @@
             search.next()
         # We found either a dummy slot, or a fully empty slot. Either way we
         # don't really care
+        if match_pre > MAX_MATCH_PRE:
+            match_pre = MAX_MATCH_PRE
+        if match_tail > MAX_MATCH_TAIL:
+            match_tail = MAX_MATCH_TAIL
         entry = search.free_slot
         entry.rabin_val = val
         entry.match_pre = match_pre
         entry.match_tail = match_tail
         entry.ptr = ptr
+        entry.source_idx = source_idx
         self.entry_count += 1
         return entry
 
@@ -334,7 +341,7 @@
         """
         assert pre_bytes < 256
         assert post_bytes < 65536
-        self.add(NULL, val, pre_bytes, post_bytes)
+        self.add(NULL, 0, val, pre_bytes, post_bytes)
 
     def _py_find_all(self, val):
         """Find all entries that match the given rabin_val.
@@ -380,12 +387,10 @@
     cdef public object buf
     # Start offset of this data, vs the start of all source
     cdef public unsigned long start_offset
-    cdef public int is_delta
 
-    def __init__(self, buf, start_offset, is_delta):
+    def __init__(self, buf, start_offset):
         self.buf = buf
         self.start_offset = start_offset
-        self.is_delta = is_delta
 
 
 cdef int DEFAULT_NUM_COLLISIONS = 4
@@ -468,6 +473,10 @@
         def __get__(self):
             return self.total_source_bytes
 
+    property num_entries:
+        def __get__(self):
+            return self._hash_map.entry_count
+
     def __init__(self):
         self._hash_map = RabinHashMap()
         # Just initialize the basic buckets to start with
@@ -475,20 +484,6 @@
         self.total_source_bytes = 0
         self.max_bucket_size = MAX_HASH_BUCKET_ENTRIES
 
-    cdef _find_source_for_ptr(self, const_data ptr):
-        """Search through the source list for which one holds this pointer."""
-        cdef SourceInfo info
-        cdef char *start, *end
-
-        if ptr == NULL:
-            return -1, None
-        for idx, info in enumerate(self.sources):
-            start = PyString_AS_STRING(info.buf)
-            end = start + PyString_GET_SIZE(info.buf)
-            if ptr >= start && ptr <= end:
-                return idx, info
-        return -1, None
-
     def _matches_as_dict(self):
         cdef SourceInfo source
         cdef unsigned int i
@@ -501,10 +496,9 @@
             if (slot.rabin_val == 0
                 and (slot.ptr == NULL or slot.ptr == dummy_ptr)):
                 continue
-            idx, source = self._find_source_for_ptr(slot.ptr)
-            if source is not None:
-                local_offset = slot.ptr - PyString_AS_STRING(source.buf)
-                matches[slot.rabin_val] = source.start_offset + local_offset
+            source = self.sources[slot.source_idx]
+            local_offset = slot.ptr - PyString_AS_STRING(source.buf)
+            matches[slot.rabin_val] = source.start_offset + local_offset
         return matches
 
     def _compute_offsets(self, source):
@@ -512,6 +506,7 @@
         cdef unsigned short source_counter
         cdef const_data ptr
         cdef const_data start
+        cdef const_data end
         cdef unsigned long num_entries
         cdef unsigned int val
         cdef Py_ssize_t match_pre
@@ -519,7 +514,6 @@
         cdef int i
         cdef Py_ssize_t size
         cdef rabin_entry *last
-        cdef RabinBucket bucket
         cdef unsigned int hash_offset
 
         if not PyString_CheckExact(source):
@@ -540,29 +534,22 @@
         # Note: trying to align this so the pointers end up on a 4/8-byte
         #       alignment didn't seem to help anything. We only ended up with a
         #       1/4 chance of having both source and target aligned.
-        ptr = (start + (num_entries * RABIN_WINDOW) - RABIN_WINDOW)
+        ptr = start
         last == NULL
-        while ptr >= start:
+        while ptr < end - RABIN_WINDOW - 1:
             val = 0
             for i from 1 <= i <= RABIN_WINDOW:
                 RABIN_ADD(val, ptr[i])
             match_pre = ptr - start
-            if match_pre > MAX_MATCH_PRE:
-                match_pre = MAX_MATCH_PRE
             match_tail = end - ptr
-            if match_tail > MAX_MATCH_TAIL:
-                match_tail = MAX_MATCH_TAIL
+            ptr += RABIN_WINDOW
             if (last != NULL and val == last.rabin_val):
-                # Keep the most recent of consecutive entries, which means the
-                # closest to the start of the source
-                # XXX: test this
-                last.ptr = (ptr + RABIN_WINDOW)
-                last.match_pre = <unsigned char>match_pre
-                last.match_tail = <unsigned short>match_tail
-                num_entries -= 1
+                # We have a repeated entry. We only store the first of
+                # consecutive entries, so we just skip this one
+                pass
             else:
-                last = self._hash_map.add(ptr, val, match_pre, match_tail)
-            ptr -= RABIN_WINDOW
+                last = self._hash_map.add(ptr, source_counter,
+                                          val, match_pre, match_tail)
 
     def _compute_offsets_from_delta(self, delta_source):
         cdef unsigned short source_counter
@@ -578,7 +565,6 @@
         cdef unsigned char c
         cdef int i
         cdef Py_ssize_t size
-        cdef RabinBucket bucket
         cdef unsigned int hash_offset
 
         if not PyString_CheckExact(delta_source):
@@ -623,11 +609,7 @@
                 #       alignment
                 while c > RABIN_WINDOW + 3:
                     match_pre = ptr - insert_start
-                    if match_pre > MAX_MATCH_PRE:
-                        match_pre = MAX_MATCH_PRE
                     match_tail = insert_end - ptr
-                    if match_tail > MAX_MATCH_TAIL:
-                        match_tail = MAX_MATCH_TAIL
                     val = 0
                     # We shift-by-one because the matching code skips the first
                     # byte
@@ -638,7 +620,8 @@
                     if val == prev_val:
                         # Keep only the first of matching sequences
                         continue
-                    last = self._hash_map.add(ptr, val, match_pre, match_tail)
+                    self._hash_map.add(ptr, source_counter, val, match_pre,
+                                       match_tail)
                     num_entries += 1
                 ptr += c
             else:
@@ -649,74 +632,35 @@
 
     def _limit_hash_buckets(self):
         """Avoid pathological behavior by limiting the entries in buckets."""
-        cdef RabinBucket bucket
-        cdef rabin_offset entry
-        cdef rabin_offset keep
-        cdef unsigned int extra
-
-        for bucket in self.buckets:
-            if bucket is None or bucket.count < self.max_bucket_size:
-                continue
-            # This bucket is over-sized, so lets remove a few entries
-            extra = bucket.count - self.max_bucket_size
-            entry = bucket.first
-            # For now, we just mirror the existing code
-            acc = 0
-            while entry:
-                acc += extra
-                if acc > 0:
-                    keep = entry
-                    while acc > 0:
-                        entry = entry.next
-                        acc -= self.max_bucket_size
-                    keep.next = entry.next
-                entry = entry.next
-            self.num_entries -= extra
-            bucket.count -= extra
-
-    def _max_match(self, offset):
-        """What is the maximum number of bytes that this ptr could match.
-
-        This is based on the internal matching logic.
-        """
-        cdef SourceInfo source
-        cdef const_data source_data
-        cdef int max_match
-        cdef int offset_to_start
-        cdef rabin_offset ra_offset
-
-        # Note: this is a prototype of how to decide which regions to remove
-        #       when decreasing the content in hash buckets. The idea is that
-        #       buckets that have more possible content to match should be
-        #       favored, since they *can* produce better matches (although any
-        #       given point may not be specifically better).
-        source = self.sources[offset.source]
-        if source.is_delta:
-            # We know that the best match will be less than 127 bytes, because
-            # that is the maximum length insert
-            # To be more accurate we would have to find what insert this offset
-            # is in, and compute its length (better would just be to cache this
-            # info on the offset object.)
-            return 127
-        # we can match at most 127 bytes back + the rest of the output
-        ra_offset = offset
-        source_data = <const_data>PyString_AS_STRING(source.buf)
-        max_match = PyString_GET_SIZE(source.buf)
-        offset_to_start = ra_offset.ptr - source_data
-        if offset_to_start > 127:
-            max_match -= (offset_to_start - 127)
-        return max_match
-
-    def _offset_into_source(self, offset):
-        """Return the byts-from-start for this rabin-offset."""
-        cdef rabin_offset rab_offset
-        cdef SourceInfo si
-        cdef const_data source_data
-
-        rab_offset = offset
-        si = self.sources[rab_offset.source]
-        source_data = <const_data>PyString_AS_STRING(si.buf)
-        return (<long>(rab_offset.ptr - source_data))
+        # TODO: when adding items, we can get a feel for how many items are in
+        #       each 'bucket', and determine which ones need to be updated.
+        #       Especially since we don't store the data into explicit buckets
+        #       anymore.
+        pass
+        # cdef RabinBucket bucket
+        # cdef rabin_offset entry
+        # cdef rabin_offset keep
+        # cdef unsigned int extra
+
+        # for bucket in self.buckets:
+        #     if bucket is None or bucket.count < self.max_bucket_size:
+        #         continue
+        #     # This bucket is over-sized, so lets remove a few entries
+        #     extra = bucket.count - self.max_bucket_size
+        #     entry = bucket.first
+        #     # For now, we just mirror the existing code
+        #     acc = 0
+        #     while entry:
+        #         acc += extra
+        #         if acc > 0:
+        #             keep = entry
+        #             while acc > 0:
+        #                 entry = entry.next
+        #                 acc -= self.max_bucket_size
+        #             keep.next = entry.next
+        #         entry = entry.next
+        #     self.num_entries -= extra
+        #     bucket.count -= extra
 
     def add_source(self, source, extra_offset=0):
         """Add a source of more data to delta against.
@@ -729,7 +673,7 @@
         if not PyString_CheckExact(source):
             raise TypeError("source must be a str, not %s" % (type(source),))
         start_offset = self.total_source_bytes + extra_offset
-        new_info = SourceInfo(source, start_offset, is_delta=False)
+        new_info = SourceInfo(source, start_offset)
         self.sources.append(new_info)
         self._hash_map.reserve(len(source) / RABIN_WINDOW)
         self._compute_offsets(source)
@@ -742,10 +686,10 @@
             raise TypeError("delta_source must be a str, not %s"
                             % (type(delta_source),))
         start_offset = self.total_source_bytes + extra_offset
-        new_info = SourceInfo(delta_source, start_offset, is_delta=True)
+        new_info = SourceInfo(delta_source, start_offset)
         self.sources.append(new_info)
-        # This is a false value, but it will always be an upper bound
-        self._ensure_enough_buckets(len(delta_source))
+        # This is an upper bound of the number of entries we'll find
+        self._hash_map.reserve(len(delta_source) / RABIN_WINDOW)
         self._compute_offsets_from_delta(delta_source)
         self._limit_hash_buckets()
         self.total_source_bytes += len(delta_source) + extra_offset
@@ -774,8 +718,7 @@
     cdef unsigned int insert_count
 
     cdef Py_ssize_t match_size
-    cdef Py_ssize_t match_offset
-    cdef unsigned short match_source_idx
+    cdef rabin_entry *match_entry
 
     cdef int noisy
 
@@ -818,10 +761,7 @@
 
     cdef _find_better_match(self, const_data data, const_data data_end,
                             unsigned int cur_rabin_val):
-        cdef int bucket_idx
-        cdef RabinBucket bucket
-        cdef rabin_offset offset
-        cdef rabin_offset this
+        cdef rabin_entry *cur_entry
         cdef const_data cur
         cdef const_data ref
         cdef const_data ref_start
@@ -829,41 +769,35 @@
         cdef Py_ssize_t cur_match_size
         cdef Py_ssize_t remaining_data_len
         cdef SourceInfo ref_source
+        cdef RabinEntrySearch search
 
-        bucket_idx = cur_rabin_val & self.index.hash_mask
-        bucket = self.index.buckets[bucket_idx]
-        if bucket is None:
-            offset = None
-        else:
-            offset = bucket.first
+        search = RabinEntrySearch(self.index._hash_map)
+        search.start(cur_rabin_val)
+        search.next()
         remaining_data_len = data_end - data
-        while offset is not None:
-            this = offset
-            offset = offset.next
-            if this.rabin_val != cur_rabin_val:
-                continue
-            ref = this.ptr
-            ref_source = self.index.sources[this.source]
-            ref_start = <const_data>PyString_AS_STRING(ref_source.buf)
-            ref_end = ref_start + PyString_GET_SIZE(ref_source.buf)
-            # Maximum match size
-            cur_match_size = ref_end - ref
-            if cur_match_size > remaining_data_len:
-                cur_match_size = remaining_data_len
+        while search.entry != NULL:
+            cur_entry = search.entry
+            search.next()
+            ref = cur_entry.ptr
+            cur_match_size = remaining_data_len
+            if cur_entry.match_tail < remaining_data_len:
+                cur_match_size = cur_entry.match_tail
+            ref_end = ref + cur_entry.match_tail
             if cur_match_size <= self.match_size:
-                # Best possible match is smaller than our current match
+                # Our current best match is better than this one could possibly
+                # be
                 continue
+            # Now actually compare the bytes, and see how many match
             cur = data
             while cur_match_size > 0 and cur[0] == ref[0]:
                 cur += 1
                 ref += 1
                 cur_match_size -= 1
             cur_match_size = (cur - data)
-            if self.match_size < cur_match_size:
+            if cur_match_size > self.match_size:
                 # This match is better
                 self.match_size = cur_match_size
-                self.match_source_idx = this.source
-                self.match_offset = this.ptr - ref_start
+                self.match_entry = cur_entry
                 if cur_match_size >= 4096:
                     # Stop searching for better matches
                     return
@@ -899,16 +833,17 @@
         """Check if we can expand the current match by data we've output."""
         cdef const_data ref_data
         cdef SourceInfo ref_source
+        cdef Py_ssize_t match_pre
 
         if not self.insert_count:
             # Nothing to expand
             return data
-        ref_source = self.index.sources[self.match_source_idx]
-        ref_data = <const_data>PyString_AS_STRING(ref_source.buf)
-        while (self.match_offset > 0
-               and ref_data[self.match_offset - 1] == data[-1]):
+        ref_data = self.match_entry.ptr
+        match_pre = self.match_entry.match_pre
+        while (match_pre > 0
+               and ref_data[match_pre - 1] == data[-1]):
             self.match_size += 1
-            self.match_offset -= 1
+            match_pre -= 1
             data -= 1
             self.out_pos -= 1
             self.insert_count -= 1
@@ -962,7 +897,9 @@
 
     cdef const_data _output_match(self, const_data data) except NULL:
         """We have a match that we like, insert it into the output stream"""
+        cdef SourceInfo source
         cdef Py_ssize_t match_size
+        cdef Py_ssize_t local_offset
 
         data = self._expand_match(data)
         self._finalize_pending_insert()
@@ -970,16 +907,15 @@
         # Matches are limited to 64kB in the current version
         if match_size > 0x10000:
             match_size = 0x10000
-        source = self.index.sources[self.match_source_idx]
-        if self.match_offset > PyString_GET_SIZE(source.buf):
-            raise RuntimeError("match_offset larger than source content len")
-        if self.match_offset + self.match_size > PyString_GET_SIZE(source.buf):
-            raise RuntimeError("match_offset + match larger than source"
-                               " content")
-        self._output_copy(self.match_offset + source.start_offset, match_size)
+        source = self.index.sources[self.match_entry.source_idx]
+        local_offset = data - PyString_AS_STRING(source.buf)
+        if local_offset < 0 or local_offset > PyString_GET_SIZE(source.buf):
+            raise RuntimeError("match start after source content len")
+        if local_offset + self.match_size > PyString_GET_SIZE(source.buf):
+            raise RuntimeError("match end after than source content")
+        self._output_copy(local_offset + source.start_offset, match_size)
 
         data += match_size
-        self.match_offset += match_size
         if self.match_size > match_size:
             self.match_size -= match_size
         else:
@@ -1015,9 +951,7 @@
             data += 1
 
         self.insert_count = RABIN_WINDOW
-        self.match_source_idx = -1
         self.match_size = 0
-        self.match_offset = 0
         while data < data_end:
             if self.match_size < 4096:
                 # Our current match isn't "good enough", so look for a better