Rev 5522: (broken, but compiles) Almost there. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
John Arbash Meinel
john at arbash-meinel.com
Fri Nov 19 22:30:12 GMT 2010
At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem
------------------------------------------------------------
revno: 5522
revision-id: john at arbash-meinel.com-20101119222956-kklvotp0bxojgvia
parent: john at arbash-meinel.com-20101119200510-rslzmtz8ofvjez5i
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Fri 2010-11-19 16:29:56 -0600
message:
(broken, but compiles) Almost there.
The code has been switched over to use the new structures, now we need to
start iterating on getting the test suite passing again, and we'll be good.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'
--- a/bzrlib/_delta_index_pyx.pyx 2010-11-19 20:05:10 +0000
+++ b/bzrlib/_delta_index_pyx.pyx 2010-11-19 22:29:56 +0000
@@ -71,8 +71,10 @@
const_data ptr
# The RABIN_VAL associated with the matching bytes
unsigned int rabin_val
- # The maximum number of bytes that can be matched following this pointer.
- # Note that this is limited to 65kB, and a theoretical match could be
+ # The index for the actual SourceInfo object
+ unsigned short source_idx
+ # The maximum number of bytes that can be matched around this pointer.
+ # Note that this is limited to 64kB, and a theoretical match could be
# longer. However, that will be handled by a later match
unsigned short match_tail
# We know that we can't match more than 127 bytes before this pointer
@@ -100,14 +102,14 @@
# Have we found a 'dummy' entry, that we could use later?
cdef rabin_entry *free_slot
- cdef rabin_entry *table
- cdef unsigned int table_mask
+ cdef rabin_entry *_table
+ cdef unsigned int _table_mask
def __init__(self, hash_map):
cdef RabinHashMap rabin_hash_map
rabin_hash_map = hash_map
- self.table = rabin_hash_map._table
- self.table_mask = rabin_hash_map.table_mask
+ self._table = rabin_hash_map._table
+ self._table_mask = rabin_hash_map.table_mask
cdef start(self, unsigned int rabin_val):
self.rabin_val = rabin_val
@@ -132,8 +134,8 @@
# Note that all of these are '& mask', but that is computed *after* the
# offset.
# Don't search forever, even if we have bogus entries in the table
- while self._step < self.table_mask:
- slot = self.table + (self._next_hash & self.table_mask)
+ while self._step < self._table_mask:
+ slot = self._table + (self._next_hash & self._table_mask)
self._step += 1
self._next_hash = self._next_hash + self._step
if slot.rabin_val == self.rabin_val:
@@ -305,9 +307,9 @@
search.start(rabin_val)
return search
- cdef rabin_entry *add(self, const_data ptr, unsigned int val,
- unsigned char match_pre, unsigned short match_tail
- ) except NULL:
+ cdef rabin_entry *add(self, const_data ptr, unsigned short source_idx,
+ unsigned int val, Py_ssize_t match_pre,
+ Py_ssize_t match_tail) except NULL:
"""Add this entry to the hash map."""
cdef RabinEntrySearch search
cdef rabin_entry *entry
@@ -317,11 +319,16 @@
search.next()
# We found either a dummy slot, or a fully empty slot. Either way we
# don't really care
+ if match_pre > MAX_MATCH_PRE:
+ match_pre = MAX_MATCH_PRE
+ if match_tail > MAX_MATCH_TAIL:
+ match_tail = MAX_MATCH_TAIL
entry = search.free_slot
entry.rabin_val = val
entry.match_pre = match_pre
entry.match_tail = match_tail
entry.ptr = ptr
+ entry.source_idx = source_idx
self.entry_count += 1
return entry
@@ -334,7 +341,7 @@
"""
assert pre_bytes < 256
assert post_bytes < 65536
- self.add(NULL, val, pre_bytes, post_bytes)
+ self.add(NULL, 0, val, pre_bytes, post_bytes)
def _py_find_all(self, val):
"""Find all entries that match the given rabin_val.
@@ -380,12 +387,10 @@
cdef public object buf
# Start offset of this data, vs the start of all source
cdef public unsigned long start_offset
- cdef public int is_delta
- def __init__(self, buf, start_offset, is_delta):
+ def __init__(self, buf, start_offset):
self.buf = buf
self.start_offset = start_offset
- self.is_delta = is_delta
cdef int DEFAULT_NUM_COLLISIONS = 4
@@ -468,6 +473,10 @@
def __get__(self):
return self.total_source_bytes
+ property num_entries:
+ def __get__(self):
+ return self._hash_map.entry_count
+
def __init__(self):
self._hash_map = RabinHashMap()
# Just initialize the basic buckets to start with
@@ -475,20 +484,6 @@
self.total_source_bytes = 0
self.max_bucket_size = MAX_HASH_BUCKET_ENTRIES
- cdef _find_source_for_ptr(self, const_data ptr):
- """Search through the source list for which one holds this pointer."""
- cdef SourceInfo info
- cdef char *start, *end
-
- if ptr == NULL:
- return -1, None
- for idx, info in enumerate(self.sources):
- start = PyString_AS_STRING(info.buf)
- end = start + PyString_GET_SIZE(info.buf)
- if ptr >= start && ptr <= end:
- return idx, info
- return -1, None
-
def _matches_as_dict(self):
cdef SourceInfo source
cdef unsigned int i
@@ -501,10 +496,9 @@
if (slot.rabin_val == 0
and (slot.ptr == NULL or slot.ptr == dummy_ptr)):
continue
- idx, source = self._find_source_for_ptr(slot.ptr)
- if source is not None:
- local_offset = slot.ptr - PyString_AS_STRING(source.buf)
- matches[slot.rabin_val] = source.start_offset + local_offset
+ source = self.sources[slot.source_idx]
+ local_offset = slot.ptr - PyString_AS_STRING(source.buf)
+ matches[slot.rabin_val] = source.start_offset + local_offset
return matches
def _compute_offsets(self, source):
@@ -512,6 +506,7 @@
cdef unsigned short source_counter
cdef const_data ptr
cdef const_data start
+ cdef const_data end
cdef unsigned long num_entries
cdef unsigned int val
cdef Py_ssize_t match_pre
@@ -519,7 +514,6 @@
cdef int i
cdef Py_ssize_t size
cdef rabin_entry *last
- cdef RabinBucket bucket
cdef unsigned int hash_offset
if not PyString_CheckExact(source):
@@ -540,29 +534,22 @@
# Note: trying to align this so the pointers end up on a 4/8-byte
# alignment didn't seem to help anything. We only ended up with a
# 1/4 chance of having both source and target aligned.
- ptr = (start + (num_entries * RABIN_WINDOW) - RABIN_WINDOW)
+ ptr = start
last == NULL
- while ptr >= start:
+ while ptr < end - RABIN_WINDOW - 1:
val = 0
for i from 1 <= i <= RABIN_WINDOW:
RABIN_ADD(val, ptr[i])
match_pre = ptr - start
- if match_pre > MAX_MATCH_PRE:
- match_pre = MAX_MATCH_PRE
match_tail = end - ptr
- if match_tail > MAX_MATCH_TAIL:
- match_tail = MAX_MATCH_TAIL
+ ptr += RABIN_WINDOW
if (last != NULL and val == last.rabin_val):
- # Keep the most recent of consecutive entries, which means the
- # closest to the start of the source
- # XXX: test this
- last.ptr = (ptr + RABIN_WINDOW)
- last.match_pre = <unsigned char>match_pre
- last.match_tail = <unsigned short>match_tail
- num_entries -= 1
+ # We have a repeated entry. We only store the first of
+ # consecutive entries, so we just skip this one
+ pass
else:
- last = self._hash_map.add(ptr, val, match_pre, match_tail)
- ptr -= RABIN_WINDOW
+ last = self._hash_map.add(ptr, source_counter,
+ val, match_pre, match_tail)
def _compute_offsets_from_delta(self, delta_source):
cdef unsigned short source_counter
@@ -578,7 +565,6 @@
cdef unsigned char c
cdef int i
cdef Py_ssize_t size
- cdef RabinBucket bucket
cdef unsigned int hash_offset
if not PyString_CheckExact(delta_source):
@@ -623,11 +609,7 @@
# alignment
while c > RABIN_WINDOW + 3:
match_pre = ptr - insert_start
- if match_pre > MAX_MATCH_PRE:
- match_pre = MAX_MATCH_PRE
match_tail = insert_end - ptr
- if match_tail > MAX_MATCH_TAIL:
- match_tail = MAX_MATCH_TAIL
val = 0
# We shift-by-one because the matching code skips the first
# byte
@@ -638,7 +620,8 @@
if val == prev_val:
# Keep only the first of matching sequences
continue
- last = self._hash_map.add(ptr, val, match_pre, match_tail)
+ self._hash_map.add(ptr, source_counter, val, match_pre,
+ match_tail)
num_entries += 1
ptr += c
else:
@@ -649,74 +632,35 @@
def _limit_hash_buckets(self):
"""Avoid pathological behavior by limiting the entries in buckets."""
- cdef RabinBucket bucket
- cdef rabin_offset entry
- cdef rabin_offset keep
- cdef unsigned int extra
-
- for bucket in self.buckets:
- if bucket is None or bucket.count < self.max_bucket_size:
- continue
- # This bucket is over-sized, so lets remove a few entries
- extra = bucket.count - self.max_bucket_size
- entry = bucket.first
- # For now, we just mirror the existing code
- acc = 0
- while entry:
- acc += extra
- if acc > 0:
- keep = entry
- while acc > 0:
- entry = entry.next
- acc -= self.max_bucket_size
- keep.next = entry.next
- entry = entry.next
- self.num_entries -= extra
- bucket.count -= extra
-
- def _max_match(self, offset):
- """What is the maximum number of bytes that this ptr could match.
-
- This is based on the internal matching logic.
- """
- cdef SourceInfo source
- cdef const_data source_data
- cdef int max_match
- cdef int offset_to_start
- cdef rabin_offset ra_offset
-
- # Note: this is a prototype of how to decide which regions to remove
- # when decreasing the content in hash buckets. The idea is that
- # buckets that have more possible content to match should be
- # favored, since they *can* produce better matches (although any
- # given point may not be specifically better).
- source = self.sources[offset.source]
- if source.is_delta:
- # We know that the best match will be less than 127 bytes, because
- # that is the maximum length insert
- # To be more accurate we would have to find what insert this offset
- # is in, and compute its length (better would just be to cache this
- # info on the offset object.)
- return 127
- # we can match at most 127 bytes back + the rest of the output
- ra_offset = offset
- source_data = <const_data>PyString_AS_STRING(source.buf)
- max_match = PyString_GET_SIZE(source.buf)
- offset_to_start = ra_offset.ptr - source_data
- if offset_to_start > 127:
- max_match -= (offset_to_start - 127)
- return max_match
-
- def _offset_into_source(self, offset):
- """Return the byts-from-start for this rabin-offset."""
- cdef rabin_offset rab_offset
- cdef SourceInfo si
- cdef const_data source_data
-
- rab_offset = offset
- si = self.sources[rab_offset.source]
- source_data = <const_data>PyString_AS_STRING(si.buf)
- return (<long>(rab_offset.ptr - source_data))
+ # TODO: when adding items, we can get a feel for how many items are in
+ # each 'bucket', and determine which ones need to be updated.
+ # Especially since we don't store the data into explicit buckets
+ # anymore.
+ pass
+ # cdef RabinBucket bucket
+ # cdef rabin_offset entry
+ # cdef rabin_offset keep
+ # cdef unsigned int extra
+
+ # for bucket in self.buckets:
+ # if bucket is None or bucket.count < self.max_bucket_size:
+ # continue
+ # # This bucket is over-sized, so lets remove a few entries
+ # extra = bucket.count - self.max_bucket_size
+ # entry = bucket.first
+ # # For now, we just mirror the existing code
+ # acc = 0
+ # while entry:
+ # acc += extra
+ # if acc > 0:
+ # keep = entry
+ # while acc > 0:
+ # entry = entry.next
+ # acc -= self.max_bucket_size
+ # keep.next = entry.next
+ # entry = entry.next
+ # self.num_entries -= extra
+ # bucket.count -= extra
def add_source(self, source, extra_offset=0):
"""Add a source of more data to delta against.
@@ -729,7 +673,7 @@
if not PyString_CheckExact(source):
raise TypeError("source must be a str, not %s" % (type(source),))
start_offset = self.total_source_bytes + extra_offset
- new_info = SourceInfo(source, start_offset, is_delta=False)
+ new_info = SourceInfo(source, start_offset)
self.sources.append(new_info)
self._hash_map.reserve(len(source) / RABIN_WINDOW)
self._compute_offsets(source)
@@ -742,10 +686,10 @@
raise TypeError("delta_source must be a str, not %s"
% (type(delta_source),))
start_offset = self.total_source_bytes + extra_offset
- new_info = SourceInfo(delta_source, start_offset, is_delta=True)
+ new_info = SourceInfo(delta_source, start_offset)
self.sources.append(new_info)
- # This is a false value, but it will always be an upper bound
- self._ensure_enough_buckets(len(delta_source))
+ # This is an upper bound of the number of entries we'll find
+ self._hash_map.reserve(len(delta_source) / RABIN_WINDOW)
self._compute_offsets_from_delta(delta_source)
self._limit_hash_buckets()
self.total_source_bytes += len(delta_source) + extra_offset
@@ -774,8 +718,7 @@
cdef unsigned int insert_count
cdef Py_ssize_t match_size
- cdef Py_ssize_t match_offset
- cdef unsigned short match_source_idx
+ cdef rabin_entry *match_entry
cdef int noisy
@@ -818,10 +761,7 @@
cdef _find_better_match(self, const_data data, const_data data_end,
unsigned int cur_rabin_val):
- cdef int bucket_idx
- cdef RabinBucket bucket
- cdef rabin_offset offset
- cdef rabin_offset this
+ cdef rabin_entry *cur_entry
cdef const_data cur
cdef const_data ref
cdef const_data ref_start
@@ -829,41 +769,35 @@
cdef Py_ssize_t cur_match_size
cdef Py_ssize_t remaining_data_len
cdef SourceInfo ref_source
+ cdef RabinEntrySearch search
- bucket_idx = cur_rabin_val & self.index.hash_mask
- bucket = self.index.buckets[bucket_idx]
- if bucket is None:
- offset = None
- else:
- offset = bucket.first
+ search = RabinEntrySearch(self.index._hash_map)
+ search.start(cur_rabin_val)
+ search.next()
remaining_data_len = data_end - data
- while offset is not None:
- this = offset
- offset = offset.next
- if this.rabin_val != cur_rabin_val:
- continue
- ref = this.ptr
- ref_source = self.index.sources[this.source]
- ref_start = <const_data>PyString_AS_STRING(ref_source.buf)
- ref_end = ref_start + PyString_GET_SIZE(ref_source.buf)
- # Maximum match size
- cur_match_size = ref_end - ref
- if cur_match_size > remaining_data_len:
- cur_match_size = remaining_data_len
+ while search.entry != NULL:
+ cur_entry = search.entry
+ search.next()
+ ref = cur_entry.ptr
+ cur_match_size = remaining_data_len
+ if cur_entry.match_tail < remaining_data_len:
+ cur_match_size = cur_entry.match_tail
+ ref_end = ref + cur_entry.match_tail
if cur_match_size <= self.match_size:
- # Best possible match is smaller than our current match
+ # Our current best match is better than this one could possibly
+ # be
continue
+ # Now actually compare the bytes, and see how many match
cur = data
while cur_match_size > 0 and cur[0] == ref[0]:
cur += 1
ref += 1
cur_match_size -= 1
cur_match_size = (cur - data)
- if self.match_size < cur_match_size:
+ if cur_match_size > self.match_size:
# This match is better
self.match_size = cur_match_size
- self.match_source_idx = this.source
- self.match_offset = this.ptr - ref_start
+ self.match_entry = cur_entry
if cur_match_size >= 4096:
# Stop searching for better matches
return
@@ -899,16 +833,17 @@
"""Check if we can expand the current match by data we've output."""
cdef const_data ref_data
cdef SourceInfo ref_source
+ cdef Py_ssize_t match_pre
if not self.insert_count:
# Nothing to expand
return data
- ref_source = self.index.sources[self.match_source_idx]
- ref_data = <const_data>PyString_AS_STRING(ref_source.buf)
- while (self.match_offset > 0
- and ref_data[self.match_offset - 1] == data[-1]):
+ ref_data = self.match_entry.ptr
+ match_pre = self.match_entry.match_pre
+ while (match_pre > 0
+ and ref_data[match_pre - 1] == data[-1]):
self.match_size += 1
- self.match_offset -= 1
+ match_pre -= 1
data -= 1
self.out_pos -= 1
self.insert_count -= 1
@@ -962,7 +897,9 @@
cdef const_data _output_match(self, const_data data) except NULL:
"""We have a match that we like, insert it into the output stream"""
+ cdef SourceInfo source
cdef Py_ssize_t match_size
+ cdef Py_ssize_t local_offset
data = self._expand_match(data)
self._finalize_pending_insert()
@@ -970,16 +907,15 @@
# Matches are limited to 64kB in the current version
if match_size > 0x10000:
match_size = 0x10000
- source = self.index.sources[self.match_source_idx]
- if self.match_offset > PyString_GET_SIZE(source.buf):
- raise RuntimeError("match_offset larger than source content len")
- if self.match_offset + self.match_size > PyString_GET_SIZE(source.buf):
- raise RuntimeError("match_offset + match larger than source"
- " content")
- self._output_copy(self.match_offset + source.start_offset, match_size)
+ source = self.index.sources[self.match_entry.source_idx]
+ local_offset = data - PyString_AS_STRING(source.buf)
+ if local_offset < 0 or local_offset > PyString_GET_SIZE(source.buf):
+ raise RuntimeError("match start after source content len")
+ if local_offset + self.match_size > PyString_GET_SIZE(source.buf):
+ raise RuntimeError("match end after than source content")
+ self._output_copy(local_offset + source.start_offset, match_size)
data += match_size
- self.match_offset += match_size
if self.match_size > match_size:
self.match_size -= match_size
else:
@@ -1015,9 +951,7 @@
data += 1
self.insert_count = RABIN_WINDOW
- self.match_source_idx = -1
self.match_size = 0
- self.match_offset = 0
while data < data_end:
if self.match_size < 4096:
# Our current match isn't "good enough", so look for a better
More information about the bazaar-commits
mailing list