Rev 72: Change the code so that we can pass in multiple sources to match against. in http://bzr.arbash-meinel.com/plugins/groupcompress_rabin
John Arbash Meinel
john at arbash-meinel.com
Mon Mar 2 18:52:40 GMT 2009
At http://bzr.arbash-meinel.com/plugins/groupcompress_rabin
------------------------------------------------------------
revno: 72
revision-id: john at arbash-meinel.com-20090302185236-gm5ckgaic13q6vvs
parent: john at arbash-meinel.com-20090302180420-8m229eh99p2bp2r5
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: groupcompress_rabin
timestamp: Mon 2009-03-02 12:52:36 -0600
message:
Change the code so that we can pass in multiple sources to match against.
At the moment, we only use a single source, but that will soon change.
-------------- next part --------------
=== modified file '_groupcompress_pyx.pyx'
--- a/_groupcompress_pyx.pyx 2009-03-02 18:04:20 +0000
+++ b/_groupcompress_pyx.pyx 2009-03-02 18:52:36 +0000
@@ -24,18 +24,17 @@
void memcpy(void *, void *, size_t)
cdef extern from "delta.h":
- void * diff_delta(void *src_buf, unsigned long src_bufsize,
- void *trg_buf, unsigned long trg_bufsize,
- unsigned long *delta_size, unsigned long max_delta_size)
struct delta_index:
unsigned long memsize
void *src_buf
unsigned long src_size
unsigned int hash_mask
# struct index_entry *hash[]
- delta_index * create_delta_index(void *buf, unsigned long bufsize)
+ delta_index * create_delta_index(void *buf, unsigned long bufsize, unsigned
+ long agg_src_offset)
void free_delta_index(delta_index *index)
- void * create_delta(delta_index *index,
+ void *create_delta(delta_index **indexes,
+ unsigned int num_indexes,
void *buf, unsigned long bufsize,
unsigned long *delta_size, unsigned long max_delta_size)
unsigned long get_delta_hdr_size(unsigned char **datap,
@@ -112,7 +111,7 @@
# fit just fine into the structure. But for now, we just wrap
# create_delta_index (For example, we could always reserve enough
# space to hash a 4MB string, etc.)
- self._index = create_delta_index(c_source, c_source_size)
+ self._index = create_delta_index(c_source, c_source_size, 0)
# TODO: Handle if _index == NULL
cdef _ensure_no_index(self):
@@ -142,7 +141,7 @@
# TODO: inline some of create_delta so we at least don't have to double
# malloc, and can instead use PyString_FromStringAndSize, to
# allocate the bytes into the final string
- delta = create_delta(self._index, target, target_size,
+ delta = create_delta(&self._index, 1, target, target_size,
&delta_size, max_delta_size)
result = None
if delta:
@@ -175,9 +174,9 @@
target_size = PyString_GET_SIZE(target_bytes)
result = None
- index = create_delta_index(source, source_size)
+ index = create_delta_index(source, source_size, 0)
if index != NULL:
- delta = create_delta(index, target, target_size,
+ delta = create_delta(&index, 1, target, target_size,
&delta_size, max_delta_size)
free_delta_index(index);
if delta:
=== modified file 'delta.h'
--- a/delta.h 2009-02-27 17:32:04 +0000
+++ b/delta.h 2009-03-02 18:52:36 +0000
@@ -16,7 +16,8 @@
* using free_delta_index().
*/
extern struct delta_index *
-create_delta_index(const void *buf, unsigned long bufsize);
+create_delta_index(const void *buf, unsigned long bufsize,
+ unsigned long agg_src_offset);
/*
* free_delta_index: free the index created by create_delta_index()
@@ -43,7 +44,8 @@
* must be freed by the caller.
*/
extern void *
-create_delta(const struct delta_index *index,
+create_delta(struct delta_index **indexes,
+ unsigned int num_indexes,
const void *buf, unsigned long bufsize,
unsigned long *delta_size, unsigned long max_delta_size);
@@ -60,9 +62,9 @@
const void *trg_buf, unsigned long trg_bufsize,
unsigned long *delta_size, unsigned long max_delta_size)
{
- struct delta_index *index = create_delta_index(src_buf, src_bufsize);
+ struct delta_index *index = create_delta_index(src_buf, src_bufsize, 0);
if (index) {
- void *delta = create_delta(index, trg_buf, trg_bufsize,
+ void *delta = create_delta(&index, 1, trg_buf, trg_bufsize,
delta_size, max_delta_size);
free_delta_index(index);
return delta;
=== modified file 'diff-delta.c'
--- a/diff-delta.c 2009-02-27 17:32:04 +0000
+++ b/diff-delta.c 2009-03-02 18:52:36 +0000
@@ -123,14 +123,18 @@
};
struct delta_index {
- unsigned long memsize;
- const void *src_buf;
- unsigned long src_size;
- unsigned int hash_mask;
+ unsigned long memsize; /* Total bytes pointed to by this index */
+ const void *src_buf; /* Pointer to the beginning of source data */
+ unsigned long src_size; /* Total length of source data */
+ unsigned long agg_src_offset; /* Start of source data as part of the
+ aggregate source */
+ unsigned int hash_mask; /* val & hash_mask gives the hash index for a given
+ entry */
struct index_entry *hash[];
};
-struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
+struct delta_index * create_delta_index(const void *buf, unsigned long bufsize,
+ unsigned long agg_src_offset)
{
unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
const unsigned char *data, *buffer = buf;
@@ -174,8 +178,8 @@
/* then populate the index */
prev_val = ~0;
for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
- data >= buffer;
- data -= RABIN_WINDOW) {
+ data >= buffer;
+ data -= RABIN_WINDOW) {
unsigned int val = 0;
for (i = 1; i <= RABIN_WINDOW; i++)
val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
@@ -196,7 +200,7 @@
/*
* Determine a limit on the number of entries in the same hash
- * bucket. This guards us against pathological data sets causing
+ * bucket. This guards us against pathological data sets causing
* really bad hash distribution with most entries in the same hash
* bucket that would bring us to O(m*n) computing costs (m and n
* corresponding to reference and target buffer sizes).
@@ -221,7 +225,7 @@
/*
* Assume that this loop is gone through exactly
* HASH_LIMIT times and is entered and left with
- * acc==0. So the first statement in the loop
+ * acc==0. So the first statement in the loop
* contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
* to the accumulator, and the inner loop consequently
* is run (hash_count[i]-HASH_LIMIT) times, removing
@@ -262,6 +266,7 @@
index->memsize = memsize;
index->src_buf = buf;
index->src_size = bufsize;
+ index->agg_src_offset = agg_src_offset;
index->hash_mask = hmask;
mem = index->hash;
@@ -308,11 +313,13 @@
#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
void *
-create_delta(const struct delta_index *index,
- const void *trg_buf, unsigned long trg_size,
- unsigned long *delta_size, unsigned long max_size)
+create_delta(struct delta_index **indexes,
+ unsigned int num_indexes,
+ const void *trg_buf, unsigned long trg_size,
+ unsigned long *delta_size, unsigned long max_size)
{
- unsigned int i, outpos, outsize, moff, msize, val;
+ unsigned int i, j, outpos, outsize, moff, msize, val;
+ const struct delta_index *index, *mindex;
int inscnt;
const unsigned char *ref_data, *ref_top, *data, *top;
unsigned char *out;
@@ -329,7 +336,11 @@
return NULL;
/* store reference buffer size */
- i = index->src_size;
+ i = 0;
+ for (j = 0; j < num_indexes; ++j) {
+ index = indexes[j];
+ i += index->src_size;
+ }
while (i >= 0x80) {
out[outpos++] = i | 0x80;
i >>= 7;
@@ -344,55 +355,85 @@
}
out[outpos++] = i;
- ref_data = index->src_buf;
- ref_top = ref_data + index->src_size;
data = trg_buf;
top = (const unsigned char *) trg_buf + trg_size;
- outpos++;
+ /* Start the matching by filling out with a simple 'insert' instruction, of
+ * the first RABIN_WINDOW bytes of the input.
+ */
+ outpos++; /* leave a byte for the insert command */
val = 0;
for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
out[outpos++] = *data;
val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
}
+ /* we are now setup with an insert of 'i' bytes and val contains the RABIN
+ * hash for those bytes, and data points to the RABIN_WINDOW+1 byte of
+ * input.
+ */
inscnt = i;
moff = 0;
msize = 0;
+ mindex = NULL;
while (data < top) {
if (msize < 4096) {
+ /* we don't have a 'worthy enough' match yet, so let's look for
+ * one.
+ */
struct index_entry *entry;
+ /* Shift the window by one byte. */
val ^= U[data[-RABIN_WINDOW]];
val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
- i = val & index->hash_mask;
- for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
- const unsigned char *ref = entry->ptr;
- const unsigned char *src = data;
- unsigned int ref_size = ref_top - ref;
- if (entry->val != val)
- continue;
- if (ref_size > top - src)
- ref_size = top - src;
- if (ref_size <= msize)
- break;
- while (ref_size-- && *src++ == *ref)
- ref++;
- if (msize < ref - entry->ptr) {
- /* this is our best match so far */
- msize = ref - entry->ptr;
- moff = entry->ptr - ref_data;
- if (msize >= 4096) /* good enough */
+ for (j = 0; j < num_indexes; ++j) {
+ index = indexes[j];
+ i = val & index->hash_mask;
+ ref_top = index->src_buf + index->src_size;
+ ref_data = index->src_buf;
+ for (entry = index->hash[i]; entry < index->hash[i+1];
+ entry++) {
+ const unsigned char *ref = entry->ptr;
+ const unsigned char *src = data;
+ unsigned int ref_size = ref_top - ref;
+ if (entry->val != val)
+ continue;
+ /* ref_size is the longest possible match that we could make
+ * here. If ref_size <= msize, then we know that we cannot
+ * match more bytes with this location that we have already
+ * matched.
+ */
+ if (ref_size > top - src)
+ ref_size = top - src;
+ if (ref_size <= msize)
break;
+ /* See how many bytes actually match at this location. */
+ while (ref_size-- && *src++ == *ref)
+ ref++;
+ if (msize < ref - entry->ptr) {
+ /* this is our best match so far */
+ msize = ref - entry->ptr;
+ moff = entry->ptr - ref_data;
+ mindex = index;
+ if (msize >= 4096) /* good enough */
+ break;
+ }
}
}
}
if (msize < 4) {
+ /* The best match right now is less than 4 bytes long. So just add
+ * the current byte to the insert instruction. Increment the insert
+ * counter, and copy the byte of data into the output buffer.
+ */
if (!inscnt)
outpos++;
out[outpos++] = *data++;
inscnt++;
if (inscnt == 0x7f) {
+ /* We have a max length insert instruction, finalize it in the
+ * output.
+ */
out[outpos - inscnt - 1] = inscnt;
inscnt = 0;
}
@@ -402,6 +443,7 @@
unsigned char *op;
if (inscnt) {
+ ref_data = mindex->src_buf;
while (moff && ref_data[moff-1] == data[-1]) {
/* we can match one byte back */
msize++;
@@ -425,6 +467,10 @@
op = out + outpos++;
i = 0x80;
+ /* so far, moff has been the offset in a single source, however,
+ * now we encode it as the offset in the aggregate source
+ */
+ moff = moff + mindex->agg_src_offset;
if (moff & 0x000000ff)
out[outpos++] = moff >> 0, i |= 0x01;
if (moff & 0x0000ff00)
@@ -450,7 +496,7 @@
val = 0;
for (j = -RABIN_WINDOW; j < 0; j++)
val = ((val << 8) | data[j])
- ^ T[val >> RABIN_SHIFT];
+ ^ T[val >> RABIN_SHIFT];
}
}
@@ -480,3 +526,5 @@
*delta_size = outpos;
return out;
}
+
+/* vim: noet ts=4 sw=4 sts=4 */
More information about the bazaar-commits
mailing list