Rev 3892: The new layout is working. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/gc_delta_index_room

Wed Mar 18 21:51:02 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/gc_delta_index_room

------------------------------------------------------------
revno: 3892
revision-id: john at arbash-meinel.com-20090318215056-dzx4j8ym5yhwh67b
parent: john at arbash-meinel.com-20090318174524-djvr0t7qwyca10jn
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: gc_delta_index_room
timestamp: Wed 2009-03-18 16:50:56 -0500
message:
  The new layout is working.
  
  Commenting out the debug info for now.
  What I'm finding is a surprising number of repeated strings.
  Basically, common strings of length < 20, which then end up
  indexed by the RABIN code, but don't get copied in the output.
  (because RABIN is a 16-byte match, but the copy command has
  a minimum size of 20-bytes. Perhaps we need to change the
  code so that it doesn't try to index <20 character inserts.
  Or change the copy code so that it allows shorter copies.
-------------- next part --------------
=== modified file 'bzrlib/delta.h'

--- a/bzrlib/delta.h	2009-03-11 06:26:22 +0000
+++ b/bzrlib/delta.h	2009-03-18 21:50:56 +0000
@@ -47,7 +47,7 @@
  */
 extern struct delta_index *
 create_delta_index_from_delta(const struct source_info *delta,
-                              const struct delta_index *old);
+                              struct delta_index *old);
 /*
  * free_delta_index: free the index created by create_delta_index()
  *

=== modified file 'bzrlib/diff-delta.c'
--- a/bzrlib/diff-delta.c	2009-03-18 17:45:24 +0000
+++ b/bzrlib/diff-delta.c	2009-03-18 21:50:56 +0000
@@ -12,6 +12,8 @@
  * published by the Free Software Foundation.
  */
 
+#include <stdio.h>
+
 #include "delta.h"
 #include <stdlib.h>
 #include <string.h>
@@ -248,6 +250,12 @@
     /* Sentinel value to indicate the length of the last hash bucket */
     packed_hash[hsize] = packed_entry;
 
+    if (packed_entry - (struct index_entry *)mem
+        != num_entries + hsize*EXTRA_NULLS) {
+        // fprintf(stderr, "We expected %d entries, but created %d\n",
+        //         num_entries + hsize*EXTRA_NULLS,
+        //         (int)(packed_entry - (struct index_entry*)mem));
+    }
     assert(packed_entry - (struct index_entry *)mem
             == num_entries + hsize*EXTRA_NULLS);
     index->last_entry = (packed_entry - 1);
@@ -285,6 +293,7 @@
     }
 }
 
+
 struct delta_index *
 create_index_from_old_and_hash(const struct delta_index *old,
                                struct unpacked_index_entry **hash,
@@ -502,18 +511,234 @@
     return index;
 }
 
+
+struct delta_index *
+create_index_from_old_and_new_entries(const struct delta_index *old_index,
+                                      struct index_entry *entries,
+                                      unsigned int num_entries)
+{
+    unsigned int i, j, hsize, hmask, total_num_entries, found_null;
+    struct delta_index *index;
+    struct index_entry *entry, *packed_entry, **packed_hash;
+    struct index_entry *last_entry, null_entry = {0};
+    void *mem;
+    unsigned long memsize;
+
+    /* Determine index hash size.  Note that indexing skips the
+       first byte to allow for optimizing the Rabin's polynomial
+       initialization in create_delta(). */
+    total_num_entries = num_entries + old_index->num_entries;
+    hsize = total_num_entries / 4;
+    for (i = 4; (1u << i) < hsize && i < 31; i++);
+    hsize = 1 << i;
+    if (hsize < old_index->hash_mask) {
+        /* For some reason, there was a code path that would actually *shrink*
+         * the hash size. This screws with some later code, and in general, I
+         * think it better to make the hash bigger, rather than smaller. So
+         * we'll just force the size here.
+         * Possibly done by create_delta_index running into a
+         * limit_hash_buckets call, that ended up transitioning across a
+         * power-of-2. The cause isn't 100% clear, though.
+         */
+        hsize = old_index->hash_mask + 1;
+    }
+    hmask = hsize - 1;
+    // fprintf(stderr, "resizing index to insert %d entries into array"
+    //                 " with %d entries: %x => %x\n",
+    //         num_entries, old_index->num_entries, old_index->hash_mask, hmask);
+
+    memsize = sizeof(*index)
+        + sizeof(*packed_hash) * (hsize+1)
+        + sizeof(*packed_entry) * (total_num_entries + hsize*EXTRA_NULLS);
+    mem = malloc(memsize);
+    if (!mem) {
+        return NULL;
+    }
+    index = mem;
+    index->memsize = memsize;
+    index->hash_mask = hmask;
+    index->num_entries = total_num_entries;
+    index->last_src = old_index->last_src;
+
+    mem = index->hash;
+    packed_hash = mem;
+    mem = packed_hash + (hsize+1);
+    packed_entry = mem;
+
+    last_entry = entries + num_entries;
+    for (i = 0; i < hsize; i++) {
+        /*
+         * Coalesce all entries belonging in one hash bucket
+         * into consecutive array entries.
+         * The entries in old_index all come before 'entries'.
+         */
+        packed_hash[i] = packed_entry;
+        /* Copy any of the old entries across */
+        /* Would we rather use memcpy? */
+        if (hmask == old_index->hash_mask) {
+            found_null = 0;
+            for (entry = old_index->hash[i];
+                 entry < old_index->hash[i+1];
+                 ++entry) {
+                if (entry->ptr == NULL) {
+                    found_null = 1;
+                    continue;
+                } else if (found_null) {
+                    fprintf(stderr, "Found a non-null entry after a"
+                                    " NULL entry!\n"
+                                    " new offset %x"
+                                    " w/ value %x\n",
+                                    i, entry->val);
+                }
+                assert((entry->val & hmask) == i);
+                *packed_entry++ = *entry;
+            }
+        } else {
+            /* If we resized the index from this action, all of the old values
+             * will be found in the previous location, but they will end up
+             * spread across the new locations.
+             */
+            j = i & old_index->hash_mask;
+            found_null = 0;
+            for (entry = old_index->hash[j];
+                 entry < old_index->hash[j+1];
+                 ++entry) {
+                if (entry->ptr == NULL) {
+                    found_null = 1;
+                    continue;
+                } else if (found_null) {
+                    fprintf(stderr, "Found a non-null entry after a"
+                                    " NULL entry!\n"
+                                    " offset %x for new offset %x"
+                                    " w/ value %x\n",
+                                    j, i, entry->val);
+                }
+                assert((entry->val & old_index->hash_mask) == j);
+                if (!(j == i || j + old_index->hash_mask + 1 == i)) {
+                    fprintf(stderr, "Old hash entry %x"
+                                    " doesn't fit with new %x\n"
+                                    "old_mask: %x new_mask: %x\n",
+                                    j, i, old_index->hash_mask, hmask);
+                }
+                assert(j == i || j + old_index->hash_mask + 1 == i);
+                if ((entry->val & hmask) == i) {
+                    /* Any entries not picked up here will be picked up on the
+                     * next pass.
+                     */
+                    *packed_entry++ = *entry;
+                }
+            }
+        }
+        /* Now see if we need to insert any of the new entries.
+         * Note that loop ends up O(hsize*num_entries), so we expect that
+         * num_entries is always small.
+         * We also help a little bit by collapsing the entry range when the
+         * endpoints are inserted. However, an alternative would be to build a
+         * quick hash lookup for just the new entries.
+         * Testing shows that this list can easily get up to about 100
+         * entries, the tradeoff is a malloc, 1 pass over the entries, copying
+         * them into a sorted buffer, and a free() when done,
+         */
+        for (entry = entries; entry < last_entry; ++entry) {
+            if (entry->ptr != NULL && ((entry->val & hmask) == i)) {
+                /* A non-empty entry, which fits in this hash bucket */
+                *packed_entry++ = *entry;
+                *entry = null_entry;
+                if (entry == entries) {
+                    /* This is the first entry in the list, skip all of the
+                     * null entries from the front.
+                     */
+                    while (entries < last_entry && entries->ptr == 0) {
+                        ++entries;
+                        /* we can also skip processing any of these entries in
+                         * the current pass
+                         */
+                        ++entry;
+                    }
+                    /* But we need to adjust back one, because the loop will
+                     * increment as well.
+                     */
+                    --entry;
+                } else if (entry == (last_entry - 1)) {
+                    /* This is the last entry, remove all the trailing null
+                     * entries from the list.
+                     */
+                    --last_entry;
+                    while (last_entry > entries && last_entry->ptr == NULL) {
+                        --last_entry;
+                    }
+                    ++last_entry;
+                }
+            }
+        }
+        /* Now insert some extra nulls */
+        for (j = 0; j < EXTRA_NULLS; ++j) {
+            *packed_entry++ = null_entry;
+        }
+    }
+
+    /* Sentinel value to indicate the length of the last hash bucket */
+    packed_hash[hsize] = packed_entry;
+
+    if ((packed_entry - (struct index_entry *)mem)
+        != (total_num_entries + hsize*EXTRA_NULLS)) {
+        fprintf(stderr, "We expected %d entries, but created %d\n",
+                total_num_entries + hsize*EXTRA_NULLS,
+                (int)(packed_entry - (struct index_entry*)mem));
+        fflush(stderr);
+    }
+    assert((packed_entry - (struct index_entry *)mem)
+           == (total_num_entries + hsize * EXTRA_NULLS));
+    index->last_entry = (packed_entry - 1);
+    return index;
+}
+
+
+void
+get_text(char buff[128], const unsigned char *ptr)
+{
+    unsigned int i;
+    const unsigned char *start;
+    unsigned char cmd;
+    start = (ptr-RABIN_WINDOW-1);
+    cmd = *(start);
+    if (cmd < 0x80) {// This is likely to be an insert instruction
+        if (cmd < RABIN_WINDOW) {
+            cmd = RABIN_WINDOW;
+        }
+    } else {
+        /* This was either a copy [should never be] or it
+         * was a longer insert so the insert start happened at 16 more
+         * bytes back.
+         */
+        cmd = RABIN_WINDOW + 1;
+    }
+    if (cmd > 60) {
+        cmd = 60; /* Be friendly to 80char terms */
+    }
+    /* Copy the 1 byte command, and 4 bytes after the insert */
+    cmd += 5;
+    memcpy(buff, start, cmd);
+    buff[cmd] = 0;
+    for (i = 0; i < cmd; ++i) {
+        if (buff[i] == '\n') {
+            buff[i] = 'N';
+        } else if (buff[i] == '\t') {
+            buff[i] = 'T';
+        }
+    }
+}
+
 struct delta_index *
 create_delta_index_from_delta(const struct source_info *src,
-                              const struct delta_index *old)
+                              struct delta_index *old_index)
 {
-    unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
-    unsigned int total_num_entries;
+    unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;
+    unsigned int hash_offset;
     const unsigned char *data, *buffer, *top;
     unsigned char cmd;
-    struct delta_index *index;
-    struct unpacked_index_entry *entry, **hash;
-    void *mem;
-    unsigned long memsize;
+    struct delta_index *new_index;
+    struct index_entry *entry, *entries, *old_entry;
 
     if (!src->buf || !src->size)
         return NULL;
@@ -527,47 +752,21 @@
        actual number will be recomputed during processing.
        */
 
-    num_entries = (src->size - 1)  / RABIN_WINDOW;
-    if (old != NULL)
-        total_num_entries = num_entries + old->num_entries;
-    else
-        total_num_entries = num_entries;
-    hsize = total_num_entries / 4;
-    for (i = 4; (1u << i) < hsize && i < 31; i++);
-    hsize = 1 << i;
-    hmask = hsize - 1;
-
-    /* allocate lookup index */
-    memsize = sizeof(*hash) * hsize +
-          sizeof(*entry) * total_num_entries;
-    mem = malloc(memsize);
-    if (!mem)
-        return NULL;
-    hash = mem;
-    mem = hash + hsize;
-    entry = mem;
-
-    memset(hash, 0, hsize * sizeof(*hash));
-
-    /* allocate an array to count hash num_entries */
-    hash_count = calloc(hsize, sizeof(*hash_count));
-    if (!hash_count) {
-        free(hash);
-        return NULL;
-    }
+    max_num_entries = (src->size - 1)  / RABIN_WINDOW;
+
+    /* allocate an array to hold whatever entries we find */
+    entries = malloc(sizeof(*entry) * max_num_entries);
+    if (!entries) /* malloc failure */
+        return NULL;
 
     /* then populate the index for the new data */
-    /* The create_delta_index code starts at the end and works backward,
-     * because it is easier to update the entry pointers (you always update the
-     * head, rather than updating the tail).
-     * However, we can't walk the delta structure that way.
-     */
     prev_val = ~0;
     data = buffer;
     /* source size */
     get_delta_hdr_size(&data, top);
     /* target size */
     get_delta_hdr_size(&data, top);
+    entry = entries; /* start at the first slot */
     num_entries = 0; /* calculate the real number of entries */
     while (data < top) {
         cmd = *data++;
@@ -594,27 +793,16 @@
                 if (val != prev_val) {
                     /* Only keep the first of consecutive data */
                     prev_val = val;
-                    i = val & hmask;
-                    entry->entry.ptr = data + RABIN_WINDOW;
-                    entry->entry.val = val;
-                    entry->entry.src = src;
-                    entry->next = NULL;
-                    /* Now we have to insert this entry at the end of the hash
-                     * map
-                     */
-                    if (hash[i] == NULL) {
-                        hash[i] = entry;
-                    } else {
-                        struct unpacked_index_entry *this;
-                        for (this = hash[i];
-                            this->next != NULL;
-                            this = this->next) { /* No action */ }
-                        this->next = entry;
-                        this = entry;
-                    }
-                    hash_count[i]++;
+                    num_entries++;
+                    entry->ptr = data + RABIN_WINDOW;
+                    entry->val = val;
+                    entry->src = src;
                     entry++;
-                    num_entries++;
+                    if (num_entries > max_num_entries) {
+                        /* We ran out of entry room, something is really wrong
+                         */
+                        break;
+                    }
                 }
             }
             /* Move the data pointer by whatever remainder is left */
@@ -630,49 +818,71 @@
     }
     if (data != top) {
         /* Something was wrong with this delta */
-        free(hash);
-        free(hash_count);
+        free(entries);
         return NULL;
     }
     if (num_entries == 0) {
         /** Nothing to index **/
-        free(hash);
-        free(hash_count);
+        free(entries);
         return NULL;
     }
-    if (old != NULL) {
-        if (hmask == old->hash_mask) {
-            /* The total hash table size didn't change, which means that
-             * entries will end up in the same pages. We can do bulk-copying to
-             * get the final output
-             */
-            index = create_index_from_old_and_hash(old, hash, hsize,
-                                                   num_entries);
-            free(hash_count);
-            free(hash);
-            if (!index) {
-                return NULL;
-            }
-            index->last_src = src;
-            return index;
-        } else {
-            total_num_entries = num_entries + old->num_entries;
-            include_entries_from_index(hash, hash_count, hsize, entry, old);
-        }
+    assert(old_index != NULL);
+    old_index->last_src = src;
+    /* See if we can fill in these values into the holes in the array */
+    entry = entries;
+    num_inserted = 0;
+    for (; num_entries > 0; --num_entries, ++entry) {
+        hash_offset = (entry->val & old_index->hash_mask);
+        /* The basic structure is a hash => packed_entries that fit in that
+         * hash bucket. Things are structured such that the hash-pointers are
+         * strictly ordered. So we start by pointing to the next pointer, and
+         * walk back until we stop getting NULL targets, and then go back
+         * forward. If there are no NULL targets, then we know because
+         * entry->ptr will not be NULL.
+         */
+        old_entry = old_index->hash[hash_offset + 1];
+        old_entry--;
+        while (old_entry->ptr == NULL
+               && old_entry >= old_index->hash[hash_offset]) {
+            old_entry--;
+        }
+        old_entry++;
+        if (old_entry->ptr != NULL
+            || old_entry >= old_index->hash[hash_offset + 1]) {
+            // char buff[128];
+            // get_text(buff, entry->ptr);
+            // fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",
+            //         hash_offset, entry->val, buff);
+            // for (old_entry = old_index->hash[hash_offset];
+            //      old_entry < old_index->hash[hash_offset+1];
+            //      ++old_entry) {
+            //     get_text(buff, old_entry->ptr);
+            //     fprintf(stderr, "  [%2d] %8x %8x: '%s'\n",
+            //             (int)(old_entry - old_index->hash[hash_offset]),
+            //             old_entry->val, old_entry->ptr, buff);
+            // }
+            break;
+        }
+        num_inserted++;
+        *old_entry = *entry;
+        /* For entries which we *do* manage to insert into old_index, we don't
+         * want them double copied into the final output.
+         */
+        old_index->num_entries++;
+    }
+    if (num_entries > 0) {
+        /* We couldn't fit the new entries into the old index, so allocate a
+         * new one, and fill it with stuff.
+         */
+        // fprintf(stderr, "inserted %d before resize\n", num_inserted);
+        new_index = create_index_from_old_and_new_entries(old_index,
+            entry, num_entries);
     } else {
-        total_num_entries = num_entries;
-    }
-
-    total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
-                                           total_num_entries);
-    free(hash_count);
-    index = pack_delta_index(hash, hsize, total_num_entries);
-    free(hash);
-    if (!index) {
-        return NULL;
-    }
-    index->last_src = src;
-    return index;
+        new_index = NULL;
+        // fprintf(stderr, "inserted %d without resizing\n", num_inserted);
+    }
+    free(entries);
+    return new_index;
 }
 
 void free_delta_index(struct delta_index *index)

=== modified file 'bzrlib/tests/test__groupcompress_pyx.py'
--- a/bzrlib/tests/test__groupcompress_pyx.py	2009-03-11 06:50:59 +0000
+++ b/bzrlib/tests/test__groupcompress_pyx.py	2009-03-18 21:50:56 +0000
@@ -82,6 +82,16 @@
 common with the next text
 """
 
+_fourth_text = """\
+123456789012345
+same rabin hash
+123456789012345
+same rabin hash
+123456789012345
+same rabin hash
+123456789012345
+same rabin hash
+"""
 
 class Test_GroupCompress(tests.TestCase):
     """Direct tests for the compiled extension."""
@@ -191,15 +201,17 @@
 
     def test_delta_with_delta_bytes(self):
         di = self._gc_module.DeltaIndex()
+        source = _first_text
         di.add_source(_first_text, 0)
         self.assertEqual(len(_first_text), di._source_offset)
         delta = di.make_delta(_second_text)
         self.assertEqual('Dh\tsome more\x91\x019'
                          '&previous text\nand has some extra text\n', delta)
         di.add_delta_source(delta, 0)
+        source += delta
         self.assertEqual(len(_first_text) + len(delta), di._source_offset)
-        third_delta = di.make_delta(_third_text)
-        result = self._gc_module.apply_delta(_first_text + delta, third_delta)
+        second_delta = di.make_delta(_third_text)
+        result = self._gc_module.apply_delta(source, second_delta)
         self.assertEqualDiff(_third_text, result)
         # We should be able to match against the 'previous text\nand has some...'
         # that was part of the delta bytes
@@ -207,4 +219,31 @@
         # enough to match in the original text, and those bytes are not present
         # in the delta for the second text.
         self.assertEqual('z\x85\x01\x90\x14\x1chas some in common with the '
+                         '\x91T&\x03and\x91\x18,', second_delta)
+        # Add this delta, and create a new delta for the same text. We should
+        # find the remaining text, and only insert the short 'and' text.
+        di.add_delta_source(second_delta, 0)
+        source += second_delta
+        third_delta = di.make_delta(_third_text)
+        result = self._gc_module.apply_delta(source, third_delta)
+        self.assertEqualDiff(_third_text, result)
+        self.assertEqual('\xa6\x01\x85\x01\x90\x14\x91\x80\x1c'
                          '\x91T&\x03and\x91\x18,', third_delta)
+        # Now create a delta, which we know won't be able to be 'fit' into the
+        # existing index
+        fourth_delta = di.make_delta(_fourth_text)
+        self.assertEqual(_fourth_text,
+                         self._gc_module.apply_delta(source, fourth_delta))
+        self.assertEqual('\xa6\x01\x80\x01'
+                         '\x7f123456789012345\nsame rabin hash\n'
+                         '123456789012345\nsame rabin hash\n'
+                         '123456789012345\nsame rabin hash\n'
+                         '123456789012345\nsame rabin hash'
+                         '\x01\n', fourth_delta)
+        di.add_delta_source(fourth_delta, 0)
+        source += fourth_delta
+        # With the next delta, everything should be found
+        fifth_delta = di.make_delta(_fourth_text)
+        self.assertEqual(_fourth_text,
+                         self._gc_module.apply_delta(source, fifth_delta))
+        self.assertEqual('\xac\x02\x80\x01\x91\xab\x7f\x01\n', fifth_delta)