Rev 3889: (broken, in progress), change the data structures to allow for inserting small deltas. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/gc_delta_index_room

Tue Mar 17 22:20:34 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/gc_delta_index_room

------------------------------------------------------------
revno: 3889
revision-id: john at arbash-meinel.com-20090317222027-tko3r2lzg6blf96y
parent: john at arbash-meinel.com-20090317201340-amjnj1wl78iwcxae
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: gc_delta_index_room
timestamp: Tue 2009-03-17 17:20:27 -0500
message:
  (broken, in progress), change the data structures to allow for inserting small deltas.
  By adding 2 blank spots per hash spot, we can normally update the structure without
  having to resize the whole thing.
  We'll probably want to tune how many extra slots to provide.
  The general work is probably good, but I need to finish handling the case when we
  really *do* need to resize the structure.
-------------- next part --------------
=== modified file 'bzrlib/diff-delta.c'

--- a/bzrlib/diff-delta.c	2009-03-11 06:50:59 +0000
+++ b/bzrlib/diff-delta.c	2009-03-17 22:20:27 +0000
@@ -203,14 +203,16 @@
     struct unpacked_index_entry *entry;
     struct delta_index *index;
     struct index_entry *packed_entry, **packed_hash;
+    struct index_entry null_entry = {0};
     void *mem;
     /*
      * Now create the packed index in array form
      * rather than linked lists.
+     * Leave a 2-entry gap for inserting more entries between the groups
      */
     memsize = sizeof(*index)
         + sizeof(*packed_hash) * (hsize+1)
-        + sizeof(*packed_entry) * num_entries;
+        + sizeof(*packed_entry) * (num_entries + hsize*2);
     mem = malloc(memsize);
     if (!mem) {
         return NULL;
@@ -232,8 +234,12 @@
          * into consecutive array entries.
          */
         packed_hash[i] = packed_entry;
-        for (entry = hash[i]; entry; entry = entry->next)
+        for (entry = hash[i]; entry; entry = entry->next) {
             *packed_entry++ = entry->entry;
+        }
+        /* Now add 2 'NULL' entries that we can use for future expansion. */
+        *packed_entry++ = null_entry;
+        *packed_entry++ = null_entry;
     }
 
     /* Sentinel value to indicate the length of the last hash bucket */
@@ -273,6 +279,31 @@
     }
 }
 
+int
+insert_entries_into_index(struct delta_index *old,
+                          struct unpacked_index_entry **hash,
+                          unsigned int hsize,
+                          unsigned int num_entries)
+{
+    unsigned int i;
+    for (i = 0; i < hsize; ++i) {
+        for (entry = hash[i]; entry; entry = entry->next) {
+            /* We have an entry to insert, so flush the copy buffer */
+            if (total_copy > 0) {
+                assert(copy_to_start != NULL);
+                assert(copy_from_start != NULL);
+                memcpy(copy_to_start, copy_from_start,
+                       total_copy * sizeof(struct index_entry));
+                bytes_copied += (total_copy * sizeof(struct index_entry));
+                total_copy = 0;
+                copy_from_start = copy_to_start = NULL;
+                num_ops += 1;
+            }
+            *packed_entry++ = entry->entry;
+        }
+    }
+}
+
 struct delta_index *
 create_index_from_old_and_hash(const struct delta_index *old,
                                struct unpacked_index_entry **hash,
@@ -461,18 +492,55 @@
     return index;
 }
 
+
+struct delta_index *
+create_index_from_old_and_new_entries(const struct delta_index *old_index,
+                                      const struct index_entry *entries,
+                                      unsigned int num_entries)
+{
+    unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
+    unsigned int total_num_entries;
+    const unsigned char *data, *buffer;
+    struct delta_index *index;
+    struct index_entry *entry, *packed_entry, **packed_hash;
+    void *mem;
+    unsigned long memsize;
+
+    /* Determine index hash size.  Note that indexing skips the
+       first byte to allow for optimizing the Rabin's polynomial
+       initialization in create_delta(). */
+    total_num_entries = num_entries + old_index->num_entries;
+    hsize = total_num_entries / 4;
+    for (i = 4; (1u << i) < hsize && i < 31; i++);
+    hsize = 1 << i;
+    hmask = hsize - 1;
+
+    memsize = sizeof(*index)
+        + sizeof(*packed_hash) * (hsize+1)
+        + sizeof(*packed_entry) * (total_num_entries + hsize*2);
+    mem = malloc(memsize);
+    if (!mem) {
+        return NULL;
+    }
+    hash = mem;
+    mem = hash + hsize;
+    entry = mem;
+
+    memset(hash, 0, hsize * sizeof(*hash));
+
+}
+
+
 struct delta_index *
 create_delta_index_from_delta(const struct source_info *src,
-                              const struct delta_index *old)
+                              struct delta_index *old_index)
 {
-    unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
-    unsigned int total_num_entries;
+    unsigned int i, num_entries, max_num_entries, prev_val;
+    unsigned int hash_offset;
     const unsigned char *data, *buffer, *top;
     unsigned char cmd;
-    struct delta_index *index;
-    struct unpacked_index_entry *entry, **hash;
-    void *mem;
-    unsigned long memsize;
+    struct delta_index *new_index;
+    struct index_entry *entry, *entries, *old_entry;
 
     if (!src->buf || !src->size)
         return NULL;
@@ -486,47 +554,21 @@
        actual number will be recomputed during processing.
        */
 
-    num_entries = (src->size - 1)  / RABIN_WINDOW;
-    if (old != NULL)
-        total_num_entries = num_entries + old->num_entries;
-    else
-        total_num_entries = num_entries;
-    hsize = total_num_entries / 4;
-    for (i = 4; (1u << i) < hsize && i < 31; i++);
-    hsize = 1 << i;
-    hmask = hsize - 1;
-
-    /* allocate lookup index */
-    memsize = sizeof(*hash) * hsize +
-          sizeof(*entry) * total_num_entries;
-    mem = malloc(memsize);
-    if (!mem)
-        return NULL;
-    hash = mem;
-    mem = hash + hsize;
-    entry = mem;
-
-    memset(hash, 0, hsize * sizeof(*hash));
-
-    /* allocate an array to count hash num_entries */
-    hash_count = calloc(hsize, sizeof(*hash_count));
-    if (!hash_count) {
-        free(hash);
-        return NULL;
-    }
+    max_num_entries = (src->size - 1)  / RABIN_WINDOW;
+
+    /* allocate an array to hold whatever entries we find */
+    entries = malloc(sizeof(*entry) * max_num_entries);
+    if (!entries) /* malloc failure */
+        return NULL;
 
     /* then populate the index for the new data */
-    /* The create_delta_index code starts at the end and works backward,
-     * because it is easier to update the entry pointers (you always update the
-     * head, rather than updating the tail).
-     * However, we can't walk the delta structure that way.
-     */
     prev_val = ~0;
     data = buffer;
     /* source size */
     get_delta_hdr_size(&data, top);
     /* target size */
     get_delta_hdr_size(&data, top);
+    entry = entries; /* start at the first slot */
     num_entries = 0; /* calculate the real number of entries */
     while (data < top) {
         cmd = *data++;
@@ -553,27 +595,16 @@
                 if (val != prev_val) {
                     /* Only keep the first of consecutive data */
                     prev_val = val;
-                    i = val & hmask;
-                    entry->entry.ptr = data + RABIN_WINDOW;
-                    entry->entry.val = val;
-                    entry->entry.src = src;
-                    entry->next = NULL;
-                    /* Now we have to insert this entry at the end of the hash
-                     * map
-                     */
-                    if (hash[i] == NULL) {
-                        hash[i] = entry;
-                    } else {
-                        struct unpacked_index_entry *this;
-                        for (this = hash[i];
-                            this->next != NULL;
-                            this = this->next) { /* No action */ }
-                        this->next = entry;
-                        this = entry;
+                    num_entries++;
+                    entry->ptr = data + RABIN_WINDOW;
+                    entry->val = val;
+                    entry->src = src;
+                    entry++;
+                    if (num_entries > max_num_entries) {
+                        /* We ran out of entry room, something is really wrong
+                         */
+                        break;
                     }
-                    hash_count[i]++;
-                    entry++;
-                    num_entries++;
                 }
             }
             /* Move the data pointer by whatever remainder is left */
@@ -589,49 +620,54 @@
     }
     if (data != top) {
         /* Something was wrong with this delta */
-        free(hash);
-        free(hash_count);
+        free(entries);
         return NULL;
     }
     if (num_entries == 0) {
         /** Nothing to index **/
-        free(hash);
-        free(hash_count);
+        free(entries);
         return NULL;
     }
-    if (old != NULL) {
-        if (hmask == old->hash_mask) {
-            /* The total hash table size didn't change, which means that
-             * entries will end up in the same pages. We can do bulk-copying to
-             * get the final output
-             */
-            index = create_index_from_old_and_hash(old, hash, hsize,
-                                                   num_entries);
-            free(hash_count);
-            free(hash);
-            if (!index) {
-                return NULL;
-            }
-            index->last_src = src;
-            return index;
-        } else {
-            total_num_entries = num_entries + old->num_entries;
-            include_entries_from_index(hash, hash_count, hsize, entry, old);
-        }
+    assert(old_index != NULL);
+    /* See if we can fill in these values into the holes in the array */
+    entry = entries;
+    for (i = 0; i < num_entries; ++i, ++entry) {
+        hash_offset = entry->val & old_index->hmask;
+        /* The basic structure is a hash => packed_entries that fit in that
+         * hash bucket. Things are structured such that the hash-pointers are
+         * strictly ordered. So we start by pointing to the next pointer, and
+         * walk back until we stop getting NULL targets, and then go back
+         * forward. If there are no NULL targets, then we know because
+         * entry->src will not be NULL.
+         */
+        old_entry = old_index->hash[hash_offset + 1];
+        old_entry--;
+        while (old_entry->src == NULL) {
+            old_entry--;
+        }
+        old_entry++;
+        if (old_entry->src != NULL) { /* there are no open spots */
+            /* We need to realloc the hash map and start from scratch */
+            break;
+        }
+        *old_entry = *entry;
+        /* For entries which we *do* manage to insert into old_index, we don't
+         * want them double copied into the final output.
+         */
+        num_entries--;
+    }
+    if (i < num_entries) {
+        /* We couldn't fit the new entries into the old index, so allocate a
+         * new one, and fill it with stuff.
+         */
+        struct delta_index * new_index;
+        new_index = create_index_from_old_and_new_entries(old_index,
+            entry, num_entries);
     } else {
-        total_num_entries = num_entries;
-    }
-
-    total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
-                                           total_num_entries);
-    free(hash_count);
-    index = pack_delta_index(hash, hsize, total_num_entries);
-    free(hash);
-    if (!index) {
-        return NULL;
-    }
-    index->last_src = src;
-    return index;
+        new_index = old_index;
+    }
+    free(entries);
+    return old_index;
 }
 
 void free_delta_index(struct delta_index *index)
@@ -731,7 +767,8 @@
              *       You end up getting a lot more collisions in the hash,
              *       which doesn't actually lead to a entry->val match.
              */
-            for (entry = index->hash[i]; entry < index->hash[i+1];
+            for (entry = index->hash[i];
+                 entry < index->hash[i+1] && entry->src != NULL;
                  entry++) {
                 const unsigned char *ref;
                 const unsigned char *src;