Rev 3897: When expanding an index put the entries into a hash. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/gc_delta_index_room
John Arbash Meinel
john at arbash-meinel.com
Thu Mar 19 03:52:50 GMT 2009
At http://bzr.arbash-meinel.com/branches/bzr/brisbane/gc_delta_index_room
------------------------------------------------------------
revno: 3897
revision-id: john at arbash-meinel.com-20090319035245-54xrfw2ztsjzbdo3
parent: john at arbash-meinel.com-20090318224524-ve32it3ddqfzvi6q
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: gc_delta_index_room
timestamp: Wed 2009-03-18 22:52:45 -0500
message:
When expanding an index put the entries into a hash.
Rather than iterating all entries for every hash index, create another
mini hash, and pull them out of there.
-------------- next part --------------
=== modified file 'bzrlib/diff-delta.c'
--- a/bzrlib/diff-delta.c 2009-03-18 22:45:24 +0000
+++ b/bzrlib/diff-delta.c 2009-03-19 03:52:45 +0000
@@ -25,7 +25,7 @@
#define RABIN_SHIFT 23
#define RABIN_WINDOW 16
-/* The hash map is sized to put 4 entries per bucket, this gives us 2 blank
+/* The hash map is sized to put 4 entries per bucket, this gives us 3 blank
* spaces.
*/
#define EXTRA_NULLS 3
@@ -128,6 +128,11 @@
unsigned int val;
};
+struct index_entry_linked_list {
+ struct index_entry *p_entry;
+ struct index_entry_linked_list *next;
+};
+
struct unpacked_index_entry {
struct index_entry entry;
struct unpacked_index_entry *next;
@@ -424,8 +429,9 @@
}
-struct delta_index * create_delta_index(const struct source_info *src,
- const struct delta_index *old)
+struct delta_index *
+create_delta_index(const struct source_info *src,
+ const struct delta_index *old)
{
unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
unsigned int total_num_entries;
@@ -514,18 +520,76 @@
return index;
}
+/* Take some entries, and put them into a custom hash.
+ * @param entries A list of entries, sorted by position in file
+ * @param num_entries Length of entries
+ * @param out_hsize The maximum size of the hash, the final size will be
+ * returned here
+ */
+struct index_entry_linked_list **
+_put_entries_into_hash(struct index_entry *entries, unsigned int num_entries,
+ unsigned int *out_hsize)
+{
+ unsigned int i, hsize, hmask, memsize;
+ struct index_entry *entry, *last_entry;
+ struct index_entry_linked_list *out_entry, **hash;
+ void *mem;
+
+ /* Avoid getting collisions, as it makes the code faster, and this will be
+ * a temp buffer anyway.
+ */
+ hsize = num_entries * 2;
+ for (i = 4; (1u << i) < hsize && i < 31; i++);
+ hsize = 1 << i;
+ if (hsize > *out_hsize) {
+ hsize = *out_hsize;
+ }
+ hmask = hsize - 1;
+ *out_hsize = hsize;
+
+ memsize = sizeof(*hash) * hsize +
+ sizeof(*out_entry) * num_entries;
+ mem = malloc(memsize);
+ if (!mem)
+ return NULL;
+ hash = mem;
+ mem = hash + hsize;
+ out_entry = mem;
+
+ memset(hash, 0, sizeof(*hash)*(hsize+1));
+
+ /* We know that entries are in the order we want in the output, but they
+ * aren't "grouped" by hash bucket yet.
+ */
+ last_entry = entries + num_entries;
+ for (entry = entries + num_entries - 1; entry >= entries; --entry) {
+ i = entry->val & hmask;
+ out_entry->p_entry = entry;
+ out_entry->next = hash[i];
+ /* TODO: Remove entries that have identical vals, or at least filter
+ * the map a little bit.
+ * if (hash[i] != NULL) {
+ * }
+ */
+ hash[i] = out_entry;
+ }
+ return hash;
+}
+
struct delta_index *
create_index_from_old_and_new_entries(const struct delta_index *old_index,
struct index_entry *entries,
unsigned int num_entries)
{
- unsigned int i, j, hsize, hmask, total_num_entries, found_null;
+ unsigned int i, j, hsize, hmask, total_num_entries;
struct delta_index *index;
struct index_entry *entry, *packed_entry, **packed_hash;
struct index_entry *last_entry, null_entry = {0};
void *mem;
unsigned long memsize;
+ struct index_entry_linked_list *unpacked_entry, **mini_hash;
+ unsigned int mini_hsize, mini_hmask;
/* Determine index hash size. Note that indexing skips the
first byte to allow for optimizing the Rabin's polynomial
@@ -568,6 +632,14 @@
mem = packed_hash + (hsize+1);
packed_entry = mem;
+ mini_hsize = hsize;
+ mini_hash = _put_entries_into_hash(entries, num_entries, &mini_hsize);
+ if (mini_hash == NULL) {
+ free(index);
+ return NULL;
+ }
+ mini_hmask = mini_hsize - 1;
+ assert(mini_hmask <= hmask);
last_entry = entries + num_entries;
for (i = 0; i < hsize; i++) {
/*
@@ -620,36 +692,11 @@
* entries, the tradeoff is a malloc, 1 pass over the entries, copying
* them into a sorted buffer, and a free() when done,
*/
- for (entry = entries; entry < last_entry; ++entry) {
- if (entry->ptr != NULL && ((entry->val & hmask) == i)) {
- /* A non-empty entry, which fits in this hash bucket */
- *packed_entry++ = *entry;
- *entry = null_entry;
- if (entry == entries) {
- /* This is the first entry in the list, skip all of the
- * null entries from the front.
- */
- while (entries < last_entry && entries->ptr == 0) {
- ++entries;
- /* we can also skip processing any of these entries in
- * the current pass
- */
- ++entry;
- }
- /* But we need to adjust back one, because the loop will
- * increment as well.
- */
- --entry;
- } else if (entry == (last_entry - 1)) {
- /* This is the last entry, remove all the trailing null
- * entries from the list.
- */
- --last_entry;
- while (last_entry > entries && last_entry->ptr == NULL) {
- --last_entry;
- }
- ++last_entry;
- }
+ for (unpacked_entry = mini_hash[i & mini_hmask];
+ unpacked_entry;
+ unpacked_entry = unpacked_entry->next) {
+ if ((unpacked_entry->p_entry->val & hmask) == i) {
+ *packed_entry++ = *(unpacked_entry->p_entry);
}
}
/* Now insert some extra nulls */
@@ -657,6 +704,7 @@
*packed_entry++ = null_entry;
}
}
+ free(mini_hash);
/* Sentinel value to indicate the length of the last hash bucket */
packed_hash[hsize] = packed_entry;
More information about the bazaar-commits
mailing list