Rev 91: Change the internals to allow delta indexes to be expanded with new source data. in http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin

John Arbash Meinel john at arbash-meinel.com
Tue Mar 3 18:06:10 GMT 2009


At http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin

------------------------------------------------------------
revno: 91
revision-id: john at arbash-meinel.com-20090303180544-mfgw9jsndwiwj047
parent: john at arbash-meinel.com-20090303163107-l4j0114btw2efmjp
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: rabin
timestamp: Tue 2009-03-03 12:05:44 -0600
message:
  Change the internals to allow delta indexes to be expanded with new source data.
  Now when adding a new source, the old index entries are included in the new structure.
  This generally seems to be better than having multiple indexes, as it improves the
  efficiency of the internal hash map, and avoids extra iterating.
  Bring back the _FAST flag. At the moment, with _FAST=True, doing bzr pack is about
  37s rather than 1min, and gives 9.7MB texts, rather than 8.2MB or so.
  So at the moment, it is still a useful flag to have.
-------------- next part --------------
=== modified file '_groupcompress_pyx.pyx'
--- a/_groupcompress_pyx.pyx	2009-03-03 16:31:07 +0000
+++ b/_groupcompress_pyx.pyx	2009-03-03 18:05:44 +0000
@@ -34,7 +34,7 @@
         unsigned long src_size
         unsigned int hash_mask
         # struct index_entry *hash[]
-    delta_index * create_delta_index(source_info *src)
+    delta_index * create_delta_index(source_info *src, delta_index *old)
     void free_delta_index(delta_index *index)
     void *create_delta(delta_index **indexes,
              unsigned int num_indexes,
@@ -88,33 +88,29 @@
     # cdef readonly list _sources
     cdef readonly object _sources
     cdef source_info *_source_infos
-    cdef delta_index **_indexes
-    cdef readonly unsigned int _num_indexes
-    cdef readonly unsigned int _max_num_indexes
+    cdef delta_index *_index
     cdef readonly unsigned int _max_num_sources
     cdef public unsigned long _source_offset
 
     def __repr__(self):
-        return '%s(%d, %d, %d)' % (self.__class__.__name__,
-            len(self._sources), self._source_offset,
-            self._num_indexes)
+        return '%s(%d, %d)' % (self.__class__.__name__,
+            len(self._sources), self._source_offset)
 
     def __init__(self, source=None):
         self._sources = []
-        self._max_num_indexes = 1024
-        self._indexes = <delta_index**>safe_malloc(sizeof(delta_index*)
-                                                   * self._max_num_indexes)
-        self._max_num_sources = 1024
+        self._index = NULL
+        self._max_num_sources = 4096
         self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
                                                         * self._max_num_sources)
-        self._num_indexes = 0
         self._source_offset = 0
 
         if source is not None:
             self.add_source(source, 0)
 
     def __dealloc__(self):
-        self._ensure_no_indexes()
+        if self._index != NULL:
+            free_delta_index(self._index)
+            self._index = NULL
         safe_free(<void **>&self._source_infos)
 
     def add_source(self, source, unadded_bytes):
@@ -151,39 +147,20 @@
         #       create_delta_index (For example, we could always reserve enough
         #       space to hash a 4MB string, etc.)
         src.agg_offset = self._source_offset + unadded_bytes
-        index = create_delta_index(src)
+        index = create_delta_index(src, self._index)
         self._source_offset = src.agg_offset + src.size
         if index != NULL:
-            num_indexes = self._num_indexes + 1
-            if num_indexes >= self._max_num_indexes:
-                self._expand_indexes()
-            self._indexes[self._num_indexes] = index
-            self._num_indexes = num_indexes
-
-    cdef _expand_indexes(self):
-        self._max_num_indexes = self._max_num_indexes * 2
-        self._indexes = <delta_index **>safe_realloc(self._indexes,
-                                                sizeof(delta_index *)
-                                                * self._max_num_indexes)
+            free_delta_index(self._index)
+            self._index = index
 
     cdef _expand_sources(self):
+        raise RuntimeError('if we move self._source_infos, then we need to'
+                           ' change all of the index pointers as well.')
         self._max_num_sources = self._max_num_sources * 2
         self._source_infos = <source_info *>safe_realloc(self._source_infos,
                                                 sizeof(source_info)
                                                 * self._max_num_sources)
 
-    cdef _ensure_no_indexes(self):
-        cdef int i
-
-        if self._indexes != NULL:
-            for i from 0 <= i < self._num_indexes:
-                free_delta_index(self._indexes[i])
-                self._indexes[i] = NULL
-            free(self._indexes)
-            self._indexes = NULL
-            self._max_num_indexes = 0
-            self._num_indexes = 0
-
     def make_delta(self, target_bytes, max_delta_size=0):
         """Create a delta from the current source to the target bytes."""
         cdef char *target
@@ -191,7 +168,7 @@
         cdef void * delta
         cdef unsigned long delta_size
 
-        if self._num_indexes == 0:
+        if self._index == NULL:
             return None
 
         if not PyString_CheckExact(target_bytes):
@@ -203,7 +180,7 @@
         # TODO: inline some of create_delta so we at least don't have to double
         #       malloc, and can instead use PyString_FromStringAndSize, to
         #       allocate the bytes into the final string
-        delta = create_delta(self._indexes, self._num_indexes,
+        delta = create_delta(&self._index, 1,
                              target, target_size,
                              &delta_size, max_delta_size)
         result = None

=== modified file 'delta.h'
--- a/delta.h	2009-03-03 16:31:07 +0000
+++ b/delta.h	2009-03-03 18:05:44 +0000
@@ -23,7 +23,8 @@
  * using free_delta_index().
  */
 extern struct delta_index *
-create_delta_index(const struct source_info *src);
+create_delta_index(const struct source_info *src,
+				   const struct delta_index *old);
 
 /*
  * free_delta_index: free the index created by create_delta_index()

=== modified file 'diff-delta.c'
--- a/diff-delta.c	2009-03-03 16:31:07 +0000
+++ b/diff-delta.c	2009-03-03 18:05:44 +0000
@@ -129,6 +129,7 @@
 	unsigned int hash_mask; /* val & hash_mask gives the hash index for a given
 							   entry */
 	unsigned int num_entries; /* The total number of entries in this index */
+	struct index_entry *last_entry; /* Pointer to the last valid entry */
 	struct index_entry *hash[];
 };
 
@@ -193,7 +194,7 @@
 
 static struct delta_index *
 pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,
-				 unsigned int entries)
+				 unsigned int num_entries)
 {
 	unsigned int i, memsize;
 	struct unpacked_index_entry *entry;
@@ -206,7 +207,7 @@
 	 */
 	memsize = sizeof(*index)
 		+ sizeof(*packed_hash) * (hsize+1)
-		+ sizeof(*packed_entry) * entries;
+		+ sizeof(*packed_entry) * num_entries;
 	mem = malloc(memsize);
 	if (!mem) {
 		return NULL;
@@ -215,7 +216,7 @@
 	index = mem;
 	index->memsize = memsize;
 	index->hash_mask = hsize - 1;
-	index->num_entries = entries;
+	index->num_entries = num_entries;
 
 	mem = index->hash;
 	packed_hash = mem;
@@ -235,14 +236,41 @@
 	/* Sentinel value to indicate the length of the last hash bucket */
 	packed_hash[hsize] = packed_entry;
 
-	assert(packed_entry - (struct index_entry *)mem == entries);
+	assert(packed_entry - (struct index_entry *)mem == num_entries);
+	index->last_entry = (packed_entry - 1);
 	return index;
 }
 
-
-struct delta_index * create_delta_index(const struct source_info *src)
-{
-	unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
+void include_entries_from_index(struct unpacked_index_entry **hash,
+								unsigned int *hash_count,
+								unsigned int hsize,
+								struct unpacked_index_entry *entry,
+								const struct delta_index *old)
+{
+	unsigned int i, hmask, masked_val;
+	struct index_entry *old_entry;
+
+	hmask = hsize - 1;
+	/* Iterate backwards through the existing entries
+	 * This is so that matches early in the file end up being the first pointer
+	 * in the linked list.
+	 */
+	for (old_entry = old->last_entry, i = 0; i < old->num_entries;
+		 i++, old_entry--) {
+		masked_val = old_entry->val & hmask;
+		entry->entry = *old_entry;
+		entry->next = hash[masked_val];
+		hash[masked_val] = entry++;
+		hash_count[masked_val]++;
+	}
+}
+
+
+struct delta_index * create_delta_index(const struct source_info *src,
+										const struct delta_index *old)
+{
+	unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
+	unsigned int total_num_entries;
 	const unsigned char *data, *buffer;
 	struct delta_index *index;
 	struct unpacked_index_entry *entry, **hash;
@@ -256,15 +284,19 @@
 	/* Determine index hash size.  Note that indexing skips the
 	   first byte to allow for optimizing the Rabin's polynomial
 	   initialization in create_delta(). */
-	entries = (src->size - 1)  / RABIN_WINDOW;
-	hsize = entries / 4;
+	num_entries = (src->size - 1)  / RABIN_WINDOW;
+	if (old != NULL)
+		total_num_entries = num_entries + old->num_entries;
+	else
+		total_num_entries = num_entries;
+	hsize = total_num_entries / 4;
 	for (i = 4; (1u << i) < hsize && i < 31; i++);
 	hsize = 1 << i;
 	hmask = hsize - 1;
 
 	/* allocate lookup index */
 	memsize = sizeof(*hash) * hsize +
-		  sizeof(*entry) * entries;
+		  sizeof(*entry) * total_num_entries;
 	mem = malloc(memsize);
 	if (!mem)
 		return NULL;
@@ -274,16 +306,16 @@
 
 	memset(hash, 0, hsize * sizeof(*hash));
 
-	/* allocate an array to count hash entries */
+	/* allocate an array to count hash num_entries */
 	hash_count = calloc(hsize, sizeof(*hash_count));
 	if (!hash_count) {
 		free(hash);
 		return NULL;
 	}
 
-	/* then populate the index */
+	/* then populate the index for the new data */
 	prev_val = ~0;
-	for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
+	for (data = buffer + num_entries * RABIN_WINDOW - RABIN_WINDOW;
 		 data >= buffer;
 		 data -= RABIN_WINDOW) {
 		unsigned int val = 0;
@@ -292,7 +324,8 @@
 		if (val == prev_val) {
 			/* keep the lowest of consecutive identical blocks */
 			entry[-1].entry.ptr = data + RABIN_WINDOW;
-			--entries;
+			--num_entries;
+			--total_num_entries;
 		} else {
 			prev_val = val;
 			i = val & hmask;
@@ -304,10 +337,17 @@
 			hash_count[i]++;
 		}
 	}
+	/* If we have an existing delta_index, we want to bring its info into the
+	 * new structure. We assume that the existing structure's objects all occur
+	 * before the new source, so they get first precedence in the index.
+	 */
+	if (old != NULL)
+		include_entries_from_index(hash, hash_count, hsize, entry, old);
 
-	entries = limit_hash_buckets(hash, hash_count, hsize, entries);
+	total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
+										   total_num_entries);
 	free(hash_count);
-	index = pack_delta_index(hash, hsize, entries);
+	index = pack_delta_index(hash, hsize, total_num_entries);
 	index->src = src;
 	free(hash);
 	if (!index) {
@@ -347,6 +387,7 @@
 	int inscnt;
 	const unsigned char *ref_data, *ref_top, *data, *top;
 	unsigned char *out;
+	unsigned long source_size;
 
 	if (!trg_buf || !trg_size)
 		return NULL;
@@ -370,6 +411,7 @@
 	}
 	assert(i <= index->src->size + index->src->agg_offset);
 	i = index->src->size + index->src->agg_offset;
+	source_size = i;
 	while (i >= 0x80) {
 		out[outpos++] = i | 0x80;
 		i >>= 7;
@@ -507,7 +549,9 @@
 			/* moff is the offset in the local structure, for encoding, we need
 			 * to push it into the global offset
 			 */
+			assert(moff < msource->size);
 			moff += msource->agg_offset;
+			assert(moff + msize <= source_size);
 			if (moff & 0x000000ff)
 				out[outpos++] = moff >> 0,  i |= 0x01;
 			if (moff & 0x0000ff00)

=== modified file 'groupcompress.py'
--- a/groupcompress.py	2009-03-03 14:48:15 +0000
+++ b/groupcompress.py	2009-03-03 18:05:44 +0000
@@ -53,6 +53,7 @@
     )
 
 _NO_LABELS = False
+_FAST = False
 
 def parse(bytes):
     if _NO_LABELS:
@@ -190,11 +191,14 @@
             else:
                 new_chunks.insert(0, 'delta\n')
                 new_chunks.append('len:%s\n' % (len(delta),))
-            # unadded_bytes = sum(map(len, new_chunks))
-            # self._delta_index.add_source(delta, unadded_bytes)
-            new_chunks.append(delta)
-            unadded_bytes = sum(map(len, new_chunks))
-            self._delta_index._source_offset += unadded_bytes
+            if _FAST:
+                new_chunks.append(delta)
+                unadded_bytes = sum(map(len, new_chunks))
+                self._delta_index._source_offset += unadded_bytes
+            else:
+                unadded_bytes = sum(map(len, new_chunks))
+                self._delta_index.add_source(delta, unadded_bytes)
+                new_chunks.append(delta)
         delta_start = (self.endpoint, len(self.lines))
         self.output_chunks(new_chunks)
         self.input_bytes += input_len

=== modified file 'tests/test__groupcompress_pyx.py'
--- a/tests/test__groupcompress_pyx.py	2009-03-02 19:36:29 +0000
+++ b/tests/test__groupcompress_pyx.py	2009-03-03 18:05:44 +0000
@@ -152,7 +152,7 @@
 
     def test_repr(self):
         di = self._gc_module.DeltaIndex('test text\n')
-        self.assertEqual('DeltaIndex(1, 10, 1)', repr(di))
+        self.assertEqual('DeltaIndex(1, 10)', repr(di))
 
     def test_make_delta(self):
         di = self._gc_module.DeltaIndex(_text1)
@@ -162,12 +162,8 @@
     def test_delta_against_multiple_sources(self):
         di = self._gc_module.DeltaIndex()
         di.add_source(_first_text, 0)
-        self.assertEqual(1, di._num_indexes)
-        self.assertEqual(1024, di._max_num_indexes)
         self.assertEqual(len(_first_text), di._source_offset)
         di.add_source(_second_text, 0)
-        self.assertEqual(2, di._num_indexes)
-        self.assertEqual(1024, di._max_num_indexes)
         self.assertEqual(len(_first_text) + len(_second_text), di._source_offset)
         delta = di.make_delta(_third_text)
         result = self._gc_module.apply_delta(_first_text + _second_text, delta)
@@ -178,12 +174,8 @@
     def test_delta_with_offsets(self):
         di = self._gc_module.DeltaIndex()
         di.add_source(_first_text, 5)
-        self.assertEqual(1, di._num_indexes)
-        self.assertEqual(1024, di._max_num_indexes)
         self.assertEqual(len(_first_text) + 5, di._source_offset)
         di.add_source(_second_text, 10)
-        self.assertEqual(2, di._num_indexes)
-        self.assertEqual(1024, di._max_num_indexes)
         self.assertEqual(len(_first_text) + len(_second_text) + 15,
                          di._source_offset)
         delta = di.make_delta(_third_text)



More information about the bazaar-commits mailing list