Rev 5536: Some interesting results from using cython: profile=True, in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

Wed Dec 1 19:18:20 GMT 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

------------------------------------------------------------
revno: 5536
revision-id: john at arbash-meinel.com-20101201191814-laegibyb63r913f3
parent: john at arbash-meinel.com-20101130203740-twiaq89vtwrvyq1p
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Wed 2010-12-01 13:18:14 -0600
message:
  Some interesting results from using cython: profile=True,
  
  Having 'next_with_dummy()' defined as 'except? NULL' was quite a bit of
  overhead. My guess is that NULL is quite common, and PyErr_Occurred is
  a little expensive. All the callers would handle it gracefully (.next()
  knows to handle falling off the end, and the limiting loop will just
  treat it as no more entries to fill in, which is valid.
  
  Now within spitting distance of the existing code, lots faster and lower
  memory against large file content, but need to think closely about some
  tweaks for match rules.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'

--- a/bzrlib/_delta_index_pyx.pyx	2010-11-30 20:37:40 +0000
+++ b/bzrlib/_delta_index_pyx.pyx	2010-12-01 19:18:14 +0000
@@ -1,3 +1,5 @@
+# cython: profile=False
+
 # Copyright (C) 2010 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
@@ -57,6 +59,8 @@
                                      unsigned char *top) nogil
 
 
+cimport cython
+
 from bzrlib import trace
 
 cdef int MAX_HASH_BUCKET_ENTRIES
@@ -148,6 +152,7 @@
         self._table = rabin_hash_map._table
         self._table_mask = rabin_hash_map.table_mask
 
+    @cython.profile(False)
     cdef start(self, unsigned int rabin_val):
         self.rabin_val = rabin_val
         self.entry = NULL
@@ -163,7 +168,8 @@
         # because RABIN isn't a super-strong hash.
         self._next_hash = rabin_val
 
-    cdef rabin_entry* next_with_dummy(self) except? NULL:
+    @cython.profile(False)
+    cdef rabin_entry* next_with_dummy(self): # noexcept
         cdef rabin_entry *slot
         # This uses Quadratic Probing:
         #  http://en.wikipedia.org/wiki/Quadratic_probing
@@ -194,11 +200,9 @@
             # We found an entry that matches exactly, return it
             self.match_count += 1
             return slot
-        raise RuntimeError("We seem to have stepped too far while searching"
-                           " for rabin_val = %d, we took %d steps, hash %d"
-                           % (self.rabin_val, self._step,
-                              self._next_hash))
+        return NULL
 
+    @cython.profile(False)
     cdef next(self):
         cdef rabin_entry *slot
         # Don't search forever, even if we have bogus entries in the table
@@ -383,11 +387,10 @@
         table_size = MIN_TABLE_SIZE
         while table_size < total_entries * 2 and table_size < 2**31:
             table_size = table_size * 2
-        if self._table != NULL:
-            trace.mutter('resizing table %08x from %d to %d'
-                         ' for %d new_entries, %d total'
-                         % (id(self), self.table_mask + 1, table_size,
-                            new_entries, total_entries))
+        trace.mutter('resizing table %08x from %d to %d'
+                     ' for %d new_entries, %d total'
+                     % (id(self), self.table_mask + 1, table_size,
+                        new_entries, total_entries))
         n_bytes = sizeof(rabin_entry) * table_size
         new_table = <rabin_entry*>PyMem_Malloc(n_bytes)
         if new_table == NULL:
@@ -692,6 +695,7 @@
         cdef Py_ssize_t size
         cdef rabin_entry *last
         cdef unsigned int hash_offset
+        cdef int stride
         cdef RabinEntrySearch search
 
         if not PyString_CheckExact(source):
@@ -702,6 +706,25 @@
             # TODO: Test that empty strings don't screw things up
             # Nothing to add
             return
+        stride = RABIN_WINDOW
+        if size > 10*1024*1024:
+            # For objects > 10MB, we increase the stride, so that we don't
+            # waste peak memory. The idea is that if the content is large, then
+            # either we will have very little overlap, or the overlap chunks
+            # are going to be very big. We don't need to find every possible
+            # 20-byte match.
+            # under 10MB, stride = 16
+            stride = size / 655360
+            # TODO: Consider capping the maximum stride. Rationale: we will
+            #       insert up to 127 bytes before finding an 'acceptable'
+            #       match. After that, we finalize the insert. So say every 200
+            #       bytes has a match in a different location. We would end up
+            #       inserting 127 bytes, and then matching the remaining 73
+            #       bytes.
+            trace.mutter('Encountered large content %.3fMB,'
+                         ' using alternate stride: %d'
+                         % ((size / (1024. * 1024.)), stride))
+        self._hash_map.reserve(size / stride)
         # We walk backwards through the data, so that early matches are the
         # first entries in the hash buckets.
         start = (<const_data>PyString_AS_STRING(source))
@@ -719,7 +742,7 @@
             ptr += RABIN_WINDOW
             match_pre = ptr - start
             match_tail = end - ptr
-            global_offset += RABIN_WINDOW
+            global_offset += stride
             if (last != NULL and val == last.rabin_val):
                 # We have a repeated entry. We only store the first of
                 # consecutive entries, so we just skip this one
@@ -728,6 +751,7 @@
                 last = self._hash_map.add(ptr, global_offset,
                                           val, match_pre, match_tail,
                                           search)
+            ptr += stride - RABIN_WINDOW
 
     def _compute_offsets_from_delta(self, delta_source, start_offset):
         cdef unsigned int global_offset
@@ -753,6 +777,8 @@
             # TODO: Test that empty strings don't screw things up
             # Nothing to add
             return
+        # This is an upper bound of the number of entries we'll find
+        self._hash_map.reserve(len(delta_source) / RABIN_WINDOW)
         start = (<const_data>PyString_AS_STRING(delta_source))
         end = start + size
         ptr = start
@@ -821,7 +847,6 @@
         start_offset = self.total_source_bytes + extra_offset
         new_info = SourceInfo(source, start_offset)
         self.sources.append(new_info)
-        self._hash_map.reserve(len(source) / RABIN_WINDOW)
         self._compute_offsets(source, start_offset)
         self.total_source_bytes += len(source) + extra_offset
         global add_total_time
@@ -836,8 +861,6 @@
         start_offset = self.total_source_bytes + extra_offset
         new_info = SourceInfo(delta_source, start_offset)
         self.sources.append(new_info)
-        # This is an upper bound of the number of entries we'll find
-        self._hash_map.reserve(len(delta_source) / RABIN_WINDOW)
         self._compute_offsets_from_delta(delta_source, start_offset)
         self.total_source_bytes += len(delta_source) + extra_offset
         global add_delta_total_time
@@ -848,11 +871,11 @@
         cdef _DeltaCreator creator
         if not self.sources:
             return None
-        t = _timer()
+        # t = _timer()
         creator = _DeltaCreator(self, content, max_delta_size, noisy)
         val = creator.create()
-        global make_delta_total_time
-        make_delta_total_time += _timer() - t
+        # global make_delta_total_time
+        # make_delta_total_time += _timer() - t
         return val