Rev 3079: using PyObject_Hash, but memcmp if both sides are strings in http://bzr.arbash-meinel.com/branches/bzr/0.93-dev/patience_tuples

Wed Dec 5 13:50:59 GMT 2007

At http://bzr.arbash-meinel.com/branches/bzr/0.93-dev/patience_tuples

------------------------------------------------------------
revno: 3079
revision-id:john at arbash-meinel.com-20071205134925-4s1wo9k920cbnavb
parent: john at arbash-meinel.com-20071204170735-1l5oc5vt7w473qds
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: patience_tuples
timestamp: Wed 2007-12-05 07:49:25 -0600
message:
  using PyObject_Hash, but memcmp if both sides are strings
  Using original djb2 hash and only handling strings: 4.340s
  With PyObject_Hash and memcmp/PyObject_Compare: 5.550s :(
modified:
  bzrlib/_patiencediff_c.c       _patiencediff_c.c-20070721205602-q3imkipwlgagp3cy-1
-------------- next part --------------
=== modified file 'bzrlib/_patiencediff_c.c'

--- a/bzrlib/_patiencediff_c.c	2007-12-04 17:07:35 +0000
+++ b/bzrlib/_patiencediff_c.c	2007-12-05 13:49:25 +0000
@@ -75,6 +75,7 @@
     Py_ssize_t equiv;  /* equivalence class */
     Py_ssize_t len;
     const PyObject *data;
+    const char *c_data;
 };
 
 
@@ -151,9 +152,14 @@
 static inline int
 compare_lines(struct line *a, struct line *b)
 {
-    return ((a->hash != b->hash) || (a->len != b->len) ||
-            PyObject_Compare((PyObject *)a->data, (PyObject *)b->data));
-            //memcmp(a->data, b->data, a->len));
+    if (((a->hash != b->hash) || (a->len != b->len))) {
+        return 0;
+    }
+    if (a->c_data != NULL && b->c_data != NULL) {
+        return memcmp(a->c_data, b->c_data, a->len);
+    } else {
+        return PyObject_Compare((PyObject *)a->data, (PyObject *)b->data);
+    }
 }
 
 
@@ -570,12 +576,26 @@
 
     for (i = 0; i < size; i++) {
         item = PySequence_Fast_GET_ITEM(seq, i);
+        line->data = item;
+        line->c_data = NULL;
         if (PyString_Check(item)) {
+            long hash;
+            const char *p;
             /* we could use the 'djb2' hash here if we find it is better than
                PyObject_Hash, however it seems measurably slower to compute,
                and doesn't give particularly better results
              */
             line->len = PyString_GET_SIZE(item);
+            line->c_data = p = PyString_AS_STRING(item);
+            // /* 'djb2' hash. This gives us a nice compromise between fast hash
+            //     function and a hash with less collisions. The algorithm doesn't
+            //     use the hash for actual lookups, only for building the table
+            //     so a better hash function wouldn't bring us much benefit, but
+            //     would make this loading code slower. */
+            // hash = 5381;
+            // for (j = 0; j < line->len; j++)
+            //     hash = ((hash << 5) + hash) + *p++;
+            line->hash = PyObject_Hash(item);
         } else if (PyTuple_Check(item)) {
             total_len = 0;
             tuple_len = PyObject_Length(item);
@@ -589,6 +609,7 @@
                 total_len += cur_len;
             }
             line->len = total_len;
+            line->hash = PyObject_Hash(item);
         } else {
             /* Generic length */
             cur_len = PyObject_Length(item);
@@ -597,9 +618,8 @@
                 goto cleanup;
             }
             line->len = cur_len;
+            line->hash = PyObject_Hash(item);
         }
-        line->data = item;
-        line->hash = PyObject_Hash(item);
         if (line->hash == (-1)) {
             /* Propogate the hash exception */
             size = -1;