Rev 3079: using PyObject_Hash, but memcmp if both sides are strings in http://bzr.arbash-meinel.com/branches/bzr/0.93-dev/patience_tuples
John Arbash Meinel
john at arbash-meinel.com
Wed Dec 5 13:50:59 GMT 2007
At http://bzr.arbash-meinel.com/branches/bzr/0.93-dev/patience_tuples
------------------------------------------------------------
revno: 3079
revision-id:john at arbash-meinel.com-20071205134925-4s1wo9k920cbnavb
parent: john at arbash-meinel.com-20071204170735-1l5oc5vt7w473qds
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: patience_tuples
timestamp: Wed 2007-12-05 07:49:25 -0600
message:
using PyObject_Hash, but memcmp if both sides are strings
Using original djb2 hash and only handling strings: 4.340s
With PyObject_Hash and memcmp/PyObject_Compare: 5.550s :(
modified:
bzrlib/_patiencediff_c.c _patiencediff_c.c-20070721205602-q3imkipwlgagp3cy-1
-------------- next part --------------
=== modified file 'bzrlib/_patiencediff_c.c'
--- a/bzrlib/_patiencediff_c.c 2007-12-04 17:07:35 +0000
+++ b/bzrlib/_patiencediff_c.c 2007-12-05 13:49:25 +0000
@@ -75,6 +75,7 @@
Py_ssize_t equiv; /* equivalence class */
Py_ssize_t len;
const PyObject *data;
+ const char *c_data;
};
@@ -151,9 +152,14 @@
static inline int
compare_lines(struct line *a, struct line *b)
{
- return ((a->hash != b->hash) || (a->len != b->len) ||
- PyObject_Compare((PyObject *)a->data, (PyObject *)b->data));
- //memcmp(a->data, b->data, a->len));
+ if (((a->hash != b->hash) || (a->len != b->len))) {
+ return 0;
+ }
+ if (a->c_data != NULL && b->c_data != NULL) {
+ return memcmp(a->c_data, b->c_data, a->len);
+ } else {
+ return PyObject_Compare((PyObject *)a->data, (PyObject *)b->data);
+ }
}
@@ -570,12 +576,26 @@
for (i = 0; i < size; i++) {
item = PySequence_Fast_GET_ITEM(seq, i);
+ line->data = item;
+ line->c_data = NULL;
if (PyString_Check(item)) {
+ long hash;
+ const char *p;
/* we could use the 'djb2' hash here if we find it is better than
PyObject_Hash, however it seems measurably slower to compute,
and doesn't give particularly better results
*/
line->len = PyString_GET_SIZE(item);
+ line->c_data = p = PyString_AS_STRING(item);
+ // /* 'djb2' hash. This gives us a nice compromise between fast hash
+ // function and a hash with less collisions. The algorithm doesn't
+ // use the hash for actual lookups, only for building the table
+ // so a better hash function wouldn't bring us much benefit, but
+ // would make this loading code slower. */
+ // hash = 5381;
+ // for (j = 0; j < line->len; j++)
+ // hash = ((hash << 5) + hash) + *p++;
+ line->hash = PyObject_Hash(item);
} else if (PyTuple_Check(item)) {
total_len = 0;
tuple_len = PyObject_Length(item);
@@ -589,6 +609,7 @@
total_len += cur_len;
}
line->len = total_len;
+ line->hash = PyObject_Hash(item);
} else {
/* Generic length */
cur_len = PyObject_Length(item);
@@ -597,9 +618,8 @@
goto cleanup;
}
line->len = cur_len;
+ line->hash = PyObject_Hash(item);
}
- line->data = item;
- line->hash = PyObject_Hash(item);
if (line->hash == (-1)) {
/* Propogate the hash exception */
size = -1;
More information about the bazaar-commits
mailing list