Rev 3075: Change the C PatienceDiff implementation to support arbitrary objects. in http://bzr.arbash-meinel.com/branches/bzr/0.93-dev/patience_tuples

Tue Dec 4 15:24:59 GMT 2007

At http://bzr.arbash-meinel.com/branches/bzr/0.93-dev/patience_tuples

------------------------------------------------------------
revno: 3075
revision-id:john at arbash-meinel.com-20071204152439-hlj4y6a8a2fb6nkz
parent: pqm at pqm.ubuntu.com-20071204035213-2kot5u403spjchen
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: patience_tuples
timestamp: Tue 2007-12-04 09:24:39 -0600
message:
  Change the C PatienceDiff implementation to support arbitrary objects.
modified:
  bzrlib/_patiencediff_c.c       _patiencediff_c.c-20070721205602-q3imkipwlgagp3cy-1
  bzrlib/tests/test_diff.py      testdiff.py-20050727164403-d1a3496ebb12e339
-------------- next part --------------
=== modified file 'bzrlib/_patiencediff_c.c'

--- a/bzrlib/_patiencediff_c.c	2007-09-04 09:10:35 +0000
+++ b/bzrlib/_patiencediff_c.c	2007-12-04 15:24:39 +0000
@@ -70,11 +70,11 @@
 
 
 struct line {
-    int hash;          /* hash code of the string */
+    long hash;         /* hash code of the string/object */
     Py_ssize_t next;   /* next line from the same equivalence class */
     Py_ssize_t equiv;  /* equivalence class */
     Py_ssize_t len;
-    const char *data;
+    const PyObject *data;
 };
 
 
@@ -152,7 +152,8 @@
 compare_lines(struct line *a, struct line *b)
 {
     return ((a->hash != b->hash) || (a->len != b->len) ||
-            memcmp(a->data, b->data, a->len));
+            PyObject_Compare((PyObject *)a->data, (PyObject *)b->data));
+            //memcmp(a->data, b->data, a->len));
 }
 
 
@@ -545,9 +546,10 @@
 static Py_ssize_t
 load_lines(PyObject *orig, struct line **lines)
 {
-    Py_ssize_t size, i, j;
-    int h;
-    char *p;
+    Py_ssize_t size, i, j, tuple_len;
+    Py_ssize_t total_len;
+    // int h;
+    // char *p;
     struct line *line;
     PyObject *seq, *item;
 
@@ -571,25 +573,39 @@
 
     for (i = 0; i < size; i++) {
         item = PySequence_Fast_GET_ITEM(seq, i);
-        if (!PyString_Check(item)){
-            PyErr_Format(PyExc_TypeError,
-                     "sequence item %zd: expected string,"
-                     " %.80s found",
-                     i, item->ob_type->tp_name);
-            Py_DECREF(seq);
-            return -1;
+        // if (!PyString_Check(item)){
+        //     PyErr_Format(PyExc_TypeError,
+        //              "sequence item %zd: expected string,"
+        //              " %.80s found",
+        //              i, item->ob_type->tp_name);
+        //     Py_DECREF(seq);
+        //     return -1;
+        // }
+        if (PyString_Check(item)) {
+            // we could use the 'djb2' hash here if we find it is better than
+            // PyObject_Hash
+            line->len = PyString_GET_SIZE(item);
+        } else if (PyTuple_Check(item)) {
+            total_len = 0;
+            tuple_len = PyObject_Length(item);
+            for (j = 0; j < tuple_len; ++j) {
+                total_len += PyObject_Length(PySequence_Fast_GET_ITEM(item, j));
+            }
+            line->len = total_len;
+        } else {
+            // Generic length
+            line->len = PyObject_Length(item);
         }
-        line->len = PyString_GET_SIZE(item);
-        line->data = p = PyString_AS_STRING(item);
-        /* 'djb2' hash. This gives us a nice compromise between fast hash
-            function and a hash with less collisions. The algorithm doesn't
-            use the hash for actual lookups, only for building the table
-            so a better hash function wouldn't bring us much benefit, but
-            would make this loading code slower. */
-        h = 5381;
-        for (j = 0; j < line->len; j++)
-            h = ((h << 5) + h) + *p++;
-        line->hash = h;
+        line->data = item; //p = PyString_AS_STRING(item);
+        // /* 'djb2' hash. This gives us a nice compromise between fast hash
+        //     function and a hash with less collisions. The algorithm doesn't
+        //     use the hash for actual lookups, only for building the table
+        //     so a better hash function wouldn't bring us much benefit, but
+        //     would make this loading code slower. */
+        // h = 5381;
+        // for (j = 0; j < line->len; j++)
+        //     h = ((h << 5) + h) + *p++;
+        line->hash = PyObject_Hash(item);
         line->next = SENTINEL;
         line++;
     }

=== modified file 'bzrlib/tests/test_diff.py'
--- a/bzrlib/tests/test_diff.py	2007-11-25 21:52:24 +0000
+++ b/bzrlib/tests/test_diff.py	2007-12-04 15:24:39 +0000
@@ -800,6 +800,38 @@
         chk_blocks('abbabbbb', 'cabbabbc', [])
         chk_blocks('bbbbbbbb', 'cbbbbbbc', [])
 
+    def test_matching_blocks_tuples(self):
+        def chk_blocks(a, b, expected_blocks):
+            # difflib always adds a signature of the total
+            # length, with no matching entries at the end
+            s = self._PatienceSequenceMatcher(None, a, b)
+            blocks = s.get_matching_blocks()
+            self.assertEquals((len(a), len(b), 0), blocks[-1])
+            self.assertEquals(expected_blocks, blocks[:-1])
+
+        # Some basic matching tests
+        chk_blocks([], [], [])
+        chk_blocks([('a',), ('b',), ('c,')], [], [])
+        chk_blocks([], [('a',), ('b',), ('c,')], [])
+        chk_blocks([('a',), ('b',), ('c,')],
+                   [('a',), ('b',), ('c,')],
+                   [(0, 0, 3)])
+        chk_blocks([('a',), ('b',), ('c,')],
+                   [('a',), ('b',), ('d,')],
+                   [(0, 0, 2)])
+        chk_blocks([('d',), ('b',), ('c,')],
+                   [('a',), ('b',), ('c,')],
+                   [(1, 1, 2)])
+        chk_blocks([('d',), ('a',), ('b',), ('c,')],
+                   [('a',), ('b',), ('c,')],
+                   [(1, 0, 3)])
+        chk_blocks([('a', 'b'), ('c', 'd'), ('e', 'f')],
+                   [('a', 'b'), ('c', 'X'), ('e', 'f')],
+                   [(0, 0, 1), (2, 2, 1)])
+        chk_blocks([('a', 'b'), ('c', 'd'), ('e', 'f')],
+                   [('a', 'b'), ('c', 'dX'), ('e', 'f')],
+                   [(0, 0, 1), (2, 2, 1)])
+
     def test_opcodes(self):
         def chk_ops(a, b, expected_codes):
             s = self._PatienceSequenceMatcher(None, a, b)