Rev 4412: Some small tweaks to decoding strings (avoid passing over the length 2x) in lp:///~jameinel/bzr/bencode_serializer

Thu Jun 4 18:12:48 BST 2009

At lp:///~jameinel/bzr/bencode_serializer

------------------------------------------------------------
revno: 4412
revision-id: john at arbash-meinel.com-20090604171229-kbgfatt63y3u3uh1
parent: john at arbash-meinel.com-20090604165033-bfdo0lyf4yt4vjcz
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: bencode_serializer
timestamp: Thu 2009-06-04 12:12:29 -0500
message:
  Some small tweaks to decoding strings (avoid passing over the length 2x)
  
  Down to 1.1s (from 1.4s) for decoding all of bzr.dev.
  Also, favor decoding strings and then lists in _decode_object, since that is the
  frequency we have those types inside Revisions.
-------------- next part --------------
=== modified file 'bzrlib/_bencode_pyx.pyx'

--- a/bzrlib/_bencode_pyx.pyx	2009-06-04 16:50:33 +0000
+++ b/bzrlib/_bencode_pyx.pyx	2009-06-04 17:12:29 +0000
@@ -98,14 +98,14 @@
             raise RuntimeError("too deeply nested")
         try:
             ch = self.tail[0]
-            if ch == c'i':
-                D_UPDATE_TAIL(self, 1)
-                return self._decode_int()
-            elif c'0' <= ch <= c'9':
+            if c'0' <= ch <= c'9':
                 return self._decode_string()
             elif ch == c'l':
                 D_UPDATE_TAIL(self, 1)
                 return self._decode_list()
+            elif ch == c'i':
+                D_UPDATE_TAIL(self, 1)
+                return self._decode_int()
             elif ch == c'd':
                 D_UPDATE_TAIL(self, 1)
                 return self._decode_dict()
@@ -144,10 +144,20 @@
         return ret
 
     cdef object _decode_string(self):
-        cdef int n, i
-        i = self._read_digits(c':')
-        n = strtol(self.tail, NULL, 10)
-        D_UPDATE_TAIL(self, i+1)
+        cdef int n
+        cdef char *next_tail
+        # strtol allows leading whitespace, negatives, and leading zeros
+        # however, all callers have already checked that '0' <= tail[0] <= '9'
+        # or they wouldn't have called _decode_string
+        # strtol will stop at trailing whitespace, etc
+        n = strtol(self.tail, &next_tail, 10)
+        if next_tail == NULL or next_tail[0] != c':':
+            raise ValueError('string len not terminated by ":"')
+        # strtol allows leading zeros, so validate that we don't have that
+        if (self.tail[0] == c'0'
+            and (n != 0 or (next_tail - self.tail != 1))):
+            raise ValueError('leading zeros are not allowed')
+        D_UPDATE_TAIL(self, next_tail - self.tail + 1)
         if n == 0:
             return ''
         if n > self.size:
@@ -170,6 +180,9 @@
                 else:
                     return result
             else:
+                # As a quick shortcut, check to see if the next object is a
+                # string, since we know that won't be creating recursion
+                # if self.tail[0] >= c'0' and self.tail[0] <= c'9':
                 PyList_Append(result, self._decode_object())
 
         raise ValueError('malformed list')
@@ -187,6 +200,8 @@
                 return result
             else:
                 # keys should be strings only
+                if self.tail[0] < c'0' or self.tail[0] > c'9':
+                    raise ValueError('key was not a simple string.')
                 key = self._decode_string()
                 if lastkey >= key:
                     raise ValueError('dict keys disordered')

=== modified file 'bzrlib/tests/test_bencode.py'
--- a/bzrlib/tests/test_bencode.py	2009-06-03 14:14:31 +0000
+++ b/bzrlib/tests/test_bencode.py	2009-06-04 17:12:29 +0000
@@ -101,7 +101,11 @@
         self._run_check_error(ValueError, '00:')
         self._run_check_error(ValueError, '35208734823ljdahflajhdf')
         self._run_check_error(ValueError, '432432432432432:foo')
-        self._run_check_error(ValueError, 'i' + ('1' * 1000) + ':')
+        self._run_check_error(ValueError, ' 1:x') # leading whitespace
+        self._run_check_error(ValueError, '-1:x') # negative
+        self._run_check_error(ValueError, '1 x') # space vs colon
+        self._run_check_error(ValueError, '1x') # missing colon
+        self._run_check_error(ValueError, ('1' * 1000) + ':')
 
     def test_list(self):
         self._check([], 'le')