Rev 2427: Rather than using split hunks, implement a bisect_dirblocks in http://bzr.arbash-meinel.com/branches/bzr/experimental/dirstate

John Arbash Meinel john at arbash-meinel.com
Tue Feb 27 01:08:12 GMT 2007


At http://bzr.arbash-meinel.com/branches/bzr/experimental/dirstate

------------------------------------------------------------
revno: 2427
revision-id: john at arbash-meinel.com-20070227010804-nb6bg1c5fvs2nsri
parent: john at arbash-meinel.com-20070227004109-o0rc0nwaqjzxaved
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate
timestamp: Mon 2007-02-26 19:08:04 -0600
message:
  Rather than using split hunks, implement a bisect_dirblocks
  which knows that it is bisecting dirblocks and only compares the first
  hunk, and does the comparison in split mode.
modified:
  bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
  bzrlib/tests/test_dirstate.py  test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
  bzrlib/workingtree_4.py        workingtree_4.py-20070208044105-5fgpc5j3ljlh5q6c-1
-------------- next part --------------
=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py	2007-02-27 00:41:09 +0000
+++ b/bzrlib/dirstate.py	2007-02-27 01:08:04 +0000
@@ -201,11 +201,11 @@
 
 from bzrlib import (
     errors,
+    inventory,
     lock,
+    osutils,
     trace,
     )
-import bzrlib.inventory
-from bzrlib import osutils
 from bzrlib.osutils import (
     pathjoin,
     sha_file,
@@ -248,6 +248,8 @@
     NULLSTAT = 'x' * 32
     NULL_PARENT_DETAILS = ('a', '', 0, False, '')
 
+    HEADER_FORMAT_2 = '#bazaar dirstate flat format 2\n'
+
     def __init__(self, path):
         """Create a  DirState object.
 
@@ -302,7 +304,7 @@
         # find the location in the block.
         # check its not there
         # add it.
-        #------- copied from bzrlib.inventory.make_entry
+        #------- copied from inventory.make_entry
         # --- normalized_filename wants a unicode basename only, so get one.
         dirname, basename = osutils.split(path)
         # we dont import normalized_filename directly because we want to be
@@ -945,7 +947,7 @@
         """
         if key[0:2] == ('', ''):
             return 0, True
-        block_index = bisect.bisect_left(self._dirblocks, (key[0], []), 1)
+        block_index = bisect_dirblock(self._dirblocks, key[0], 1)
         # _right returns one-past-where-key is so we have to subtract
         # one to use it. we use _right here because there are two
         # '' blocks - the root, and the contents of root
@@ -1232,7 +1234,7 @@
         empty_tree_dirblocks = [('', []), ('', [])]
         # a new root directory, with a NULLSTAT.
         empty_tree_dirblocks[0][1].append(
-            (('', '', bzrlib.inventory.ROOT_ID), [
+            (('', '', inventory.ROOT_ID), [
                 ('d', '', 0, False, DirState.NULLSTAT),
             ]))
         result.lock_write()
@@ -1298,7 +1300,7 @@
         :param lines: A sequece of lines containing the parents list and the
             path lines.
         """
-        output_lines = ['#bazaar dirstate flat format 2\n']
+        output_lines = [DirState.HEADER_FORMAT_2]
         lines.append('') # a final newline
         inventory_text = '\0\n\0'.join(lines)
         output_lines.append('adler32: %s\n' % (zlib.adler32(inventory_text),))
@@ -1469,7 +1471,7 @@
         and their ids. Followed by a newline.
         """
         header = self._state_file.readline()
-        assert header == '#bazaar dirstate flat format 2\n', \
+        assert header == DirState.HEADER_FORMAT_2, \
             'invalid header line: %r' % (header,)
         adler_line = self._state_file.readline()
         assert adler_line.startswith('adler32: '), 'missing adler32 checksum'
@@ -1794,7 +1796,7 @@
             # Remove it, its meaningless.
             block = self._find_block(current_old[0])
             entry_index, present = self._find_entry_index(current_old[0], block[1])
-            assert present
+            assert present, 'could not find entry for %s' % (current_old,)
             block[1].pop(entry_index)
             # if we have an id_index in use, remove this key from it for this id.
             if self._id_index is not None:
@@ -1806,10 +1808,10 @@
         for update_key in all_remaining_keys:
             update_block_index, present = \
                 self._find_block_index_from_key(update_key)
-            assert present
+            assert present, 'could not find block for %s' % (update_key,)
             update_entry_index, present = \
                 self._find_entry_index(update_key, self._dirblocks[update_block_index][1])
-            assert present
+            assert present, 'could not find entry for %s' % (update_key,)
             update_tree_details = self._dirblocks[update_block_index][1][update_entry_index][1]
             # it must not be absent at the moment
             assert update_tree_details[0][0] != 'a' # absent
@@ -1859,9 +1861,14 @@
                     # the test for existing kinds is different: this can be
                     # factored out to a helper though.
                     other_block_index, present = self._find_block_index_from_key(other_key)
-                    assert present
-                    other_entry_index, present = self._find_entry_index(other_key, self._dirblocks[other_block_index][1])
-                    assert present
+                    if not present:
+                        import pdb; pdb.set_trace()
+                    assert present, 'could not find block for %s' % (other_key,)
+                    other_entry_index, present = self._find_entry_index(other_key,
+                                            self._dirblocks[other_block_index][1])
+                    if not present:
+                        import pdb; pdb.set_trace()
+                    assert present, 'could not find entry for %s' % (other_key,)
                     assert path_utf8 is not None
                     self._dirblocks[other_block_index][1][other_entry_index][1][0] = \
                         ('r', path_utf8, 0, False, '')
@@ -1874,10 +1881,10 @@
                     # records.
                     update_block_index, present = \
                         self._find_block_index_from_key(other_key)
-                    assert present
+                    assert present, 'could not find block for %s' % (other_key,)
                     update_entry_index, present = \
                         self._find_entry_index(other_key, self._dirblocks[update_block_index][1])
-                    assert present
+                    assert present, 'could not find entry for %s' % (other_key,)
                     update_details = self._dirblocks[update_block_index][1][update_entry_index][1][lookup_index]
                     if update_details[0] in ('r', 'a'): # relocated, absent
                         # its a pointer or absent in lookup_index's tree, use
@@ -1965,36 +1972,33 @@
             raise errors.ObjectNotLocked(self)
 
 
-def bisect_split(a, x, lo=0, hi=None, cache={}):
-    """Return the index where to insert item x in list a, assuming a is sorted.
-
-    The return value i is such that all e in a[:i] have e < x, and all e in
-    a[i:] have e >= x.  So if x already appears in the list, i points just
-    before the leftmost x already there.
-
-    Optional args lo (default 0) and hi (default len(a)) bound the
+def bisect_dirblock(dirblocks, dirname, lo=0, hi=None, cache={}):
+    """Return the index where to insert dirname into the dirblocks.
+
+    The return value idx is such that all directories blocks in dirblock[:idx]
+    have names < dirname, and all blocks in dirblock[idx:] have names >=
+    dirname.
+
+    Optional args lo (default 0) and hi (default len(dirblocks)) bound the
     slice of a to be searched.
-
-    The one trick is that we compare entries by spliting them into directory
-    path chunks, rather than by individual strings.
     """
     if hi is None:
-        hi = len(a)
+        hi = len(dirblocks)
     try:
-        x_split = cache[x]
+        dirname_split = cache[dirname]
     except KeyError:
-        x_split = x.split('/')
-        cache[x] = x_split
-    x_split = x.split('/')
+        dirname_split = dirname.split('/')
+        cache[dirname] = dirname_split
     while lo < hi:
         mid = (lo+hi)//2
-        cur = a[mid]
+        # Grab the dirname for the current dirblock
+        cur = dirblocks[mid][0]
         try:
             cur_split = cache[cur]
         except KeyError:
             cur_split = cur.split('/')
             cache[cur] = cur_split
-        if cur_split < x_split: lo = mid+1
+        if cur_split < dirname_split: lo = mid+1
         else: hi = mid
     return lo
 

=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py	2007-02-27 00:41:09 +0000
+++ b/bzrlib/tests/test_dirstate.py	2007-02-27 01:08:04 +0000
@@ -1292,44 +1292,59 @@
                                    state, ['b'])
 
 
-class TestBisectSplit(TestCase):
-    """Test that bisect_split() returns the expected values.
+class TestBisectDirblock(TestCase):
+    """Test that bisect_dirblock() returns the expected values.
 
-    bisect_split is intended to work like bisect.bisect_left() except it splits
-    the path by directory sections, rather than using a raw comparison.
+    bisect_dirblock is intended to work like bisect.bisect_left() except it
+    knows it is working on dirblocks and that dirblocks are sorted by ('path',
+    'to', 'foo') chunks rather than by raw 'path/to/foo'.
     """
 
-    def assertBisect(self, paths, split_paths, key, *args, **kwargs):
+    def assertBisect(self, dirblocks, split_dirblocks, path, *args, **kwargs):
         """Assert that bisect_split works like bisect_left on the split paths.
 
-        :param paths: The plain list of paths. It should be split sorted
-        :param split_paths: A list of split paths sorted identical to paths.
-        :param key: The path we are bisecting for.
+        :param dirblocks: A list of (path, [info]) pairs.
+        :param split_dirblocks: A list of ((split, path), [info]) pairs.
+        :param path: The path we are indexing.
+
+        All other arguments will be passed along.
         """
-        bisect_split_idx = dirstate.bisect_split(paths, key, *args, **kwargs)
-        bisect_left_idx = bisect.bisect_left(split_paths, key.split('/'),
+        bisect_split_idx = dirstate.bisect_dirblock(dirblocks, path,
+                                                 *args, **kwargs)
+        split_dirblock = (path.split('/'), [])
+        bisect_left_idx = bisect.bisect_left(split_dirblocks, split_dirblock,
                                              *args)
         self.assertEqual(bisect_left_idx, bisect_split_idx,
                          'bisect_split disagreed. %s != %s'
                          ' for key %s'
-                         % (bisect_left_idx, bisect_split_idx, key)
+                         % (bisect_left_idx, bisect_split_idx, path)
                          )
 
+    def paths_to_dirblocks(self, paths):
+        """Convert a list of paths into dirblock form.
+
+        Also, ensure that the paths are in proper sorted order.
+        """
+        dirblocks = [(path, []) for path in paths]
+        split_dirblocks = [(path.split('/'), []) for path in paths]
+        self.assertEqual(sorted(split_dirblocks), split_dirblocks)
+        return dirblocks, split_dirblocks
+
     def test_simple(self):
         """In the simple case it works just like bisect_left"""
         paths = ['', 'a', 'b', 'c', 'd']
-        split_paths = [[''], ['a'], ['b'], ['c'], ['d']]
+        dirblocks, split_dirblocks = self.paths_to_dirblocks(paths)
         for path in paths:
-            self.assertBisect(paths, split_paths, path)
-        self.assertBisect(paths, split_paths, '_')
-        self.assertBisect(paths, split_paths, 'aa')
-        self.assertBisect(paths, split_paths, 'bb')
-        self.assertBisect(paths, split_paths, 'cc')
-        self.assertBisect(paths, split_paths, 'dd')
-        self.assertBisect(paths, split_paths, 'a/a')
-        self.assertBisect(paths, split_paths, 'b/b')
-        self.assertBisect(paths, split_paths, 'c/c')
-        self.assertBisect(paths, split_paths, 'd/d')
+            self.assertBisect(dirblocks, split_dirblocks, path)
+        self.assertBisect(dirblocks, split_dirblocks, '_')
+        self.assertBisect(dirblocks, split_dirblocks, 'aa')
+        self.assertBisect(dirblocks, split_dirblocks, 'bb')
+        self.assertBisect(dirblocks, split_dirblocks, 'cc')
+        self.assertBisect(dirblocks, split_dirblocks, 'dd')
+        self.assertBisect(dirblocks, split_dirblocks, 'a/a')
+        self.assertBisect(dirblocks, split_dirblocks, 'b/b')
+        self.assertBisect(dirblocks, split_dirblocks, 'c/c')
+        self.assertBisect(dirblocks, split_dirblocks, 'd/d')
 
     def test_involved(self):
         """This is where bisect_left diverges slightly."""
@@ -1341,10 +1356,9 @@
                  'z/z', 'z/z/a', 'z/z/z', 'z/z-a', 'z/z-z',
                  'z-a', 'z-z',
                 ]
-        split_paths = [p.split('/') for p in paths]
-        self.assertEqual(split_paths, sorted(split_paths))
+        dirblocks, split_dirblocks = self.paths_to_dirblocks(paths)
         for path in paths:
-            self.assertBisect(paths, split_paths, path)
+            self.assertBisect(dirblocks, split_dirblocks, path)
 
     def test_involved_cached(self):
         """This is where bisect_left diverges slightly."""
@@ -1357,7 +1371,7 @@
                  'z-a', 'z-z',
                 ]
         cache = {}
-        split_paths = [p.split('/') for p in paths]
-        self.assertEqual(split_paths, sorted(split_paths))
+        dirblocks, split_dirblocks = self.paths_to_dirblocks(paths)
         for path in paths:
-            self.assertBisect(paths, split_paths, path, cache=cache)
+            self.assertBisect(dirblocks, split_dirblocks, path, cache=cache)
+

=== modified file 'bzrlib/workingtree_4.py'
--- a/bzrlib/workingtree_4.py	2007-02-26 21:51:04 +0000
+++ b/bzrlib/workingtree_4.py	2007-02-27 01:08:04 +0000
@@ -294,7 +294,7 @@
         for block in state._dirblocks[1:]: # skip the root
             dirname = block[0]
             try:
-                parent_ie = parent_ies[block[0]]
+                parent_ie = parent_ies[dirname]
             except KeyError:
                 # all the paths in this block are not versioned in this tree
                 continue



More information about the bazaar-commits mailing list