Rev 2424: Change the sort order to match walkdirs() and iter_entries_by_dir() in http://bazaar.launchpad.net/%7Ebzr/bzr/dirstate

Tue Feb 27 02:46:06 GMT 2007

At http://bazaar.launchpad.net/%7Ebzr/bzr/dirstate

------------------------------------------------------------
revno: 2424
revision-id: john at arbash-meinel.com-20070227024458-qehj0qcykr3zqckm
parent: john at arbash-meinel.com-20070226221107-q3sz1brqz1yxugos
parent: john at arbash-meinel.com-20070227015806-z9bf0at2itqkc2h0
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate
timestamp: Mon 2007-02-26 20:44:58 -0600
message:
  Change the sort order to match walkdirs() and iter_entries_by_dir()
modified:
  bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
  bzrlib/tests/intertree_implementations/test_compare.py test_compare.py-20060724101752-09ysswo1a92uqyoz-2
  bzrlib/tests/test_dirstate.py  test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
  bzrlib/workingtree_4.py        workingtree_4.py-20070208044105-5fgpc5j3ljlh5q6c-1
    ------------------------------------------------------------
    revno: 2423.1.6
    merged: john at arbash-meinel.com-20070227015806-z9bf0at2itqkc2h0
    parent: john at arbash-meinel.com-20070227013027-tuqcr879a36obvx3
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: dirstate
    timestamp: Mon 2007-02-26 19:58:06 -0600
    message:
      Because the disk format (sorting) has now changed
      bump the format number. Converting between them is trivial, and
      could actually be done on the fly if desired.
    ------------------------------------------------------------
    revno: 2423.1.5
    merged: john at arbash-meinel.com-20070227013027-tuqcr879a36obvx3
    parent: john at arbash-meinel.com-20070227010804-nb6bg1c5fvs2nsri
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: dirstate
    timestamp: Mon 2007-02-26 19:30:27 -0600
    message:
      Add a test that dirstate adds records in the right order.
    ------------------------------------------------------------
    revno: 2423.1.4
    merged: john at arbash-meinel.com-20070227010804-nb6bg1c5fvs2nsri
    parent: john at arbash-meinel.com-20070227004109-o0rc0nwaqjzxaved
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: dirstate
    timestamp: Mon 2007-02-26 19:08:04 -0600
    message:
      Rather than using split hunks, implement a bisect_dirblocks
      which knows that it is bisecting dirblocks and only compares the first
      hunk, and does the comparison in split mode.
    ------------------------------------------------------------
    revno: 2423.1.3
    merged: john at arbash-meinel.com-20070227004109-o0rc0nwaqjzxaved
    parent: john at arbash-meinel.com-20070227003412-6i0cl98a6f3l71ue
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: dirstate
    timestamp: Mon 2007-02-26 18:41:09 -0600
    message:
      Add the ability to cache the split output
      rather than having to always split paths.
    ------------------------------------------------------------
    revno: 2423.1.2
    merged: john at arbash-meinel.com-20070227003412-6i0cl98a6f3l71ue
    parent: john at arbash-meinel.com-20070226230506-8l3jxhu3ed4ruiek
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: dirstate
    timestamp: Mon 2007-02-26 18:34:12 -0600
    message:
      Add a helper function, which allows us to store keys as plain paths,
      but bisect across them as split paths.
    ------------------------------------------------------------
    revno: 2423.1.1
    merged: john at arbash-meinel.com-20070226230506-8l3jxhu3ed4ruiek
    parent: john at arbash-meinel.com-20070226221107-q3sz1brqz1yxugos
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: dirstate
    timestamp: Mon 2007-02-26 17:05:06 -0600
    message:
      (broken) Update the test to actually expose the iter_changes bug.
-------------- next part --------------
=== modified file 'bzrlib/dirstate.py'

--- a/bzrlib/dirstate.py	2007-02-26 22:08:14 +0000
+++ b/bzrlib/dirstate.py	2007-02-27 01:58:06 +0000
@@ -201,11 +201,11 @@
 
 from bzrlib import (
     errors,
+    inventory,
     lock,
+    osutils,
     trace,
     )
-import bzrlib.inventory
-from bzrlib import osutils
 from bzrlib.osutils import (
     pathjoin,
     sha_file,
@@ -248,6 +248,9 @@
     NULLSTAT = 'x' * 32
     NULL_PARENT_DETAILS = ('a', '', 0, False, '')
 
+    HEADER_FORMAT_2 = '#bazaar dirstate flat format 2\n'
+    HEADER_FORMAT_3 = '#bazaar dirstate flat format 3\n'
+
     def __init__(self, path):
         """Create a  DirState object.
 
@@ -302,7 +305,7 @@
         # find the location in the block.
         # check its not there
         # add it.
-        #------- copied from bzrlib.inventory.make_entry
+        #------- copied from inventory.make_entry
         # --- normalized_filename wants a unicode basename only, so get one.
         dirname, basename = osutils.split(path)
         # we dont import normalized_filename directly because we want to be
@@ -945,7 +948,7 @@
         """
         if key[0:2] == ('', ''):
             return 0, True
-        block_index = bisect.bisect_left(self._dirblocks, (key[0], []), 1)
+        block_index = bisect_dirblock(self._dirblocks, key[0], 1)
         # _right returns one-past-where-key is so we have to subtract
         # one to use it. we use _right here because there are two
         # '' blocks - the root, and the contents of root
@@ -1232,7 +1235,7 @@
         empty_tree_dirblocks = [('', []), ('', [])]
         # a new root directory, with a NULLSTAT.
         empty_tree_dirblocks[0][1].append(
-            (('', '', bzrlib.inventory.ROOT_ID), [
+            (('', '', inventory.ROOT_ID), [
                 ('d', '', 0, False, DirState.NULLSTAT),
             ]))
         result.lock_write()
@@ -1298,7 +1301,7 @@
         :param lines: A sequece of lines containing the parents list and the
             path lines.
         """
-        output_lines = ['#bazaar dirstate flat format 2\n']
+        output_lines = [DirState.HEADER_FORMAT_3]
         lines.append('') # a final newline
         inventory_text = '\0\n\0'.join(lines)
         output_lines.append('adler32: %s\n' % (zlib.adler32(inventory_text),))
@@ -1426,6 +1429,11 @@
                 entries = [fields_to_entry(fields[pos:pos+entry_size])
                            for pos in xrange(cur, field_count, entry_size)]
                 self._entries_to_current_state(entries)
+            # To convert from format 2  => format 3
+            # self._dirblocks = sorted(self._dirblocks,
+            #                          key=lambda blk:blk[0].split('/'))
+            # To convert from format 3 => format 2
+            # self._dirblocks = sorted(self._dirblocks)
             self._dirblock_state = DirState.IN_MEMORY_UNMODIFIED
 
     def _read_header(self):
@@ -1469,7 +1477,7 @@
         and their ids. Followed by a newline.
         """
         header = self._state_file.readline()
-        assert header == '#bazaar dirstate flat format 2\n', \
+        assert header == DirState.HEADER_FORMAT_3, \
             'invalid header line: %r' % (header,)
         adler_line = self._state_file.readline()
         assert adler_line.startswith('adler32: '), 'missing adler32 checksum'
@@ -1794,7 +1802,7 @@
             # Remove it, its meaningless.
             block = self._find_block(current_old[0])
             entry_index, present = self._find_entry_index(current_old[0], block[1])
-            assert present
+            assert present, 'could not find entry for %s' % (current_old,)
             block[1].pop(entry_index)
             # if we have an id_index in use, remove this key from it for this id.
             if self._id_index is not None:
@@ -1806,10 +1814,10 @@
         for update_key in all_remaining_keys:
             update_block_index, present = \
                 self._find_block_index_from_key(update_key)
-            assert present
+            assert present, 'could not find block for %s' % (update_key,)
             update_entry_index, present = \
                 self._find_entry_index(update_key, self._dirblocks[update_block_index][1])
-            assert present
+            assert present, 'could not find entry for %s' % (update_key,)
             update_tree_details = self._dirblocks[update_block_index][1][update_entry_index][1]
             # it must not be absent at the moment
             assert update_tree_details[0][0] != 'a' # absent
@@ -1859,9 +1867,14 @@
                     # the test for existing kinds is different: this can be
                     # factored out to a helper though.
                     other_block_index, present = self._find_block_index_from_key(other_key)
-                    assert present
-                    other_entry_index, present = self._find_entry_index(other_key, self._dirblocks[other_block_index][1])
-                    assert present
+                    if not present:
+                        import pdb; pdb.set_trace()
+                    assert present, 'could not find block for %s' % (other_key,)
+                    other_entry_index, present = self._find_entry_index(other_key,
+                                            self._dirblocks[other_block_index][1])
+                    if not present:
+                        import pdb; pdb.set_trace()
+                    assert present, 'could not find entry for %s' % (other_key,)
                     assert path_utf8 is not None
                     self._dirblocks[other_block_index][1][other_entry_index][1][0] = \
                         ('r', path_utf8, 0, False, '')
@@ -1874,10 +1887,10 @@
                     # records.
                     update_block_index, present = \
                         self._find_block_index_from_key(other_key)
-                    assert present
+                    assert present, 'could not find block for %s' % (other_key,)
                     update_entry_index, present = \
                         self._find_entry_index(other_key, self._dirblocks[update_block_index][1])
-                    assert present
+                    assert present, 'could not find entry for %s' % (other_key,)
                     update_details = self._dirblocks[update_block_index][1][update_entry_index][1][lookup_index]
                     if update_details[0] in ('r', 'a'): # relocated, absent
                         # its a pointer or absent in lookup_index's tree, use
@@ -1964,6 +1977,38 @@
         if not self._lock_token:
             raise errors.ObjectNotLocked(self)
 
+
+def bisect_dirblock(dirblocks, dirname, lo=0, hi=None, cache={}):
+    """Return the index where to insert dirname into the dirblocks.
+
+    The return value idx is such that all directories blocks in dirblock[:idx]
+    have names < dirname, and all blocks in dirblock[idx:] have names >=
+    dirname.
+
+    Optional args lo (default 0) and hi (default len(dirblocks)) bound the
+    slice of a to be searched.
+    """
+    if hi is None:
+        hi = len(dirblocks)
+    try:
+        dirname_split = cache[dirname]
+    except KeyError:
+        dirname_split = dirname.split('/')
+        cache[dirname] = dirname_split
+    while lo < hi:
+        mid = (lo+hi)//2
+        # Grab the dirname for the current dirblock
+        cur = dirblocks[mid][0]
+        try:
+            cur_split = cache[cur]
+        except KeyError:
+            cur_split = cur.split('/')
+            cache[cur] = cur_split
+        if cur_split < dirname_split: lo = mid+1
+        else: hi = mid
+    return lo
+
+
 def pack_stat(st, _encode=base64.encodestring, _pack=struct.pack):
     """Convert stat values into a packed representation."""
     # jam 20060614 it isn't really worth removing more entries if we
@@ -1976,4 +2021,3 @@
     return _encode(_pack('>llllll'
         , st.st_size, st.st_mtime, st.st_ctime
         , st.st_dev, st.st_ino, st.st_mode))[:-1]
-

=== modified file 'bzrlib/tests/intertree_implementations/test_compare.py'
--- a/bzrlib/tests/intertree_implementations/test_compare.py	2007-02-26 21:51:04 +0000
+++ b/bzrlib/tests/intertree_implementations/test_compare.py	2007-02-26 23:05:06 +0000
@@ -312,13 +312,22 @@
         """Create a tree with filenames chosen to exercise the walk order."""
         tree1 = self.make_branch_and_tree('tree1')
         tree2 = self.make_to_branch_and_tree('tree2')
-        from_paths = ['b-ar', 'b-foo', 'b-zar',
-                   'bar', 'bfoo','bzar',
-                   'b/', 'b/ar', 'b/foo/', 'b/zar',
-                   'b/foo-a', 'b/foo-z',
-                   'b/fooa', 'b/fooz',
-                   'b/foo/a', 'b/foo/z',
-                  ]
+        from_paths = ['b-ar/', 'b-ar/a',
+                      'b-foo/', 'b-foo/a',
+                      'b-zar/', 'b-zar/a',
+                      'bar/', 'bar/a',
+                      'bfoo/', 'bfoo/a',
+                      'bzar/', 'bzar/a',
+                      'b/', 'b/a',
+                      'b/ar/', 'b/ar/a',
+                      'b/foo/', 'b/foo/a',
+                      'b/zar/', 'b/zar/a',
+                      'b/foo-a/', 'b/foo-a/a',
+                      'b/foo-z/', 'b/foo-z/a',
+                      'b/fooa/', 'b/fooa/a',
+                      'b/fooz/', 'b/fooz/a',
+                      'b/foo/z/', 'b/foo/z/a',
+                     ]
         self.build_tree(['tree2/' + p for p in from_paths])
         paths_no_slashes = [p.strip('/') for p in from_paths]
         path_ids = [p.replace('/', '_') + '-id' for p in paths_no_slashes]

=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py	2007-02-25 14:45:50 +0000
+++ b/bzrlib/tests/test_dirstate.py	2007-02-27 01:30:27 +0000
@@ -16,6 +16,7 @@
 
 """Tests of the dirstate functionality being built for WorkingTreeFormat4."""
 
+import bisect
 import os
 
 from bzrlib import (
@@ -24,7 +25,7 @@
     osutils,
     )
 from bzrlib.memorytree import MemoryTree
-from bzrlib.tests import TestCaseWithTransport
+from bzrlib.tests import TestCase, TestCaseWithTransport
 
 
 # TODO:
@@ -897,6 +898,37 @@
             state.unlock()
 
 
+class TestDirstateSortOrder(TestCaseWithTransport):
+    """Test that DirState adds entries in the right order."""
+
+    def test_add_sorting(self):
+        """Add entries in lexicographical order, we get path sorted order."""
+        dirs = ['a', 'a-a', 'a-z',
+                'a/a', 'a/a-a', 'a/a-z', 'a/a/a', 'a/a/z',
+                'a/z', 'a/z-a', 'a/z-z', 'a/z/a', 'a/z/z',
+                'z',
+               ]
+        null_sha = ''
+        state = dirstate.DirState.initialize('dirstate')
+        self.addCleanup(state.unlock)
+
+        fake_stat = os.stat('dirstate')
+        for d in dirs:
+            d_id = d.replace('/', '_')+'-id'
+            file_path = d + '/f'
+            file_id = file_path.replace('/', '_')+'-id'
+            state.add(d, d_id, 'directory', fake_stat, null_sha)
+            state.add(file_path, file_id, 'file', fake_stat, null_sha)
+
+        expected = ['', '', 'a',
+                'a/a', 'a/a/a', 'a/a/z', 'a/a-a', 'a/a-z',
+                'a/z', 'a/z/a', 'a/z/z', 'a/z-a', 'a/z-z',
+                'a-a', 'a-z', 'z',
+               ]
+        dirblock_names = [d[0] for d in state._dirblocks]
+        self.assertEqual(expected, dirblock_names)
+
+
 class TestBisect(TestCaseWithTransport):
     """Test the ability to bisect into the disk format."""
 
@@ -1290,3 +1322,87 @@
                                               'b/d/e', 'b/g', 'h', 'h/e'],
                                    state, ['b'])
 
+
+class TestBisectDirblock(TestCase):
+    """Test that bisect_dirblock() returns the expected values.
+
+    bisect_dirblock is intended to work like bisect.bisect_left() except it
+    knows it is working on dirblocks and that dirblocks are sorted by ('path',
+    'to', 'foo') chunks rather than by raw 'path/to/foo'.
+    """
+
+    def assertBisect(self, dirblocks, split_dirblocks, path, *args, **kwargs):
+        """Assert that bisect_split works like bisect_left on the split paths.
+
+        :param dirblocks: A list of (path, [info]) pairs.
+        :param split_dirblocks: A list of ((split, path), [info]) pairs.
+        :param path: The path we are indexing.
+
+        All other arguments will be passed along.
+        """
+        bisect_split_idx = dirstate.bisect_dirblock(dirblocks, path,
+                                                 *args, **kwargs)
+        split_dirblock = (path.split('/'), [])
+        bisect_left_idx = bisect.bisect_left(split_dirblocks, split_dirblock,
+                                             *args)
+        self.assertEqual(bisect_left_idx, bisect_split_idx,
+                         'bisect_split disagreed. %s != %s'
+                         ' for key %s'
+                         % (bisect_left_idx, bisect_split_idx, path)
+                         )
+
+    def paths_to_dirblocks(self, paths):
+        """Convert a list of paths into dirblock form.
+
+        Also, ensure that the paths are in proper sorted order.
+        """
+        dirblocks = [(path, []) for path in paths]
+        split_dirblocks = [(path.split('/'), []) for path in paths]
+        self.assertEqual(sorted(split_dirblocks), split_dirblocks)
+        return dirblocks, split_dirblocks
+
+    def test_simple(self):
+        """In the simple case it works just like bisect_left"""
+        paths = ['', 'a', 'b', 'c', 'd']
+        dirblocks, split_dirblocks = self.paths_to_dirblocks(paths)
+        for path in paths:
+            self.assertBisect(dirblocks, split_dirblocks, path)
+        self.assertBisect(dirblocks, split_dirblocks, '_')
+        self.assertBisect(dirblocks, split_dirblocks, 'aa')
+        self.assertBisect(dirblocks, split_dirblocks, 'bb')
+        self.assertBisect(dirblocks, split_dirblocks, 'cc')
+        self.assertBisect(dirblocks, split_dirblocks, 'dd')
+        self.assertBisect(dirblocks, split_dirblocks, 'a/a')
+        self.assertBisect(dirblocks, split_dirblocks, 'b/b')
+        self.assertBisect(dirblocks, split_dirblocks, 'c/c')
+        self.assertBisect(dirblocks, split_dirblocks, 'd/d')
+
+    def test_involved(self):
+        """This is where bisect_left diverges slightly."""
+        paths = ['', 'a',
+                 'a/a', 'a/a/a', 'a/a/z', 'a/a-a', 'a/a-z',
+                 'a/z', 'a/z/a', 'a/z/z', 'a/z-a', 'a/z-z',
+                 'a-a', 'a-z',
+                 'z', 'z/a/a', 'z/a/z', 'z/a-a', 'z/a-z',
+                 'z/z', 'z/z/a', 'z/z/z', 'z/z-a', 'z/z-z',
+                 'z-a', 'z-z',
+                ]
+        dirblocks, split_dirblocks = self.paths_to_dirblocks(paths)
+        for path in paths:
+            self.assertBisect(dirblocks, split_dirblocks, path)
+
+    def test_involved_cached(self):
+        """This is where bisect_left diverges slightly."""
+        paths = ['', 'a',
+                 'a/a', 'a/a/a', 'a/a/z', 'a/a-a', 'a/a-z',
+                 'a/z', 'a/z/a', 'a/z/z', 'a/z-a', 'a/z-z',
+                 'a-a', 'a-z',
+                 'z', 'z/a/a', 'z/a/z', 'z/a-a', 'z/a-z',
+                 'z/z', 'z/z/a', 'z/z/z', 'z/z-a', 'z/z-z',
+                 'z-a', 'z-z',
+                ]
+        cache = {}
+        dirblocks, split_dirblocks = self.paths_to_dirblocks(paths)
+        for path in paths:
+            self.assertBisect(dirblocks, split_dirblocks, path, cache=cache)
+

=== modified file 'bzrlib/workingtree_4.py'
--- a/bzrlib/workingtree_4.py	2007-02-26 21:51:04 +0000
+++ b/bzrlib/workingtree_4.py	2007-02-27 01:08:04 +0000
@@ -294,7 +294,7 @@
         for block in state._dirblocks[1:]: # skip the root
             dirname = block[0]
             try:
-                parent_ie = parent_ies[block[0]]
+                parent_ie = parent_ies[dirname]
             except KeyError:
                 # all the paths in this block are not versioned in this tree
                 continue