Rev 2380: Initial effort at adding a basic _bisect function to DirState. in http://bazaar.launchpad.net/%7Ebzr/bzr/dirstate

Fri Feb 23 17:48:11 GMT 2007

At http://bazaar.launchpad.net/%7Ebzr/bzr/dirstate

------------------------------------------------------------
revno: 2380
revision-id: john at arbash-meinel.com-20070223174705-n1j4wyfh5jrlku7s
parent: john at arbash-meinel.com-20070223151256-g1imdh471lwffdzs
parent: john at arbash-meinel.com-20070222154410-svh8iq9j7f8ci302
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate
timestamp: Fri 2007-02-23 11:47:05 -0600
message:
  Initial effort at adding a basic _bisect function to DirState.
  It doesn't handle the fact that a single path can occur multiple times, but that
  will be fixed.
modified:
  bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
  bzrlib/tests/test_dirstate.py  test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
    ------------------------------------------------------------
    revno: 2343.1.2
    merged: john at arbash-meinel.com-20070222154410-svh8iq9j7f8ci302
    parent: john at arbash-meinel.com-20070221201710-siq76rbcfpj68gqx
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: dirstate-jam
    timestamp: Thu 2007-02-22 09:44:10 -0600
    message:
      (broken) some basic work on adding bisect functionality to dirstate.
-------------- next part --------------
=== modified file 'bzrlib/dirstate.py'

--- a/bzrlib/dirstate.py	2007-02-23 02:16:44 +0000
+++ b/bzrlib/dirstate.py	2007-02-23 17:47:05 +0000
@@ -214,6 +214,10 @@
     )
 
 
+class _Bisector(object):
+    """This just keeps track of information as we are bisecting."""
+
+
 class DirState(object):
     """Record directory and metadata state for fast access.
 
@@ -276,6 +280,8 @@
         self._state_file = None
         self._filename = path
         self._lock_token = None
+        self._end_of_header = None
+        self._bisect_page_size = DirState.BISECT_PAGE_SIZE
 
     def add(self, path, file_id, kind, stat, link_or_sha1):
         """Add a path to be tracked.
@@ -353,6 +359,153 @@
            self._ensure_block(block_index, entry_index, utf8path)
         self._dirblock_state = DirState.IN_MEMORY_MODIFIED
 
+    def _bisect(self, dir_name_list):
+        """Bisect through the disk structure for specific rows."""
+        self._requires_lock()
+        # We need the file pointer to be right after the initial header block
+        self._read_header_if_needed()
+        # If _dirblock_state was in memory, we should just return info from
+        # there, this function is only meant to handle when we want to read
+        # part of the disk.
+        assert self._dirblock_state == DirState.NOT_IN_MEMORY
+
+        # The disk representation is generally info + '\0\n\0' at the end. But
+        # for bisecting, it is easier to treat this as '\0' + info + '\0\n'
+        # Because it means we can sync on the '\n'
+        state_file = self._state_file
+        file_size = os.fstat(state_file.fileno()).st_size
+        # Because of the extra \0 we should have one extra field
+        entry_field_count = self._fields_per_row() + 1
+
+        low = self._end_of_header
+        high = file_size - 1 # Ignore the final '\0'
+        # Map from (
+        found = {}
+
+        # Avoid infinite seeking
+        max_count = 30*len(dir_name_list)
+        count = 0
+        pending = [(low, high, dir_name_list)]
+
+        page_size = self.BISECT_PAGE_SIZE
+
+        fields_to_entry = self._get_fields_to_entry()
+        def add_one_record(num):
+            info = entries[num].split('\0')
+            # We have to offset by 1 because we are syncing on the earlier '\n'
+            # rather than the '\0'
+            record = fields_to_entry(info[1:])
+            found[(record[0][0], record[0][1])] = record
+
+        while pending:
+            low, high, cur_files = pending.pop()
+
+            if not cur_files:
+                # No files to look for
+                continue
+
+            if low >= high:
+                # Did not find cur_files, these will be returned as None
+                # However, other logic should probably prevent this from ever
+                # happening.
+                continue
+
+            count += 1
+            if count > max_count:
+                raise errors.BzrError('Too many seeks, most likely a bug.')
+
+            mid = max(low, (low+high-page_size)/2)
+
+            state_file.seek(mid)
+            block = state_file.read(page_size)
+            entries = block.split('\n')
+
+            start = mid
+            after = mid + len(block)
+
+            # Check the first and last entries, in case they are partial, or if
+            # we don't care about the rest of this page
+            first_entry_num = 0
+            first_info = entries[0].split('\0')
+            if len(first_info) < entry_field_count:
+                # We didn't get the complete first entry
+                # so move start, and grab the next, which
+                # should be a full entry
+                start += len(entries[0])+1
+                first_info = entries[1].split('\0')
+                first_entry_num = 1
+
+            # Find what entries we are looking for, which occur before and
+            # after this first record.
+            first_info_dir_and_name = (first_info[1], first_info[2])
+            first_loc = bisect.bisect_left(cur_files, first_info_dir_and_name)
+
+            # These exist before the current location
+            pre = cur_files[:first_loc]
+            # These occur after the current location, which may be in the data
+            # we read, or might be after the last entry
+            middle_files = cur_files[first_loc:]
+            # These are only after the last entry
+            post = []
+
+            if middle_files:
+                # We have something to look for
+
+                # Parse the last entry
+                last_entry_num = len(entries)-1
+                last_info = entries[last_entry_num].split('\0')
+                if len(last_info) < entry_field_count:
+                    # The very last hunk was not complete,
+                    # read the previous hunk
+                    # TODO: jam 20070217 There may be an edge case if there are
+                    #       not enough entries that were read.
+                    after -= len(entries[-1])
+                    last_entry_num -= 1
+                    last_info = entries[last_entry_num].split('\0')
+
+                last_info_dir_and_name = (last_info[1], last_info[2])
+                last_loc = bisect.bisect_right(middle_files,
+                                               last_info_dir_and_name)
+
+                post = middle_files[last_loc:]
+                middle_files = middle_files[:last_loc]
+
+                if middle_files:
+                    # We have files that should occur in this block
+                    # (>= first, <= last)
+                    # Either we will find them here, or we can mark them as
+                    # missing.
+
+                    # Find out what paths we have
+                    # TODO: jam 20070223 There should be faster ways of doing
+                    #       this, but in the short term this should work
+                    #       Also, consider that we already parsed the
+                    #       first_info and last_info, so we don't need to split
+                    #       them again.
+                    paths = dict((tuple(entries[num].split('\0', 3)[1:3]), num)
+                        for num in xrange(first_entry_num+1, last_entry_num))
+                    paths[first_info_dir_and_name] = first_entry_num
+                    paths[last_info_dir_and_name] = last_entry_num
+
+                    for fname in middle_files:
+                        num = paths.get(fname, None)
+                        if num is not None:
+                            add_one_record(num)
+
+            # Now we have split up everything into pre, middle, and post, and
+            # we have handled everything that fell in 'middle'.
+            # We add 'post' first, so that we prefer to seek towards the
+            # beginning, so that we will tend to go as early as we need, and
+            # then only seek forward after that.
+            if post:
+                pending.append((after, high, post))
+            if pre:
+                pending.append((low, start-1, pre))
+
+        return [found.get(path_key, None) for path_key in dir_name_list]
+
+
+
     def _empty_parent_info(self):
         return [DirState.NULL_PARENT_DETAILS] * (len(self._parents) -
                                                     len(self._ghosts))
@@ -986,6 +1139,7 @@
         assert num_ghosts == len(info)-3, 'incorrect ghost info line'
         self._ghosts = info[2:-1]
         self._header_state = DirState.IN_MEMORY_UNMODIFIED
+        self._end_of_header = self._state_file.tell()
 
     def _read_header_if_needed(self):
         """Read the header of the dirstate file if needed."""

=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py	2007-02-23 15:12:56 +0000
+++ b/bzrlib/tests/test_dirstate.py	2007-02-23 17:47:05 +0000
@@ -18,7 +18,11 @@
 
 import os
 
-from bzrlib import dirstate, errors
+from bzrlib import (
+    dirstate,
+    errors,
+    osutils,
+    )
 from bzrlib.memorytree import MemoryTree
 from bzrlib.tests import TestCaseWithTransport
 
@@ -891,3 +895,137 @@
             self.assertEntryEqual('', '', 'a-root-value', state, '', 0)
         finally:
             state.unlock()
+
+
+class TestBisect(TestCaseWithTransport):
+    """Test the ability to bisect into the disk format."""
+
+    def create_basic_dirstate(self):
+        """Create a dirstate with a few files and directories.
+
+            a
+            b/
+              c
+              d/
+                e
+            f
+        """
+        tree = self.make_branch_and_tree('tree')
+        paths = ['a', 'b/', 'b/c', 'b/d/', 'b/d/e', 'f']
+        file_ids = ['a-id', 'b-id', 'c-id', 'd-id', 'e-id', 'f-id']
+        self.build_tree(['tree/' + p for p in paths])
+        tree.set_root_id('TREE_ROOT')
+        tree.add([p.rstrip('/') for p in paths], file_ids)
+        tree.commit('initial', rev_id='rev-1')
+        revision_id = 'rev-1'
+        # a_packed_stat = dirstate.pack_stat(os.stat('tree/a'))
+        t = self.get_transport().clone('tree')
+        a_text = t.get_bytes('a')
+        a_sha = osutils.sha_string(a_text)
+        a_len = len(a_text)
+        # b_packed_stat = dirstate.pack_stat(os.stat('tree/b'))
+        # c_packed_stat = dirstate.pack_stat(os.stat('tree/b/c'))
+        c_text = t.get_bytes('b/c')
+        c_sha = osutils.sha_string(c_text)
+        c_len = len(c_text)
+        # d_packed_stat = dirstate.pack_stat(os.stat('tree/b/d'))
+        # e_packed_stat = dirstate.pack_stat(os.stat('tree/b/d/e'))
+        e_text = t.get_bytes('b/d/e')
+        e_sha = osutils.sha_string(e_text)
+        e_len = len(e_text)
+        # f_packed_stat = dirstate.pack_stat(os.stat('tree/f'))
+        f_text = t.get_bytes('f')
+        f_sha = osutils.sha_string(f_text)
+        f_len = len(f_text)
+        null_stat = dirstate.DirState.NULLSTAT
+        expected = {
+            '':(('', '', 'TREE_ROOT'), [
+                  ('d', '', 0, False, null_stat),
+                  ('d', '', 0, False, revision_id),
+                ]),
+            'a':(('', 'a', 'a-id'), [
+                   ('f', '', 0, False, null_stat),
+                   ('f', a_sha, a_len, False, revision_id),
+                 ]),
+            'b':(('', 'b', 'b-id'), [
+                  ('d', '', 0, False, null_stat),
+                  ('d', '', 0, False, revision_id),
+                 ]),
+            'b/c':(('b', 'c', 'c-id'), [
+                    ('f', '', 0, False, null_stat),
+                    ('f', c_sha, c_len, False, revision_id),
+                   ]),
+            'b/d':(('b', 'd', 'd-id'), [
+                    ('d', '', 0, False, null_stat),
+                    ('d', '', 0, False, revision_id),
+                   ]),
+            'b/d/e':(('b/d', 'e', 'e-id'), [
+                      ('f', '', 0, False, null_stat),
+                      ('f', e_sha, e_len, False, revision_id),
+                     ]),
+            'f':(('', 'f', 'f-id'), [
+                  ('f', '', 0, False, null_stat),
+                  ('f', f_sha, f_len, False, revision_id),
+                 ]),
+        }
+        state = dirstate.DirState.from_tree(tree, 'dirstate')
+        try:
+            state.save()
+        finally:
+            state.unlock()
+        # Use a different object, to make sure nothing is pre-cached in memory.
+        state = dirstate.DirState.on_file('dirstate')
+        state.lock_read()
+        self.addCleanup(state.unlock)
+        self.assertEqual(dirstate.DirState.NOT_IN_MEMORY,
+                         state._dirblock_state)
+        return tree, state, expected
+
+    def assertBisect(self, expected, state, paths):
+        """Assert that bisecting for paths returns the right result.
+
+        :param expected: The list of entries we expect.
+        :param state: The DirState object.
+        :param paths: A list of paths, these will automatically be split into
+                      (dir, name) tuples, and sorted according to how _bisect
+                      requires.
+        """
+        dir_names = sorted(osutils.split(p) for p in paths)
+        result = state._bisect(dir_names)
+        self.assertEqual(expected, result)
+
+    def test_bisect_each(self):
+        """Find a single record using bisect."""
+        tree, state, expected = self.create_basic_dirstate()
+
+        # Bisect should return the rows for the specified files.
+        self.assertBisect([expected['']], state, [''])
+        self.assertBisect([expected['a']], state, ['a'])
+        self.assertBisect([expected['b']], state, ['b'])
+        self.assertBisect([expected['b/c']], state, ['b/c'])
+        self.assertBisect([expected['b/d']], state, ['b/d'])
+        self.assertBisect([expected['b/d/e']], state, ['b/d/e'])
+        self.assertBisect([expected['f']], state, ['f'])
+
+    def test_bisect_multi(self):
+        """Bisect can be used to find multiple records at the same time."""
+        tree, state, expected = self.create_basic_dirstate()
+        # Bisect should be capable of finding multiple entries at the same time
+        self.assertBisect([expected['a'], expected['b'], expected['f']],
+                         state, ['a', 'b', 'f'])
+        # ('', 'f') sorts before the others
+        self.assertBisect([expected['f'], expected['b/d'], expected['b/d/e']],
+                         state, ['b/d', 'b/d/e', 'f'])
+
+    def test_bisect_split_pages(self):
+        """Test bisect when crossing page boundaries"""
+        tree, state, expected = self.create_basic_dirstate()
+        # Make sure we can't read all the records in a single page, but also
+        # that we *can* read a singe entry in the page size.
+        state._bisect_page_size = 100
+        # Bisect should be capable of finding multiple entries at the same time
+        self.assertBisect([expected['a'], expected['b'], expected['f']],
+                         state, ['a', 'b', 'f'])
+        # ('', 'f') sorts before the others
+        self.assertBisect([expected['f'], expected['b/d'], expected['b/d/e']],
+                         state, ['b/d', 'b/d/e', 'f'])