Rev 2380: Initial effort at adding a basic _bisect function to DirState. in http://bazaar.launchpad.net/%7Ebzr/bzr/dirstate
John Arbash Meinel
john at arbash-meinel.com
Fri Feb 23 17:48:11 GMT 2007
At http://bazaar.launchpad.net/%7Ebzr/bzr/dirstate
------------------------------------------------------------
revno: 2380
revision-id: john at arbash-meinel.com-20070223174705-n1j4wyfh5jrlku7s
parent: john at arbash-meinel.com-20070223151256-g1imdh471lwffdzs
parent: john at arbash-meinel.com-20070222154410-svh8iq9j7f8ci302
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate
timestamp: Fri 2007-02-23 11:47:05 -0600
message:
Initial effort at adding a basic _bisect function to DirState.
It doesn't handle the fact that a single path can occur multiple times, but that
will be fixed.
modified:
bzrlib/dirstate.py dirstate.py-20060728012006-d6mvoihjb3je9peu-1
bzrlib/tests/test_dirstate.py test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
------------------------------------------------------------
revno: 2343.1.2
merged: john at arbash-meinel.com-20070222154410-svh8iq9j7f8ci302
parent: john at arbash-meinel.com-20070221201710-siq76rbcfpj68gqx
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate-jam
timestamp: Thu 2007-02-22 09:44:10 -0600
message:
(broken) some basic work on adding bisect functionality to dirstate.
-------------- next part --------------
=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py 2007-02-23 02:16:44 +0000
+++ b/bzrlib/dirstate.py 2007-02-23 17:47:05 +0000
@@ -214,6 +214,10 @@
)
+class _Bisector(object):
+ """This just keeps track of information as we are bisecting."""
+
+
class DirState(object):
"""Record directory and metadata state for fast access.
@@ -276,6 +280,8 @@
self._state_file = None
self._filename = path
self._lock_token = None
+ self._end_of_header = None
+ self._bisect_page_size = DirState.BISECT_PAGE_SIZE
def add(self, path, file_id, kind, stat, link_or_sha1):
"""Add a path to be tracked.
@@ -353,6 +359,153 @@
self._ensure_block(block_index, entry_index, utf8path)
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
+ def _bisect(self, dir_name_list):
+ """Bisect through the disk structure for specific rows."""
+ self._requires_lock()
+ # We need the file pointer to be right after the initial header block
+ self._read_header_if_needed()
+ # If _dirblock_state was in memory, we should just return info from
+ # there, this function is only meant to handle when we want to read
+ # part of the disk.
+ assert self._dirblock_state == DirState.NOT_IN_MEMORY
+
+ # The disk representation is generally info + '\0\n\0' at the end. But
+ # for bisecting, it is easier to treat this as '\0' + info + '\0\n'
+ # Because it means we can sync on the '\n'
+ state_file = self._state_file
+ file_size = os.fstat(state_file.fileno()).st_size
+ # Because of the extra \0 we should have one extra field
+ entry_field_count = self._fields_per_row() + 1
+
+ low = self._end_of_header
+ high = file_size - 1 # Ignore the final '\0'
+ # Map from (
+ found = {}
+
+ # Avoid infinite seeking
+ max_count = 30*len(dir_name_list)
+ count = 0
+ pending = [(low, high, dir_name_list)]
+
+ page_size = self.BISECT_PAGE_SIZE
+
+ fields_to_entry = self._get_fields_to_entry()
+ def add_one_record(num):
+ info = entries[num].split('\0')
+ # We have to offset by 1 because we are syncing on the earlier '\n'
+ # rather than the '\0'
+ record = fields_to_entry(info[1:])
+ found[(record[0][0], record[0][1])] = record
+
+ while pending:
+ low, high, cur_files = pending.pop()
+
+ if not cur_files:
+ # No files to look for
+ continue
+
+ if low >= high:
+ # Did not find cur_files, these will be returned as None
+ # However, other logic should probably prevent this from ever
+ # happening.
+ continue
+
+ count += 1
+ if count > max_count:
+ raise errors.BzrError('Too many seeks, most likely a bug.')
+
+ mid = max(low, (low+high-page_size)/2)
+
+ state_file.seek(mid)
+ block = state_file.read(page_size)
+ entries = block.split('\n')
+
+ start = mid
+ after = mid + len(block)
+
+ # Check the first and last entries, in case they are partial, or if
+ # we don't care about the rest of this page
+ first_entry_num = 0
+ first_info = entries[0].split('\0')
+ if len(first_info) < entry_field_count:
+ # We didn't get the complete first entry
+ # so move start, and grab the next, which
+ # should be a full entry
+ start += len(entries[0])+1
+ first_info = entries[1].split('\0')
+ first_entry_num = 1
+
+ # Find what entries we are looking for, which occur before and
+ # after this first record.
+ first_info_dir_and_name = (first_info[1], first_info[2])
+ first_loc = bisect.bisect_left(cur_files, first_info_dir_and_name)
+
+ # These exist before the current location
+ pre = cur_files[:first_loc]
+ # These occur after the current location, which may be in the data
+ # we read, or might be after the last entry
+ middle_files = cur_files[first_loc:]
+ # These are only after the last entry
+ post = []
+
+ if middle_files:
+ # We have something to look for
+
+ # Parse the last entry
+ last_entry_num = len(entries)-1
+ last_info = entries[last_entry_num].split('\0')
+ if len(last_info) < entry_field_count:
+ # The very last hunk was not complete,
+ # read the previous hunk
+ # TODO: jam 20070217 There may be an edge case if there are
+ # not enough entries that were read.
+ after -= len(entries[-1])
+ last_entry_num -= 1
+ last_info = entries[last_entry_num].split('\0')
+
+ last_info_dir_and_name = (last_info[1], last_info[2])
+ last_loc = bisect.bisect_right(middle_files,
+ last_info_dir_and_name)
+
+ post = middle_files[last_loc:]
+ middle_files = middle_files[:last_loc]
+
+ if middle_files:
+ # We have files that should occur in this block
+ # (>= first, <= last)
+ # Either we will find them here, or we can mark them as
+ # missing.
+
+ # Find out what paths we have
+ # TODO: jam 20070223 There should be faster ways of doing
+ # this, but in the short term this should work
+ # Also, consider that we already parsed the
+ # first_info and last_info, so we don't need to split
+ # them again.
+ paths = dict((tuple(entries[num].split('\0', 3)[1:3]), num)
+ for num in xrange(first_entry_num+1, last_entry_num))
+ paths[first_info_dir_and_name] = first_entry_num
+ paths[last_info_dir_and_name] = last_entry_num
+
+ for fname in middle_files:
+ num = paths.get(fname, None)
+ if num is not None:
+ add_one_record(num)
+
+ # Now we have split up everything into pre, middle, and post, and
+ # we have handled everything that fell in 'middle'.
+ # We add 'post' first, so that we prefer to seek towards the
+ # beginning, so that we will tend to go as early as we need, and
+ # then only seek forward after that.
+ if post:
+ pending.append((after, high, post))
+ if pre:
+ pending.append((low, start-1, pre))
+
+ return [found.get(path_key, None) for path_key in dir_name_list]
+
+
+
def _empty_parent_info(self):
return [DirState.NULL_PARENT_DETAILS] * (len(self._parents) -
len(self._ghosts))
@@ -986,6 +1139,7 @@
assert num_ghosts == len(info)-3, 'incorrect ghost info line'
self._ghosts = info[2:-1]
self._header_state = DirState.IN_MEMORY_UNMODIFIED
+ self._end_of_header = self._state_file.tell()
def _read_header_if_needed(self):
"""Read the header of the dirstate file if needed."""
=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py 2007-02-23 15:12:56 +0000
+++ b/bzrlib/tests/test_dirstate.py 2007-02-23 17:47:05 +0000
@@ -18,7 +18,11 @@
import os
-from bzrlib import dirstate, errors
+from bzrlib import (
+ dirstate,
+ errors,
+ osutils,
+ )
from bzrlib.memorytree import MemoryTree
from bzrlib.tests import TestCaseWithTransport
@@ -891,3 +895,137 @@
self.assertEntryEqual('', '', 'a-root-value', state, '', 0)
finally:
state.unlock()
+
+
+class TestBisect(TestCaseWithTransport):
+ """Test the ability to bisect into the disk format."""
+
+ def create_basic_dirstate(self):
+ """Create a dirstate with a few files and directories.
+
+ a
+ b/
+ c
+ d/
+ e
+ f
+ """
+ tree = self.make_branch_and_tree('tree')
+ paths = ['a', 'b/', 'b/c', 'b/d/', 'b/d/e', 'f']
+ file_ids = ['a-id', 'b-id', 'c-id', 'd-id', 'e-id', 'f-id']
+ self.build_tree(['tree/' + p for p in paths])
+ tree.set_root_id('TREE_ROOT')
+ tree.add([p.rstrip('/') for p in paths], file_ids)
+ tree.commit('initial', rev_id='rev-1')
+ revision_id = 'rev-1'
+ # a_packed_stat = dirstate.pack_stat(os.stat('tree/a'))
+ t = self.get_transport().clone('tree')
+ a_text = t.get_bytes('a')
+ a_sha = osutils.sha_string(a_text)
+ a_len = len(a_text)
+ # b_packed_stat = dirstate.pack_stat(os.stat('tree/b'))
+ # c_packed_stat = dirstate.pack_stat(os.stat('tree/b/c'))
+ c_text = t.get_bytes('b/c')
+ c_sha = osutils.sha_string(c_text)
+ c_len = len(c_text)
+ # d_packed_stat = dirstate.pack_stat(os.stat('tree/b/d'))
+ # e_packed_stat = dirstate.pack_stat(os.stat('tree/b/d/e'))
+ e_text = t.get_bytes('b/d/e')
+ e_sha = osutils.sha_string(e_text)
+ e_len = len(e_text)
+ # f_packed_stat = dirstate.pack_stat(os.stat('tree/f'))
+ f_text = t.get_bytes('f')
+ f_sha = osutils.sha_string(f_text)
+ f_len = len(f_text)
+ null_stat = dirstate.DirState.NULLSTAT
+ expected = {
+ '':(('', '', 'TREE_ROOT'), [
+ ('d', '', 0, False, null_stat),
+ ('d', '', 0, False, revision_id),
+ ]),
+ 'a':(('', 'a', 'a-id'), [
+ ('f', '', 0, False, null_stat),
+ ('f', a_sha, a_len, False, revision_id),
+ ]),
+ 'b':(('', 'b', 'b-id'), [
+ ('d', '', 0, False, null_stat),
+ ('d', '', 0, False, revision_id),
+ ]),
+ 'b/c':(('b', 'c', 'c-id'), [
+ ('f', '', 0, False, null_stat),
+ ('f', c_sha, c_len, False, revision_id),
+ ]),
+ 'b/d':(('b', 'd', 'd-id'), [
+ ('d', '', 0, False, null_stat),
+ ('d', '', 0, False, revision_id),
+ ]),
+ 'b/d/e':(('b/d', 'e', 'e-id'), [
+ ('f', '', 0, False, null_stat),
+ ('f', e_sha, e_len, False, revision_id),
+ ]),
+ 'f':(('', 'f', 'f-id'), [
+ ('f', '', 0, False, null_stat),
+ ('f', f_sha, f_len, False, revision_id),
+ ]),
+ }
+ state = dirstate.DirState.from_tree(tree, 'dirstate')
+ try:
+ state.save()
+ finally:
+ state.unlock()
+ # Use a different object, to make sure nothing is pre-cached in memory.
+ state = dirstate.DirState.on_file('dirstate')
+ state.lock_read()
+ self.addCleanup(state.unlock)
+ self.assertEqual(dirstate.DirState.NOT_IN_MEMORY,
+ state._dirblock_state)
+ return tree, state, expected
+
+ def assertBisect(self, expected, state, paths):
+ """Assert that bisecting for paths returns the right result.
+
+ :param expected: The list of entries we expect.
+ :param state: The DirState object.
+ :param paths: A list of paths, these will automatically be split into
+ (dir, name) tuples, and sorted according to how _bisect
+ requires.
+ """
+ dir_names = sorted(osutils.split(p) for p in paths)
+ result = state._bisect(dir_names)
+ self.assertEqual(expected, result)
+
+ def test_bisect_each(self):
+ """Find a single record using bisect."""
+ tree, state, expected = self.create_basic_dirstate()
+
+ # Bisect should return the rows for the specified files.
+ self.assertBisect([expected['']], state, [''])
+ self.assertBisect([expected['a']], state, ['a'])
+ self.assertBisect([expected['b']], state, ['b'])
+ self.assertBisect([expected['b/c']], state, ['b/c'])
+ self.assertBisect([expected['b/d']], state, ['b/d'])
+ self.assertBisect([expected['b/d/e']], state, ['b/d/e'])
+ self.assertBisect([expected['f']], state, ['f'])
+
+ def test_bisect_multi(self):
+ """Bisect can be used to find multiple records at the same time."""
+ tree, state, expected = self.create_basic_dirstate()
+ # Bisect should be capable of finding multiple entries at the same time
+ self.assertBisect([expected['a'], expected['b'], expected['f']],
+ state, ['a', 'b', 'f'])
+ # ('', 'f') sorts before the others
+ self.assertBisect([expected['f'], expected['b/d'], expected['b/d/e']],
+ state, ['b/d', 'b/d/e', 'f'])
+
+ def test_bisect_split_pages(self):
+ """Test bisect when crossing page boundaries"""
+ tree, state, expected = self.create_basic_dirstate()
+ # Make sure we can't read all the records in a single page, but also
+ # that we *can* read a singe entry in the page size.
+ state._bisect_page_size = 100
+ # Bisect should be capable of finding multiple entries at the same time
+ self.assertBisect([expected['a'], expected['b'], expected['f']],
+ state, ['a', 'b', 'f'])
+ # ('', 'f') sorts before the others
+ self.assertBisect([expected['f'], expected['b/d'], expected['b/d/e']],
+ state, ['b/d', 'b/d/e', 'f'])
More information about the bazaar-commits
mailing list