Rev 2345: (broken) some basic work on adding bisect functionality to dirstate. in http://bzr.arbash-meinel.com/branches/bzr/experimental/dirstate-jam
John Arbash Meinel
john at arbash-meinel.com
Thu Feb 22 15:44:25 GMT 2007
At http://bzr.arbash-meinel.com/branches/bzr/experimental/dirstate-jam
------------------------------------------------------------
revno: 2345
revision-id: john at arbash-meinel.com-20070222154410-svh8iq9j7f8ci302
parent: john at arbash-meinel.com-20070221201710-siq76rbcfpj68gqx
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate-jam
timestamp: Thu 2007-02-22 09:44:10 -0600
message:
(broken) some basic work on adding bisect functionality to dirstate.
modified:
bzrlib/dirstate.py dirstate.py-20060728012006-d6mvoihjb3je9peu-1
bzrlib/tests/test_dirstate.py test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
-------------- next part --------------
=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py 2007-02-21 20:17:10 +0000
+++ b/bzrlib/dirstate.py 2007-02-22 15:44:10 +0000
@@ -210,6 +210,158 @@
)
+class _Bisector(object):
+ """This just keeps track of information as we are bisecting."""
+
+
+ def bisect(self):
+ """Start finding the requested files."""
+ # We need the file pointer to be right after the initial header block
+ self._state._read_header_if_needed()
+ # If _dirblock_state was in memory, we should just return info from
+ # there, this function is only meant to handle when we want to read
+ # part of the disk.
+ assert self._state._dirblock_state == DirState.NOT_IN_MEMORY
+
+ # The disk representation is generally info + '\0\n\0' at the end. But
+ # for bisecting, it is easier to treat this as '\0' + info + '\0\n'
+ # Because it means we can sync on the '\n'
+ state_file = self._state._state_file
+ file_size = os.fstat(state_file).st_size
+ # 3 fields for the key
+ # + number of fields per tree_data (5) * tree count
+ # + newline
+ entry_size = self._fields_per_row()
+
+ low = self._state._end_of_header
+ high = file_size - 1 # Ignore the final '\0'
+ found = {}
+
+ # Avoid infinite seeking
+ max_count = 30*len(self._path_name_list)
+ count = 0
+ pending = [(low, high, self._path_name_list)]
+
+ page_size = self._page_size
+
+ if num_present_parents == 0:
+ def split_one(info):
+ return (info[1:8],)
+ elif num_present_parents == 1:
+ def split_one(info):
+ return (info[1:8], [info[8:15]])
+ elif num_present_parents == 2:
+ def split_one(info):
+ return (info[1:8], [info[8:15], info[16:22]])
+ else:
+ def split_one(info):
+ return (info[1:8], [info[cur:cur+7]
+ for cur in xrange(8, len(info)-1, 7)])
+ def add_one_record(num):
+ info = entries[num].split('\0')
+ record = split_one(info)
+ found[record[0][:2]] = record
+
+ while pending:
+ low, high, cur_files = pending.pop()
+
+ if not cur_files:
+ # No files to look for
+ continue
+
+ if low >= high:
+ # Did not find cur_files, these will be returned as None
+ # However, other logic should probably prevent this from ever
+ # happening.
+ continue
+
+ count += 1
+ if count > max_count:
+ raise errors.BzrError('Too many seeks, most likely a bug.')
+
+ mid = max(low, (low+high-page_size)/2)
+
+ state_file.seek(mid)
+ block = state_file.read(page_size)
+ entries = block.split('\n')
+
+ start = mid
+ after = mid + len(block)
+
+ # Check the first and last entries, in case they are partial, or if
+ # we don't care about the rest of this page
+ first_entry_num = 0
+ first_info = entries[0].split('\0')
+ if len(first_info) < entry_size:
+ # We didn't get the complete first entry
+ # so move start, and grab the next, which
+ # should be a full entry
+ start += len(entries[0])+1
+ first_info = entries[1].split('\0')
+ first_entry_num = 1
+
+ # Find what entries we are looking for, which occur before and
+ # after this first record.
+ first_info_dir_and_name = tuple(first_info[1:3])
+ first_loc = bisect.bisect_left(cur_files, first_info_dir_and_name)
+
+ # These exist before the current location
+ pre = cur_files[:first_loc]
+ # These occur after the current location, which may be in the data
+ # we read, or might be after the last entry
+ middle_files = cur_files[first_loc:]
+ # These are only after the last entry
+ post = []
+
+ if middle_files:
+ # We have something to look for
+
+ # Parse the last entry
+ last_entry_num = len(entries)-1
+ last_info = entries[last_entry_num].split('\0')
+ if len(last_info) < entry_size:
+ # The very last hunk was not complete,
+ # read the previous hunk
+ # TODO: jam 20070217 There may be an edge case if there are
+ # not enough entries that were read.
+ after -= len(entries[-1])
+ last_entry_num -= 1
+ last_info = entries[last_entry_num].split('\0')
+
+ last_info_dir_and_name = tuple(last_info[1:3])
+ last_loc = bisect.bisect_right(middle_files, last_info_dir_and_name)
+
+ post = middle_files[last_loc:]
+ middle_files = middle_files[:last_loc]
+
+ if middle_files:
+ # We have files that should occur in this block
+ # (>= first, <= last)
+ # Either we will find them here, or we can mark them as
+ # missing.
+
+ # Find out what paths we have
+ paths = dict((tuple(entries[num].split('\0', 3)[1:3]), num)
+ for num in xrange(len(middle_files)))
+
+ for fname in middle_files:
+ num = paths.get(fname, None)
+ if num is not None:
+ add_one_record(num)
+
+ # Now we have split up everything into pre, middle, and post, and
+ # we have handled everything that fell in 'middle'.
+ # We add 'post' first, so that we prefer to seek towards the
+ # beginning, so that we will tend to go as early as we need, and
+ # then only seek forward after that.
+ if post:
+ pending.append((after, high, post))
+ if pre:
+ pending.append((low, start-1, pre))
+
+ return [found.get(path_key, None) for path_key in self._path_name_list]
+
+
class DirState(object):
"""Record directory and metadata state for fast access.
@@ -269,7 +421,8 @@
self._ghosts = []
self._parents = []
self._root_entries = None
- self._state_file=None
+ self._state_file = None
+ self._end_of_header = None
def add(self, path, file_id, kind, stat, link_or_sha1):
"""Add a path to be tracked.
@@ -985,6 +1138,8 @@
assert num_ghosts == len(info)-3, 'incorrect ghost info line'
self._ghosts = info[2:-1]
self._header_state = DirState.IN_MEMORY_UNMODIFIED
+ #self._end_of_header = os.fstat(self._state_file.fileno()).st_size
+ self._end_of_header = self._state_file.tell()
def _read_header_if_needed(self):
"""Read the header of the dirstate file if needed."""
=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py 2007-02-20 08:44:10 +0000
+++ b/bzrlib/tests/test_dirstate.py 2007-02-22 15:44:10 +0000
@@ -732,3 +732,33 @@
self.assertEqual(dirstate.DirState.NOT_IN_MEMORY, state._header_state)
self.assertEqual(dirstate.DirState.NOT_IN_MEMORY, state._dirblock_state)
self.assertEntryEqual('', '', 'a-root-value', state, '', 0)
+
+
+class TestBisect(TestCaseWithTransport):
+ """Test the ability to bisect into the disk format."""
+
+ def create_basic_dirstate(self):
+ """Create a dirstate with a few files and directories.
+
+ a
+ b/
+ c
+ d/
+ e
+ f
+ """
+ tree = self.make_branch_and_tree('tree')
+ paths = ['a', 'b/', 'b/c', 'b/d/', 'b/d/e', 'f']
+ self.build_tree(['tree/' + p for p in paths])
+ tree.add([p.rstrip('/') for p in paths])
+ tree.commit('initial')
+ return dirstate.DirState.from_tree(tree, 'dirstate')
+
+ def test_bisect_each(self):
+ """Find a single record using bisect."""
+ self.create_basic_dirstate()
+ # Use a different object, to make sure nothing is pre-cached in memory.
+ state = dirstate.DirState.on_file('dirstate')
+
+ # Bisect should return the rows for the specified files.
+ self.assertEqual(
More information about the bazaar-commits
mailing list