Rev 2385: Add a very similar function which grabs everything for a particular directory block. in http://bzr.arbash-meinel.com/branches/bzr/dirstate

Sat Feb 24 16:01:19 GMT 2007

At http://bzr.arbash-meinel.com/branches/bzr/dirstate

------------------------------------------------------------
revno: 2385
revision-id: john at arbash-meinel.com-20070224160033-rr51vbz5f91z6jvw
parent: john at arbash-meinel.com-20070223225101-92gi3mujiugb0tk0
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate
timestamp: Sat 2007-02-24 10:00:33 -0600
message:
  Add a very similar function which grabs everything for a particular directory block.
modified:
  bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
  bzrlib/tests/test_dirstate.py  test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
-------------- next part --------------
=== modified file 'bzrlib/dirstate.py'

--- a/bzrlib/dirstate.py	2007-02-23 22:51:01 +0000
+++ b/bzrlib/dirstate.py	2007-02-24 16:00:33 +0000
@@ -403,14 +403,8 @@
         while pending:
             low, high, cur_files = pending.pop()
 
-            if not cur_files:
-                # No files to look for
-                continue
-
-            if low >= high:
-                # Did not find cur_files, these will be returned as None
-                # However, other logic should probably prevent this from ever
-                # happening.
+            if not cur_files or low >= high:
+                # Nothing to find
                 continue
 
             count += 1
@@ -533,8 +527,191 @@
             if pre:
                 pending.append((low, start-1, pre))
 
+        # Consider that we may want to return the directory entries in sorted
+        # order. For now, we just return them in whatever order we found them,
+        # and leave it up to the caller if they care if it is ordered or not.
         return [found.get(dir_name, None) for dir_name in dir_name_list]
 
+    def _bisect_dirblocks(self, dir_list):
+        """Bisect through the disk structure to find entries in given dirs.
+
+        _bisect_dirblocks is meant to find the contents of directories, which
+        differs from _bisect, which only finds individual entries.
+
+        :param dir_list: An sorted list of directory names ['', 'dir', 'foo'].
+        :return: The contents of each directory.
+        """
+        # TODO: jam 20070223 A lot of the bisecting logic could be shared
+        #       between this and _bisect. It would require parameterizing the
+        #       inner loop with a function, though. We should evaluate the
+        #       performance difference.
+        self._requires_lock()
+        # We need the file pointer to be right after the initial header block
+        self._read_header_if_needed()
+        # If _dirblock_state was in memory, we should just return info from
+        # there, this function is only meant to handle when we want to read
+        # part of the disk.
+        assert self._dirblock_state == DirState.NOT_IN_MEMORY
+
+        # The disk representation is generally info + '\0\n\0' at the end. But
+        # for bisecting, it is easier to treat this as '\0' + info + '\0\n'
+        # Because it means we can sync on the '\n'
+        state_file = self._state_file
+        file_size = os.fstat(state_file.fileno()).st_size
+        # We end up with 2 extra fields, we should have a trailing '\n' to
+        # ensure that we read the whole record, and we should have a precursur
+        # '' which ensures that we start after the previous '\n'
+        entry_field_count = self._fields_per_entry() + 1
+
+        low = self._end_of_header
+        high = file_size - 1 # Ignore the final '\0'
+        # Map from dir => entry
+        found = {}
+
+        # Avoid infinite seeking
+        max_count = 30*len(dir_list)
+        count = 0
+        # pending is a list of places to look.
+        # each entry is a tuple of low, high, dir_names
+        #   low -> the first byte offset to read (inclusive)
+        #   high -> the last byte offset (inclusive)
+        #   dirs -> The list of directories that should be found in
+        #                the [low, high] range
+        pending = [(low, high, dir_list)]
+
+        page_size = self._bisect_page_size
+
+        fields_to_entry = self._get_fields_to_entry()
+
+        while pending:
+            low, high, cur_dirs = pending.pop()
+
+            if not cur_dirs or low >= high:
+                # Nothing to find
+                continue
+
+            count += 1
+            if count > max_count:
+                raise errors.BzrError('Too many seeks, most likely a bug.')
+
+            mid = max(low, (low+high-page_size)/2)
+
+            state_file.seek(mid)
+            # limit the read size, so we don't end up reading data that we have
+            # already read.
+            read_size = min(page_size, (high-mid)+1)
+            block = state_file.read(read_size)
+
+            start = mid
+            entries = block.split('\n')
+
+            if len(entries) < 2:
+                # We didn't find a '\n', so we cannot have found any records.
+                # So put this range back and try again. But we know we have to
+                # increase the page size, because a single read did not contain
+                # a record break (so records must be larger than page_size)
+                page_size *= 2
+                pending.append((low, high, cur_dirs))
+                continue
+
+            # Check the first and last entries, in case they are partial, or if
+            # we don't care about the rest of this page
+            first_entry_num = 0
+            first_fields = entries[0].split('\0')
+            if len(first_fields) < entry_field_count:
+                # We didn't get the complete first entry
+                # so move start, and grab the next, which
+                # should be a full entry
+                start += len(entries[0])+1
+                first_fields = entries[1].split('\0')
+                first_entry_num = 1
+
+            if len(first_fields) <= 1:
+                # We didn't even get a dirname here... what do we do?
+                # Try a large page size and repeat this query
+                page_size *= 2
+                pending.append((low, high, cur_dirs))
+                continue
+            else:
+                # Find what entries we are looking for, which occur before and
+                # after this first record.
+                after = start
+                first_dir = first_fields[1]
+                first_loc = bisect.bisect_left(cur_dirs, first_dir)
+
+                # These exist before the current location
+                pre = cur_dirs[:first_loc]
+                # These occur after the current location, which may be in the
+                # data we read, or might be after the last entry
+                post = cur_dirs[first_loc:]
+
+            if post and len(first_fields) >= entry_field_count:
+                # We have records to look at after the first entry
+
+                # Parse the last entry
+                last_entry_num = len(entries)-1
+                last_fields = entries[last_entry_num].split('\0')
+                if len(last_fields) < entry_field_count:
+                    # The very last hunk was not complete,
+                    # read the previous hunk
+                    after = mid + len(block) - len(entries[-1])
+                    last_entry_num -= 1
+                    last_fields = entries[last_entry_num].split('\0')
+                else:
+                    after = mid + len(block)
+
+                last_dir = last_fields[1]
+                last_loc = bisect.bisect_right(post, last_dir)
+
+                middle_files = post[:last_loc]
+                post = post[last_loc:]
+
+                if middle_files:
+                    # We have files that should occur in this block
+                    # (>= first, <= last)
+                    # Either we will find them here, or we can mark them as
+                    # missing.
+
+                    if middle_files[0] == first_dir:
+                        # We might need to go before this location
+                        pre.append(first_dir)
+                    if middle_files[-1] == last_dir:
+                        post.insert(0, last_dir)
+
+                    # Find out what paths we have
+                    paths = {first_dir:[first_fields]}
+                    # last_dir might == first_dir so we need to be
+                    # careful if we should append rather than overwrite
+                    if last_entry_num != first_entry_num:
+                        paths.setdefault(last_dir, []).append(last_fields)
+                    for num in xrange(first_entry_num+1, last_entry_num):
+                        # TODO: jam 20070223 We are already splitting here, so
+                        #       shouldn't we just split the whole thing rather
+                        #       than doing the split again in add_one_record?
+                        fields = entries[num].split('\0')
+                        paths.setdefault(fields[1], []).append(fields)
+
+                    for cur_dir in middle_files:
+                        for fields in paths.get(cur_dir, []):
+                            # offset by 1 because of the opening '\0'
+                            # consider changing fields_to_entry to avoid the
+                            # extra list slice
+                            entry = fields_to_entry(fields[1:])
+                            found.setdefault(cur_dir, []).append(entry)
+
+            # Now we have split up everything into pre, middle, and post, and
+            # we have handled everything that fell in 'middle'.
+            # We add 'post' first, so that we prefer to seek towards the
+            # beginning, so that we will tend to go as early as we need, and
+            # then only seek forward after that.
+            if post:
+                pending.append((after, high, post))
+            if pre:
+                pending.append((low, start-1, pre))
+
+        return [found.get(cur_dir, None) for cur_dir in dir_list]
+
+
     def _empty_parent_info(self):
         return [DirState.NULL_PARENT_DETAILS] * (len(self._parents) -
                                                     len(self._ghosts))

=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py	2007-02-23 22:51:01 +0000
+++ b/bzrlib/tests/test_dirstate.py	2007-02-24 16:00:33 +0000
@@ -1072,6 +1072,28 @@
             self.assertEqual(expected_entries, actual_entries)
         self.assertEqual(len(expected), len(result))
 
+    def assertBisectDirBlocks(self, expected_map, entries, state, paths):
+        """Assert that bisecting for dirbblocks returns the right result.
+
+        :param expected: A map from path => expected values
+        :param entries: A nested list of paths we expect to be returned.
+            Something like [['a', 'b', 'f'], ['b/c', 'b/d']]
+        :param state: The DirState object.
+        :param paths: A list of directories
+        """
+        result = state._bisect_dirblocks(paths)
+        # For now, results are just returned in whatever order we read them.
+        # We could sort by (dir, name, file_id) or something like that, but in
+        for subentries, actual in zip(entries, result):
+            if subentries is None:
+                expected = None
+            else:
+                expected = sorted(expected_map[e] for e in subentries)
+            if actual is not None:
+                actual.sort()
+            self.assertEqual(expected, actual)
+        self.assertEqual(len(entries), len(result))
+
     def test_bisect_each(self):
         """Find a single record using bisect."""
         tree, state, expected = self.create_basic_dirstate()
@@ -1157,3 +1179,28 @@
         # Search for the pre and post renamed entries
         self.assertBisect([[expected['a']]], state, ['a'])
         self.assertBisect([[expected['b/g']]], state, ['b/g'])
+
+    def test_bisect_dirblocks(self):
+        tree, state, expected = self.create_duplicated_dirstate()
+        self.assertBisectDirBlocks(expected,
+            [['', 'a', 'a2', 'b', 'b2', 'f', 'f2']], state, [''])
+        self.assertBisectDirBlocks(expected,
+            [['b/c', 'b/c2', 'b/d', 'b/d2']], state, ['b'])
+        self.assertBisectDirBlocks(expected,
+            [['b/d/e', 'b/d/e2']], state, ['b/d'])
+        self.assertBisectDirBlocks(expected,
+            [['', 'a', 'a2', 'b', 'b2', 'f', 'f2'],
+             ['b/c', 'b/c2', 'b/d', 'b/d2'],
+             ['b/d/e', 'b/d/e2'],
+            ], state, ['', 'b', 'b/d'])
+
+    def test_bisect_dirblocks_missing(self):
+        tree, state, expected = self.create_basic_dirstate()
+        self.assertBisectDirBlocks(expected, [['b/d/e'], None],
+            state, ['b/d', 'b/e'])
+        # Files don't show up in this search
+        self.assertBisectDirBlocks(expected, [None], state, ['a'])
+        self.assertBisectDirBlocks(expected, [None], state, ['b/c'])
+        self.assertBisectDirBlocks(expected, [None], state, ['c'])
+        self.assertBisectDirBlocks(expected, [None], state, ['b/d/e'])
+        self.assertBisectDirBlocks(expected, [None], state, ['f'])