Rev 5363: Change the DirState id_index structure to use tuples instead of sets. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-dirstate-index
John Arbash Meinel
john at arbash-meinel.com
Mon Aug 2 22:43:24 BST 2010
At http://bazaar.launchpad.net/~jameinel/bzr/2.3-dirstate-index
------------------------------------------------------------
revno: 5363
revision-id: john at arbash-meinel.com-20100802214315-r0henszv95y5uo1c
parent: john at arbash-meinel.com-20100802214007-7wdfs47bss26ksfb
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-dirstate-index
timestamp: Mon 2010-08-02 16:43:15 -0500
message:
Change the DirState id_index structure to use tuples instead of sets.
-------------- next part --------------
=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py 2010-08-02 21:40:07 +0000
+++ b/bzrlib/dirstate.py 2010-08-02 21:43:15 +0000
@@ -548,7 +548,7 @@
self._ensure_block(block_index, entry_index, utf8path)
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
if self._id_index:
- self._id_index.setdefault(entry_key[2], set()).add(entry_key)
+ self._add_to_id_index(self._id_index, entry_key)
def _bisect(self, paths):
"""Bisect through the disk structure for specific rows.
@@ -2143,14 +2143,47 @@
yield entry
def _get_id_index(self):
- """Get an id index of self._dirblocks."""
+ """Get an id index of self._dirblocks.
+
+ This maps from file_id => [(directory, name, file_id)] entries where
+ that file_id appears in one of the trees.
+ """
if self._id_index is None:
id_index = {}
for key, tree_details in self._iter_entries():
- id_index.setdefault(key[2], set()).add(key)
+ self._add_to_id_index(id_index, key)
self._id_index = id_index
return self._id_index
+ def _add_to_id_index(self, id_index, entry_key):
+ """Add this entry to the _id_index mapping."""
+ # This code used to use a set for every entry in the id_index. However,
+ # it is *rare* to have more than one entry. So a set is a large
+ # overkill. And even when we do, we won't ever have more than the
+ # number of parent trees. Which is still a small number (rarely >2). As
+ # such, we use a simple tuple, and do our own uniqueness checks. While
+ # the 'in' check is O(N) since N is nicely bounded it shouldn't ever
+ # cause quadratic failure.
+ # TODO: This should use StaticTuple
+ file_id = entry_key[2]
+ if file_id not in id_index:
+ id_index[file_id] = (entry_key,)
+ else:
+ entry_keys = id_index[file_id]
+ if entry_key not in entry_keys:
+ id_index[file_id] = entry_keys + (entry_key,)
+
+ def _remove_from_id_index(self, id_index, entry_key):
+ """Remove this entry from the _id_index mapping.
+
+ It is an programming error to call this when the entry_key is not
+ already present.
+ """
+ file_id = entry_key[2]
+ entry_keys = id_index[file_id]
+ idx = entry_keys.index(entry_key)
+ id_index[file_id] = entry_keys[:idx] + entry_keys[idx+1:]
+
def _get_output_lines(self, lines):
"""Format lines for final output.
@@ -2414,7 +2447,9 @@
continue
by_path[entry[0]] = [entry[1][0]] + \
[DirState.NULL_PARENT_DETAILS] * parent_count
- id_index[entry[0][2]] = set([entry[0]])
+ # TODO: Possibly inline this, since we know it isn't present yet
+ # id_index[entry[0][2]] = (entry[0],)
+ self._add_to_id_index(id_index, entry[0])
# now the parent trees:
for tree_index, tree in enumerate(parent_trees):
@@ -2442,7 +2477,7 @@
new_entry_key = (dirname, basename, file_id)
# tree index consistency: All other paths for this id in this tree
# index must point to the correct path.
- for entry_key in id_index.setdefault(file_id, set()):
+ for entry_key in id_index.get(file_id, ()):
# TODO:PROFILING: It might be faster to just update
# rather than checking if we need to, and then overwrite
# the one we are located at.
@@ -2454,7 +2489,8 @@
by_path[entry_key][tree_index] = ('r', path_utf8, 0, False, '')
# by path consistency: Insert into an existing path record (trivial), or
# add a new one with relocation pointers for the other tree indexes.
- if new_entry_key in id_index[file_id]:
+ entry_keys = id_index.get(file_id, ())
+ if new_entry_key in entry_keys:
# there is already an entry where this data belongs, just insert it.
by_path[new_entry_key][tree_index] = \
self._inv_entry_to_details(entry)
@@ -2465,16 +2501,16 @@
new_details = []
for lookup_index in xrange(tree_index):
# boundary case: this is the first occurence of file_id
- # so there are no id_indexs, possibly take this out of
+ # so there are no id_indexes, possibly take this out of
# the loop?
- if not len(id_index[file_id]):
+ if not len(entry_keys):
new_details.append(DirState.NULL_PARENT_DETAILS)
else:
# grab any one entry, use it to find the right path.
# TODO: optimise this to reduce memory use in highly
# fragmented situations by reusing the relocation
# records.
- a_key = iter(id_index[file_id]).next()
+ a_key = iter(entry_keys).next()
if by_path[a_key][lookup_index][0] in ('r', 'a'):
# its a pointer or missing statement, use it as is.
new_details.append(by_path[a_key][lookup_index])
@@ -2485,7 +2521,7 @@
new_details.append(self._inv_entry_to_details(entry))
new_details.extend(new_location_suffix)
by_path[new_entry_key] = new_details
- id_index[file_id].add(new_entry_key)
+ self._add_to_id_index(id_index, new_entry_key)
# --- end generation of full tree mappings
# sort and output all the entries
@@ -2673,7 +2709,7 @@
block[1].pop(entry_index)
# if we have an id_index in use, remove this key from it for this id.
if self._id_index is not None:
- self._id_index[current_old[0][2]].remove(current_old[0])
+ self._remove_from_id_index(self._id_index, current_old[0])
# update all remaining keys for this id to record it as absent. The
# existing details may either be the record we are marking as deleted
# (if there were other trees with the id present at this path), or may
@@ -2748,7 +2784,7 @@
else:
break
# new entry, synthesis cross reference here,
- existing_keys = id_index.setdefault(key[2], set())
+ existing_keys = id_index.get(key[2], ())
if not existing_keys:
# not currently in the state, simplest case
new_entry = key, [new_details] + self._empty_parent_info()
@@ -2797,6 +2833,8 @@
# - or by creating a new pointer to the right row inside that column
num_present_parents = self._num_present_parents()
if num_present_parents:
+ # TODO: This re-evaluates the existing_keys set, do we need
+ # to do that ourselves?
other_key = list(existing_keys)[0]
for lookup_index in xrange(1, num_present_parents + 1):
# grab any one entry, use it to find the right path.
@@ -2821,7 +2859,7 @@
pointer_path = osutils.pathjoin(*other_key[0:2])
new_entry[1].append(('r', pointer_path, 0, False, ''))
block.insert(entry_index, new_entry)
- existing_keys.add(key)
+ self._add_to_id_index(id_index, key)
else:
# Does the new state matter?
block[entry_index][1][0] = new_details
@@ -2836,8 +2874,12 @@
# converted to relocated.
if path_utf8 is None:
raise AssertionError('no path')
- assert key in id_index[key[2]]
- for entry_key in id_index.get(key[2], ()):
+ existing_keys = id_index[key[2]]
+ if key not in existing_keys:
+ raise AssertionError('We found the entry in the blocks, but'
+ ' the key is not in the id_index.'
+ ' key: %s, existing_keys: %s' % (key, existing_keys))
+ for entry_key in id_index[key[2]]:
# TODO:PROFILING: It might be faster to just update
# rather than checking if we need to, and then overwrite
# the one we are located at.
@@ -2854,7 +2896,6 @@
raise AssertionError('not present: %r', entry_key)
self._dirblocks[block_index][1][entry_index][1][0] = \
('r', path_utf8, 0, False, '')
- assert key in id_index[key[2]]
# add a containing dirblock if needed.
if new_details[0] == 'd':
subdir_key = (osutils.pathjoin(*key[0:2]), '', '')
@@ -2878,7 +2919,7 @@
break
if not present_in_row:
block.pop(index)
- id_index[entry[0][2]].remove(entry[0])
+ self._remove_from_id_index(id_index, entry[0])
return True
return False
@@ -3028,6 +3069,10 @@
raise AssertionError(
'file_id %r did not match entry key %s'
% (file_id, entry_key))
+ if len(entry_keys) != len(set(entry_keys)):
+ raise AssertionError(
+ 'id_index contained non-unique data for %s'
+ % (entry_keys,))
def _wipe_state(self):
"""Forget all state information about the dirstate."""
More information about the bazaar-commits
mailing list