[MERGE] use a serial number and one random value for fileids
Martin Pool
mbp at canonical.com
Fri May 19 06:12:29 BST 2006
On 19 May 2006, Robert Collins <robertc at robertcollins.net> wrote:
> This patch uses a serial number and a single random value for 'bzr add'
> rather than brand new randomness each time.
>
> Its clearly still unique within a single process.
>
> Globally it still needs a collision on the timestamp and 64 bits of
> entropy to have a collision.
>
> The only hole I can see is someone doing fork() after calling add, and
> then calling add in the child. We probably should add an atfork hook to
> reset the seed in this case.
OK, I'd just suggest putting an XXX comment about this - similarly for
threading.
We should possibly also
- squash filenames to lowercase to avoid stores needing to escape the
name
- truncate the filename for similar reasons
- similarly, possibly avoid generating ids that would look like illegal
filenames on windows (con.txt, etc) - though perhaps that should be
done in the store
> === modified file 'bzrlib/inventory.py'
> --- bzrlib/inventory.py
> +++ bzrlib/inventory.py
> @@ -999,18 +999,19 @@
> The immediate parent must already be versioned.
>
> Returns the new entry object."""
> - from bzrlib.workingtree import gen_file_id
>
> parts = bzrlib.osutils.splitpath(relpath)
>
> - if file_id == None:
> - file_id = gen_file_id(relpath)
> -
> if len(parts) == 0:
> + if file_id is None:
> + file_id = bzrlib.workingtree.gen_root_id()
> self.root = RootEntry(file_id)
> self._byid = {self.root.file_id: self.root}
> return
> else:
> + if file_id is None:
> + file_id = bzrlib.workingtree.gen_file_id(parts[-1])
> +
> parent_path = parts[:-1]
> parent_id = self.path2id(parent_path)
> if parent_id == None:
Could delete that blank line
>
> === modified file 'bzrlib/tests/test_workingtree.py'
> --- bzrlib/tests/test_workingtree.py
> +++ bzrlib/tests/test_workingtree.py
> @@ -234,3 +234,28 @@
> os.mkdir('lala.OTHER')
> expected = ContentsConflict('lala', file_id='lala-id')
> self.assertEqual(list(tree.conflicts()), [expected])
> +
> +
> +class TestNonFormatSpecific(TestCaseWithTransport):
> +
> + def test_gen_file_id(self):
> + self.assertStartsWith(bzrlib.workingtree.gen_file_id('bar'), 'bar-')
> + self.assertStartsWith(bzrlib.workingtree.gen_file_id('Mwoo oof\t m'), 'Mwoooofm-')
> + self.assertStartsWith(bzrlib.workingtree.gen_file_id('..gam.py'), 'gam.py-')
> + self.assertStartsWith(bzrlib.workingtree.gen_file_id('..Mwoo oof\t m'), 'Mwoooofm-')
> +
> + def test_next_id_suffix(self):
> + bzrlib.workingtree._gen_id_suffix = None
> + bzrlib.workingtree._next_id_suffix()
> + self.assertNotEqual(None, bzrlib.workingtree._gen_id_suffix)
> + bzrlib.workingtree._gen_id_suffix = "foo-"
> + bzrlib.workingtree._gen_id_serial = 1
> + self.assertEqual("foo-2", bzrlib.workingtree._next_id_suffix())
> + self.assertEqual("foo-3", bzrlib.workingtree._next_id_suffix())
> + self.assertEqual("foo-4", bzrlib.workingtree._next_id_suffix())
> + self.assertEqual("foo-5", bzrlib.workingtree._next_id_suffix())
> + self.assertEqual("foo-6", bzrlib.workingtree._next_id_suffix())
> + self.assertEqual("foo-7", bzrlib.workingtree._next_id_suffix())
> + self.assertEqual("foo-8", bzrlib.workingtree._next_id_suffix())
> + self.assertEqual("foo-9", bzrlib.workingtree._next_id_suffix())
> + self.assertEqual("foo-10", bzrlib.workingtree._next_id_suffix())
>
> === modified file 'bzrlib/workingtree.py'
> --- bzrlib/workingtree.py
> +++ bzrlib/workingtree.py
> @@ -39,13 +39,15 @@
> # At the moment they may alias the inventory and have old copies of it in
> # memory. (Now done? -- mbp 20060309)
>
> +from binascii import hexlify
> from copy import deepcopy
> from cStringIO import StringIO
> import errno
> import fnmatch
> import os
> +import re
> import stat
> -
> +from time import time
>
> from bzrlib.atomicfile import AtomicFile
> from bzrlib.branch import (Branch,
> @@ -101,33 +103,39 @@
> import bzrlib.xml5
>
>
> +# the regex here does the following:
> +# 1) remove any wierd characters; we don't escape them but rather
> +# just pull them out
'weird'
> + # 2) match leading '.'s to make it not hidden
> +_gen_file_id_re = re.compile(r'[^\w.]|(^\.*)')
> +_gen_id_suffix = None
> +_gen_id_serial = 0
> +
> +
> +def _next_id_suffix():
> + """Create a new file id suffix that is reasonably unique.
> +
> + On the first call we combine the current time with 64 bits of randomness
> + to give a highly probably globally unique number. Then each call in the same
> + process adds 1 to a serial number we append to that unique value.
> + """
> + global _gen_id_suffix, _gen_id_serial
> + if _gen_id_suffix is None:
> + _gen_id_suffix = "-%s-%s-" % (compact_date(time()), hexlify(rand_bytes(8)))
> + _gen_id_serial += 1
> + return _gen_id_suffix + str(_gen_id_serial)
Since we're changing it I'd prefer to use rand_chars, which gets more
entropy per byte.
> +
> +
> def gen_file_id(name):
> - """Return new file id.
> + """Return new file id for the basename 'name'.
>
> This should probably generate proper UUIDs, but for the moment we
> cope with just randomness because running uuidgen every time is
> - slow."""
This sentence should be deleted.
> - import re
> - from binascii import hexlify
> - from time import time
> -
> - # get last component
> - idx = name.rfind('/')
> - if idx != -1:
> - name = name[idx+1 : ]
> - idx = name.rfind('\\')
> - if idx != -1:
> - name = name[idx+1 : ]
> -
> - # make it not a hidden file
> - name = name.lstrip('.')
> -
> - # remove any wierd characters; we don't escape them but rather
> - # just pull them out
> - name = re.sub(r'[^\w.]', '', name)
> -
> - s = hexlify(rand_bytes(8))
> - return '-'.join((name, compact_date(time()), s))
> + slow.
> +
> + The uniqueness is supplied from _next_id_suffix.
> + """
> + return _gen_file_id_re.sub('', name) + _next_id_suffix()
>
>
> def gen_root_id():
> @@ -588,10 +596,10 @@
> 'i.e. regular file, symlink or directory): %s' % quotefn(f))
>
> if file_id is None:
> - file_id = gen_file_id(f)
> - inv.add_path(f, kind=kind, file_id=file_id)
> -
> - mutter("add file %s file_id:{%s} kind=%r" % (f, file_id, kind))
> + inv.add_path(f, kind=kind)
> + else:
> + inv.add_path(f, kind=kind, file_id=file_id)
> +
> self._write_inventory(inv)
>
> @needs_write_lock
>
--
Martin
More information about the bazaar
mailing list