[MERGE] use a serial number and one random value for fileids

Fri May 19 06:12:29 BST 2006

On 19 May 2006, Robert Collins <robertc at robertcollins.net> wrote:
> This patch uses a serial number and a single random value for 'bzr add'
> rather than brand new randomness each time.
> 
> Its clearly still unique within a single process.
> 
> Globally it still needs a collision on the timestamp and 64 bits of
> entropy to have a collision.
> 
> The only hole I can see is someone doing fork() after calling add, and
> then calling add in the child. We probably should add an atfork hook to
> reset the seed in this case.

OK, I'd just suggest putting an XXX comment about this - similarly for
threading.

We should possibly also

 - squash filenames to lowercase to avoid stores needing to escape the
   name
 - truncate the filename for similar reasons
 - similarly, possibly avoid generating ids that would look like illegal 
   filenames on windows (con.txt, etc) - though perhaps that should be
   done in the store

> === modified file 'bzrlib/inventory.py'
> --- bzrlib/inventory.py	
> +++ bzrlib/inventory.py	
> @@ -999,18 +999,19 @@
>          The immediate parent must already be versioned.
>  
>          Returns the new entry object."""
> -        from bzrlib.workingtree import gen_file_id
>          
>          parts = bzrlib.osutils.splitpath(relpath)
>  
> -        if file_id == None:
> -            file_id = gen_file_id(relpath)
> -
>          if len(parts) == 0:
> +            if file_id is None:
> +                file_id = bzrlib.workingtree.gen_root_id()
>              self.root = RootEntry(file_id)
>              self._byid = {self.root.file_id: self.root}
>              return
>          else:
> +            if file_id is None:
> +                file_id = bzrlib.workingtree.gen_file_id(parts[-1])
> +
>              parent_path = parts[:-1]
>              parent_id = self.path2id(parent_path)
>              if parent_id == None:

Could delete that blank line

> 
> === modified file 'bzrlib/tests/test_workingtree.py'
> --- bzrlib/tests/test_workingtree.py	
> +++ bzrlib/tests/test_workingtree.py	
> @@ -234,3 +234,28 @@
>          os.mkdir('lala.OTHER')
>          expected = ContentsConflict('lala', file_id='lala-id')
>          self.assertEqual(list(tree.conflicts()), [expected])
> +
> +
> +class TestNonFormatSpecific(TestCaseWithTransport):
> +    
> +    def test_gen_file_id(self):
> +        self.assertStartsWith(bzrlib.workingtree.gen_file_id('bar'), 'bar-')
> +        self.assertStartsWith(bzrlib.workingtree.gen_file_id('Mwoo oof\t m'), 'Mwoooofm-')
> +        self.assertStartsWith(bzrlib.workingtree.gen_file_id('..gam.py'), 'gam.py-')
> +        self.assertStartsWith(bzrlib.workingtree.gen_file_id('..Mwoo oof\t m'), 'Mwoooofm-')
> +
> +    def test_next_id_suffix(self):
> +        bzrlib.workingtree._gen_id_suffix = None
> +        bzrlib.workingtree._next_id_suffix()
> +        self.assertNotEqual(None, bzrlib.workingtree._gen_id_suffix)
> +        bzrlib.workingtree._gen_id_suffix = "foo-"
> +        bzrlib.workingtree._gen_id_serial = 1
> +        self.assertEqual("foo-2", bzrlib.workingtree._next_id_suffix())
> +        self.assertEqual("foo-3", bzrlib.workingtree._next_id_suffix())
> +        self.assertEqual("foo-4", bzrlib.workingtree._next_id_suffix())
> +        self.assertEqual("foo-5", bzrlib.workingtree._next_id_suffix())
> +        self.assertEqual("foo-6", bzrlib.workingtree._next_id_suffix())
> +        self.assertEqual("foo-7", bzrlib.workingtree._next_id_suffix())
> +        self.assertEqual("foo-8", bzrlib.workingtree._next_id_suffix())
> +        self.assertEqual("foo-9", bzrlib.workingtree._next_id_suffix())
> +        self.assertEqual("foo-10", bzrlib.workingtree._next_id_suffix())
> 
> === modified file 'bzrlib/workingtree.py'
> --- bzrlib/workingtree.py	
> +++ bzrlib/workingtree.py	
> @@ -39,13 +39,15 @@
>  # At the moment they may alias the inventory and have old copies of it in
>  # memory.  (Now done? -- mbp 20060309)
>  
> +from binascii import hexlify
>  from copy import deepcopy
>  from cStringIO import StringIO
>  import errno
>  import fnmatch
>  import os
> +import re
>  import stat
> - 
> +from time import time
>  
>  from bzrlib.atomicfile import AtomicFile
>  from bzrlib.branch import (Branch,
> @@ -101,33 +103,39 @@
>  import bzrlib.xml5
>  
>  
> +# the regex here does the following:
> +# 1) remove any wierd characters; we don't escape them but rather
> +# just pull them out

'weird'

> + # 2) match leading '.'s to make it not hidden
> +_gen_file_id_re = re.compile(r'[^\w.]|(^\.*)')
> +_gen_id_suffix = None
> +_gen_id_serial = 0
> +
> +
> +def _next_id_suffix():
> +    """Create a new file id suffix that is reasonably unique.
> +    
> +    On the first call we combine the current time with 64 bits of randomness
> +    to give a highly probably globally unique number. Then each call in the same
> +    process adds 1 to a serial number we append to that unique value.
> +    """
> +    global _gen_id_suffix, _gen_id_serial
> +    if _gen_id_suffix is None:
> +        _gen_id_suffix = "-%s-%s-" % (compact_date(time()), hexlify(rand_bytes(8)))
> +    _gen_id_serial += 1
> +    return _gen_id_suffix + str(_gen_id_serial)

Since we're changing it I'd prefer to use rand_chars, which gets more
entropy per byte.

> +
> +
>  def gen_file_id(name):
> -    """Return new file id.
> +    """Return new file id for the basename 'name'.
>  
>      This should probably generate proper UUIDs, but for the moment we
>      cope with just randomness because running uuidgen every time is
> -    slow."""

This sentence should be deleted.

> -    import re
> -    from binascii import hexlify
> -    from time import time
> -
> -    # get last component
> -    idx = name.rfind('/')
> -    if idx != -1:
> -        name = name[idx+1 : ]
> -    idx = name.rfind('\\')
> -    if idx != -1:
> -        name = name[idx+1 : ]
> -
> -    # make it not a hidden file
> -    name = name.lstrip('.')
> -
> -    # remove any wierd characters; we don't escape them but rather
> -    # just pull them out
> -    name = re.sub(r'[^\w.]', '', name)
> -
> -    s = hexlify(rand_bytes(8))
> -    return '-'.join((name, compact_date(time()), s))
> +    slow.
> +
> +    The uniqueness is supplied from _next_id_suffix.
> +    """
> +    return _gen_file_id_re.sub('', name) + _next_id_suffix()
>  
>  
>  def gen_root_id():
> @@ -588,10 +596,10 @@
>                                 'i.e. regular file, symlink or directory): %s' % quotefn(f))
>  
>              if file_id is None:
> -                file_id = gen_file_id(f)
> -            inv.add_path(f, kind=kind, file_id=file_id)
> -
> -            mutter("add file %s file_id:{%s} kind=%r" % (f, file_id, kind))
> +                inv.add_path(f, kind=kind)
> +            else:
> +                inv.add_path(f, kind=kind, file_id=file_id)
> +
>          self._write_inventory(inv)
>  
>      @needs_write_lock
> 

-- 
Martin