Prefixed stores

Gustavo Niemeyer gustavo at niemeyer.net
Wed Oct 5 19:08:09 BST 2005


Greetings!

The attached patch will add support for prefixed stores. This
means that instead of using something like

  .bzr/store/fileid1
  .bzr/store/fileid2

it will use something like

  .bzr/store/ab/fileid1
  .bzr/store/fe/fileid2

and so on. The prefixes are created using the 8 lower bits of
an adler32 checksum on the fileid (not the filename, meaning that
"foobar.gz" and "foobar" go on the same slot).

The patch will introduce branch format v6, but it will not
force upgrade. Users will continue using v5 until they "bzr upgrade"
the branch. Upgrading an older format (v0.0.4) will turn the
branch into a v6 as well.

Have fun!

-- 
Gustavo Niemeyer
http://niemeyer.net
-------------- next part --------------
=== modified file 'bzrlib/branch.py'
--- bzrlib/branch.py
+++ bzrlib/branch.py
@@ -49,6 +49,7 @@
 
 BZR_BRANCH_FORMAT_4 = "Bazaar-NG branch, format 0.0.4\n"
 BZR_BRANCH_FORMAT_5 = "Bazaar-NG branch, format 5\n"
+BZR_BRANCH_FORMAT_6 = "Bazaar-NG branch, format 6\n"
 ## TODO: Maybe include checks for common corruption of newlines, etc?
 
 
@@ -236,25 +237,27 @@
             self._make_control()
         self._check_format(relax_version_check)
 
-        def get_store(name, compressed=True):
+        def get_store(name, compressed=True, prefixed=False):
             # FIXME: This approach of assuming stores are all entirely compressed
             # or entirely uncompressed is tidy, but breaks upgrade from 
             # some existing branches where there's a mixture; we probably 
             # still want the option to look for both.
             relpath = self._rel_controlfilename(name)
             if compressed:
-                store = CompressedTextStore(self._transport.clone(relpath))
+                store = CompressedTextStore(self._transport.clone(relpath),
+                                            prefixed=prefixed)
             else:
-                store = TextStore(self._transport.clone(relpath))
+                store = TextStore(self._transport.clone(relpath),
+                                  prefixed=prefixed)
             if self._transport.should_cache():
                 from meta_store import CachedStore
                 cache_path = os.path.join(self.cache_root, name)
                 os.mkdir(cache_path)
                 store = CachedStore(store, cache_path)
             return store
-        def get_weave(name):
+        def get_weave(name, prefixed=False):
             relpath = self._rel_controlfilename(name)
-            ws = WeaveStore(self._transport.clone(relpath))
+            ws = WeaveStore(self._transport.clone(relpath), prefixed=prefixed)
             if self._transport.should_cache():
                 ws.enable_cache = True
             return ws
@@ -267,6 +270,11 @@
             self.control_weaves = get_weave([])
             self.weave_store = get_weave('weaves')
             self.revision_store = get_store('revision-store', compressed=False)
+        elif self._branch_format == 6:
+            self.control_weaves = get_weave([])
+            self.weave_store = get_weave('weaves', prefixed=True)
+            self.revision_store = get_store('revision-store', compressed=False,
+                                            prefixed=True)
 
     def __str__(self):
         return '%s(%r)' % (self.__class__.__name__, self._transport.base)
@@ -438,7 +446,7 @@
         files = [('README', 
             "This is a Bazaar-NG control directory.\n"
             "Do not change any files in this directory.\n"),
-            ('branch-format', BZR_BRANCH_FORMAT_5),
+            ('branch-format', BZR_BRANCH_FORMAT_6),
             ('revision-history', ''),
             ('branch-name', ''),
             ('branch-lock', ''),
@@ -466,13 +474,15 @@
         except NoSuchFile:
             raise NotBranchError(self.base)
 
-        if fmt == BZR_BRANCH_FORMAT_5:
+        if fmt == BZR_BRANCH_FORMAT_6:
+            self._branch_format = 6
+        elif fmt == BZR_BRANCH_FORMAT_5:
             self._branch_format = 5
         elif fmt == BZR_BRANCH_FORMAT_4:
             self._branch_format = 4
 
         if (not relax_version_check
-            and self._branch_format != 5):
+            and self._branch_format not in (5, 6)):
             raise BzrError('sorry, branch format %r not supported' % fmt,
                            ['use a different bzr version',
                             'or remove the .bzr directory'

=== modified file 'bzrlib/selftest/test_upgrade.py'
--- bzrlib/selftest/test_upgrade.py
+++ bzrlib/selftest/test_upgrade.py
@@ -37,12 +37,12 @@
         self.assertTrue(os.path.exists('.bzr/README'))
 
     def test_upgrade_simple(self):
-        """Upgrade simple v0.0.4 format to v5"""
+        """Upgrade simple v0.0.4 format to v6"""
         eq = self.assertEquals
         build_tree_contents(_upgrade1_template)
         upgrade('.')
         b = Branch.open('.')
-        eq(b._branch_format, 5)
+        eq(b._branch_format, 6)
         rh = b.revision_history()
         eq(rh,
            ['mbp at sourcefrog.net-20051004035611-176b16534b086b3c',

=== modified file 'bzrlib/store/__init__.py'
--- bzrlib/store/__init__.py
+++ bzrlib/store/__init__.py
@@ -25,6 +25,7 @@
 """
 
 from cStringIO import StringIO
+from zlib import adler32
 
 from bzrlib.errors import BzrError, UnlistableStore, TransportNotPossible
 from bzrlib.trace import mutter
@@ -302,3 +303,6 @@
         raise UnlistableStore(store_from)
     store_to.copy_multi(store_from, ids)
 
+def hash_prefix(file_id):
+    return "%02x/" % (adler32(file_id) & 0xff)
+

=== modified file 'bzrlib/store/compressed_text.py'
--- bzrlib/store/compressed_text.py
+++ bzrlib/store/compressed_text.py
@@ -24,11 +24,12 @@
 import os, tempfile, gzip
 
 import bzrlib.store
+from bzrlib.store import hash_prefix
 from bzrlib.trace import mutter
-from bzrlib.errors import BzrError
+from bzrlib.errors import BzrError, FileExists
 
 from StringIO import StringIO
-from stat import ST_SIZE
+from stat import ST_SIZE, ST_MODE, S_ISDIR
 
 class CompressedTextStore(bzrlib.store.TransportStore):
     """Store that holds files indexed by unique names.
@@ -56,8 +57,9 @@
     'goodbye'
     """
 
-    def __init__(self, transport):
+    def __init__(self, transport, prefixed=False):
         super(CompressedTextStore, self).__init__(transport)
+        self._prefixed = prefixed
 
     def _check_fileid(self, fileid):
         if '\\' in fileid or '/' in fileid:
@@ -65,7 +67,10 @@
 
     def _relpath(self, fileid):
         self._check_fileid(fileid)
-        return fileid + '.gz'
+        if self._prefixed:
+            return hash_prefix(fileid) + fileid + ".gz"
+        else:
+            return fileid + ".gz"
 
     def add(self, f, fileid):
         """Add contents of a file into the store.
@@ -88,6 +93,11 @@
         if self._transport.has(fn):
             raise BzrError("store %r already contains id %r" % (self._transport.base, fileid))
 
+        if self._prefixed:
+            try:
+                self._transport.mkdir(hash_prefix(fileid))
+            except FileExists:
+                pass
 
         sio = StringIO()
         gf = gzip.GzipFile(mode='wb', fileobj=sio)
@@ -196,17 +206,27 @@
             yield None
             count += 1
 
+    def _iter_relpaths(self):
+        transport = self._transport
+        queue = list(transport.list_dir('.'))
+        while queue:
+            relpath = queue.pop(0)
+            st = transport.stat(relpath)
+            if S_ISDIR(st[ST_MODE]):
+                for i, basename in enumerate(transport.list_dir(relpath)):
+                    queue.insert(i, relpath+'/'+basename)
+            else:
+                yield relpath, st
+
     def __iter__(self):
-        # TODO: case-insensitive?
-        for f in self._transport.list_dir('.'):
-            if f[-3:] == '.gz':
-                yield f[:-3]
+        for relpath, st in self._iter_relpaths():
+            if relpath.endswith(".gz"):
+                yield os.path.basename(relpath)[:-3]
             else:
-                yield f
+                yield os.path.basename(relpath)
 
     def __len__(self):
-        return len([f for f in self._transport.list_dir('.')])
-
+        return len(list(self._iter_relpath()))
 
     def __getitem__(self, fileid):
         """Returns a file reading from a particular entry."""
@@ -235,8 +255,7 @@
         the content."""
         total = 0
         count = 0
-        relpaths = [self._relpath(fid) for fid in self]
-        for st in self._transport.stat_multi(relpaths):
+        for relpath, st in self._iter_relpaths():
             count += 1
             total += st[ST_SIZE]
                 

=== modified file 'bzrlib/store/text.py'
--- bzrlib/store/text.py
+++ bzrlib/store/text.py
@@ -24,11 +24,12 @@
 import os, tempfile
 
 import bzrlib.store
+from bzrlib.store import hash_prefix
 from bzrlib.trace import mutter
-from bzrlib.errors import BzrError
+from bzrlib.errors import BzrError, FileExists
 
 from cStringIO import StringIO
-from stat import ST_SIZE
+from stat import ST_SIZE, ST_MODE, S_ISDIR
 
 
 class TextStore(bzrlib.store.TransportStore):
@@ -41,8 +42,9 @@
     Files are stored uncompressed, with no delta compression.
     """
 
-    def __init__(self, transport):
+    def __init__(self, transport, prefixed=False):
         super(TextStore, self).__init__(transport)
+        self._prefixed = prefixed
 
     def _check_fileid(self, fileid):
         if not isinstance(fileid, basestring):
@@ -52,7 +54,10 @@
 
     def _relpath(self, fileid):
         self._check_fileid(fileid)
-        return fileid
+        if self._prefixed:
+            return hash_prefix(fileid) + fileid
+        else:
+            return fileid
 
     def add(self, f, fileid):
         """Add contents of a file into the store.
@@ -65,37 +70,13 @@
         if self._transport.has(fn):
             raise BzrError("store %r already contains id %r" % (self._transport.base, fileid))
 
+        if self._prefixed:
+            try:
+                self._transport.mkdir(hash_prefix(fileid))
+            except FileExists:
+                pass
+
         self._transport.put(fn, f)
-
-    def _do_copy(self, other, to_copy, pb, permit_failure=False):
-        if isinstance(other, TextStore):
-            return self._copy_multi_text(other, to_copy, pb,
-                    permit_failure=permit_failure)
-        return super(TextStore, self)._do_copy(other, to_copy,
-                pb, permit_failure=permit_failure)
-
-    def _copy_multi_text(self, other, to_copy, pb,
-            permit_failure=False):
-        # Because of _transport, we can no longer assume
-        # that they are on the same filesystem, we can, however
-        # assume that we only need to copy the exact bytes,
-        # we don't need to process the files.
-
-        failed = set()
-        if permit_failure:
-            new_to_copy = set()
-            for fileid, has in zip(to_copy, other.has(to_copy)):
-                if has:
-                    new_to_copy.add(fileid)
-                else:
-                    failed.add(fileid)
-            to_copy = new_to_copy
-            #mutter('_copy_multi_text copying %s, failed %s' % (to_copy, failed))
-
-        paths = [self._relpath(fileid) for fileid in to_copy]
-        count = other._transport.copy_to(paths, self._transport, pb=pb)
-        assert count == len(to_copy)
-        return count, failed
 
     def __contains__(self, fileid):
         """"""
@@ -156,14 +137,24 @@
             yield None
             count += 1
 
+    def _iter_relpaths(self):
+        transport = self._transport
+        queue = list(transport.list_dir('.'))
+        while queue:
+            relpath = queue.pop(0)
+            st = transport.stat(relpath)
+            if S_ISDIR(st[ST_MODE]):
+                for i, basename in enumerate(transport.list_dir(relpath)):
+                    queue.insert(i, relpath+'/'+basename)
+            else:
+                yield relpath, st
+
     def __iter__(self):
-        # TODO: case-insensitive?
-        for f in self._transport.list_dir('.'):
-            yield f
+        for relpath, st in self._iter_relpaths():
+            yield os.path.basename(relpath)
 
     def __len__(self):
-        return len([f for f in self._transport.list_dir('.')])
-
+        return len(list(self._iter_relpath()))
 
     def __getitem__(self, fileid):
         """Returns a file reading from a particular entry."""
@@ -184,8 +175,7 @@
         the content."""
         total = 0
         count = 0
-        relpaths = [self._relpath(fid) for fid in self]
-        for st in self._transport.stat_multi(relpaths):
+        for relpath, st in self._iter_relpaths():
             count += 1
             total += st[ST_SIZE]
                 

=== modified file 'bzrlib/store/weave.py'
--- bzrlib/store/weave.py
+++ bzrlib/store/weave.py
@@ -21,14 +21,15 @@
 
 
 from cStringIO import StringIO
+from stat import ST_MODE, S_ISDIR
 import os
 import errno
 
 from bzrlib.weavefile import read_weave, write_weave_v5
 from bzrlib.weave import Weave
-from bzrlib.store import Store
+from bzrlib.store import Store, hash_prefix
 from bzrlib.atomicfile import AtomicFile
-from bzrlib.errors import NoSuchFile
+from bzrlib.errors import NoSuchFile, FileExists
 from bzrlib.trace import mutter
 
 
@@ -41,22 +42,37 @@
     """
     FILE_SUFFIX = '.weave'
 
-    def __init__(self, transport):
+    def __init__(self, transport, prefixed=False):
         self._transport = transport
+        self._prefixed = prefixed
         self._cache = {}
-	self.enable_cache = False
+        self.enable_cache = False
 
 
     def filename(self, file_id):
         """Return the path relative to the transport root."""
-        return file_id + WeaveStore.FILE_SUFFIX
+        if self._prefixed:
+            return hash_prefix(file_id) + file_id + WeaveStore.FILE_SUFFIX
+        else:
+            return file_id + WeaveStore.FILE_SUFFIX
+
+    def _iter_relpaths(self):
+        transport = self._transport
+        queue = list(transport.list_dir('.'))
+        while queue:
+            relpath = queue.pop(0)
+            st = transport.stat(relpath)
+            if S_ISDIR(st[ST_MODE]):
+                for i, basename in enumerate(transport.list_dir(relpath)):
+                    queue.insert(i, relpath+'/'+basename)
+            else:
+                yield relpath, st
 
     def __iter__(self):
         l = len(WeaveStore.FILE_SUFFIX)
-        for f in self._transport.list_dir('.'):
-            if f.endswith(WeaveStore.FILE_SUFFIX):
-                f = f[:-l]
-                yield f
+        for relpath, st in self._iter_relpaths():
+            if relpath.endswith(WeaveStore.FILE_SUFFIX):
+                yield os.path.basename(relpath[:-l])
 
     def __contains__(self, fileid):
         """"""
@@ -66,6 +82,11 @@
         return self._transport.get(self.filename(file_id))
 
     def _put(self, file_id, f):
+        if self._prefixed:
+            try:
+                self._transport.mkdir(hash_prefix(file_id))
+            except FileExists:
+                pass
         return self._transport.put(self.filename(file_id), f)
 
 

=== modified file 'bzrlib/upgrade.py'
--- bzrlib/upgrade.py
+++ bzrlib/upgrade.py
@@ -85,7 +85,8 @@
 import logging
 import shutil
 
-from bzrlib.branch import Branch, find_branch, BZR_BRANCH_FORMAT_5
+from bzrlib.branch import Branch, find_branch
+from bzrlib.branch import BZR_BRANCH_FORMAT_5, BZR_BRANCH_FORMAT_6
 from bzrlib.revfile import Revfile
 from bzrlib.weave import Weave
 from bzrlib.weavefile import read_weave, write_weave
@@ -113,9 +114,38 @@
             return
         note('starting upgrade of %s', os.path.abspath(self.base))
         self._backup_control_dir()
-        note('starting upgrade')
+        self.pb = ui_factory.progress_bar()
+        if self.old_format == 4:
+            note('starting upgrade from format 4 to 5')
+            self._convert_to_weaves()
+            self._open_branch()
+        if self.old_format == 5:
+            note('starting upgrade from format 5 to 6')
+            self._convert_to_prefixed()
+            self._open_branch()
+        note("finished")
+
+
+    def _convert_to_prefixed(self):
+        from bzrlib.store import hash_prefix
+        for store_name in ["weaves", "revision-store"]:
+            note("adding prefixes to %s" % store_name) 
+            store_dir = os.path.join(self.base, ".bzr", store_name)
+            for filename in os.listdir(store_dir):
+                if filename.endswith(".weave") or filename.endswith(".gz"):
+                    file_id = os.path.splitext(filename)[0]
+                else:
+                    file_id = filename
+                prefix_dir = os.path.join(store_dir, hash_prefix(file_id))
+                if not os.path.isdir(prefix_dir):
+                    os.mkdir(prefix_dir)
+                os.rename(os.path.join(store_dir, filename),
+                          os.path.join(prefix_dir, filename))
+        self._set_new_format(BZR_BRANCH_FORMAT_6)
+
+
+    def _convert_to_weaves(self):
         note('note: upgrade may be faster if all store files are ungzipped first')
-        self.pb = ui_factory.progress_bar()
         if not os.path.isdir(self.base + '/.bzr/weaves'):
             os.mkdir(self.base + '/.bzr/weaves')
         self.inv_weave = Weave('inventory')
@@ -147,23 +177,24 @@
         note('  %6d texts' % self.text_count)
         self._write_all_weaves()
         self._write_all_revs()
-        self._set_new_format()
         self._cleanup_spare_files()
+        self._set_new_format(BZR_BRANCH_FORMAT_5)
 
 
     def _open_branch(self):
         self.branch = Branch.open_downlevel(self.base)
-        if self.branch._branch_format == 5:
-            note('this branch is already in the most current format')
+        self.old_format = self.branch._branch_format
+        if self.old_format == 6:
+            note('this branch is in the most current format')
             return False
-        if self.branch._branch_format != 4:
+        if self.old_format not in (4, 5):
             raise BzrError("cannot upgrade from branch format %r" %
                            self.branch._branch_format)
         return True
 
 
-    def _set_new_format(self):
-        self.branch.put_controlfile('branch-format', BZR_BRANCH_FORMAT_5)
+    def _set_new_format(self, format):
+        self.branch.put_controlfile('branch-format', format)
 
 
     def _cleanup_spare_files(self):



More information about the bazaar mailing list