Rev 14: Replace stat names by indices in file for another 30% size in file:///v/home/vila/.bazaar/plugins/transportstats/

Sun Oct 7 20:52:42 BST 2007

At file:///v/home/vila/.bazaar/plugins/transportstats/

------------------------------------------------------------
revno: 14
revision-id: v.ladeuil+lp at free.fr-20071007195240-cf3yoc88xn9sb2ty
parent: v.ladeuil+lp at free.fr-20071007141530-v9ln5myvecno4v6w
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: transportstats
timestamp: Sun 2007-10-07 21:52:40 +0200
message:
  Replace stat names by indices in file for another 30% size
  reduction.  Stats file for bzr branch http://bzr.dev is now under
  1M for ~70M downloaded.
  
  * tests/test_statsfile.py: 
  Cosmetic changes.
  
  * tests/test_stats.py:
  (TestStats.setUp): Remove dead code.
  
  * statsfile.py:
  (UniqueByteLongStringsContainer): Renamed from
  ByteLongUniqueStringsContainer.
  (UniqueByteLongStringsContainer.add): Add write_only parameter to
  be able to use the container in read/write mode.
  (UniqueByteLongString.__init__): Allows the container to be
  spcified at construction time again.
  
  * stats.py:
  (StatsRegistry): New class. Centralize stats declaration to ensure
  proper declarations and allow replacing stat names in the file by
  indices.
  
  * serialize.py: 
  Global replace fs_ by format_speficier for readability.
  
  * TODO:
  Add results after replacing stat names with indices.
  Document next obvious steps.
modified:
  TODO                           todo-20070929082541-nqthugj4zxx7hufy-1
  serialize.py                   serialize.py-20071005112454-5v72oa7pqcdhjytc-1
  stats.py                       stats.py-20070928061304-7i3r2h4gg6rbi03e-1
  statsfile.py                   structuredfile.py-20070927123433-krruwbx4mkalu3xs-1
  tests/test_stats.py            test_stats.py-20071005112458-fpbdqbcq3pr2r2c0-1
  tests/test_statsfile.py        test_tsfile.py-20070929082603-rmpewiprgrp8d4ej-1
-------------- next part --------------
=== modified file 'TODO'

--- a/TODO	2007-10-07 14:15:30 +0000
+++ b/TODO	2007-10-07 19:52:40 +0000
@@ -1,3 +1,17 @@
+- implement missing transport methods in the decorator
+
+- exploit statistics for:
+* latency (measured by start/stop around requests)
+* bytes written
+* number of requests
+* numner of files (watch for renames to avoid pollution)
+
+Requests here roughly means any operation involving sending
+something on the network.
+
+- rename stats to transport.<method>
+  No need to restrict the stats collection to transports.
+
 - Implements pack/unpack for formats
 
 The aim is to provide a robust, cross-platform, reasonably fast
@@ -14,7 +28,7 @@
 * i: int
 * I: unsigned int
 * l: long
-* L: unsigned long
+* L: unsigned long /done
 * f: float
 * d: double
 * s: length-prefixed (one byte) string / done
@@ -22,7 +36,7 @@
 
 In a second step, prefixes for string specifiers (sS):
 
-* u: index (one byte) to a unique string
+* u: index (one byte) to a unique string / done
 * U: index (two bytes) to a unique string
 
 Note: to keep the resulting encoding the most streamable as
@@ -67,16 +81,6 @@
     # format: "%(name)us%(base)us%(relpath)us%(start)f%(end)f"
     def stat(self, relpath):
 
-- share strings by replacing all strings by indices
-
-The constraints are that several collections may define different
-indices for the same string (since we don't want to reload
-previously collected strings before starting a new
-collection). At exploitation time, indices should be associated
-with the currently define set of strings.
-
-The aim is to reduce the StatsFile size.
-
 - change the stats file format to:
 
 file: header marker sections*
@@ -98,7 +102,7 @@
 stat: %(stat_id)u stat_specific_content
 
 
-Format specifiers needed: BHsuU
+Format specifiers needed: BHLsuU
 
 - add method decorators on all transport decorator methods
 
@@ -112,6 +116,9 @@
 * define the method to be executed when collecting
 * define the method to be executed when exploiting
 
+Note: looks overkill, the available mechanisms seems pretty
+efficient already.
+
 - revision 11:
   bzr branch   stats+http+urllib://bazaar.launchpad.net/~bzr/bzr.webdav/webdav/
   -rw-r--r--  1 vila vila 15317 2007-09-30 15:35 .transport_stats_for_bzr
@@ -120,3 +127,24 @@
    bzr branch stats+http+urllib://bazaar-vcs.org/bzr/bzr.dev/ tagada
   -rw-r--r--  1 vila vila 4441317 2007-09-30 15:41 .transport_stats_for_bzr
   Bytes read: 68289613
+
+- revision 13:
+  bzr branch   stats+http+urllib://bazaar.launchpad.net/~bzr/bzr.webdav/webdav/
+Branched 40 revision(s).
+  -rw-r--r--  1 vila vila  5378 2007-10-07 16:16 .transport_stats_for_bzr
+Bytes read: 189785
+
+   bzr branch stats+http+urllib://bazaar-vcs.org/bzr/bzr.dev/ tagada
+Branched 2892 revision(s).
+  -rw-r--r--  1 vila vila 1328776 2007-10-07 16:23 .transport_stats_for_bzr
+Bytes read: 69083460
+
+- revision 14
+  bzr branch stats+http+urllib://bazaar.launchpad.net/~bzr/bzr.webdav/webdav/ -r 40
+Branched 40 revision(s).
+-rw-r--r-- 1 vila vila 4157 2007-10-07 18:36 .transport_stats_for_bzr
+Bytes read: 189785
+ bzr branch stats+http+urllib://bazaar-vcs.org/bzr/bzr.dev/ -r 2892
+Branched 2892 revision(s).
+-rw-r--r-- 1 vila vila 969647 2007-10-07 18:41 .transport_stats_for_bzr
+Bytes read: 69083460

=== modified file 'serialize.py'
--- a/serialize.py	2007-10-07 14:15:30 +0000
+++ b/serialize.py	2007-10-07 19:52:40 +0000
@@ -22,14 +22,12 @@
 from bzrlib.plugins.transportstats import errors
 
 
-fs_registry = registry.Registry()
-"""Registry of format specifiers."""
-
 class FormatSpecifier(object):
 
     def __init__(self, name):
         self.name = name
 
+
 class UnsignedByte(FormatSpecifier):
 
     def pack(self, file, byte):
@@ -73,10 +71,14 @@
         return str
 
 
-fs_registry.register('B', UnsignedByte)
-fs_registry.register('H', UnsignedShort)
-fs_registry.register('L', UnsignedLong)
-fs_registry.register('s', ByteLongString)
+format_specifier_registry = registry.Registry()
+"""Registry of format specifiers."""
+
+
+format_specifier_registry.register('B', UnsignedByte)
+format_specifier_registry.register('H', UnsignedShort)
+format_specifier_registry.register('L', UnsignedLong)
+format_specifier_registry.register('s', ByteLongString)
 
 class Format(object):
 
@@ -102,10 +104,11 @@
             format = part[closing+1:].rstrip() # Get rid of optional spaces
 
             try:
-                fmt_spec_class = fs_registry.get(format)
+                fmt_spec_class = format_specifier_registry.get(format)
             except KeyError:
                 # FIXME: Report a bug on bzrlib.registry, we should have a way
-                # to use fs_registry.get(format, None) as on any python dict
+                # to use format_specifier_registry.get(format, None) as on any
+                # python dict
                 fmt_spec_class = None
             if fmt_spec_class is None:
                 raise errors.MalformedFormat(

=== modified file 'stats.py'
--- a/stats.py	2007-10-07 14:15:30 +0000
+++ b/stats.py	2007-10-07 19:52:40 +0000
@@ -89,16 +89,37 @@
         self.relpath = relpath
 
 
-stats_registry = registry.Registry()
-"""Registry of statictics."""
-
-
-def register_stat(name, description):
-    # Unique strings container will be provided later
-    format = statsfile.StatsFormat(description)
-    stats_registry.register(name, format)
-
-
+class StatsRegistry(registry.Registry):
+    """Statitics registry."""
+
+    def __init__(self):
+        super(StatsRegistry, self).__init__()
+        self._stat_names = statsfile.UniqueByteLongStringsContainer()
+        self._stat_name_serializer = \
+            serialize.format_specifier_registry.get('us')('stat_name',
+                                                          self._stat_names)
+
+    def register(self, key, description,
+                 help=None, info=None, override_existing=False):
+        format = statsfile.StatsFormat(description)
+        super(StatsRegistry, self).register(key, format, help, info,
+                                            override_existing)
+        self._stat_names.add(key, write_only=False)
+
+    @property
+    def stat_names(self):
+        return self._stat_names
+
+    @property
+    def stat_name_serializer(self):
+        return self._stat_name_serializer
+
+
+stats_registry = StatsRegistry()
+
+
+# Shortcut
+register_stat = stats_registry.register
 register_stat('has', '%(base)us%(relpath)us')
 register_stat('get', '%(base)us%(relpath)us%(bytes_read)L')
 register_stat('get_bytes', '%(base)us%(relpath)us%(bytes_read)L')
@@ -109,13 +130,12 @@
 class Stats(object):
 
     _default_name = '.transport_stats_for_bzr'
-    _name_serializer = serialize.fs_registry.get('s')('_stat_name')
+    _name_serializer = stats_registry.stat_name_serializer
 
     def __init__(self):
         self._sfile = None
         self._mode = None
         self._opened = 0
-#        self._stat_names = statsfile.ByteLongUniqueStringsContainer()
 
     def _ensure_mode(self, mode):
         if self._mode != mode:
@@ -191,8 +211,8 @@
     def __iter__(self):
         self._ensure_mode('exploiting')
         for stat in self._sfile:
-            name, f, stat_format = self._sfile.read_stat_name(
-                stat, self._name_serializer,stats_registry)
+            name, file, stat_format = self._sfile.read_stat(
+                stat, self._name_serializer, stats_registry)
 
             class_name = '_' + name + '_stat'
             stat_class = globals().get(class_name, None)
@@ -200,7 +220,7 @@
                 trace.warning(
                     'Unknown method in stats file: %s' % class_name )
             else:
-                args = stat_format.unpack(f)
+                args = stat_format.unpack(file)
                 yield stat_class(*args)
 
     def nb_bytes_read(self):

=== modified file 'statsfile.py'
--- a/statsfile.py	2007-10-07 14:15:30 +0000
+++ b/statsfile.py	2007-10-07 19:52:40 +0000
@@ -27,10 +27,11 @@
 
 
 # FIXME: implement array format specifier and use it
-class ByteLongUniqueStringsContainer(object):
+class UniqueByteLongStringsContainer(object):
 
-    _ubyte_serializer = serialize.fs_registry.get('B')('_nb_ustrings')
-    _string_serializer = serialize.fs_registry.get('s')('_string')
+    _ubyte_serializer = serialize.format_specifier_registry.get('B')(
+        '_nb_ustrings')
+    _string_serializer = serialize.format_specifier_registry.get('s')('_string')
 
     def __init__(self, size=256):
         if size > 256:
@@ -50,13 +51,15 @@
     def free_slots(self):
         return self._size - len(self.idx_of)
 
-    def add(self, str):
+    def add(self, str, write_only=True):
         idx = self.idx_of.get(str, None)
         if idx is None:
             if self.free_slots <= 0:
                 raise errors.Overflow(self.size_max)
             idx = len(self.idx_of)
             self.idx_of[str] = idx
+            if not write_only:
+                self.strings.append(str)
         return idx
 
     def get(self, idx):
@@ -86,17 +89,24 @@
 
 class UniqueByteLongString(serialize.UnsignedByte):
 
+    def __init__(self, name, container=None):
+        super(UniqueByteLongString, self).__init__(name)
+        if container is not None:
+            self._get_unique_strings_container = lambda: container
+
     @property
     def strings(self):
         """Unique strings container accessor.
 
         Users are responsible to define a proper implementation before pack or
-        unpack is called.
+        unpack is called. See __init__ for an example of container provided at
+        build time and StatsFormat.compile for an example of container provided
+        after the UniqueByteLongString have been built.
         """
         return self._get_unique_strings_container()
 
     def _get_unique_strings_container(self):
-        raise NotImplementedError(self._get_unique_strings_container)
+        return None
 
     def pack(self, file, str):
         idx = self.strings.add(str)
@@ -107,7 +117,7 @@
         return self.strings.get(idx)
 
 
-serialize.fs_registry.register('us', UniqueByteLongString)
+serialize.format_specifier_registry.register('us', UniqueByteLongString)
 
 
 class StatsFormat(serialize.Format):
@@ -180,8 +190,9 @@
 # FIXME: implement array format specifier and use it
 class StatsContainer(list):
 
-    _ubyte_serializer = serialize.fs_registry.get('H')('_nb_stats')
-    _string_serializer = serialize.fs_registry.get('s')('_stat')
+    _ubyte_serializer = serialize.format_specifier_registry.get('H')(
+        '_nb_stats')
+    _string_serializer = serialize.format_specifier_registry.get('s')('_stat')
 
     def __init__(self, max_size=65536):
         super(StatsContainer, self).__init__()
@@ -238,7 +249,7 @@
 
     def __init__(self, file):
         super(StatsFile, self).__init__(file)
-        self.unique_strings = ByteLongUniqueStringsContainer()
+        self.unique_strings = UniqueByteLongStringsContainer()
         self.stats = StatsContainer()
 
     @classmethod
@@ -294,9 +305,16 @@
     def read_header(self, header_name):
         return self._header_of[header_name].read(self._file)
 
-    # FIXME: should use a true format instead of a format_specifier for name
     def write_stat(self, name_fmt_spec, name, stat_fmt, *args):
+        """Writes a statistic to the file.
+
+
+        The name and the body are encoded separately to that a true object can
+        be build at read time, deducing the class from the name.
+        """
         if self.stats.is_full():
+            # Look Before You Leap (LBLY), otherwise unique strings and stats
+            # may be desynchronized
             self.flush()
         f = StringIO()
         name_fmt_spec.pack(f, name)
@@ -304,16 +322,24 @@
         try:
             stat_fmt.pack(f, *args)
         except errors.Overflow:
+            # No enough rooms in the unique strings
             self.flush()
+            # Try again, that should succeed
             stat_fmt.pack(f, *args)
         self.stats.append(f.getvalue())
 
-    def read_stat_name(self, raw_stat, name_fmt, stat_fmt_of):
-        f = StringIO(raw_stat)
-        name = name_fmt.unpack(f)
+    def read_stat(self, raw_stat, name_fmt, stat_fmt_of):
+        """Read a statistic name and provides body data.
+
+        :returns: name, file, stat_format. name is the statistic name, file a
+          file-like object containing the stat body data and stat_format the
+          format able to decode the body.
+        """
+        file = StringIO(raw_stat)
+        name = name_fmt.unpack(file)
         stat_format = stat_fmt_of.get(name)
         stat_format.set_unique_strings_container(self.unique_strings)
-        return name, f, stat_format
+        return name, file, stat_format
 
     def read_stats_section(self):
         # We may have to read a ustrings section

=== modified file 'tests/test_stats.py'
--- a/tests/test_stats.py	2007-10-07 14:15:30 +0000
+++ b/tests/test_stats.py	2007-10-07 19:52:40 +0000
@@ -34,8 +34,6 @@
         self._orig_dir = os.getcwdu()
         os.chdir(self._test_dir)
 
-        self.st = stats.get_stats()
-
     def tearDown(self):
         os.chdir(self._orig_dir)
         osutils.rmtree(self._test_dir)

=== modified file 'tests/test_statsfile.py'
--- a/tests/test_statsfile.py	2007-10-07 14:15:30 +0000
+++ b/tests/test_statsfile.py	2007-10-07 19:52:40 +0000
@@ -28,7 +28,7 @@
 class TestUniqueStrings(tests.TestCase):
 
     def test_add(self):
-        us = statsfile.ByteLongUniqueStringsContainer(2)
+        us = statsfile.UniqueByteLongStringsContainer(2)
         self.assertEquals(0, us.add('a'))
         self.assertEquals(1, us.add('b'))
         self.assertEquals(0, us.add('a'))
@@ -36,7 +36,7 @@
 
     def test_write(self):
         f = StringIO()
-        us = statsfile.ByteLongUniqueStringsContainer(2)
+        us = statsfile.UniqueByteLongStringsContainer(2)
         self.assertEquals(0, us.add('first'))
         self.assertEquals(1, us.add('second'))
         us.write(f)
@@ -45,35 +45,35 @@
 
     def test_read(self):
         f = StringIO('\x03\x05first\x06second\x05third')
-        us = statsfile.ByteLongUniqueStringsContainer(3)
+        us = statsfile.UniqueByteLongStringsContainer(3)
         us.read(f)
         self.assertEquals('first', us.get(0))
         self.assertEquals('second', us.get(1))
         self.assertEquals('third', us.get(2))
 
     def test_pack(self):
-        fs_class = serialize.fs_registry.get('us')
-        container = statsfile.ByteLongUniqueStringsContainer()
+        fs_class = serialize.format_specifier_registry.get('us')
+        container = statsfile.UniqueByteLongStringsContainer()
         fs = fs_class('format')
         f = StringIO()
-        self.assertRaises(NotImplementedError, fs.pack, f, 'foo')
+        self.assertRaises(AttributeError, fs.pack, f, 'foo')
         fs._get_unique_strings_container = lambda: container
         fs.pack(f, 'foo')
         self.assertEquals('\x00', f.getvalue())
         self.assertEquals(0, container.add('foo'))
 
     def test_unpack(self):
-        fs_class = serialize.fs_registry.get('us')
-        container = statsfile.ByteLongUniqueStringsContainer()
+        fs_class = serialize.format_specifier_registry.get('us')
+        container = statsfile.UniqueByteLongStringsContainer()
         fs = fs_class('format')
         container.read(StringIO('\x01\x03foo'))
         self.assertEquals('foo', container.get(0))
-        self.assertRaises(NotImplementedError, fs.unpack, StringIO('\x00'))
+        self.assertRaises(AttributeError, fs.unpack, StringIO('\x00'))
         fs._get_unique_strings_container = lambda: container
         self.assertEquals('foo', fs.unpack(StringIO('\x00')))
 
     def test_empty(self):
-        us = statsfile.ByteLongUniqueStringsContainer()
+        us = statsfile.UniqueByteLongStringsContainer()
         self.assertEquals(0, len(us))
         us.add('a')
         self.assertEquals(1, len(us))
@@ -125,7 +125,8 @@
     def test_write_stat(self):
         f = StringIO()
         sf = statsfile.StatsFile(f)
-        name_fmt_spec = serialize.fs_registry.get('s')('_stat_name')
+        name_fmt_spec = serialize.format_specifier_registry.get('s')(
+            '_stat_name')
         stat_fmt = statsfile.StatsFormat('%(str)s')
         sf.write_stat(name_fmt_spec, 'foo', stat_fmt, 'bar')
         sf.flush()
@@ -136,7 +137,8 @@
     def test_write_stat_with_strings(self):
         f = StringIO()
         sf = statsfile.StatsFile(f)
-        name_fmt_spec = serialize.fs_registry.get('s')('_stat_name')
+        name_fmt_spec = serialize.format_specifier_registry.get('s')(
+            '_stat_name')
         stat_fmt = statsfile.StatsFormat('%(str)us')
         sf.write_stat(name_fmt_spec, 'foo', stat_fmt, 'bar')
         sf.write_stat(name_fmt_spec, 'bar', stat_fmt, 'bar')