Rev 14: Replace stat names by indices in file for another 30% size in file:///v/home/vila/.bazaar/plugins/transportstats/
Vincent Ladeuil
v.ladeuil+lp at free.fr
Sun Oct 7 20:52:42 BST 2007
At file:///v/home/vila/.bazaar/plugins/transportstats/
------------------------------------------------------------
revno: 14
revision-id: v.ladeuil+lp at free.fr-20071007195240-cf3yoc88xn9sb2ty
parent: v.ladeuil+lp at free.fr-20071007141530-v9ln5myvecno4v6w
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: transportstats
timestamp: Sun 2007-10-07 21:52:40 +0200
message:
Replace stat names by indices in file for another 30% size
reduction. Stats file for bzr branch http://bzr.dev is now under
1M for ~70M downloaded.
* tests/test_statsfile.py:
Cosmetic changes.
* tests/test_stats.py:
(TestStats.setUp): Remove dead code.
* statsfile.py:
(UniqueByteLongStringsContainer): Renamed from
ByteLongUniqueStringsContainer.
(UniqueByteLongStringsContainer.add): Add write_only parameter to
be able to use the container in read/write mode.
(UniqueByteLongString.__init__): Allows the container to be
spcified at construction time again.
* stats.py:
(StatsRegistry): New class. Centralize stats declaration to ensure
proper declarations and allow replacing stat names in the file by
indices.
* serialize.py:
Global replace fs_ by format_speficier for readability.
* TODO:
Add results after replacing stat names with indices.
Document next obvious steps.
modified:
TODO todo-20070929082541-nqthugj4zxx7hufy-1
serialize.py serialize.py-20071005112454-5v72oa7pqcdhjytc-1
stats.py stats.py-20070928061304-7i3r2h4gg6rbi03e-1
statsfile.py structuredfile.py-20070927123433-krruwbx4mkalu3xs-1
tests/test_stats.py test_stats.py-20071005112458-fpbdqbcq3pr2r2c0-1
tests/test_statsfile.py test_tsfile.py-20070929082603-rmpewiprgrp8d4ej-1
-------------- next part --------------
=== modified file 'TODO'
--- a/TODO 2007-10-07 14:15:30 +0000
+++ b/TODO 2007-10-07 19:52:40 +0000
@@ -1,3 +1,17 @@
+- implement missing transport methods in the decorator
+
+- exploit statistics for:
+* latency (measured by start/stop around requests)
+* bytes written
+* number of requests
+* numner of files (watch for renames to avoid pollution)
+
+Requests here roughly means any operation involving sending
+something on the network.
+
+- rename stats to transport.<method>
+ No need to restrict the stats collection to transports.
+
- Implements pack/unpack for formats
The aim is to provide a robust, cross-platform, reasonably fast
@@ -14,7 +28,7 @@
* i: int
* I: unsigned int
* l: long
-* L: unsigned long
+* L: unsigned long /done
* f: float
* d: double
* s: length-prefixed (one byte) string / done
@@ -22,7 +36,7 @@
In a second step, prefixes for string specifiers (sS):
-* u: index (one byte) to a unique string
+* u: index (one byte) to a unique string / done
* U: index (two bytes) to a unique string
Note: to keep the resulting encoding the most streamable as
@@ -67,16 +81,6 @@
# format: "%(name)us%(base)us%(relpath)us%(start)f%(end)f"
def stat(self, relpath):
-- share strings by replacing all strings by indices
-
-The constraints are that several collections may define different
-indices for the same string (since we don't want to reload
-previously collected strings before starting a new
-collection). At exploitation time, indices should be associated
-with the currently define set of strings.
-
-The aim is to reduce the StatsFile size.
-
- change the stats file format to:
file: header marker sections*
@@ -98,7 +102,7 @@
stat: %(stat_id)u stat_specific_content
-Format specifiers needed: BHsuU
+Format specifiers needed: BHLsuU
- add method decorators on all transport decorator methods
@@ -112,6 +116,9 @@
* define the method to be executed when collecting
* define the method to be executed when exploiting
+Note: looks overkill, the available mechanisms seems pretty
+efficient already.
+
- revision 11:
bzr branch stats+http+urllib://bazaar.launchpad.net/~bzr/bzr.webdav/webdav/
-rw-r--r-- 1 vila vila 15317 2007-09-30 15:35 .transport_stats_for_bzr
@@ -120,3 +127,24 @@
bzr branch stats+http+urllib://bazaar-vcs.org/bzr/bzr.dev/ tagada
-rw-r--r-- 1 vila vila 4441317 2007-09-30 15:41 .transport_stats_for_bzr
Bytes read: 68289613
+
+- revision 13:
+ bzr branch stats+http+urllib://bazaar.launchpad.net/~bzr/bzr.webdav/webdav/
+Branched 40 revision(s).
+ -rw-r--r-- 1 vila vila 5378 2007-10-07 16:16 .transport_stats_for_bzr
+Bytes read: 189785
+
+ bzr branch stats+http+urllib://bazaar-vcs.org/bzr/bzr.dev/ tagada
+Branched 2892 revision(s).
+ -rw-r--r-- 1 vila vila 1328776 2007-10-07 16:23 .transport_stats_for_bzr
+Bytes read: 69083460
+
+- revision 14
+ bzr branch stats+http+urllib://bazaar.launchpad.net/~bzr/bzr.webdav/webdav/ -r 40
+Branched 40 revision(s).
+-rw-r--r-- 1 vila vila 4157 2007-10-07 18:36 .transport_stats_for_bzr
+Bytes read: 189785
+ bzr branch stats+http+urllib://bazaar-vcs.org/bzr/bzr.dev/ -r 2892
+Branched 2892 revision(s).
+-rw-r--r-- 1 vila vila 969647 2007-10-07 18:41 .transport_stats_for_bzr
+Bytes read: 69083460
=== modified file 'serialize.py'
--- a/serialize.py 2007-10-07 14:15:30 +0000
+++ b/serialize.py 2007-10-07 19:52:40 +0000
@@ -22,14 +22,12 @@
from bzrlib.plugins.transportstats import errors
-fs_registry = registry.Registry()
-"""Registry of format specifiers."""
-
class FormatSpecifier(object):
def __init__(self, name):
self.name = name
+
class UnsignedByte(FormatSpecifier):
def pack(self, file, byte):
@@ -73,10 +71,14 @@
return str
-fs_registry.register('B', UnsignedByte)
-fs_registry.register('H', UnsignedShort)
-fs_registry.register('L', UnsignedLong)
-fs_registry.register('s', ByteLongString)
+format_specifier_registry = registry.Registry()
+"""Registry of format specifiers."""
+
+
+format_specifier_registry.register('B', UnsignedByte)
+format_specifier_registry.register('H', UnsignedShort)
+format_specifier_registry.register('L', UnsignedLong)
+format_specifier_registry.register('s', ByteLongString)
class Format(object):
@@ -102,10 +104,11 @@
format = part[closing+1:].rstrip() # Get rid of optional spaces
try:
- fmt_spec_class = fs_registry.get(format)
+ fmt_spec_class = format_specifier_registry.get(format)
except KeyError:
# FIXME: Report a bug on bzrlib.registry, we should have a way
- # to use fs_registry.get(format, None) as on any python dict
+ # to use format_specifier_registry.get(format, None) as on any
+ # python dict
fmt_spec_class = None
if fmt_spec_class is None:
raise errors.MalformedFormat(
=== modified file 'stats.py'
--- a/stats.py 2007-10-07 14:15:30 +0000
+++ b/stats.py 2007-10-07 19:52:40 +0000
@@ -89,16 +89,37 @@
self.relpath = relpath
-stats_registry = registry.Registry()
-"""Registry of statictics."""
-
-
-def register_stat(name, description):
- # Unique strings container will be provided later
- format = statsfile.StatsFormat(description)
- stats_registry.register(name, format)
-
-
+class StatsRegistry(registry.Registry):
+ """Statitics registry."""
+
+ def __init__(self):
+ super(StatsRegistry, self).__init__()
+ self._stat_names = statsfile.UniqueByteLongStringsContainer()
+ self._stat_name_serializer = \
+ serialize.format_specifier_registry.get('us')('stat_name',
+ self._stat_names)
+
+ def register(self, key, description,
+ help=None, info=None, override_existing=False):
+ format = statsfile.StatsFormat(description)
+ super(StatsRegistry, self).register(key, format, help, info,
+ override_existing)
+ self._stat_names.add(key, write_only=False)
+
+ @property
+ def stat_names(self):
+ return self._stat_names
+
+ @property
+ def stat_name_serializer(self):
+ return self._stat_name_serializer
+
+
+stats_registry = StatsRegistry()
+
+
+# Shortcut
+register_stat = stats_registry.register
register_stat('has', '%(base)us%(relpath)us')
register_stat('get', '%(base)us%(relpath)us%(bytes_read)L')
register_stat('get_bytes', '%(base)us%(relpath)us%(bytes_read)L')
@@ -109,13 +130,12 @@
class Stats(object):
_default_name = '.transport_stats_for_bzr'
- _name_serializer = serialize.fs_registry.get('s')('_stat_name')
+ _name_serializer = stats_registry.stat_name_serializer
def __init__(self):
self._sfile = None
self._mode = None
self._opened = 0
-# self._stat_names = statsfile.ByteLongUniqueStringsContainer()
def _ensure_mode(self, mode):
if self._mode != mode:
@@ -191,8 +211,8 @@
def __iter__(self):
self._ensure_mode('exploiting')
for stat in self._sfile:
- name, f, stat_format = self._sfile.read_stat_name(
- stat, self._name_serializer,stats_registry)
+ name, file, stat_format = self._sfile.read_stat(
+ stat, self._name_serializer, stats_registry)
class_name = '_' + name + '_stat'
stat_class = globals().get(class_name, None)
@@ -200,7 +220,7 @@
trace.warning(
'Unknown method in stats file: %s' % class_name )
else:
- args = stat_format.unpack(f)
+ args = stat_format.unpack(file)
yield stat_class(*args)
def nb_bytes_read(self):
=== modified file 'statsfile.py'
--- a/statsfile.py 2007-10-07 14:15:30 +0000
+++ b/statsfile.py 2007-10-07 19:52:40 +0000
@@ -27,10 +27,11 @@
# FIXME: implement array format specifier and use it
-class ByteLongUniqueStringsContainer(object):
+class UniqueByteLongStringsContainer(object):
- _ubyte_serializer = serialize.fs_registry.get('B')('_nb_ustrings')
- _string_serializer = serialize.fs_registry.get('s')('_string')
+ _ubyte_serializer = serialize.format_specifier_registry.get('B')(
+ '_nb_ustrings')
+ _string_serializer = serialize.format_specifier_registry.get('s')('_string')
def __init__(self, size=256):
if size > 256:
@@ -50,13 +51,15 @@
def free_slots(self):
return self._size - len(self.idx_of)
- def add(self, str):
+ def add(self, str, write_only=True):
idx = self.idx_of.get(str, None)
if idx is None:
if self.free_slots <= 0:
raise errors.Overflow(self.size_max)
idx = len(self.idx_of)
self.idx_of[str] = idx
+ if not write_only:
+ self.strings.append(str)
return idx
def get(self, idx):
@@ -86,17 +89,24 @@
class UniqueByteLongString(serialize.UnsignedByte):
+ def __init__(self, name, container=None):
+ super(UniqueByteLongString, self).__init__(name)
+ if container is not None:
+ self._get_unique_strings_container = lambda: container
+
@property
def strings(self):
"""Unique strings container accessor.
Users are responsible to define a proper implementation before pack or
- unpack is called.
+ unpack is called. See __init__ for an example of container provided at
+ build time and StatsFormat.compile for an example of container provided
+ after the UniqueByteLongString have been built.
"""
return self._get_unique_strings_container()
def _get_unique_strings_container(self):
- raise NotImplementedError(self._get_unique_strings_container)
+ return None
def pack(self, file, str):
idx = self.strings.add(str)
@@ -107,7 +117,7 @@
return self.strings.get(idx)
-serialize.fs_registry.register('us', UniqueByteLongString)
+serialize.format_specifier_registry.register('us', UniqueByteLongString)
class StatsFormat(serialize.Format):
@@ -180,8 +190,9 @@
# FIXME: implement array format specifier and use it
class StatsContainer(list):
- _ubyte_serializer = serialize.fs_registry.get('H')('_nb_stats')
- _string_serializer = serialize.fs_registry.get('s')('_stat')
+ _ubyte_serializer = serialize.format_specifier_registry.get('H')(
+ '_nb_stats')
+ _string_serializer = serialize.format_specifier_registry.get('s')('_stat')
def __init__(self, max_size=65536):
super(StatsContainer, self).__init__()
@@ -238,7 +249,7 @@
def __init__(self, file):
super(StatsFile, self).__init__(file)
- self.unique_strings = ByteLongUniqueStringsContainer()
+ self.unique_strings = UniqueByteLongStringsContainer()
self.stats = StatsContainer()
@classmethod
@@ -294,9 +305,16 @@
def read_header(self, header_name):
return self._header_of[header_name].read(self._file)
- # FIXME: should use a true format instead of a format_specifier for name
def write_stat(self, name_fmt_spec, name, stat_fmt, *args):
+ """Writes a statistic to the file.
+
+
+ The name and the body are encoded separately to that a true object can
+ be build at read time, deducing the class from the name.
+ """
if self.stats.is_full():
+ # Look Before You Leap (LBLY), otherwise unique strings and stats
+ # may be desynchronized
self.flush()
f = StringIO()
name_fmt_spec.pack(f, name)
@@ -304,16 +322,24 @@
try:
stat_fmt.pack(f, *args)
except errors.Overflow:
+ # No enough rooms in the unique strings
self.flush()
+ # Try again, that should succeed
stat_fmt.pack(f, *args)
self.stats.append(f.getvalue())
- def read_stat_name(self, raw_stat, name_fmt, stat_fmt_of):
- f = StringIO(raw_stat)
- name = name_fmt.unpack(f)
+ def read_stat(self, raw_stat, name_fmt, stat_fmt_of):
+ """Read a statistic name and provides body data.
+
+ :returns: name, file, stat_format. name is the statistic name, file a
+ file-like object containing the stat body data and stat_format the
+ format able to decode the body.
+ """
+ file = StringIO(raw_stat)
+ name = name_fmt.unpack(file)
stat_format = stat_fmt_of.get(name)
stat_format.set_unique_strings_container(self.unique_strings)
- return name, f, stat_format
+ return name, file, stat_format
def read_stats_section(self):
# We may have to read a ustrings section
=== modified file 'tests/test_stats.py'
--- a/tests/test_stats.py 2007-10-07 14:15:30 +0000
+++ b/tests/test_stats.py 2007-10-07 19:52:40 +0000
@@ -34,8 +34,6 @@
self._orig_dir = os.getcwdu()
os.chdir(self._test_dir)
- self.st = stats.get_stats()
-
def tearDown(self):
os.chdir(self._orig_dir)
osutils.rmtree(self._test_dir)
=== modified file 'tests/test_statsfile.py'
--- a/tests/test_statsfile.py 2007-10-07 14:15:30 +0000
+++ b/tests/test_statsfile.py 2007-10-07 19:52:40 +0000
@@ -28,7 +28,7 @@
class TestUniqueStrings(tests.TestCase):
def test_add(self):
- us = statsfile.ByteLongUniqueStringsContainer(2)
+ us = statsfile.UniqueByteLongStringsContainer(2)
self.assertEquals(0, us.add('a'))
self.assertEquals(1, us.add('b'))
self.assertEquals(0, us.add('a'))
@@ -36,7 +36,7 @@
def test_write(self):
f = StringIO()
- us = statsfile.ByteLongUniqueStringsContainer(2)
+ us = statsfile.UniqueByteLongStringsContainer(2)
self.assertEquals(0, us.add('first'))
self.assertEquals(1, us.add('second'))
us.write(f)
@@ -45,35 +45,35 @@
def test_read(self):
f = StringIO('\x03\x05first\x06second\x05third')
- us = statsfile.ByteLongUniqueStringsContainer(3)
+ us = statsfile.UniqueByteLongStringsContainer(3)
us.read(f)
self.assertEquals('first', us.get(0))
self.assertEquals('second', us.get(1))
self.assertEquals('third', us.get(2))
def test_pack(self):
- fs_class = serialize.fs_registry.get('us')
- container = statsfile.ByteLongUniqueStringsContainer()
+ fs_class = serialize.format_specifier_registry.get('us')
+ container = statsfile.UniqueByteLongStringsContainer()
fs = fs_class('format')
f = StringIO()
- self.assertRaises(NotImplementedError, fs.pack, f, 'foo')
+ self.assertRaises(AttributeError, fs.pack, f, 'foo')
fs._get_unique_strings_container = lambda: container
fs.pack(f, 'foo')
self.assertEquals('\x00', f.getvalue())
self.assertEquals(0, container.add('foo'))
def test_unpack(self):
- fs_class = serialize.fs_registry.get('us')
- container = statsfile.ByteLongUniqueStringsContainer()
+ fs_class = serialize.format_specifier_registry.get('us')
+ container = statsfile.UniqueByteLongStringsContainer()
fs = fs_class('format')
container.read(StringIO('\x01\x03foo'))
self.assertEquals('foo', container.get(0))
- self.assertRaises(NotImplementedError, fs.unpack, StringIO('\x00'))
+ self.assertRaises(AttributeError, fs.unpack, StringIO('\x00'))
fs._get_unique_strings_container = lambda: container
self.assertEquals('foo', fs.unpack(StringIO('\x00')))
def test_empty(self):
- us = statsfile.ByteLongUniqueStringsContainer()
+ us = statsfile.UniqueByteLongStringsContainer()
self.assertEquals(0, len(us))
us.add('a')
self.assertEquals(1, len(us))
@@ -125,7 +125,8 @@
def test_write_stat(self):
f = StringIO()
sf = statsfile.StatsFile(f)
- name_fmt_spec = serialize.fs_registry.get('s')('_stat_name')
+ name_fmt_spec = serialize.format_specifier_registry.get('s')(
+ '_stat_name')
stat_fmt = statsfile.StatsFormat('%(str)s')
sf.write_stat(name_fmt_spec, 'foo', stat_fmt, 'bar')
sf.flush()
@@ -136,7 +137,8 @@
def test_write_stat_with_strings(self):
f = StringIO()
sf = statsfile.StatsFile(f)
- name_fmt_spec = serialize.fs_registry.get('s')('_stat_name')
+ name_fmt_spec = serialize.format_specifier_registry.get('s')(
+ '_stat_name')
stat_fmt = statsfile.StatsFormat('%(str)us')
sf.write_stat(name_fmt_spec, 'foo', stat_fmt, 'bar')
sf.write_stat(name_fmt_spec, 'bar', stat_fmt, 'bar')
More information about the bazaar-commits
mailing list