Rev 48: Improve the parser so that it consumes less memory by caching repeated strings in http://bzr.arbash-meinel.com/plugins/cvsps_import_trunk
John Arbash Meinel
john at arbash-meinel.com
Fri Feb 2 00:12:12 GMT 2007
At http://bzr.arbash-meinel.com/plugins/cvsps_import_trunk
------------------------------------------------------------
revno: 48
revision-id: john at arbash-meinel.com-20070202001204-ar9r63nvyqdqznph
parent: john at arbash-meinel.com-20070201230133-trt6hpkupwbbxo98
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: cvsps_import_trunk
timestamp: Thu 2007-02-01 18:12:04 -0600
message:
Improve the parser so that it consumes less memory by caching repeated strings
modified:
cvsps/parser.py parser.py-20061121005859-rf85jkcpq6bbb3g7-3
-------------- next part --------------
=== modified file 'cvsps/parser.py'
--- a/cvsps/parser.py 2006-11-30 22:26:46 +0000
+++ b/cvsps/parser.py 2007-02-02 00:12:04 +0000
@@ -93,6 +93,19 @@
self._patchsets = []
+ # Just keep a dict of strings that have been parsed, so we can save
+ # some memory for all of the duplicated strings.
+ self._string_cache = {}
+ self._cache_hits = 0
+
+ def _cache(self, s):
+ """Get a cached version of the string to decrease memory overhead."""
+ if s not in self._string_cache:
+ self._string_cache[s] = s
+ return s
+ self._cache_hits += 1
+ return self._string_cache[s]
+
def _handle_ancestor_branch(self, line):
"""Handle and 'Author:' line"""
assert line.startswith(self.PREFIX_ANCESTOR_BRANCH)
@@ -104,14 +117,14 @@
assert line.startswith(self.PREFIX_AUTHOR)
assert self._patchset is not None
author = line[len(self.PREFIX_AUTHOR):].rstrip()
- self._patchset.author = author.decode(self._encoding)
+ self._patchset.author = self._cache(author.decode(self._encoding))
def _handle_branch(self, line):
"""Handle a 'Branch:' line"""
assert line.startswith(self.PREFIX_BRANCH)
assert self._patchset is not None
branch = line[len(self.PREFIX_BRANCH):].rstrip()
- self._patchset.branch = branch
+ self._patchset.branch = self._cache(branch)
def _handle_dashes(self, line):
"""Parse an all dashes line"""
@@ -130,7 +143,7 @@
time_tuple = time.strptime(date, '%Y/%m/%d %H:%M:%S')
timestamp = round(time.mktime(time_tuple), 3)
- self._patchset.date = date
+ self._patchset.date = self._cache(date)
self._patchset.timestamp = timestamp
self._patchset.time_offset = 0
@@ -165,9 +178,11 @@
if ':' not in line:
return
fname, version = line[1:].rsplit(':', 1)
+ fname = self._cache(fname)
versions = version.split('->')
assert len(versions) == 2
- self._patchset.members.append((fname, versions[-1].strip()))
+ version = self._cache(versions[-1].strip())
+ self._patchset.members.append((fname, version))
def _handle_patchset(self, line):
"""Parse a PatchSet line"""
@@ -204,6 +219,8 @@
self._pb = pb
+ self._string_cache.clear()
+
handlers = {
self.PREFIX_ANCESTOR_BRANCH:self._handle_ancestor_branch,
self.PREFIX_AUTHOR:self._handle_author,
@@ -247,13 +264,21 @@
break
else:
assert False, 'Failed to process: %r' % (line,)
-
+
# We've processed all the lines
if self._patchset is not None:
self._patchsets.append(self._patchset)
self._patchset = None
if self._pb is not None:
self._pb.update('reading patchsets', 0, len(self._patchsets))
+
+ self._pb.note('Read %s patchsets (string cache hits: %s, total: %s)',
+ len(self._patchsets), self._cache_hits,
+ len(self._string_cache))
+
+ self._string_cache.clear()
+ self._cache_hits = 0
+
self._pb = None
return self._patchsets
More information about the bazaar-commits
mailing list