Rev 48: Improve the parser so that it consumes less memory by caching repeated strings in http://bzr.arbash-meinel.com/plugins/cvsps_import_trunk

Fri Feb 2 00:12:12 GMT 2007

At http://bzr.arbash-meinel.com/plugins/cvsps_import_trunk

------------------------------------------------------------
revno: 48
revision-id: john at arbash-meinel.com-20070202001204-ar9r63nvyqdqznph
parent: john at arbash-meinel.com-20070201230133-trt6hpkupwbbxo98
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: cvsps_import_trunk
timestamp: Thu 2007-02-01 18:12:04 -0600
message:
  Improve the parser so that it consumes less memory by caching repeated strings
modified:
  cvsps/parser.py                parser.py-20061121005859-rf85jkcpq6bbb3g7-3
-------------- next part --------------
=== modified file 'cvsps/parser.py'

--- a/cvsps/parser.py	2006-11-30 22:26:46 +0000
+++ b/cvsps/parser.py	2007-02-02 00:12:04 +0000
@@ -93,6 +93,19 @@
 
         self._patchsets = []
 
+        # Just keep a dict of strings that have been parsed, so we can save
+        # some memory for all of the duplicated strings.
+        self._string_cache = {}
+        self._cache_hits = 0
+
+    def _cache(self, s):
+        """Get a cached version of the string to decrease memory overhead."""
+        if s not in self._string_cache:
+            self._string_cache[s] = s
+            return s
+        self._cache_hits += 1
+        return self._string_cache[s]
+
     def _handle_ancestor_branch(self, line):
         """Handle and 'Author:' line"""
         assert line.startswith(self.PREFIX_ANCESTOR_BRANCH)
@@ -104,14 +117,14 @@
         assert line.startswith(self.PREFIX_AUTHOR)
         assert self._patchset is not None
         author = line[len(self.PREFIX_AUTHOR):].rstrip()
-        self._patchset.author = author.decode(self._encoding)
+        self._patchset.author = self._cache(author.decode(self._encoding))
 
     def _handle_branch(self, line):
         """Handle a 'Branch:' line"""
         assert line.startswith(self.PREFIX_BRANCH)
         assert self._patchset is not None
         branch = line[len(self.PREFIX_BRANCH):].rstrip()
-        self._patchset.branch = branch
+        self._patchset.branch = self._cache(branch)
 
     def _handle_dashes(self, line):
         """Parse an all dashes line"""
@@ -130,7 +143,7 @@
         time_tuple = time.strptime(date, '%Y/%m/%d %H:%M:%S')
         timestamp = round(time.mktime(time_tuple), 3)
 
-        self._patchset.date = date
+        self._patchset.date = self._cache(date)
         self._patchset.timestamp = timestamp
         self._patchset.time_offset = 0
 
@@ -165,9 +178,11 @@
         if ':' not in line:
             return
         fname, version = line[1:].rsplit(':', 1)
+        fname = self._cache(fname)
         versions = version.split('->')
         assert len(versions) == 2
-        self._patchset.members.append((fname, versions[-1].strip()))
+        version = self._cache(versions[-1].strip())
+        self._patchset.members.append((fname, version))
 
     def _handle_patchset(self, line):
         """Parse a PatchSet line"""
@@ -204,6 +219,8 @@
 
         self._pb = pb
 
+        self._string_cache.clear()
+
         handlers = {
             self.PREFIX_ANCESTOR_BRANCH:self._handle_ancestor_branch,
             self.PREFIX_AUTHOR:self._handle_author,
@@ -247,13 +264,21 @@
                     break
             else:
                 assert False, 'Failed to process: %r' % (line,)
-                    
+
         # We've processed all the lines
         if self._patchset is not None:
             self._patchsets.append(self._patchset)
             self._patchset = None
         if self._pb is not None:
             self._pb.update('reading patchsets', 0, len(self._patchsets))
+
+            self._pb.note('Read %s patchsets (string cache hits: %s, total: %s)',
+                len(self._patchsets), self._cache_hits,
+                len(self._string_cache))
+
+        self._string_cache.clear()
+        self._cache_hits = 0
+
         self._pb = None
         return self._patchsets