Rev 67: Fix/limit memory use in mpdiff generation for dealing with busy trees. in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

Robert Collins robertc at robertcollins.net
Wed Jan 21 04:43:08 GMT 2009


At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

------------------------------------------------------------
revno: 67
revision-id: robertc at robertcollins.net-20090121044253-cm1ogklzp31qpkvj
parent: robertc at robertcollins.net-20081202223633-et3bqd5i8d3qnu94
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2009-01-21 15:42:53 +1100
message:
  Fix/limit memory use in mpdiff generation for dealing with busy trees.
=== modified file 'index.py'
--- a/index.py	2008-12-02 22:36:33 +0000
+++ b/index.py	2009-01-21 04:42:53 +0000
@@ -775,18 +775,23 @@
             for file_version in item[2]:
                 file_versions.add((item[1], file_version))
         for file_id, file_keys in files.iteritems():
-            for diff, key in zip(repository.texts.make_mpdiffs(file_keys),
-                file_keys):
-                document_key = ('f',) + key
-                for hunk in diff.hunks:
-                    if type(hunk) == NewText:
-                        for line in hunk.lines:
-                            line_terms = _tokeniser_re.split(line)
-                            for term in line_terms:
-                                if not term:
-                                    continue
-                                posting_list = terms.setdefault((term,), set())
-                                posting_list.add(document_key)
+            file_keys = list(file_keys)
+            group_size = 100
+            groups = len(file_keys) / group_size + 1
+            for offset in range(groups):
+                file_key_group = file_keys[offset * group_size:(offset + 1) * group_size]
+                for diff, key in zip(repository.texts.make_mpdiffs(file_key_group),
+                    file_key_group):
+                    document_key = ('f',) + key
+                    for hunk in diff.hunks:
+                        if type(hunk) == NewText:
+                            for line in hunk.lines:
+                                line_terms = _tokeniser_re.split(line)
+                                for term in line_terms:
+                                    if not term:
+                                        continue
+                                    posting_list = terms.setdefault((term,), set())
+                                    posting_list.add(document_key)
         return terms.items()
 
 




More information about the bazaar-commits mailing list