Rev 5122: (mbp, for jspashett) cope with non-utf8 data in the ignore file in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Tue Mar 30 06:12:30 BST 2010

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 5122 [merge]
revision-id: pqm at pqm.ubuntu.com-20100330051224-a8ubi9w2cd0upyb5
parent: pqm at pqm.ubuntu.com-20100329080616-84azimjwafaukcey
parent: mbp at sourcefrog.net-20100330040915-waagxocidz7mr07h
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Tue 2010-03-30 06:12:24 +0100
message:
  (mbp, for jspashett) cope with non-utf8 data in the ignore file
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/ignores.py              ignores.py-20060712153832-2von9l0t7p43ixsv-1
  bzrlib/tests/test_ignores.py   test_ignores.py-20060712172354-vqq9ln0t8di27v53-1
=== modified file 'NEWS'

--- a/NEWS	2010-03-29 06:37:23 +0000
+++ b/NEWS	2010-03-30 04:09:15 +0000
@@ -139,6 +139,9 @@
   ``add``.
   (Parth Malwankar, #335033, #300001)
 
+* Cope with non-utf8 characters inside ``.bzrignore``.
+  (Jason Spashett, #183504)
+
 * Correctly interpret "451 Rename/move failure: Directory not empty" from
   ftp servers while trying to take a lock.
   (Martin Pool, #528722)

=== modified file 'bzrlib/ignores.py'
--- a/bzrlib/ignores.py	2009-03-23 14:59:43 +0000
+++ b/bzrlib/ignores.py	2010-03-29 00:54:27 +0000
@@ -25,6 +25,8 @@
     globbing,
     )
 
+from trace import warning
+
 # This was the full ignore list for bzr 0.8
 # please keep these sorted (in C locale order) to aid merging
 OLD_DEFAULTS = [
@@ -100,10 +102,34 @@
 ]
 
 
+
 def parse_ignore_file(f):
-    """Read in all of the lines in the file and turn it into an ignore list"""
+    """Read in all of the lines in the file and turn it into an ignore list
+    
+    Continue in the case of utf8 decoding errors, and emit a warning when 
+    such and error is found. Optimise for the common case -- no decoding 
+    errors.
+    """
     ignored = set()
-    for line in f.read().decode('utf8').split('\n'):
+    ignore_file = f.read()
+    try:
+        # Try and parse whole ignore file at once.
+        unicode_lines = ignore_file.decode('utf8').split('\n')
+    except UnicodeDecodeError:
+        # Otherwise go though line by line and pick out the 'good'
+        # decodable lines
+        lines = ignore_file.split('\n')
+        unicode_lines = []    
+        for line_number, line in enumerate(lines):
+            try:
+                unicode_lines.append(line.decode('utf-8'))
+            except UnicodeDecodeError:
+                # report error about line (idx+1)
+                warning('.bzrignore: On Line #%d, malformed utf8 character. '
+                        'Ignoring line.' % (line_number+1))
+    
+    # Append each line to ignore list if it's not a comment line
+    for line in unicode_lines:
         line = line.rstrip('\r\n')
         if not line or line.startswith('#'):
             continue

=== modified file 'bzrlib/tests/test_ignores.py'
--- a/bzrlib/tests/test_ignores.py	2010-02-23 07:43:11 +0000
+++ b/bzrlib/tests/test_ignores.py	2009-12-26 21:31:56 +0000
@@ -50,6 +50,18 @@
     def test_parse_empty(self):
         ignored = ignores.parse_ignore_file(StringIO(''))
         self.assertEqual(set([]), ignored)
+        
+    def test_parse_non_utf8(self):
+        """Lines with non utf 8 characters should be discarded."""
+        ignored = ignores.parse_ignore_file(StringIO(
+                'utf8filename_a\n'
+                'invalid utf8\x80\n'
+                'utf8filename_b\n'
+                ))
+        self.assertEqual(set([
+                        'utf8filename_a',
+                        'utf8filename_b',
+                       ]), ignored)
 
 
 class TestUserIgnores(TestCaseInTempDir):