[MERGE REVIEW] Binary file handling

Tue Apr 18 07:42:39 BST 2006

+1 with some comments.

On 16 Apr 2006, Aaron Bentley <aaron.bentley at utoronto.ca> wrote:
> Hi all,
>
> This patch adds support for binary file handling in diff and merge.
> That is, diffing a binary file will produce 'Binary files differ'.
>
> Text merges won't be performed on binary files-- if a scalar merge can't
> be performed, foo.BASE, foo.THIS and foo.OTHER will be emitted, with no
> foo.  (foo.OTHER will be versioned).

We may at some point want to allow you to force it by remerging the file.

> Binary files are defined as files containing the NUL character (\x00) in
> their first 1024 bytes.  Reportedly, this is the heuristic used by diff.
> ~ This does, unfortunately, mean that UTF-16 files will be treated as
> binary.
>
> It is tempting to test for ESC as well, since that would prevent bzr
> diff from messing up the user's terminal.  Text files containing ESC are
> relatively rare, but they do exist, e.g. shell scripts that set terminal
> colors.

If people have them it's probably OK to actually display them?

> === added file 'b/bzrlib/tests/test_patch.py'
> --- /dev/null
> +++ b/bzrlib/tests/test_patch.py
> @@ -0,0 +1,28 @@
> +# Copyright (C) 2006 by Canonical Ltd
> +#
> +# This program is free software; you can redistribute it and/or modify
> +# it under the terms of the GNU General Public License as published by
> +# the Free Software Foundation; either version 2 of the License, or
> +# (at your option) any later version.
> +#
> +# This program is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write to the Free Software
> +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> +
> +from bzrlib.errors import BinaryFile
> +from bzrlib.patch import diff3
> +from bzrlib.tests import TestCaseInTempDir
> +
> +
> +class TestPatch(TestCaseInTempDir):
> +
> +    def test_diff3_binaries(self):
> +        file('this', 'wb').write('a')
> +        file('other', 'wb').write('a')
> +        file('base', 'wb').write('\x00')
> +        self.assertRaises(BinaryFile, diff3, 'unused', 'this', 'other', 'base')
>
> === added file 'b/bzrlib/tests/test_textfile.py'
> --- /dev/null
> +++ b/bzrlib/tests/test_textfile.py
> @@ -0,0 +1,51 @@
> +# Copyright (C) 2006 by Canonical Ltd
> +#
> +# This program is free software; you can redistribute it and/or modify
> +# it under the terms of the GNU General Public License as published by
> +# the Free Software Foundation; either version 2 of the License, or
> +# (at your option) any later version.
> +#
> +# This program is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write to the Free Software
> +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> +
> +from StringIO import StringIO
> +
> +from bzrlib.errors import BinaryFile
> +from bzrlib.tests import TestCase, TestCaseInTempDir
> +from bzrlib.textfile import *
> +
> +
> +class TextFile(TestCase):
> +
> +    def test_text_file(self):
> +        s = StringIO('ab' * 2048)
> +        self.assertEqual(text_file(s).read(), s.getvalue())
> +        s = StringIO('a' * 1023 + '\x00')
> +        self.assertRaises(BinaryFile, text_file, s)
> +        s = StringIO('a' * 1024 + '\x00')
> +        self.assertEqual(text_file(s).read(), s.getvalue())
> +
> +    def test_check_text_lines(self):
> +        lines = ['ab' * 2048]
> +        check_text_lines(lines)
> +        lines = ['a' * 1023 + '\x00']
> +        self.assertRaises(BinaryFile, check_text_lines, lines)
> +        lines = ['a' * 1024 + '\x00']
> +        check_text_lines(lines)
> +
> +
> +class TextPath(TestCaseInTempDir):
> +
> +    def test_text_file(self):
> +        file('boo', 'wb').write('ab' * 2048)
> +        check_text_path('boo')
> +        file('boo', 'wb').write('a' * 1023 + '\x00')
> +        self.assertRaises(BinaryFile, check_text_path, 'boo')
> +        file('boo', 'wb').write('a' * 1024 + '\x00')
> +        check_text_path('boo')
>

> === added file 'b/bzrlib/textfile.py'
> --- /dev/null
> +++ b/bzrlib/textfile.py
> @@ -0,0 +1,47 @@
> +# Copyright (C) 2006 by Canonical Ltd
> +#
> +# This program is free software; you can redistribute it and/or modify
> +# it under the terms of the GNU General Public License as published by
> +# the Free Software Foundation; either version 2 of the License, or
> +# (at your option) any later version.
> +#
> +# This program is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write to the Free Software
> +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> +
> +from itertools import chain
> +
> +from bzrlib.errors import BinaryFile
> +from bzrlib.iterablefile import IterableFile
> +from bzrlib.osutils import file_iterator

Could do with a module docstring, e.g. "Utilities for
distinguishing binary from text files".

> +
> +
> +def text_file(input):
> +    """Produce a file iterator that is guaranteed to be text, without seeking.
> +    BinaryFile is raised if the file contains a NUL in the first 1024 bytes.
> +    """
> +    first_chunk = input.read(1024)
> +    if '\x00' in first_chunk:
> +        raise BinaryFile()
> +    return IterableFile(chain((first_chunk,), file_iterator(input)))

> +
> +
> +def check_text_lines(lines):
> +    """Raise BinaryFile if the supplied lines contain NULs.
> +    Only the first 1024 characters are checked.
> +    """
> +    f = IterableFile(lines)
> +    if '\x00' in f.read(1024):
> +        raise BinaryFile()
> +
> +
> +def check_text_path(path):
> +    """Check whether the supplied path is a text, not binary file.
> +    Raise BinaryFile if a NUL occurs in the first 1024 bytes.
> +    """
> +    text_file(open(path, 'rb'))
>
> === modified file 'a/bzrlib/diff.py'
> --- a/bzrlib/diff.py
> +++ b/bzrlib/diff.py
> @@ -17,6 +17,7 @@
>  from bzrlib.delta import compare_trees
>  from bzrlib.errors import BzrError
>  from bzrlib.symbol_versioning import *
> +from bzrlib.textfile import check_text_lines
>  from bzrlib.trace import mutter
>
>  # TODO: Rather than building a changeset object, we should probably
> @@ -41,6 +42,9 @@
>      # both sequences are empty.
>      if not oldlines and not newlines:
>          return
> +
> +    check_text_lines(oldlines)
> +    check_text_lines(newlines)
>
>      ud = difflib.unified_diff(oldlines, newlines,
>                                fromfile=old_filename+'\t',

It might be good to have a parameter allowing you to see the diff even
if it looks binary - or will difflib fail?

>
> === modified file 'a/bzrlib/errors.py'
> --- a/bzrlib/errors.py
> +++ b/bzrlib/errors.py
> @@ -839,3 +839,7 @@
>          self.method = method
>          self.mname = method.__name__
>          self.tname = type(method_self).__name__
> +
> +
> +class BinaryFile(BzrNewError):
> +    """File is binary but should be text."""

Perhaps it should include a string for the filename?

>
> === modified file 'a/bzrlib/inventory.py'
> --- a/bzrlib/inventory.py
> +++ b/bzrlib/inventory.py
> @@ -37,9 +37,9 @@
>  import bzrlib
>  from bzrlib.osutils import (pumpfile, quotefn, splitpath, joinpath,
>                              pathjoin, sha_strings)
> +from bzrlib.errors import (NotVersionedError, InvalidEntryName,
> +                           BzrError, BzrCheckError, BinaryFile)
>  from bzrlib.trace import mutter
> -from bzrlib.errors import (NotVersionedError, InvalidEntryName,
> -                           BzrError, BzrCheckError)
>
>
>  class InventoryEntry(object):
> @@ -572,17 +572,20 @@
>      def _diff(self, text_diff, from_label, tree, to_label, to_entry, to_tree,
>               output_to, reverse=False):
>          """See InventoryEntry._diff."""
> -        from_text = tree.get_file(self.file_id).readlines()
> -        if to_entry:
> -            to_text = to_tree.get_file(to_entry.file_id).readlines()
> -        else:
> -            to_text = []
> -        if not reverse:
> -            text_diff(from_label, from_text,
> -                      to_label, to_text, output_to)
> -        else:
> -            text_diff(to_label, to_text,
> -                      from_label, from_text, output_to)
> +        try:
> +            from_text = tree.get_file(self.file_id).readlines()
> +            if to_entry:
> +                to_text = to_tree.get_file(to_entry.file_id).readlines()
> +            else:
> +                to_text = []
> +            if not reverse:
> +                text_diff(from_label, from_text,
> +                          to_label, to_text, output_to)
> +            else:
> +                text_diff(to_label, to_text,
> +                          from_label, from_text, output_to)
> +        except BinaryFile:
> +            print >> output_to, "Binary files differ"

Should this include the filenames?  I suppose it will come after the
heading and so be clear from that?

>
>      def has_text(self):
>          """See InventoryEntry.has_text."""
>
> === modified file 'a/bzrlib/merge.py'
> --- a/bzrlib/merge.py
> +++ b/bzrlib/merge.py
> @@ -35,6 +35,7 @@
>                             UnrelatedBranches,
>                             UnsupportedOperation,
>                             WorkingTreeNotRevision,
> +                           BinaryFile,
>                             )
>  from bzrlib.merge3 import Merge3
>  import bzrlib.osutils
> @@ -42,6 +43,7 @@
>  from progress import DummyProgress, ProgressPhase
>  from bzrlib.revision import common_ancestor, is_ancestor, NULL_REVISION
>  from bzrlib.symbol_versioning import *
> +from bzrlib.textfile import check_text_lines
>  from bzrlib.trace import mutter, warning, note
>  from bzrlib.transform import (TreeTransform, resolve_conflicts, cook_conflicts,
>                                FinalPaths, create_by_entry, unique_add)
> @@ -517,6 +519,18 @@
>              else:
>                  contents = None
>              return kind, contents
> +
> +        def contents_conflict():
> +            trans_id = self.tt.trans_id_file_id(file_id)
> +            name = self.tt.final_name(trans_id)
> +            parent_id = self.tt.final_parent(trans_id)
> +            if file_id in self.this_tree.inventory:
> +                self.tt.unversion_file(trans_id)
> +                self.tt.delete_contents(trans_id)
> +            file_group = self._dump_conflicts(name, parent_id, file_id,
> +                                              set_version=True)
> +            self._raw_conflicts.append(('contents conflict', file_group))
> +
>          # See SPOT run.  run, SPOT, run.
>          # So we're not QUITE repeating ourselves; we do tricky things with
>          # file kind...
> @@ -553,9 +567,12 @@
>                  # THIS and OTHER are both files, so text merge.  Either
>                  # BASE is a file, or both converted to files, so at least we
>                  # have agreement that output should be a file.
> +                try:
> +                    self.text_merge(file_id, trans_id)
> +                except BinaryFile:
> +                    return contents_conflict()
>                  if file_id not in self.this_tree.inventory:
>                      self.tt.version_file(file_id, trans_id)
> -                self.text_merge(file_id, trans_id)
>                  try:
>                      self.tt.tree_kind(trans_id)
>                      self.tt.delete_contents(trans_id)

> @@ -564,15 +581,7 @@
>                  return "modified"
>              else:
>                  # Scalar conflict, can't text merge.  Dump conflicts
> -                trans_id = self.tt.trans_id_file_id(file_id)
> -                name = self.tt.final_name(trans_id)
> -                parent_id = self.tt.final_parent(trans_id)
> -                if file_id in self.this_tree.inventory:
> -                    self.tt.unversion_file(trans_id)
> -                    self.tt.delete_contents(trans_id)
> -                file_group = self._dump_conflicts(name, parent_id, file_id,
> -                                                  set_version=True)
> -                self._raw_conflicts.append(('contents conflict', file_group))
> +                return contents_conflict()
>
>      def get_lines(self, tree, file_id):
>          """Return the lines in a file, or an empty list."""
> @@ -808,6 +817,9 @@
>          self._check_file(file_id)
>          lines = list(self._merged_lines(file_id))
>          conflicts = '<<<<<<< TREE\n' in lines
> +        # Note we're checking whether the OUTPUT is binary in this case,
> +        # because we don't want to get into weave merge guts.
> +        check_text_lines(lines)
>          self.tt.create_file(lines, trans_id)
>          if conflicts:
>              self._raw_conflicts.append(('text conflict', trans_id))
>
> === modified file 'a/bzrlib/merge3.py'
> --- a/bzrlib/merge3.py
> +++ b/bzrlib/merge3.py
> @@ -21,6 +21,7 @@
>
>  from difflib import SequenceMatcher
>  from bzrlib.errors import CantReprocessAndShowBase
> +from textfile import check_text_lines

Should be bzrlib.textfile.

>
>  def intersect(ra, rb):
>      """Given two ranges return the range where they intersect or None.
> @@ -66,6 +67,9 @@
>      incorporating the changes from both BASE->OTHER and BASE->THIS.
>      All three will typically be sequences of lines."""
>      def __init__(self, base, a, b):
> +        check_text_lines(base)
> +        check_text_lines(a)
> +        check_text_lines(b)
>          self.base = base
>          self.a = a
>          self.b = b
>
> === modified file 'a/bzrlib/patch.py'
> --- a/bzrlib/patch.py
> +++ b/bzrlib/patch.py
> @@ -3,6 +3,7 @@
>  from subprocess import Popen, PIPE
>
>  from bzrlib.errors import NoDiff3
> +from bzrlib.textfile import check_text_path
>  """
>  Diff and patch functionality
>  """
> @@ -51,6 +52,9 @@
>  def diff3(out_file, mine_path, older_path, yours_path):
>      def add_label(args, label):
>          args.extend(("-L", label))
> +    check_text_path(mine_path)
> +    check_text_path(older_path)
> +    check_text_path(yours_path)
>      args = ['diff3', "-E", "--merge"]
>      add_label(args, "TREE")
>      add_label(args, "ANCESTOR")
>
> === modified file 'a/bzrlib/tests/__init__.py'
> --- a/bzrlib/tests/__init__.py
> +++ b/bzrlib/tests/__init__.py
> @@ -954,6 +954,7 @@
>                     'bzrlib.tests.test_nonascii',
>                     'bzrlib.tests.test_options',
>                     'bzrlib.tests.test_osutils',
> +                   'bzrlib.tests.test_patch',
>                     'bzrlib.tests.test_permissions',
>                     'bzrlib.tests.test_plugins',
>                     'bzrlib.tests.test_progress',
> @@ -972,6 +973,7 @@
>                     'bzrlib.tests.test_store',
>                     'bzrlib.tests.test_symbol_versioning',
>                     'bzrlib.tests.test_testament',
> +                   'bzrlib.tests.test_textfile',
>                     'bzrlib.tests.test_trace',
>                     'bzrlib.tests.test_transactions',
>                     'bzrlib.tests.test_transform',
>
> === modified file 'a/bzrlib/tests/test_diff.py'
> --- a/bzrlib/tests/test_diff.py
> +++ b/bzrlib/tests/test_diff.py
> @@ -1,6 +1,10 @@
> +from cStringIO import StringIO
> +
> +from bzrlib.diff import internal_diff
> +from bzrlib.errors import BinaryFile
>  from bzrlib.tests import TestCase
> -from bzrlib.diff import internal_diff
> -from cStringIO import StringIO
> +
> +
>  def udiff_lines(old, new):
>      output = StringIO()
>      internal_diff('old', old, 'new', new, output)
> @@ -47,3 +51,8 @@
>          self.assert_('@@' in lines[2][2:])
>              ## "Unterminated hunk header for patch:\n%s" % "".join(lines)
>
> +    def test_binary_lines(self):
> +        self.assertRaises(BinaryFile, udiff_lines, [1023 * 'a' + '\x00'], [])
> +        self.assertRaises(BinaryFile, udiff_lines, [], [1023 * 'a' + '\x00'])
> +        udiff_lines([1024 * 'a' + '\x00'], [])
> +        udiff_lines([], [1024 * 'a' + '\x00'])

In several of these tests you check whether a \0 after the first kb will
be undetected.  Is this really a good test -- is it something that
should be a *requirement* of the program in future?  It seems to me that
it should not be tested; rather you should just make sure that a
file with no nuls at all will be OK.

> === modified file 'a/bzrlib/tests/test_inv.py'
> --- a/bzrlib/tests/test_inv.py
> +++ b/bzrlib/tests/test_inv.py
> @@ -139,21 +139,26 @@
>          self.wt = self.make_branch_and_tree('.')
>          self.branch = self.wt.branch
>          print >> open('file', 'wb'), 'foo'
> +        print >> open('binfile', 'wb'), 'foo'
>          self.wt.add(['file'], ['fileid'])
> +        self.wt.add(['binfile'], ['binfileid'])
>          if has_symlinks():
>              os.symlink('target1', 'symlink')
>              self.wt.add(['symlink'], ['linkid'])
>          self.wt.commit('message_1', rev_id = '1')
>          print >> open('file', 'wb'), 'bar'
> +        print >> open('binfile', 'wb'), 'x' * 1023 + '\x00'
>          if has_symlinks():
>              os.unlink('symlink')
>              os.symlink('target2', 'symlink')
>          self.tree_1 = self.branch.repository.revision_tree('1')
>          self.inv_1 = self.branch.repository.get_inventory('1')
>          self.file_1 = self.inv_1['fileid']
> +        self.file_1b = self.inv_1['binfileid']
>          self.tree_2 = self.wt
>          self.inv_2 = self.tree_2.read_working_inventory()
>          self.file_2 = self.inv_2['fileid']
> +        self.file_2b = self.inv_2['binfileid']
>          if has_symlinks():
>              self.link_1 = self.inv_1['linkid']
>              self.link_2 = self.inv_2['linkid']
> @@ -195,6 +200,13 @@
>                                              "+bar\n"
>                                              "\n")
>
> +    def test_file_diff_binary(self):
> +        output = StringIO()
> +        self.file_1.diff(internal_diff,
> +                          "/dev/null", self.tree_1,
> +                          "new_label", self.file_2b, self.tree_2,
> +                          output)
> +        self.assertEqual(output.getvalue(), "Binary files differ\n")
>      def test_link_diff_deleted(self):
>          if not has_symlinks():
>              return
>
> === modified file 'a/bzrlib/tests/test_merge3.py'
> --- a/bzrlib/tests/test_merge3.py
> +++ b/bzrlib/tests/test_merge3.py
> @@ -17,7 +17,7 @@
>
>  from bzrlib.tests import TestCaseInTempDir, TestCase
>  from bzrlib.merge3 import Merge3
> -from bzrlib.errors import CantReprocessAndShowBase
> +from bzrlib.errors import CantReprocessAndShowBase, BinaryFile
>
>  def split_lines(t):
>      from cStringIO import StringIO
> @@ -324,3 +324,6 @@
>          m_lines = m3.merge_lines('OTHER', 'THIS', reprocess=True,
>                                   base_marker='|||||||')
>          self.assertRaises(CantReprocessAndShowBase, list, m_lines)
> +
> +    def test_binary(self):
> +        self.assertRaises(BinaryFile, Merge3, ['\x00'], ['a'], ['b'])
>
> === modified file 'a/bzrlib/tests/test_merge_core.py'
> --- a/bzrlib/tests/test_merge_core.py
> +++ b/bzrlib/tests/test_merge_core.py
> @@ -280,8 +280,12 @@
>          builder = MergeBuilder()
>          builder.add_file("1", "TREE_ROOT", "name1", "text1", True)
>          builder.change_contents("1", other="text4", this="text3")
> +        builder.add_file("2", "TREE_ROOT", "name2", "text1", True)
> +        builder.change_contents("2", other="\x00", this="text3")
>          conflicts = builder.merge(merge_factory)
> -        self.assertEqual(conflicts, [TextConflict('name1', file_id='1')])
> +        self.assertEqual(conflicts, [TextConflict('name1', file_id='1'),
> +                                     ContentsConflict('name2', file_id='2')])
> +        self.assertEqual(builder.this.get_file('2').read(), '\x00')
>          builder.cleanup()
>
>      def test_symlink_conflicts(self):
>

-- 
Martin