[MERGE REVIEW] Binary file handling
Martin Pool
mbp at sourcefrog.net
Tue Apr 18 07:42:39 BST 2006
+1 with some comments.
On 16 Apr 2006, Aaron Bentley <aaron.bentley at utoronto.ca> wrote:
> Hi all,
>
> This patch adds support for binary file handling in diff and merge.
> That is, diffing a binary file will produce 'Binary files differ'.
>
> Text merges won't be performed on binary files-- if a scalar merge can't
> be performed, foo.BASE, foo.THIS and foo.OTHER will be emitted, with no
> foo. (foo.OTHER will be versioned).
We may at some point want to allow you to force it by remerging the file.
> Binary files are defined as files containing the NUL character (\x00) in
> their first 1024 bytes. Reportedly, this is the heuristic used by diff.
> ~ This does, unfortunately, mean that UTF-16 files will be treated as
> binary.
>
> It is tempting to test for ESC as well, since that would prevent bzr
> diff from messing up the user's terminal. Text files containing ESC are
> relatively rare, but they do exist, e.g. shell scripts that set terminal
> colors.
If people have them it's probably OK to actually display them?
> === added file 'b/bzrlib/tests/test_patch.py'
> --- /dev/null
> +++ b/bzrlib/tests/test_patch.py
> @@ -0,0 +1,28 @@
> +# Copyright (C) 2006 by Canonical Ltd
> +#
> +# This program is free software; you can redistribute it and/or modify
> +# it under the terms of the GNU General Public License as published by
> +# the Free Software Foundation; either version 2 of the License, or
> +# (at your option) any later version.
> +#
> +# This program is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write to the Free Software
> +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> +
> +from bzrlib.errors import BinaryFile
> +from bzrlib.patch import diff3
> +from bzrlib.tests import TestCaseInTempDir
> +
> +
> +class TestPatch(TestCaseInTempDir):
> +
> + def test_diff3_binaries(self):
> + file('this', 'wb').write('a')
> + file('other', 'wb').write('a')
> + file('base', 'wb').write('\x00')
> + self.assertRaises(BinaryFile, diff3, 'unused', 'this', 'other', 'base')
>
> === added file 'b/bzrlib/tests/test_textfile.py'
> --- /dev/null
> +++ b/bzrlib/tests/test_textfile.py
> @@ -0,0 +1,51 @@
> +# Copyright (C) 2006 by Canonical Ltd
> +#
> +# This program is free software; you can redistribute it and/or modify
> +# it under the terms of the GNU General Public License as published by
> +# the Free Software Foundation; either version 2 of the License, or
> +# (at your option) any later version.
> +#
> +# This program is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write to the Free Software
> +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> +
> +from StringIO import StringIO
> +
> +from bzrlib.errors import BinaryFile
> +from bzrlib.tests import TestCase, TestCaseInTempDir
> +from bzrlib.textfile import *
> +
> +
> +class TextFile(TestCase):
> +
> + def test_text_file(self):
> + s = StringIO('ab' * 2048)
> + self.assertEqual(text_file(s).read(), s.getvalue())
> + s = StringIO('a' * 1023 + '\x00')
> + self.assertRaises(BinaryFile, text_file, s)
> + s = StringIO('a' * 1024 + '\x00')
> + self.assertEqual(text_file(s).read(), s.getvalue())
> +
> + def test_check_text_lines(self):
> + lines = ['ab' * 2048]
> + check_text_lines(lines)
> + lines = ['a' * 1023 + '\x00']
> + self.assertRaises(BinaryFile, check_text_lines, lines)
> + lines = ['a' * 1024 + '\x00']
> + check_text_lines(lines)
> +
> +
> +class TextPath(TestCaseInTempDir):
> +
> + def test_text_file(self):
> + file('boo', 'wb').write('ab' * 2048)
> + check_text_path('boo')
> + file('boo', 'wb').write('a' * 1023 + '\x00')
> + self.assertRaises(BinaryFile, check_text_path, 'boo')
> + file('boo', 'wb').write('a' * 1024 + '\x00')
> + check_text_path('boo')
>
> === added file 'b/bzrlib/textfile.py'
> --- /dev/null
> +++ b/bzrlib/textfile.py
> @@ -0,0 +1,47 @@
> +# Copyright (C) 2006 by Canonical Ltd
> +#
> +# This program is free software; you can redistribute it and/or modify
> +# it under the terms of the GNU General Public License as published by
> +# the Free Software Foundation; either version 2 of the License, or
> +# (at your option) any later version.
> +#
> +# This program is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write to the Free Software
> +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> +
> +from itertools import chain
> +
> +from bzrlib.errors import BinaryFile
> +from bzrlib.iterablefile import IterableFile
> +from bzrlib.osutils import file_iterator
Could do with a module docstring, e.g. "Utilities for
distinguishing binary from text files".
> +
> +
> +def text_file(input):
> + """Produce a file iterator that is guaranteed to be text, without seeking.
> + BinaryFile is raised if the file contains a NUL in the first 1024 bytes.
> + """
> + first_chunk = input.read(1024)
> + if '\x00' in first_chunk:
> + raise BinaryFile()
> + return IterableFile(chain((first_chunk,), file_iterator(input)))
> +
> +
> +def check_text_lines(lines):
> + """Raise BinaryFile if the supplied lines contain NULs.
> + Only the first 1024 characters are checked.
> + """
> + f = IterableFile(lines)
> + if '\x00' in f.read(1024):
> + raise BinaryFile()
> +
> +
> +def check_text_path(path):
> + """Check whether the supplied path is a text, not binary file.
> + Raise BinaryFile if a NUL occurs in the first 1024 bytes.
> + """
> + text_file(open(path, 'rb'))
>
> === modified file 'a/bzrlib/diff.py'
> --- a/bzrlib/diff.py
> +++ b/bzrlib/diff.py
> @@ -17,6 +17,7 @@
> from bzrlib.delta import compare_trees
> from bzrlib.errors import BzrError
> from bzrlib.symbol_versioning import *
> +from bzrlib.textfile import check_text_lines
> from bzrlib.trace import mutter
>
> # TODO: Rather than building a changeset object, we should probably
> @@ -41,6 +42,9 @@
> # both sequences are empty.
> if not oldlines and not newlines:
> return
> +
> + check_text_lines(oldlines)
> + check_text_lines(newlines)
>
> ud = difflib.unified_diff(oldlines, newlines,
> fromfile=old_filename+'\t',
It might be good to have a parameter allowing you to see the diff even
if it looks binary - or will difflib fail?
>
> === modified file 'a/bzrlib/errors.py'
> --- a/bzrlib/errors.py
> +++ b/bzrlib/errors.py
> @@ -839,3 +839,7 @@
> self.method = method
> self.mname = method.__name__
> self.tname = type(method_self).__name__
> +
> +
> +class BinaryFile(BzrNewError):
> + """File is binary but should be text."""
Perhaps it should include a string for the filename?
>
> === modified file 'a/bzrlib/inventory.py'
> --- a/bzrlib/inventory.py
> +++ b/bzrlib/inventory.py
> @@ -37,9 +37,9 @@
> import bzrlib
> from bzrlib.osutils import (pumpfile, quotefn, splitpath, joinpath,
> pathjoin, sha_strings)
> +from bzrlib.errors import (NotVersionedError, InvalidEntryName,
> + BzrError, BzrCheckError, BinaryFile)
> from bzrlib.trace import mutter
> -from bzrlib.errors import (NotVersionedError, InvalidEntryName,
> - BzrError, BzrCheckError)
>
>
> class InventoryEntry(object):
> @@ -572,17 +572,20 @@
> def _diff(self, text_diff, from_label, tree, to_label, to_entry, to_tree,
> output_to, reverse=False):
> """See InventoryEntry._diff."""
> - from_text = tree.get_file(self.file_id).readlines()
> - if to_entry:
> - to_text = to_tree.get_file(to_entry.file_id).readlines()
> - else:
> - to_text = []
> - if not reverse:
> - text_diff(from_label, from_text,
> - to_label, to_text, output_to)
> - else:
> - text_diff(to_label, to_text,
> - from_label, from_text, output_to)
> + try:
> + from_text = tree.get_file(self.file_id).readlines()
> + if to_entry:
> + to_text = to_tree.get_file(to_entry.file_id).readlines()
> + else:
> + to_text = []
> + if not reverse:
> + text_diff(from_label, from_text,
> + to_label, to_text, output_to)
> + else:
> + text_diff(to_label, to_text,
> + from_label, from_text, output_to)
> + except BinaryFile:
> + print >> output_to, "Binary files differ"
Should this include the filenames? I suppose it will come after the
heading and so be clear from that?
>
> def has_text(self):
> """See InventoryEntry.has_text."""
>
> === modified file 'a/bzrlib/merge.py'
> --- a/bzrlib/merge.py
> +++ b/bzrlib/merge.py
> @@ -35,6 +35,7 @@
> UnrelatedBranches,
> UnsupportedOperation,
> WorkingTreeNotRevision,
> + BinaryFile,
> )
> from bzrlib.merge3 import Merge3
> import bzrlib.osutils
> @@ -42,6 +43,7 @@
> from progress import DummyProgress, ProgressPhase
> from bzrlib.revision import common_ancestor, is_ancestor, NULL_REVISION
> from bzrlib.symbol_versioning import *
> +from bzrlib.textfile import check_text_lines
> from bzrlib.trace import mutter, warning, note
> from bzrlib.transform import (TreeTransform, resolve_conflicts, cook_conflicts,
> FinalPaths, create_by_entry, unique_add)
> @@ -517,6 +519,18 @@
> else:
> contents = None
> return kind, contents
> +
> + def contents_conflict():
> + trans_id = self.tt.trans_id_file_id(file_id)
> + name = self.tt.final_name(trans_id)
> + parent_id = self.tt.final_parent(trans_id)
> + if file_id in self.this_tree.inventory:
> + self.tt.unversion_file(trans_id)
> + self.tt.delete_contents(trans_id)
> + file_group = self._dump_conflicts(name, parent_id, file_id,
> + set_version=True)
> + self._raw_conflicts.append(('contents conflict', file_group))
> +
> # See SPOT run. run, SPOT, run.
> # So we're not QUITE repeating ourselves; we do tricky things with
> # file kind...
> @@ -553,9 +567,12 @@
> # THIS and OTHER are both files, so text merge. Either
> # BASE is a file, or both converted to files, so at least we
> # have agreement that output should be a file.
> + try:
> + self.text_merge(file_id, trans_id)
> + except BinaryFile:
> + return contents_conflict()
> if file_id not in self.this_tree.inventory:
> self.tt.version_file(file_id, trans_id)
> - self.text_merge(file_id, trans_id)
> try:
> self.tt.tree_kind(trans_id)
> self.tt.delete_contents(trans_id)
> @@ -564,15 +581,7 @@
> return "modified"
> else:
> # Scalar conflict, can't text merge. Dump conflicts
> - trans_id = self.tt.trans_id_file_id(file_id)
> - name = self.tt.final_name(trans_id)
> - parent_id = self.tt.final_parent(trans_id)
> - if file_id in self.this_tree.inventory:
> - self.tt.unversion_file(trans_id)
> - self.tt.delete_contents(trans_id)
> - file_group = self._dump_conflicts(name, parent_id, file_id,
> - set_version=True)
> - self._raw_conflicts.append(('contents conflict', file_group))
> + return contents_conflict()
>
> def get_lines(self, tree, file_id):
> """Return the lines in a file, or an empty list."""
> @@ -808,6 +817,9 @@
> self._check_file(file_id)
> lines = list(self._merged_lines(file_id))
> conflicts = '<<<<<<< TREE\n' in lines
> + # Note we're checking whether the OUTPUT is binary in this case,
> + # because we don't want to get into weave merge guts.
> + check_text_lines(lines)
> self.tt.create_file(lines, trans_id)
> if conflicts:
> self._raw_conflicts.append(('text conflict', trans_id))
>
> === modified file 'a/bzrlib/merge3.py'
> --- a/bzrlib/merge3.py
> +++ b/bzrlib/merge3.py
> @@ -21,6 +21,7 @@
>
> from difflib import SequenceMatcher
> from bzrlib.errors import CantReprocessAndShowBase
> +from textfile import check_text_lines
Should be bzrlib.textfile.
>
> def intersect(ra, rb):
> """Given two ranges return the range where they intersect or None.
> @@ -66,6 +67,9 @@
> incorporating the changes from both BASE->OTHER and BASE->THIS.
> All three will typically be sequences of lines."""
> def __init__(self, base, a, b):
> + check_text_lines(base)
> + check_text_lines(a)
> + check_text_lines(b)
> self.base = base
> self.a = a
> self.b = b
>
> === modified file 'a/bzrlib/patch.py'
> --- a/bzrlib/patch.py
> +++ b/bzrlib/patch.py
> @@ -3,6 +3,7 @@
> from subprocess import Popen, PIPE
>
> from bzrlib.errors import NoDiff3
> +from bzrlib.textfile import check_text_path
> """
> Diff and patch functionality
> """
> @@ -51,6 +52,9 @@
> def diff3(out_file, mine_path, older_path, yours_path):
> def add_label(args, label):
> args.extend(("-L", label))
> + check_text_path(mine_path)
> + check_text_path(older_path)
> + check_text_path(yours_path)
> args = ['diff3', "-E", "--merge"]
> add_label(args, "TREE")
> add_label(args, "ANCESTOR")
>
> === modified file 'a/bzrlib/tests/__init__.py'
> --- a/bzrlib/tests/__init__.py
> +++ b/bzrlib/tests/__init__.py
> @@ -954,6 +954,7 @@
> 'bzrlib.tests.test_nonascii',
> 'bzrlib.tests.test_options',
> 'bzrlib.tests.test_osutils',
> + 'bzrlib.tests.test_patch',
> 'bzrlib.tests.test_permissions',
> 'bzrlib.tests.test_plugins',
> 'bzrlib.tests.test_progress',
> @@ -972,6 +973,7 @@
> 'bzrlib.tests.test_store',
> 'bzrlib.tests.test_symbol_versioning',
> 'bzrlib.tests.test_testament',
> + 'bzrlib.tests.test_textfile',
> 'bzrlib.tests.test_trace',
> 'bzrlib.tests.test_transactions',
> 'bzrlib.tests.test_transform',
>
> === modified file 'a/bzrlib/tests/test_diff.py'
> --- a/bzrlib/tests/test_diff.py
> +++ b/bzrlib/tests/test_diff.py
> @@ -1,6 +1,10 @@
> +from cStringIO import StringIO
> +
> +from bzrlib.diff import internal_diff
> +from bzrlib.errors import BinaryFile
> from bzrlib.tests import TestCase
> -from bzrlib.diff import internal_diff
> -from cStringIO import StringIO
> +
> +
> def udiff_lines(old, new):
> output = StringIO()
> internal_diff('old', old, 'new', new, output)
> @@ -47,3 +51,8 @@
> self.assert_('@@' in lines[2][2:])
> ## "Unterminated hunk header for patch:\n%s" % "".join(lines)
>
> + def test_binary_lines(self):
> + self.assertRaises(BinaryFile, udiff_lines, [1023 * 'a' + '\x00'], [])
> + self.assertRaises(BinaryFile, udiff_lines, [], [1023 * 'a' + '\x00'])
> + udiff_lines([1024 * 'a' + '\x00'], [])
> + udiff_lines([], [1024 * 'a' + '\x00'])
In several of these tests you check whether a \0 after the first kb will
be undetected. Is this really a good test -- is it something that
should be a *requirement* of the program in future? It seems to me that
it should not be tested; rather you should just make sure that a
file with no nuls at all will be OK.
> === modified file 'a/bzrlib/tests/test_inv.py'
> --- a/bzrlib/tests/test_inv.py
> +++ b/bzrlib/tests/test_inv.py
> @@ -139,21 +139,26 @@
> self.wt = self.make_branch_and_tree('.')
> self.branch = self.wt.branch
> print >> open('file', 'wb'), 'foo'
> + print >> open('binfile', 'wb'), 'foo'
> self.wt.add(['file'], ['fileid'])
> + self.wt.add(['binfile'], ['binfileid'])
> if has_symlinks():
> os.symlink('target1', 'symlink')
> self.wt.add(['symlink'], ['linkid'])
> self.wt.commit('message_1', rev_id = '1')
> print >> open('file', 'wb'), 'bar'
> + print >> open('binfile', 'wb'), 'x' * 1023 + '\x00'
> if has_symlinks():
> os.unlink('symlink')
> os.symlink('target2', 'symlink')
> self.tree_1 = self.branch.repository.revision_tree('1')
> self.inv_1 = self.branch.repository.get_inventory('1')
> self.file_1 = self.inv_1['fileid']
> + self.file_1b = self.inv_1['binfileid']
> self.tree_2 = self.wt
> self.inv_2 = self.tree_2.read_working_inventory()
> self.file_2 = self.inv_2['fileid']
> + self.file_2b = self.inv_2['binfileid']
> if has_symlinks():
> self.link_1 = self.inv_1['linkid']
> self.link_2 = self.inv_2['linkid']
> @@ -195,6 +200,13 @@
> "+bar\n"
> "\n")
>
> + def test_file_diff_binary(self):
> + output = StringIO()
> + self.file_1.diff(internal_diff,
> + "/dev/null", self.tree_1,
> + "new_label", self.file_2b, self.tree_2,
> + output)
> + self.assertEqual(output.getvalue(), "Binary files differ\n")
> def test_link_diff_deleted(self):
> if not has_symlinks():
> return
>
> === modified file 'a/bzrlib/tests/test_merge3.py'
> --- a/bzrlib/tests/test_merge3.py
> +++ b/bzrlib/tests/test_merge3.py
> @@ -17,7 +17,7 @@
>
> from bzrlib.tests import TestCaseInTempDir, TestCase
> from bzrlib.merge3 import Merge3
> -from bzrlib.errors import CantReprocessAndShowBase
> +from bzrlib.errors import CantReprocessAndShowBase, BinaryFile
>
> def split_lines(t):
> from cStringIO import StringIO
> @@ -324,3 +324,6 @@
> m_lines = m3.merge_lines('OTHER', 'THIS', reprocess=True,
> base_marker='|||||||')
> self.assertRaises(CantReprocessAndShowBase, list, m_lines)
> +
> + def test_binary(self):
> + self.assertRaises(BinaryFile, Merge3, ['\x00'], ['a'], ['b'])
>
> === modified file 'a/bzrlib/tests/test_merge_core.py'
> --- a/bzrlib/tests/test_merge_core.py
> +++ b/bzrlib/tests/test_merge_core.py
> @@ -280,8 +280,12 @@
> builder = MergeBuilder()
> builder.add_file("1", "TREE_ROOT", "name1", "text1", True)
> builder.change_contents("1", other="text4", this="text3")
> + builder.add_file("2", "TREE_ROOT", "name2", "text1", True)
> + builder.change_contents("2", other="\x00", this="text3")
> conflicts = builder.merge(merge_factory)
> - self.assertEqual(conflicts, [TextConflict('name1', file_id='1')])
> + self.assertEqual(conflicts, [TextConflict('name1', file_id='1'),
> + ContentsConflict('name2', file_id='2')])
> + self.assertEqual(builder.this.get_file('2').read(), '\x00')
> builder.cleanup()
>
> def test_symlink_conflicts(self):
>
--
Martin
More information about the bazaar
mailing list