Rev 5874: (vila) Add support for double width characters to textwrap. (INADA Naoki) in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Mon May 16 23:19:34 UTC 2011

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 5874 [merge]
revision-id: pqm at pqm.ubuntu.com-20110516231929-aenh18a18r12mvp2
parent: pqm at pqm.ubuntu.com-20110516222836-prryayncmfqqh6w3
parent: songofacandy at gmail.com-20110514145906-xj3xa06jihd4wapq
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Mon 2011-05-16 23:19:29 +0000
message:
  (vila) Add support for double width characters to textwrap. (INADA Naoki)
added:
  bzrlib/tests/test_utextwrap.py test_utextwrap.py-20110504151300-vdvrs19wd20a5cy0-1
  bzrlib/utextwrap.py            utextwrap.py-20110504064158-lx1sswjckyz5vyi3-1
modified:
  bzrlib/tests/__init__.py       selftest.py-20050531073622-8d0e3c8845c97a64
=== modified file 'bzrlib/tests/__init__.py'

--- a/bzrlib/tests/__init__.py	2011-05-16 22:28:36 +0000
+++ b/bzrlib/tests/__init__.py	2011-05-16 23:19:29 +0000
@@ -3900,6 +3900,7 @@
         'bzrlib.tests.test_upgrade',
         'bzrlib.tests.test_upgrade_stacked',
         'bzrlib.tests.test_urlutils',
+        'bzrlib.tests.test_utextwrap',
         'bzrlib.tests.test_version',
         'bzrlib.tests.test_version_info',
         'bzrlib.tests.test_versionedfile',

=== added file 'bzrlib/tests/test_utextwrap.py'
--- a/bzrlib/tests/test_utextwrap.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test_utextwrap.py	2011-05-14 14:59:06 +0000
@@ -0,0 +1,207 @@
+# Copyright (C) 2011 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+#
+
+"""Tests of the bzrlib.utextwrap."""
+
+from bzrlib import tests, utextwrap
+from bzrlib.tests import TestSkipped
+
+# Japanese "Good morning".
+# Each character have double width. So total 8 width on console.
+_str_D = u'\u304a\u306f\u3088\u3046'
+
+_str_S = u"hello"
+
+# Combine single width characters and double width characters.
+_str_SD = _str_S + _str_D
+_str_DS = _str_D + _str_S
+
+class TestUTextWrap(tests.TestCase):
+
+    def check_width(self, text, expected_width):
+        w = utextwrap.UTextWrapper()
+        self.assertEqual(
+                w._width(text),
+                expected_width,
+                "Width of %r should be %d" % (text, expected_width))
+
+    def test_width(self):
+        self.check_width(_str_D, 8)
+        self.check_width(_str_SD, 13)
+
+    def check_cut(self, text, width, pos):
+        w = utextwrap.UTextWrapper()
+        self.assertEqual((text[:pos], text[pos:]), w._cut(text, width))
+
+    def test_cut(self):
+        s = _str_SD
+        self.check_cut(s, 0, 0)
+        self.check_cut(s, 1, 1)
+        self.check_cut(s, 5, 5)
+        self.check_cut(s, 6, 5)
+        self.check_cut(s, 7, 6)
+        self.check_cut(s, 12, 8)
+        self.check_cut(s, 13, 9)
+        self.check_cut(s, 14, 9)
+        self.check_cut(u'A'*5, 3, 3)
+
+    def test_split(self):
+        w = utextwrap.UTextWrapper()
+        self.assertEqual(list(_str_D), w._split(_str_D))
+        self.assertEqual([_str_S]+list(_str_D), w._split(_str_SD))
+        self.assertEqual(list(_str_D)+[_str_S], w._split(_str_DS))
+
+    def test_wrap(self):
+        self.assertEqual(list(_str_D), utextwrap.wrap(_str_D, 1))
+        self.assertEqual(list(_str_D), utextwrap.wrap(_str_D, 2))
+        self.assertEqual(list(_str_D), utextwrap.wrap(_str_D, 3))
+        self.assertEqual(list(_str_D),
+                         utextwrap.wrap(_str_D, 3, break_long_words=False))
+
+class TestUTextFill(tests.TestCase):
+
+    def test_fill_simple(self):
+        # Test only can call fill() because it's just '\n'.join(wrap(text)).
+        self.assertEqual("%s\n%s" % (_str_D[:2], _str_D[2:]),
+                         utextwrap.fill(_str_D, 4))
+
+    def test_fill_with_breaks(self):
+        # Demonstrate complicated case.
+        text = u"spam ham egg spamhamegg" + _str_D + u" spam" + _str_D*2
+        self.assertEqual(u'\n'.join(["spam ham",
+                                     "egg spam",
+                                     "hamegg" + _str_D[0],
+                                     _str_D[1:],
+                                     "spam" + _str_D[:2],
+                                     _str_D[2:]+_str_D[:2],
+                                     _str_D[2:]]),
+                         utextwrap.fill(text, 8))
+
+    def test_fill_without_breaks(self):
+        text = u"spam ham egg spamhamegg" + _str_D + u" spam" + _str_D*2
+        self.assertEqual(u'\n'.join(["spam ham",
+                                     "egg",
+                                     "spamhamegg", 
+                                     # border between single width and double
+                                     # width.
+                                     _str_D,
+                                     "spam" + _str_D[:2],
+                                     _str_D[2:]+_str_D[:2],
+                                     _str_D[2:]]),
+                         utextwrap.fill(text, 8, break_long_words=False))
+
+    def test_fill_indent_with_breaks(self):
+        w = utextwrap.UTextWrapper(8, initial_indent=' '*4,
+                                   subsequent_indent=' '*4)
+        self.assertEqual(u'\n'.join(["    hell",
+                                     "    o" + _str_D[0],
+                                     "    " + _str_D[1:3],
+                                     "    " + _str_D[3]
+                                     ]),
+                         w.fill(_str_SD))
+
+    def test_fill_indent_without_breaks(self):
+        w = utextwrap.UTextWrapper(8, initial_indent=' '*4,
+                                   subsequent_indent=' '*4)
+        w.break_long_words = False
+        self.assertEqual(u'\n'.join(["    hello",
+                                     "    " + _str_D[:2],
+                                     "    " + _str_D[2:],
+                                     ]),
+                         w.fill(_str_SD))
+
+    def test_fill_indent_without_breaks_with_fixed_width(self):
+        w = utextwrap.UTextWrapper(8, initial_indent=' '*4,
+                                   subsequent_indent=' '*4)
+        w.break_long_words = False
+        w.width = 3
+        self.assertEqual(u'\n'.join(["    hello",
+                                     "    " + _str_D[0],
+                                     "    " + _str_D[1],
+                                     "    " + _str_D[2],
+                                     "    " + _str_D[3],
+                                     ]),
+                         w.fill(_str_SD))
+
+class TestUTextWrapAmbiWidth(tests.TestCase):
+    _cyrill_char = u"\u0410" # east_asian_width() == 'A'
+
+    def test_ambiwidth1(self):
+        w = utextwrap.UTextWrapper(4, ambiguous_width=1)
+        s = self._cyrill_char*8
+        self.assertEqual([self._cyrill_char*4]*2, w.wrap(s))
+
+    def test_ambiwidth2(self):
+        w = utextwrap.UTextWrapper(4, ambiguous_width=2)
+        s = self._cyrill_char*8
+        self.assertEqual([self._cyrill_char*2]*4, w.wrap(s))
+
+
+# Regression test with Python's test_textwrap
+# Note that some distribution including Ubuntu doesn't install
+# Python's test suite.
+try:
+    from test import test_textwrap
+
+    def override_textwrap_symbols(testcase):
+        # Override the symbols imported by test_textwrap so it uses our own
+        # replacements.
+        testcase.overrideAttr(test_textwrap, 'TextWrapper',
+                              utextwrap.UTextWrapper)
+        testcase.overrideAttr(test_textwrap, 'wrap', utextwrap.wrap)
+        testcase.overrideAttr(test_textwrap, 'fill', utextwrap.fill)
+
+
+    def setup_both(testcase, base_class, reused_class):
+        super(base_class, testcase).setUp()
+        override_textwrap_symbols(testcase)
+        reused_class.setUp(testcase)
+
+
+    class TestWrap(tests.TestCase, test_textwrap.WrapTestCase):
+
+        def setUp(self):
+            setup_both(self, TestWrap, test_textwrap.WrapTestCase)
+
+
+    class TestLongWord(tests.TestCase, test_textwrap.LongWordTestCase):
+
+        def setUp(self):
+            setup_both(self, TestLongWord, test_textwrap.LongWordTestCase)
+
+
+    class TestIndent(tests.TestCase, test_textwrap.IndentTestCases):
+
+        def setUp(self):
+            setup_both(self, TestIndent, test_textwrap.IndentTestCases)
+
+except ImportError:
+
+    class TestWrap(tests.TestCase):
+
+        def test_wrap(self):
+            raise TestSkipped("test.test_textwrap is not avialable.")
+
+    class TestLongWord(tests.TestCase):
+
+        def test_longword(self):
+            raise TestSkipped("test.test_textwrap is not avialable.")
+
+    class TestIndent(tests.TestCase):
+
+        def test_indent(self):
+            raise TestSkipped("test.test_textwrap is not avialable.")

=== added file 'bzrlib/utextwrap.py'
--- a/bzrlib/utextwrap.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/utextwrap.py	2011-05-14 14:32:02 +0000
@@ -0,0 +1,264 @@
+# Copyright (C) 2011 Canonical Ltd
+#
+# UTextWrapper._handle_long_word, UTextWrapper._wrap_chunks,
+# UTextWrapper._fix_sentence_endings, wrap and fill is copied from Python's
+# textwrap module (under PSF license) and modified for support CJK.
+# Original Copyright for these functions:
+#
+# Copyright (C) 1999-2001 Gregory P. Ward.
+# Copyright (C) 2002, 2003 Python Software Foundation.
+#
+# Written by Greg Ward <gward at python.net>
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+import sys
+import textwrap
+from unicodedata import east_asian_width as _eawidth
+
+from bzrlib import osutils
+
+__all__ = ["UTextWrapper", "fill", "wrap"]
+
+class UTextWrapper(textwrap.TextWrapper):
+    """
+    Extend TextWrapper for Unicode.
+
+    This textwrapper handles east asian double width and split word
+    even if !break_long_words when word contains double width
+    characters.
+
+    :param ambiguous_width: (keyword argument) width for character when
+                            unicodedata.east_asian_width(c) == 'A'
+                            (default: 1)
+
+    Limitations:
+    * expand_tabs doesn't fixed. It uses len() for calculating width
+      of string on left of TAB.
+    * Handles one codeunit as a single character having 1 or 2 width.
+      This is not correct when there are surrogate pairs, combined
+      characters or zero-width characters.
+    * Treats all asian character are line breakable. But it is not
+      true because line breaking is prohibited around some characters.
+      (For example, breaking before punctation mark is prohibited.)
+      See UAX # 14 "UNICODE LINE BREAKING ALGORITHM"
+    """
+
+    def __init__(self, width=None, **kwargs):
+        if width is None:
+            width = (osutils.terminal_width() or
+                        osutils.default_terminal_width) - 1
+
+        ambi_width = kwargs.pop('ambiguous_width', 1)
+        if ambi_width == 1:
+            self._east_asian_doublewidth = 'FW'
+        elif ambi_width == 2:
+            self._east_asian_doublewidth = 'FWA'
+        else:
+            raise ValueError("ambiguous_width should be 1 or 2")
+
+        # No drop_whitespace param before Python 2.6 it was always dropped
+        if sys.version_info < (2, 6):
+            self.drop_whitespace = kwargs.pop("drop_whitespace", True)
+            if not self.drop_whitespace:
+                raise ValueError("TextWrapper version must drop whitespace")
+        textwrap.TextWrapper.__init__(self, width, **kwargs)
+
+    def _unicode_char_width(self, uc):
+        """Return width of character `uc`.
+
+        :param:     uc      Single unicode character.
+        """
+        # 'A' means width of the character is not be able to determine.
+        # We assume that it's width is 2 because longer wrap may over
+        # terminal width but shorter wrap may be acceptable.
+        return (_eawidth(uc) in self._east_asian_doublewidth and 2) or 1
+
+    def _width(self, s):
+        """Returns width for s.
+
+        When s is unicode, take care of east asian width.
+        When s is bytes, treat all byte is single width character.
+        """
+        charwidth = self._unicode_char_width
+        return sum(charwidth(c) for c in s)
+
+    def _cut(self, s, width):
+        """Returns head and rest of s. (head+rest == s)
+
+        Head is large as long as _width(head) <= width.
+        """
+        w = 0
+        charwidth = self._unicode_char_width
+        for pos, c in enumerate(s):
+            w += charwidth(c)
+            if w > width:
+                return s[:pos], s[pos:]
+        return s, u''
+
+    def _fix_sentence_endings(self, chunks):
+        """_fix_sentence_endings(chunks : [string])
+
+        Correct for sentence endings buried in 'chunks'.  Eg. when the
+        original text contains "... foo.\nBar ...", munge_whitespace()
+        and split() will convert that to [..., "foo.", " ", "Bar", ...]
+        which has one too few spaces; this method simply changes the one
+        space to two.
+
+        Note: This function is copied from textwrap.TextWrap and modified
+        to use unicode always.
+        """
+        i = 0
+        L = len(chunks)-1
+        patsearch = self.sentence_end_re.search
+        while i < L:
+            if chunks[i+1] == u" " and patsearch(chunks[i]):
+                chunks[i+1] = u"  "
+                i += 2
+            else:
+                i += 1
+
+    def _handle_long_word(self, chunks, cur_line, cur_len, width):
+        # Figure out when indent is larger than the specified width, and make
+        # sure at least one character is stripped off on every pass
+        if width < 2:
+            space_left = chunks[-1] and self._width(chunks[-1][0]) or 1
+        else:
+            space_left = width - cur_len
+
+        # If we're allowed to break long words, then do so: put as much
+        # of the next chunk onto the current line as will fit.
+        if self.break_long_words:
+            head, rest = self._cut(chunks[-1], space_left)
+            cur_line.append(head)
+            if rest:
+                chunks[-1] = rest
+            else:
+                del chunks[-1]
+
+        # Otherwise, we have to preserve the long word intact.  Only add
+        # it to the current line if there's nothing already there --
+        # that minimizes how much we violate the width constraint.
+        elif not cur_line:
+            cur_line.append(chunks.pop())
+
+        # If we're not allowed to break long words, and there's already
+        # text on the current line, do nothing.  Next time through the
+        # main loop of _wrap_chunks(), we'll wind up here again, but
+        # cur_len will be zero, so the next line will be entirely
+        # devoted to the long word that we can't handle right now.
+
+    def _wrap_chunks(self, chunks):
+        lines = []
+        if self.width <= 0:
+            raise ValueError("invalid width %r (must be > 0)" % self.width)
+
+        # Arrange in reverse order so items can be efficiently popped
+        # from a stack of chucks.
+        chunks.reverse()
+
+        while chunks:
+
+            # Start the list of chunks that will make up the current line.
+            # cur_len is just the length of all the chunks in cur_line.
+            cur_line = []
+            cur_len = 0
+
+            # Figure out which static string will prefix this line.
+            if lines:
+                indent = self.subsequent_indent
+            else:
+                indent = self.initial_indent
+
+            # Maximum width for this line.
+            width = self.width - len(indent)
+
+            # First chunk on line is whitespace -- drop it, unless this
+            # is the very beginning of the text (ie. no lines started yet).
+            if self.drop_whitespace and chunks[-1].strip() == '' and lines:
+                del chunks[-1]
+
+            while chunks:
+                # Use _width instead of len for east asian width
+                l = self._width(chunks[-1])
+
+                # Can at least squeeze this chunk onto the current line.
+                if cur_len + l <= width:
+                    cur_line.append(chunks.pop())
+                    cur_len += l
+
+                # Nope, this line is full.
+                else:
+                    break
+
+            # The current line is full, and the next chunk is too big to
+            # fit on *any* line (not just this one).
+            if chunks and self._width(chunks[-1]) > width:
+                self._handle_long_word(chunks, cur_line, cur_len, width)
+
+            # If the last chunk on this line is all whitespace, drop it.
+            if self.drop_whitespace and cur_line and not cur_line[-1].strip():
+                del cur_line[-1]
+
+            # Convert current line back to a string and store it in list
+            # of all lines (return value).
+            if cur_line:
+                lines.append(indent + u''.join(cur_line))
+
+        return lines
+
+    def _split(self, text):
+        chunks = textwrap.TextWrapper._split(self, unicode(text))
+        cjk_split_chunks = []
+        for chunk in chunks:
+            prev_pos = 0
+            for pos, char in enumerate(chunk):
+                if self._unicode_char_width(char) == 2:
+                    if prev_pos < pos:
+                        cjk_split_chunks.append(chunk[prev_pos:pos])
+                    cjk_split_chunks.append(char)
+                    prev_pos = pos+1
+            if prev_pos < len(chunk):
+                cjk_split_chunks.append(chunk[prev_pos:])
+        return cjk_split_chunks
+
+    def wrap(self, text):
+        # ensure text is unicode
+        return textwrap.TextWrapper.wrap(self, unicode(text))
+
+# -- Convenience interface ---------------------------------------------
+
+def wrap(text, width=None, **kwargs):
+    """Wrap a single paragraph of text, returning a list of wrapped lines.
+
+    Reformat the single paragraph in 'text' so it fits in lines of no
+    more than 'width' columns, and return a list of wrapped lines.  By
+    default, tabs in 'text' are expanded with string.expandtabs(), and
+    all other whitespace characters (including newline) are converted to
+    space.  See TextWrapper class for available keyword args to customize
+    wrapping behaviour.
+    """
+    return UTextWrapper(width=width, **kwargs).wrap(text)
+
+def fill(text, width=None, **kwargs):
+    """Fill a single paragraph of text, returning a new string.
+
+    Reformat the single paragraph in 'text' to fit in lines of no more
+    than 'width' columns, and return a new string containing the entire
+    wrapped paragraph.  As with wrap(), tabs are expanded and other
+    whitespace characters converted to space.  See TextWrapper class for
+    available keyword args to customize wrapping behaviour.
+    """
+    return UTextWrapper(width=width, **kwargs).fill(text)
+