Rev 5874: (vila) Add support for double width characters to textwrap. (INADA Naoki) in file:///home/pqm/archives/thelove/bzr/%2Btrunk/
Canonical.com Patch Queue Manager
pqm at pqm.ubuntu.com
Mon May 16 23:19:34 UTC 2011
At file:///home/pqm/archives/thelove/bzr/%2Btrunk/
------------------------------------------------------------
revno: 5874 [merge]
revision-id: pqm at pqm.ubuntu.com-20110516231929-aenh18a18r12mvp2
parent: pqm at pqm.ubuntu.com-20110516222836-prryayncmfqqh6w3
parent: songofacandy at gmail.com-20110514145906-xj3xa06jihd4wapq
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Mon 2011-05-16 23:19:29 +0000
message:
(vila) Add support for double width characters to textwrap. (INADA Naoki)
added:
bzrlib/tests/test_utextwrap.py test_utextwrap.py-20110504151300-vdvrs19wd20a5cy0-1
bzrlib/utextwrap.py utextwrap.py-20110504064158-lx1sswjckyz5vyi3-1
modified:
bzrlib/tests/__init__.py selftest.py-20050531073622-8d0e3c8845c97a64
=== modified file 'bzrlib/tests/__init__.py'
--- a/bzrlib/tests/__init__.py 2011-05-16 22:28:36 +0000
+++ b/bzrlib/tests/__init__.py 2011-05-16 23:19:29 +0000
@@ -3900,6 +3900,7 @@
'bzrlib.tests.test_upgrade',
'bzrlib.tests.test_upgrade_stacked',
'bzrlib.tests.test_urlutils',
+ 'bzrlib.tests.test_utextwrap',
'bzrlib.tests.test_version',
'bzrlib.tests.test_version_info',
'bzrlib.tests.test_versionedfile',
=== added file 'bzrlib/tests/test_utextwrap.py'
--- a/bzrlib/tests/test_utextwrap.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test_utextwrap.py 2011-05-14 14:59:06 +0000
@@ -0,0 +1,207 @@
+# Copyright (C) 2011 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+#
+
+"""Tests of the bzrlib.utextwrap."""
+
+from bzrlib import tests, utextwrap
+from bzrlib.tests import TestSkipped
+
+# Japanese "Good morning".
+# Each character have double width. So total 8 width on console.
+_str_D = u'\u304a\u306f\u3088\u3046'
+
+_str_S = u"hello"
+
+# Combine single width characters and double width characters.
+_str_SD = _str_S + _str_D
+_str_DS = _str_D + _str_S
+
+class TestUTextWrap(tests.TestCase):
+
+ def check_width(self, text, expected_width):
+ w = utextwrap.UTextWrapper()
+ self.assertEqual(
+ w._width(text),
+ expected_width,
+ "Width of %r should be %d" % (text, expected_width))
+
+ def test_width(self):
+ self.check_width(_str_D, 8)
+ self.check_width(_str_SD, 13)
+
+ def check_cut(self, text, width, pos):
+ w = utextwrap.UTextWrapper()
+ self.assertEqual((text[:pos], text[pos:]), w._cut(text, width))
+
+ def test_cut(self):
+ s = _str_SD
+ self.check_cut(s, 0, 0)
+ self.check_cut(s, 1, 1)
+ self.check_cut(s, 5, 5)
+ self.check_cut(s, 6, 5)
+ self.check_cut(s, 7, 6)
+ self.check_cut(s, 12, 8)
+ self.check_cut(s, 13, 9)
+ self.check_cut(s, 14, 9)
+ self.check_cut(u'A'*5, 3, 3)
+
+ def test_split(self):
+ w = utextwrap.UTextWrapper()
+ self.assertEqual(list(_str_D), w._split(_str_D))
+ self.assertEqual([_str_S]+list(_str_D), w._split(_str_SD))
+ self.assertEqual(list(_str_D)+[_str_S], w._split(_str_DS))
+
+ def test_wrap(self):
+ self.assertEqual(list(_str_D), utextwrap.wrap(_str_D, 1))
+ self.assertEqual(list(_str_D), utextwrap.wrap(_str_D, 2))
+ self.assertEqual(list(_str_D), utextwrap.wrap(_str_D, 3))
+ self.assertEqual(list(_str_D),
+ utextwrap.wrap(_str_D, 3, break_long_words=False))
+
+class TestUTextFill(tests.TestCase):
+
+ def test_fill_simple(self):
+ # Test only can call fill() because it's just '\n'.join(wrap(text)).
+ self.assertEqual("%s\n%s" % (_str_D[:2], _str_D[2:]),
+ utextwrap.fill(_str_D, 4))
+
+ def test_fill_with_breaks(self):
+ # Demonstrate complicated case.
+ text = u"spam ham egg spamhamegg" + _str_D + u" spam" + _str_D*2
+ self.assertEqual(u'\n'.join(["spam ham",
+ "egg spam",
+ "hamegg" + _str_D[0],
+ _str_D[1:],
+ "spam" + _str_D[:2],
+ _str_D[2:]+_str_D[:2],
+ _str_D[2:]]),
+ utextwrap.fill(text, 8))
+
+ def test_fill_without_breaks(self):
+ text = u"spam ham egg spamhamegg" + _str_D + u" spam" + _str_D*2
+ self.assertEqual(u'\n'.join(["spam ham",
+ "egg",
+ "spamhamegg",
+ # border between single width and double
+ # width.
+ _str_D,
+ "spam" + _str_D[:2],
+ _str_D[2:]+_str_D[:2],
+ _str_D[2:]]),
+ utextwrap.fill(text, 8, break_long_words=False))
+
+ def test_fill_indent_with_breaks(self):
+ w = utextwrap.UTextWrapper(8, initial_indent=' '*4,
+ subsequent_indent=' '*4)
+ self.assertEqual(u'\n'.join([" hell",
+ " o" + _str_D[0],
+ " " + _str_D[1:3],
+ " " + _str_D[3]
+ ]),
+ w.fill(_str_SD))
+
+ def test_fill_indent_without_breaks(self):
+ w = utextwrap.UTextWrapper(8, initial_indent=' '*4,
+ subsequent_indent=' '*4)
+ w.break_long_words = False
+ self.assertEqual(u'\n'.join([" hello",
+ " " + _str_D[:2],
+ " " + _str_D[2:],
+ ]),
+ w.fill(_str_SD))
+
+ def test_fill_indent_without_breaks_with_fixed_width(self):
+ w = utextwrap.UTextWrapper(8, initial_indent=' '*4,
+ subsequent_indent=' '*4)
+ w.break_long_words = False
+ w.width = 3
+ self.assertEqual(u'\n'.join([" hello",
+ " " + _str_D[0],
+ " " + _str_D[1],
+ " " + _str_D[2],
+ " " + _str_D[3],
+ ]),
+ w.fill(_str_SD))
+
+class TestUTextWrapAmbiWidth(tests.TestCase):
+ _cyrill_char = u"\u0410" # east_asian_width() == 'A'
+
+ def test_ambiwidth1(self):
+ w = utextwrap.UTextWrapper(4, ambiguous_width=1)
+ s = self._cyrill_char*8
+ self.assertEqual([self._cyrill_char*4]*2, w.wrap(s))
+
+ def test_ambiwidth2(self):
+ w = utextwrap.UTextWrapper(4, ambiguous_width=2)
+ s = self._cyrill_char*8
+ self.assertEqual([self._cyrill_char*2]*4, w.wrap(s))
+
+
+# Regression test with Python's test_textwrap
+# Note that some distribution including Ubuntu doesn't install
+# Python's test suite.
+try:
+ from test import test_textwrap
+
+ def override_textwrap_symbols(testcase):
+ # Override the symbols imported by test_textwrap so it uses our own
+ # replacements.
+ testcase.overrideAttr(test_textwrap, 'TextWrapper',
+ utextwrap.UTextWrapper)
+ testcase.overrideAttr(test_textwrap, 'wrap', utextwrap.wrap)
+ testcase.overrideAttr(test_textwrap, 'fill', utextwrap.fill)
+
+
+ def setup_both(testcase, base_class, reused_class):
+ super(base_class, testcase).setUp()
+ override_textwrap_symbols(testcase)
+ reused_class.setUp(testcase)
+
+
+ class TestWrap(tests.TestCase, test_textwrap.WrapTestCase):
+
+ def setUp(self):
+ setup_both(self, TestWrap, test_textwrap.WrapTestCase)
+
+
+ class TestLongWord(tests.TestCase, test_textwrap.LongWordTestCase):
+
+ def setUp(self):
+ setup_both(self, TestLongWord, test_textwrap.LongWordTestCase)
+
+
+ class TestIndent(tests.TestCase, test_textwrap.IndentTestCases):
+
+ def setUp(self):
+ setup_both(self, TestIndent, test_textwrap.IndentTestCases)
+
+except ImportError:
+
+ class TestWrap(tests.TestCase):
+
+ def test_wrap(self):
+ raise TestSkipped("test.test_textwrap is not avialable.")
+
+ class TestLongWord(tests.TestCase):
+
+ def test_longword(self):
+ raise TestSkipped("test.test_textwrap is not avialable.")
+
+ class TestIndent(tests.TestCase):
+
+ def test_indent(self):
+ raise TestSkipped("test.test_textwrap is not avialable.")
=== added file 'bzrlib/utextwrap.py'
--- a/bzrlib/utextwrap.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/utextwrap.py 2011-05-14 14:32:02 +0000
@@ -0,0 +1,264 @@
+# Copyright (C) 2011 Canonical Ltd
+#
+# UTextWrapper._handle_long_word, UTextWrapper._wrap_chunks,
+# UTextWrapper._fix_sentence_endings, wrap and fill is copied from Python's
+# textwrap module (under PSF license) and modified for support CJK.
+# Original Copyright for these functions:
+#
+# Copyright (C) 1999-2001 Gregory P. Ward.
+# Copyright (C) 2002, 2003 Python Software Foundation.
+#
+# Written by Greg Ward <gward at python.net>
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+import sys
+import textwrap
+from unicodedata import east_asian_width as _eawidth
+
+from bzrlib import osutils
+
+__all__ = ["UTextWrapper", "fill", "wrap"]
+
+class UTextWrapper(textwrap.TextWrapper):
+ """
+ Extend TextWrapper for Unicode.
+
+ This textwrapper handles east asian double width and split word
+ even if !break_long_words when word contains double width
+ characters.
+
+ :param ambiguous_width: (keyword argument) width for character when
+ unicodedata.east_asian_width(c) == 'A'
+ (default: 1)
+
+ Limitations:
+ * expand_tabs doesn't fixed. It uses len() for calculating width
+ of string on left of TAB.
+ * Handles one codeunit as a single character having 1 or 2 width.
+ This is not correct when there are surrogate pairs, combined
+ characters or zero-width characters.
+ * Treats all asian character are line breakable. But it is not
+ true because line breaking is prohibited around some characters.
+ (For example, breaking before punctation mark is prohibited.)
+ See UAX # 14 "UNICODE LINE BREAKING ALGORITHM"
+ """
+
+ def __init__(self, width=None, **kwargs):
+ if width is None:
+ width = (osutils.terminal_width() or
+ osutils.default_terminal_width) - 1
+
+ ambi_width = kwargs.pop('ambiguous_width', 1)
+ if ambi_width == 1:
+ self._east_asian_doublewidth = 'FW'
+ elif ambi_width == 2:
+ self._east_asian_doublewidth = 'FWA'
+ else:
+ raise ValueError("ambiguous_width should be 1 or 2")
+
+ # No drop_whitespace param before Python 2.6 it was always dropped
+ if sys.version_info < (2, 6):
+ self.drop_whitespace = kwargs.pop("drop_whitespace", True)
+ if not self.drop_whitespace:
+ raise ValueError("TextWrapper version must drop whitespace")
+ textwrap.TextWrapper.__init__(self, width, **kwargs)
+
+ def _unicode_char_width(self, uc):
+ """Return width of character `uc`.
+
+ :param: uc Single unicode character.
+ """
+ # 'A' means width of the character is not be able to determine.
+ # We assume that it's width is 2 because longer wrap may over
+ # terminal width but shorter wrap may be acceptable.
+ return (_eawidth(uc) in self._east_asian_doublewidth and 2) or 1
+
+ def _width(self, s):
+ """Returns width for s.
+
+ When s is unicode, take care of east asian width.
+ When s is bytes, treat all byte is single width character.
+ """
+ charwidth = self._unicode_char_width
+ return sum(charwidth(c) for c in s)
+
+ def _cut(self, s, width):
+ """Returns head and rest of s. (head+rest == s)
+
+ Head is large as long as _width(head) <= width.
+ """
+ w = 0
+ charwidth = self._unicode_char_width
+ for pos, c in enumerate(s):
+ w += charwidth(c)
+ if w > width:
+ return s[:pos], s[pos:]
+ return s, u''
+
+ def _fix_sentence_endings(self, chunks):
+ """_fix_sentence_endings(chunks : [string])
+
+ Correct for sentence endings buried in 'chunks'. Eg. when the
+ original text contains "... foo.\nBar ...", munge_whitespace()
+ and split() will convert that to [..., "foo.", " ", "Bar", ...]
+ which has one too few spaces; this method simply changes the one
+ space to two.
+
+ Note: This function is copied from textwrap.TextWrap and modified
+ to use unicode always.
+ """
+ i = 0
+ L = len(chunks)-1
+ patsearch = self.sentence_end_re.search
+ while i < L:
+ if chunks[i+1] == u" " and patsearch(chunks[i]):
+ chunks[i+1] = u" "
+ i += 2
+ else:
+ i += 1
+
+ def _handle_long_word(self, chunks, cur_line, cur_len, width):
+ # Figure out when indent is larger than the specified width, and make
+ # sure at least one character is stripped off on every pass
+ if width < 2:
+ space_left = chunks[-1] and self._width(chunks[-1][0]) or 1
+ else:
+ space_left = width - cur_len
+
+ # If we're allowed to break long words, then do so: put as much
+ # of the next chunk onto the current line as will fit.
+ if self.break_long_words:
+ head, rest = self._cut(chunks[-1], space_left)
+ cur_line.append(head)
+ if rest:
+ chunks[-1] = rest
+ else:
+ del chunks[-1]
+
+ # Otherwise, we have to preserve the long word intact. Only add
+ # it to the current line if there's nothing already there --
+ # that minimizes how much we violate the width constraint.
+ elif not cur_line:
+ cur_line.append(chunks.pop())
+
+ # If we're not allowed to break long words, and there's already
+ # text on the current line, do nothing. Next time through the
+ # main loop of _wrap_chunks(), we'll wind up here again, but
+ # cur_len will be zero, so the next line will be entirely
+ # devoted to the long word that we can't handle right now.
+
+ def _wrap_chunks(self, chunks):
+ lines = []
+ if self.width <= 0:
+ raise ValueError("invalid width %r (must be > 0)" % self.width)
+
+ # Arrange in reverse order so items can be efficiently popped
+ # from a stack of chucks.
+ chunks.reverse()
+
+ while chunks:
+
+ # Start the list of chunks that will make up the current line.
+ # cur_len is just the length of all the chunks in cur_line.
+ cur_line = []
+ cur_len = 0
+
+ # Figure out which static string will prefix this line.
+ if lines:
+ indent = self.subsequent_indent
+ else:
+ indent = self.initial_indent
+
+ # Maximum width for this line.
+ width = self.width - len(indent)
+
+ # First chunk on line is whitespace -- drop it, unless this
+ # is the very beginning of the text (ie. no lines started yet).
+ if self.drop_whitespace and chunks[-1].strip() == '' and lines:
+ del chunks[-1]
+
+ while chunks:
+ # Use _width instead of len for east asian width
+ l = self._width(chunks[-1])
+
+ # Can at least squeeze this chunk onto the current line.
+ if cur_len + l <= width:
+ cur_line.append(chunks.pop())
+ cur_len += l
+
+ # Nope, this line is full.
+ else:
+ break
+
+ # The current line is full, and the next chunk is too big to
+ # fit on *any* line (not just this one).
+ if chunks and self._width(chunks[-1]) > width:
+ self._handle_long_word(chunks, cur_line, cur_len, width)
+
+ # If the last chunk on this line is all whitespace, drop it.
+ if self.drop_whitespace and cur_line and not cur_line[-1].strip():
+ del cur_line[-1]
+
+ # Convert current line back to a string and store it in list
+ # of all lines (return value).
+ if cur_line:
+ lines.append(indent + u''.join(cur_line))
+
+ return lines
+
+ def _split(self, text):
+ chunks = textwrap.TextWrapper._split(self, unicode(text))
+ cjk_split_chunks = []
+ for chunk in chunks:
+ prev_pos = 0
+ for pos, char in enumerate(chunk):
+ if self._unicode_char_width(char) == 2:
+ if prev_pos < pos:
+ cjk_split_chunks.append(chunk[prev_pos:pos])
+ cjk_split_chunks.append(char)
+ prev_pos = pos+1
+ if prev_pos < len(chunk):
+ cjk_split_chunks.append(chunk[prev_pos:])
+ return cjk_split_chunks
+
+ def wrap(self, text):
+ # ensure text is unicode
+ return textwrap.TextWrapper.wrap(self, unicode(text))
+
+# -- Convenience interface ---------------------------------------------
+
+def wrap(text, width=None, **kwargs):
+ """Wrap a single paragraph of text, returning a list of wrapped lines.
+
+ Reformat the single paragraph in 'text' so it fits in lines of no
+ more than 'width' columns, and return a list of wrapped lines. By
+ default, tabs in 'text' are expanded with string.expandtabs(), and
+ all other whitespace characters (including newline) are converted to
+ space. See TextWrapper class for available keyword args to customize
+ wrapping behaviour.
+ """
+ return UTextWrapper(width=width, **kwargs).wrap(text)
+
+def fill(text, width=None, **kwargs):
+ """Fill a single paragraph of text, returning a new string.
+
+ Reformat the single paragraph in 'text' to fit in lines of no more
+ than 'width' columns, and return a new string containing the entire
+ wrapped paragraph. As with wrap(), tabs are expanded and other
+ whitespace characters converted to space. See TextWrapper class for
+ available keyword args to customize wrapping behaviour.
+ """
+ return UTextWrapper(width=width, **kwargs).fill(text)
+
More information about the bazaar-commits
mailing list