Rev 4787: (jam) Fix bugs (#425510, #426410, #194450), in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Thu Nov 5 16:02:17 GMT 2009

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 4787 [merge]
revision-id: pqm at pqm.ubuntu.com-20091105160215-rjpz3n473ixk4p3i
parent: pqm at pqm.ubuntu.com-20091104160630-zeuyqfu2frdr4vob
parent: john at arbash-meinel.com-20091104223213-foo7qmu39b26zdac
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Thu 2009-11-05 16:02:15 +0000
message:
  (jam) Fix bugs (#425510, #426410, #194450),
  	add a custom win32 command line parser.
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/builtins.py             builtins.py-20050830033751-fc01482b9ca23183
  bzrlib/commands.py             bzr.py-20050309040720-d10f4714595cf8c3
  bzrlib/tests/test_win32utils.py test_win32utils.py-20070713181630-8xsrjymd3e8mgw23-108
  bzrlib/win32utils.py           win32console.py-20051021033308-123c6c929d04973d
=== modified file 'NEWS'

--- a/NEWS	2009-11-04 09:52:44 +0000
+++ b/NEWS	2009-11-04 22:29:40 +0000
@@ -29,6 +29,15 @@
   allow those because XML store silently translate it anyway. (The parser
   auto-translates \r\n => \n in ways that are hard for us to catch.)
 
+* On Windows, do glob expansion at the command-line level (as is usually
+  done in bash, etc.) This means that *all* commands get glob expansion
+  (bzr status, bzr add, bzr mv, etc). It uses a custom command line
+  parser, which allows us to know if a given section was quoted. It means
+  you can now do ``bzr ignore "*.py"``. It also means that single-quotes
+  are now treated as quoted ``bzr ignore '*.py'``.
+  (John Arbash Meinel, #425510, #426410, #194450)
+
+
 Improvements
 ************
 

=== modified file 'bzrlib/builtins.py'
--- a/bzrlib/builtins.py	2009-11-03 20:24:25 +0000
+++ b/bzrlib/builtins.py	2009-11-04 22:32:13 +0000
@@ -655,7 +655,6 @@
         if base_tree:
             base_tree.lock_read()
         try:
-            file_list = self._maybe_expand_globs(file_list)
             tree, file_list = tree_files_for_add(file_list)
             added, ignored = tree.smart_add(file_list, not
                 no_recurse, action=action, save=not dry_run)

=== modified file 'bzrlib/commands.py'
--- a/bzrlib/commands.py	2009-10-14 20:02:28 +0000
+++ b/bzrlib/commands.py	2009-11-04 22:32:13 +0000
@@ -56,6 +56,7 @@
 from bzrlib.symbol_versioning import (
     deprecated_function,
     deprecated_in,
+    deprecated_method,
     suppress_deprecation_warnings,
     )
 
@@ -383,18 +384,18 @@
         # List of standard options directly supported
         self.supported_std_options = []
 
+    @deprecated_method(deprecated_in((2, 1, 0)))
     def _maybe_expand_globs(self, file_list):
         """Glob expand file_list if the platform does not do that itself.
 
+        Not used anymore, now that the bzr command-line parser globs on
+        Windows.
+
         :return: A possibly empty list of unicode paths.
 
         Introduced in bzrlib 0.18.
         """
-        if not file_list:
-            file_list = []
-        if sys.platform == 'win32':
-            file_list = win32utils.glob_expand(file_list)
-        return list(file_list)
+        return file_list
 
     def _usage(self):
         """Return single-line grammar for this command.

=== modified file 'bzrlib/tests/test_win32utils.py'
--- a/bzrlib/tests/test_win32utils.py	2009-07-03 14:26:34 +0000
+++ b/bzrlib/tests/test_win32utils.py	2009-11-04 22:12:46 +0000
@@ -17,7 +17,11 @@
 import os
 import sys
 
-from bzrlib import osutils
+from bzrlib import (
+    osutils,
+    tests,
+    win32utils,
+    )
 from bzrlib.tests import (
     Feature,
     TestCase,
@@ -26,7 +30,6 @@
     UnicodeFilenameFeature,
     )
 from bzrlib.win32utils import glob_expand, get_app_path
-from bzrlib import win32utils
 
 
 # Features
@@ -261,3 +264,90 @@
         os.makedirs(u'\u1234\\.bzr')
         path = osutils.abspath(u'\u1234\\.bzr')
         win32utils.set_file_attr_hidden(path)
+
+
+
+class TestUnicodeShlex(tests.TestCase):
+
+    def assertAsTokens(self, expected, line):
+        s = win32utils.UnicodeShlex(line)
+        self.assertEqual(expected, list(s))
+
+    def test_simple(self):
+        self.assertAsTokens([(False, u'foo'), (False, u'bar'), (False, u'baz')],
+                            u'foo bar baz')
+
+    def test_ignore_multiple_spaces(self):
+        self.assertAsTokens([(False, u'foo'), (False, u'bar')], u'foo  bar')
+
+    def test_ignore_leading_space(self):
+        self.assertAsTokens([(False, u'foo'), (False, u'bar')], u'  foo bar')
+
+    def test_ignore_trailing_space(self):
+        self.assertAsTokens([(False, u'foo'), (False, u'bar')], u'foo bar  ')
+
+    def test_posix_quotations(self):
+        self.assertAsTokens([(True, u'foo bar')], u'"foo bar"')
+        self.assertAsTokens([(True, u'foo bar')], u"'foo bar'")
+        self.assertAsTokens([(True, u'foo bar')], u"'fo''o b''ar'")
+        self.assertAsTokens([(True, u'foo bar')], u'"fo""o b""ar"')
+        self.assertAsTokens([(True, u'foo bar')], u'"fo"\'o b\'"ar"')
+
+    def test_nested_quotations(self):
+        self.assertAsTokens([(True, u'foo"" bar')], u"'foo\"\" bar'")
+        self.assertAsTokens([(True, u'foo\'\' bar')], u"\"foo'' bar\"")
+
+    def test_empty_result(self):
+        self.assertAsTokens([], u'')
+        self.assertAsTokens([], u'    ')
+
+    def test_quoted_empty(self):
+        self.assertAsTokens([(True, '')], u'""')
+        self.assertAsTokens([(True, '')], u"''")
+
+    def test_unicode_chars(self):
+        self.assertAsTokens([(False, u'f\xb5\xee'), (False, u'\u1234\u3456')],
+                             u'f\xb5\xee \u1234\u3456')
+
+    def test_newline_in_quoted_section(self):
+        self.assertAsTokens([(True, u'foo\nbar\nbaz\n')], u'"foo\nbar\nbaz\n"')
+        self.assertAsTokens([(True, u'foo\nbar\nbaz\n')], u"'foo\nbar\nbaz\n'")
+
+    def test_escape_chars(self):
+        self.assertAsTokens([(False, u'foo\\bar')], u'foo\\bar')
+
+    def test_escape_quote(self):
+        self.assertAsTokens([(True, u'foo"bar')], u'"foo\\"bar"')
+        self.assertAsTokens([(True, u'foo\\"bar')], u"'foo\\\"bar'")
+
+    def test_double_escape(self):
+        self.assertAsTokens([(True, u'foo\\bar')], u'"foo\\\\bar"')
+        self.assertAsTokens([(True, u'foo\\\\bar')], u"'foo\\\\bar'")
+        self.assertAsTokens([(False, u'foo\\\\bar')], u"foo\\\\bar")
+
+
+class Test_CommandLineToArgv(tests.TestCaseInTempDir):
+
+    def assertCommandLine(self, expected, line):
+        self.assertEqual(expected, win32utils._command_line_to_argv(line))
+
+    def test_glob_paths(self):
+        self.build_tree(['a/', 'a/b.c', 'a/c.c', 'a/c.h'])
+        self.assertCommandLine([u'a/b.c', u'a/c.c'], 'a/*.c')
+        self.build_tree(['b/', 'b/b.c', 'b/d.c', 'b/d.h'])
+        self.assertCommandLine([u'a/b.c', u'b/b.c'], '*/b.c')
+        self.assertCommandLine([u'a/b.c', u'a/c.c', u'b/b.c', u'b/d.c'],
+                               '*/*.c')
+        # Bash style, just pass through the argument if nothing matches
+        self.assertCommandLine([u'*/*.qqq'], '*/*.qqq')
+
+    def test_quoted_globs(self):
+        self.build_tree(['a/', 'a/b.c', 'a/c.c', 'a/c.h'])
+        self.assertCommandLine([u'a/*.c'], '"a/*.c"')
+        self.assertCommandLine([u'a/*.c'], "'a/*.c'")
+
+    def test_slashes_changed(self):
+        self.assertCommandLine([u'a/*.c'], '"a\\*.c"')
+        # Expands the glob, but nothing matches
+        self.assertCommandLine([u'a/*.c'], 'a\\*.c')
+        self.assertCommandLine([u'a/foo.c'], 'a\\foo.c')

=== modified file 'bzrlib/win32utils.py'
--- a/bzrlib/win32utils.py	2009-07-08 14:37:25 +0000
+++ b/bzrlib/win32utils.py	2009-11-04 22:26:25 +0000
@@ -19,8 +19,12 @@
 Only one dependency: ctypes should be installed.
 """
 
+import glob
 import os
+import re
+import shlex
 import struct
+import StringIO
 import sys
 
 
@@ -422,6 +426,26 @@
 
 
 
+def glob_one(possible_glob):
+    """Same as glob.glob().
+
+    work around bugs in glob.glob()
+    - Python bug #1001604 ("glob doesn't return unicode with ...")
+    - failing expansion for */* with non-iso-8859-* chars
+    """
+    corrected_glob, corrected = _ensure_with_dir(possible_glob)
+    glob_files = glob.glob(corrected_glob)
+
+    if not glob_files:
+        # special case to let the normal code path handle
+        # files that do not exist, etc.
+        glob_files = [possible_glob]
+    elif corrected:
+        glob_files = [_undo_ensure_with_dir(elem, corrected)
+                      for elem in glob_files]
+    return [elem.replace(u'\\', u'/') for elem in glob_files]
+
+
 def glob_expand(file_list):
     """Replacement for glob expansion by the shell.
 
@@ -435,25 +459,10 @@
     """
     if not file_list:
         return []
-    import glob
     expanded_file_list = []
     for possible_glob in file_list:
-        # work around bugs in glob.glob()
-        # - Python bug #1001604 ("glob doesn't return unicode with ...")
-        # - failing expansion for */* with non-iso-8859-* chars
-        possible_glob, corrected = _ensure_with_dir(possible_glob)
-        glob_files = glob.glob(possible_glob)
-
-        if glob_files == []:
-            # special case to let the normal code path handle
-            # files that do not exists
-            expanded_file_list.append(
-                _undo_ensure_with_dir(possible_glob, corrected))
-        else:
-            glob_files = [_undo_ensure_with_dir(elem, corrected) for elem in glob_files]
-            expanded_file_list += glob_files
-
-    return [elem.replace(u'\\', u'/') for elem in expanded_file_list]
+        expanded_file_list.extend(glob_one(possible_glob))
+    return expanded_file_list
 
 
 def get_app_path(appname):
@@ -511,6 +520,124 @@
             trace.mutter('Unable to set hidden attribute on %r: %s', path, e)
 
 
+
+class UnicodeShlex(object):
+    """This is a very simplified version of shlex.shlex.
+
+    The main change is that it supports non-ascii input streams. The internal
+    structure is quite simplified relative to shlex.shlex, since we aren't
+    trying to handle multiple input streams, etc. In fact, we don't use a
+    file-like api either.
+    """
+
+    def __init__(self, uni_string):
+        self._input = uni_string
+        self._input_iter = iter(self._input)
+        self._whitespace_match = re.compile(u'\s').match
+        self._word_match = re.compile(u'\S').match
+        self._quote_chars = u'\'"'
+        # self._quote_match = re.compile(u'[\'"]').match
+        self._escape_match = lambda x: None # Never matches
+        self._escape = '\\'
+        # State can be
+        #   ' ' - after whitespace, starting a new token
+        #   'a' - after text, currently working on a token
+        #   '"' - after ", currently in a "-delimited quoted section
+        #   "'" - after ', currently in a '-delimited quotod section
+        #   "\" - after '\', checking the next char
+        self._state = ' '
+        self._token = [] # Current token being parsed
+
+    def _get_token(self):
+        # Were there quote chars as part of this token?
+        quoted = False
+        quoted_state = None
+        for nextchar in self._input_iter:
+            if self._state == ' ':
+                if self._whitespace_match(nextchar):
+                    # if self._token: return token
+                    continue
+                elif nextchar in self._quote_chars:
+                    self._state = nextchar # quoted state
+                elif self._word_match(nextchar):
+                    self._token.append(nextchar)
+                    self._state = 'a'
+                else:
+                    raise AssertionError('wtttf?')
+            elif self._state in self._quote_chars:
+                quoted = True
+                if nextchar == self._state: # End of quote
+                    self._state = 'a' # posix allows 'foo'bar to translate to
+                                      # foobar
+                elif self._state == '"' and nextchar == self._escape:
+                    quoted_state = self._state
+                    self._state = nextchar
+                else:
+                    self._token.append(nextchar)
+            elif self._state == self._escape:
+                if nextchar == '\\':
+                    self._token.append('\\')
+                elif nextchar == '"':
+                    self._token.append(nextchar)
+                else:
+                    self._token.append('\\' + nextchar)
+                self._state = quoted_state
+            elif self._state == 'a':
+                if self._whitespace_match(nextchar):
+                    if self._token:
+                        break # emit this token
+                    else:
+                        continue # no token to emit
+                elif nextchar in self._quote_chars:
+                    # Start a new quoted section
+                    self._state = nextchar
+                # escape?
+                elif (self._word_match(nextchar)
+                      or nextchar in self._quote_chars
+                      # or whitespace_split?
+                      ):
+                    self._token.append(nextchar)
+                else:
+                    raise AssertionError('state == "a", char: %r'
+                                         % (nextchar,))
+            else:
+                raise AssertionError('unknown state: %r' % (self._state,))
+        result = ''.join(self._token)
+        self._token = []
+        if not quoted and result == '':
+            result = None
+        return quoted, result
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        quoted, token = self._get_token()
+        if token is None:
+            raise StopIteration
+        return quoted, token
+
+
+def _command_line_to_argv(command_line):
+    """Convert a Unicode command line into a set of argv arguments.
+
+    This does wildcard expansion, etc. It is intended to make wildcards act
+    closer to how they work in posix shells, versus how they work by default on
+    Windows.
+    """
+    s = UnicodeShlex(command_line)
+    # Now that we've split the content, expand globs
+    # TODO: Use 'globbing' instead of 'glob.glob', this gives us stuff like
+    #       '**/' style globs
+    args = []
+    for is_quoted, arg in s:
+        if is_quoted or not glob.has_magic(arg):
+            args.append(arg.replace(u'\\', u'/'))
+        else:
+            args.extend(glob_one(arg))
+    return args
+
+
 if has_ctypes and winver != 'Windows 98':
     def get_unicode_argv():
         LPCWSTR = ctypes.c_wchar_p
@@ -520,21 +647,19 @@
         GetCommandLine = prototype(("GetCommandLineW",
                                     ctypes.windll.kernel32))
         prototype = ctypes.WINFUNCTYPE(POINTER(LPCWSTR), LPCWSTR, POINTER(INT))
-        CommandLineToArgv = prototype(("CommandLineToArgvW",
-                                       ctypes.windll.shell32))
-        c = INT(0)
-        pargv = CommandLineToArgv(GetCommandLine(), ctypes.byref(c))
+        command_line = GetCommandLine()
         # Skip the first argument, since we only care about parameters
-        argv = [pargv[i] for i in range(1, c.value)]
+        argv = _command_line_to_argv(GetCommandLine())[1:]
         if getattr(sys, 'frozen', None) is None:
             # Invoked via 'python.exe' which takes the form:
             #   python.exe [PYTHON_OPTIONS] C:\Path\bzr [BZR_OPTIONS]
             # we need to get only BZR_OPTIONS part,
-            # so let's using sys.argv[1:] as reference to get the tail
-            # of unicode argv
-            tail_len = len(sys.argv[1:])
-            ix = len(argv) - tail_len
-            argv = argv[ix:]
+            # We already removed 'python.exe' so we remove everything up to and
+            # including the first non-option ('-') argument.
+            for idx in xrange(len(argv)):
+                if argv[idx][:1] != '-':
+                    break
+            argv = argv[idx+1:]
         return argv
 else:
     get_unicode_argv = None