Rev 4787: Work on doing globbing, etc for all commands on Windows. in http://bazaar.launchpad.net/~jameinel/bzr/2.1.0b3-win32-shell-completion

Wed Nov 4 22:04:07 GMT 2009

At http://bazaar.launchpad.net/~jameinel/bzr/2.1.0b3-win32-shell-completion

------------------------------------------------------------
revno: 4787
revision-id: john at arbash-meinel.com-20091104220346-ncxbtnp4d0fc6glb
parent: pqm at pqm.ubuntu.com-20091104160630-zeuyqfu2frdr4vob
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.1.0b3-win32-shell-completion
timestamp: Wed 2009-11-04 16:03:46 -0600
message:
  Work on doing globbing, etc for all commands on Windows.
  
  Implement a UnicodeShlex that takes the key bits from shlex.shlex.
  Most notably, though, it works on a Unicode string, and not a collection
  of input files. It also has a slightly different handling of \ because
  those are frequently part of the path.
  
  It also lets us know if a given section was quoted, so that we know if
  we should skip glob expansion or not.
  
  Then start expanding globs as appropriate.
-------------- next part --------------
=== modified file 'bzrlib/tests/test_win32utils.py'

--- a/bzrlib/tests/test_win32utils.py	2009-07-03 14:26:34 +0000
+++ b/bzrlib/tests/test_win32utils.py	2009-11-04 22:03:46 +0000
@@ -17,7 +17,11 @@
 import os
 import sys
 
-from bzrlib import osutils
+from bzrlib import (
+    osutils,
+    tests,
+    win32utils,
+    )
 from bzrlib.tests import (
     Feature,
     TestCase,
@@ -26,7 +30,6 @@
     UnicodeFilenameFeature,
     )
 from bzrlib.win32utils import glob_expand, get_app_path
-from bzrlib import win32utils
 
 
 # Features
@@ -261,3 +264,84 @@
         os.makedirs(u'\u1234\\.bzr')
         path = osutils.abspath(u'\u1234\\.bzr')
         win32utils.set_file_attr_hidden(path)
+
+
+
+class TestUnicodeShlex(tests.TestCase):
+
+    def assertAsTokens(self, expected, line):
+        s = win32utils.UnicodeShlex(line)
+        self.assertEqual(expected, list(s))
+
+    def test_simple(self):
+        self.assertAsTokens([(False, u'foo'), (False, u'bar'), (False, u'baz')],
+                            u'foo bar baz')
+
+    def test_ignore_multiple_spaces(self):
+        self.assertAsTokens([(False, u'foo'), (False, u'bar')], u'foo  bar')
+
+    def test_ignore_leading_space(self):
+        self.assertAsTokens([(False, u'foo'), (False, u'bar')], u'  foo bar')
+
+    def test_ignore_trailing_space(self):
+        self.assertAsTokens([(False, u'foo'), (False, u'bar')], u'foo bar  ')
+
+    def test_posix_quotations(self):
+        self.assertAsTokens([(True, u'foo bar')], u'"foo bar"')
+        self.assertAsTokens([(True, u'foo bar')], u"'foo bar'")
+        self.assertAsTokens([(True, u'foo bar')], u"'fo''o b''ar'")
+        self.assertAsTokens([(True, u'foo bar')], u'"fo""o b""ar"')
+        self.assertAsTokens([(True, u'foo bar')], u'"fo"\'o b\'"ar"')
+
+    def test_nested_quotations(self):
+        self.assertAsTokens([(True, u'foo"" bar')], u"'foo\"\" bar'")
+        self.assertAsTokens([(True, u'foo\'\' bar')], u"\"foo'' bar\"")
+
+    def test_empty_result(self):
+        self.assertAsTokens([], u'')
+        self.assertAsTokens([], u'    ')
+
+    def test_quoted_empty(self):
+        self.assertAsTokens([(True, '')], u'""')
+        self.assertAsTokens([(True, '')], u"''")
+
+    def test_unicode_chars(self):
+        self.assertAsTokens([(False, u'f\xb5\xee'), (False, u'\u1234\u3456')],
+                             u'f\xb5\xee \u1234\u3456')
+
+    def test_newline_in_quoted_section(self):
+        self.assertAsTokens([(True, u'foo\nbar\nbaz\n')], u'"foo\nbar\nbaz\n"')
+        self.assertAsTokens([(True, u'foo\nbar\nbaz\n')], u"'foo\nbar\nbaz\n'")
+
+    def test_escape_chars(self):
+        self.assertAsTokens([(False, u'foo\\bar')], u'foo\\bar')
+
+    def test_escape_quote(self):
+        self.assertAsTokens([(True, u'foo"bar')], u'"foo\\"bar"')
+        self.assertAsTokens([(True, u'foo\\"bar')], u"'foo\\\"bar'")
+
+    def test_double_escape(self):
+        self.assertAsTokens([(True, u'foo\\bar')], u'"foo\\\\bar"')
+        self.assertAsTokens([(True, u'foo\\\\bar')], u"'foo\\\\bar'")
+        self.assertAsTokens([(False, u'foo\\\\bar')], u"foo\\\\bar")
+
+
+class Test_CommandLineToArgv(tests.TestCaseInTempDir):
+
+    def assertCommandLine(self, expected, line):
+        self.assertEqual(expected, win32utils._command_line_to_argv(line))
+
+    def test_glob_paths(self):
+        self.build_tree(['a/', 'a/b.c', 'a/c.c', 'a/c.h'])
+        self.assertCommandLine([u'a/b.c', u'a/c.c'], 'a/*.c')
+        self.build_tree(['b/', 'b/b.c', 'b/d.c', 'b/d.h'])
+        self.assertCommandLine([u'a/b.c', u'b/b.c'], '*/b.c')
+        self.assertCommandLine([u'a/b.c', u'a/c.c', u'b/b.c', u'b/d.c'],
+                               '*/*.c')
+        # Bash style, just pass through the argument if nothing matches
+        self.assertCommandLine([u'*/*.qqq'], '*/*.qqq')
+
+    def test_quoted_globs(self):
+        self.build_tree(['a/', 'a/b.c', 'a/c.c', 'a/c.h'])
+        self.assertCommandLine([u'a/*.c'], '"a/*.c"')
+        self.assertCommandLine([u'a/*.c'], "'a/*.c'")

=== modified file 'bzrlib/win32utils.py'
--- a/bzrlib/win32utils.py	2009-07-08 14:37:25 +0000
+++ b/bzrlib/win32utils.py	2009-11-04 22:03:46 +0000
@@ -19,8 +19,12 @@
 Only one dependency: ctypes should be installed.
 """
 
+import glob
 import os
+import re
+import shlex
 import struct
+import StringIO
 import sys
 
 
@@ -435,7 +439,6 @@
     """
     if not file_list:
         return []
-    import glob
     expanded_file_list = []
     for possible_glob in file_list:
         # work around bugs in glob.glob()
@@ -511,6 +514,124 @@
             trace.mutter('Unable to set hidden attribute on %r: %s', path, e)
 
 
+
+class UnicodeShlex(object):
+    """This is a very simplified version of shlex.shlex.
+
+    The main change is that it supports non-ascii input streams. The internal
+    structure is quite simplified relative to shlex.shlex, since we aren't
+    trying to handle multiple input streams, etc. In fact, we don't use a
+    file-like api either.
+    """
+
+    def __init__(self, uni_string):
+        self._input = uni_string
+        self._input_iter = iter(self._input)
+        self._whitespace_match = re.compile(u'\s').match
+        self._word_match = re.compile(u'\S').match
+        self._quote_chars = u'\'"'
+        # self._quote_match = re.compile(u'[\'"]').match
+        self._escape_match = lambda x: None # Never matches
+        self._escape = '\\'
+        # State can be
+        #   ' ' - after whitespace, starting a new token
+        #   'a' - after text, currently working on a token
+        #   '"' - after ", currently in a "-delimited quoted section
+        #   "'" - after ', currently in a '-delimited quotod section
+        #   "\" - after '\', checking the next char
+        self._state = ' '
+        self._token = [] # Current token being parsed
+
+    def _get_token(self):
+        # Were there quote chars as part of this token?
+        quoted = False
+        quoted_state = None
+        for nextchar in self._input_iter:
+            if self._state == ' ':
+                if self._whitespace_match(nextchar):
+                    # if self._token: return token
+                    continue
+                elif nextchar in self._quote_chars:
+                    self._state = nextchar # quoted state
+                elif self._word_match(nextchar):
+                    self._token.append(nextchar)
+                    self._state = 'a'
+                else:
+                    raise AssertionError('wtttf?')
+            elif self._state in self._quote_chars:
+                quoted = True
+                if nextchar == self._state: # End of quote
+                    self._state = 'a' # posix allows 'foo'bar to translate to
+                                      # foobar
+                elif self._state == '"' and nextchar == self._escape:
+                    quoted_state = self._state
+                    self._state = nextchar
+                else:
+                    self._token.append(nextchar)
+            elif self._state == self._escape:
+                if nextchar == '\\':
+                    self._token.append('\\')
+                elif nextchar == '"':
+                    self._token.append(nextchar)
+                else:
+                    self._token.append('\\' + nextchar)
+                self._state = quoted_state
+            elif self._state == 'a':
+                if self._whitespace_match(nextchar):
+                    if self._token:
+                        break # emit this token
+                    else:
+                        continue # no token to emit
+                elif nextchar in self._quote_chars:
+                    # Start a new quoted section
+                    self._state = nextchar
+                # escape?
+                elif (self._word_match(nextchar)
+                      or nextchar in self._quote_chars
+                      # or whitespace_split?
+                      ):
+                    self._token.append(nextchar)
+                else:
+                    raise AssertionError('state == "a", char: %r'
+                                         % (nextchar,))
+            else:
+                raise AssertionError('unknown state: %r' % (self._state,))
+        result = ''.join(self._token)
+        self._token = []
+        if not quoted and result == '':
+            result = None
+        return quoted, result
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        quoted, token = self._get_token()
+        if token is None:
+            raise StopIteration
+        return quoted, token
+
+
+def _command_line_to_argv(command_line):
+    """Convert a Unicode command line into a set of argv arguments.
+
+    This does wildcard expansion, etc. It is intended to make wildcards act
+    closer to how they work in posix shells, versus how they work by default on
+    Windows.
+    """
+    s = UnicodeShlex(command_line)
+    # Now that we've split the content, expand globs
+    # TODO: Use 'globbing' instead of 'glob.glob', this gives us stuff like
+    #       '**/' style globs
+    args = []
+    for is_quoted, arg in s:
+        if is_quoted:
+            args.append(arg)
+        else:
+            args.extend(glob_expand([arg]))
+    return args
+
+
 if has_ctypes and winver != 'Windows 98':
     def get_unicode_argv():
         LPCWSTR = ctypes.c_wchar_p