[MERGE] Improvements to is_ignored, factored out glob stuff.

Tue Jan 17 22:44:32 GMT 2006

Jan Hudec wrote:
> Hello All,
> 
> Commited http://drak.ucw.cz/~bulb/bzr/bzr.ignore revision 1527, which factors out
> the glob stuff into a separate file.
> 

...

> 
> With factoring out the glob stuff, I made it independent from the ignore
> logic (with helper function for it), but this feature remained. The point is,
> that *//*, */./* and */* match the same files in shell. So they are
> canonicalized, because otherwise the former two would usually match nothing
> in bzr. But to avoid overcomplicating the patterns, they must be matched
> against canonical paths -- which never starts with './' (I have described
> this in help for bzrlib.glob.translate function).

I'm fine with this.

> 
> I attach a patch for review again (cumulative for the whole change -- use the
> branch to see individual steps). Please merge from above url
> (http://drak.ucw.cz/~bulb/bzr/bzr.ignore) 
> 
> 
> === added file 'bzrlib/glob.py'
> --- /dev/null	
> +++ bzrlib/glob.py	
> @@ -0,0 +1,171 @@
...

> +class replacer(object):
> +    """Do a multiple-pattern substitution.
> +
> +    The patterns and substitutions are combined into one, so the result of
> +    one replacement is never substituted again. Add the patterns and
> +    replacements via the add method and then call the object. The patterns
> +    must not contain capturing groups.
> +    """
> +    _expand = re.compile(ur'\\&')
> +    def __init__(self):
> +        self._pat = None
> +        self._pats = []
> +        self._funs = []
> +

I think PEP8 says there should be a blank line between the final """ of
the doc string, and the _expand, and then another one before def. But
Robert is the PEP8 guru/grammer nazi :)

> +    def add(self, pat, fun):
> +        r"""Add a pattern and replacement.
> +
> +        The pattern must not contain capturing groups.
> +        The replacement might be either a string template in which \& will be
> +        replaced with the match, or a function that will get the matching text as
> +        argument. It does not get match object, because capturing is forbidden
> +        anyway.
> +        """
> +        self._pat = None
> +        self._pats.append(pat)
> +        self._funs.append(fun)
> +
> +    def __call__(self, text):
> +        if self._pat is None:
> +            self._pat = re.compile(
> +                    u'|'.join([u'(%s)' % p for p in self._pats]),
> +                    re.UNICODE)
> +        return self._pat.sub(self._do_sub, text)
> +
> +    def _do_sub(self, m):
> +        fun = self._funs[m.lastindex - 1]
> +        if hasattr(fun, '__call__'):
> +            return fun(m.group(0))
> +        else:
> +            return self._expand.sub(m.group(0), fun)
> +
> +
> +_sub_named = replacer()
> +_sub_named.add(u'\[:digit:\]', ur'\d')
> +_sub_named.add(u'\[:space:\]', ur'\s')
> +_sub_named.add(u'\[:alnum:\]', ur'\w')
> +_sub_named.add(u'\[:ascii:\]', ur'\0-\x7f')
> +_sub_named.add(u'\[:blank:\]', ur' \t')
> +_sub_named.add(u'\[:cntrl:\]', ur'\0-\x1f\x7f-\x9f')
> +# I would gladly support many others like [:alpha:], [:graph:] and [:print:]
> +# but python regular expression engine does not provide their equivalents.
> +
> +def _sub_group(m):
> +    if m[1] == u'!':
> +        m[1] = u'^'
> +    return u'[' + _sub_named(m[1:-1]) + u']'
> +
> +def _sub_re(m):
> +    return m[3:]
> +
> +_sub_glob = replacer()
> +_sub_glob.add(ur'^RE:.*', _sub_re) # RE:<anything> is a regex.

Interesting. I like the idea, I'm not sure if it will ever be used.
Since colons are uncommon, I doubt people will try to match "RE:*" normally.

> +_sub_glob.add(ur'(?:(?<=/)|^)(?:\.?/)+', u'') # Canonicalize
> +_sub_glob.add(ur'\\.', ur'\&') # keep anything backslashed...
> +_sub_glob.add(ur'[(){}|^$+.]', ur'\\&') # escape specials...
> +_sub_glob.add(ur'(?:(?<=/)|^)\*\*\*/', ur'(?:[^/]+/)*') # ***, includes .*
> +_sub_glob.add(ur'(?:(?<=/)|^)\*\*/', ur'(?:[^./][^/]*/)*') # **, zsh-style
> +_sub_glob.add(ur'(?:(?<=/)|^)\*\.', ur'(?:[^./][^/]*)\.') # *. after / or at start
> +_sub_glob.add(ur'(?:(?<=/)|^)\*', ur'(?:[^./][^/]*)?') # * after / or at start
> +_sub_glob.add(ur'\*', ur'[^/]*') # * elsewhere
> +_sub_glob.add(ur'(?:(?<=/)|^)\?', ur'[^./]') # ? after / or at start
> +_sub_glob.add(ur'\?', ur'[^/]') # ? elsewhere
> +_sub_glob.add(ur'\[\^?\]?(?:[^][]|\[:[^]]+:\])+\]', _sub_group) # character group
> +
> +
> +def translate(pat):
> +    r"""Convert a unix glob to regular expression.
> +    
> +    Globs implement *, ?, [] character groups (both ! and ^ negate), named
> +    character classes [:digit:], [:space:], [:alnum:], [:ascii:], [:blank:],
> +    [:cntrl:] (use /inside/ []), zsh-style **/, ***/ which includes hidden
> +    directories and escapes regular expression special characters.
> +
> +    If a pattern starts with RE:, the rest is considered to be regular
> +    expression.
> +
> +    During conversion the regexp is canonicalized and must be matched against
> +    canonical path. The path must NOT start with '/' and must not contain
> +    '.' components nor multiple '/'es.
> +
> +    Pattern is returned as string.
> +    """
> +    return _sub_glob(pat) + u'$'
> +
> +
> +def translate_list(pats, wrap=u'(?:%s)'):
> +    """Convert a list of unix globs to a regular expression.
> +
> +    The pattern is returned as string. The wrap is % format applied to each
> +    individual glob pattern. It has to apply group.
> +
> +    See translate for glob semantics.
> +    """
> +    return u'|'.join([wrap % translate(x) for x in pats])
> +
> +    """
> +    Patterns containing '/' need to match whole path; others match
> +    against only the last component - as per requirement of
> +    WorkingTree.is_ignored().
> +    """
> +

Why is this docstring, code docstring? Did you mean to write this as a
comment, or should it be part of the rest of the doc string?

> +def compile(pat):
> +    """Convert a unix glob to regular expression and compile it.
> +
> +    This converts a glob to regex via translate and compiles the regex. See
> +    translate for glob semantics.
> +    """
> +    return re.compile(translate(pat), re.UNICODE)
> +
> +
> +def compile_list(pats, wrap=u'(?:%s)'):
> +    """Convert a list of unix globs to a regular expression and compile it.
> +
> +    The pattern is returned as compiled regex object. The wrap is % format
> +    applied to each individual glob pattern. It has to apply group.
> +    """
> +    return re.compile(translate_list(pats, wrap), re.UNICODE)
> +
> +
> +def anchor_glob(pat):
> +    if '/' in pat:
> +        return pat
> +    else:
> +        return u'***/' + pat
> +
> +
> +def anchor_globs(pats):
> +    """Convert file-globs to path globs as used in ignore patterns.
> +
> +    If a pattern contains '/' or starts with './', it should match whole path
> +    from root (optional './' is stripped), otherwise it should match the
> +    filename only. Thus such patterns are prefixed with '***/'.
> +
> +    """
> +    return [anchor_glob(pat) for pat in pats]
> 

You probably don't need to say 'If a pattern contains '/' or starts with
'./' since './' contains '/'. Just
'If a pattern contains '/' it should match the whole path from root'

I'm a little curious that you say './' is stripped when as near as I can
tell, no such thing is done.

> === added file 'bzrlib/tests/test_ignore.py'
> --- /dev/null	
> +++ bzrlib/tests/test_ignore.py	
> @@ -0,0 +1,191 @@
> +# Copyright (C) 2006 by Jan Hudec
> +
> +# This program is free software; you can redistribute it and/or modify
> +# it under the terms of the GNU General Public License as published by
> +# the Free Software Foundation; either version 2 of the License, or
> +# (at your option) any later version.
> +
> +# This program is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write to the Free Software
> +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> +
> +import re
> +
> +import logging
> +from cStringIO import StringIO
> +import bzrlib.trace
> +
> +from bzrlib.branch import Branch
> +from bzrlib.tests import TestCase, TestCaseInTempDir
> +
> +from bzrlib.glob import anchor_glob, compile
> +
> +class TestGlobs(TestCase):
> +
> +    def assertMatch(self, glob, names):
> +        rx = compile(anchor_glob(glob))
> +        for name in names:
> +            if not rx.match(name):
> +                raise AssertionError(repr(
> +                        u'name "%s" does not match glob "%s" (rx="%s")' %
> +                        (name, glob, rx.pattern)))
> +
> +    def assertNotMatch(self, glob, names):
> +        rx = compile(anchor_glob(glob))
> +        for name in names:
> +            if rx.match(name):
> +                raise AssertionError(repr(
> +                        u'name "%s" does match glob "%s" (rx="%s")' %
> +                        (name, glob, rx.pattern)))

For clarity, I wrote an assertMatch that takes a list of matching
patterns and a list of non-matching patterns.
So that instead of writing:

self.assertMatch('pattern', [a, b, c])
self.assertNotMatch('pattern', [d, e, f])
You could just write
self.assertMatch('pattern', [a, b, c], [d, e, f])

I also wouldn't raise AssertionError. I would tend to write it as:

def assertMatching(self, glob, matching, non_matching):
  rx = compile(anchor_glob(glob))
  for path in matching:
    self.failUnless(rx.match(path),
        'name %r does not match glob %r (rx=%r)'
        % (path, glob, rx.pattern))
  for path in non_matching:
    self.failIf(rx.match(path),
        'name %r does match glob %r (rx=%r)'
        % (path, glob, rx.pattern))

I'm not going to require you to rewrite all of the
assertMatching/assertNotMatching, but I would like to see the
'self.failIf' rather than raise AssertionError.

> +
> +    def test_char_groups(self):
> +        # The definition of digit this uses includes arabic digits from
> +        # non-latin scripts (arabic, indic, etc.) and subscript/superscript
> +        # digits, but neither roman numerals nor vulgar fractions.
> +        self.assertMatch(u'[[:digit:]]', [u'0', u'5', u'\u0663', u'\u06f9',
> +                u'\u0f21', u'\xb9'])
> +        self.assertNotMatch(u'[[:digit:]]', [u'T', u'q', u' ', u'\u8336'])
> +
> +        self.assertMatch(u'[[:space:]]', [u' ', u'\t', u'\n', u'\xa0',
> +                u'\u2000', u'\u2002'])
> +        self.assertMatch(u'[^[:space:]]', [u'a', u'-', u'\u8336'])
> +        self.assertNotMatch(u'[[:space:]]', [u'a', u'-', u'\u8336'])
> +
> +        self.assertMatch(u'[[:alnum:]]', [u'a', u'Z', u'\u017e', u'\u8336'])
> +        self.assertMatch(u'[^[:alnum:]]', [u':', u'-', u'\u25cf'])
> +        self.assertNotMatch(u'[[:alnum:]]', [u':', u'-', u'\u25cf'])
> +
> +        self.assertMatch(u'[[:ascii:]]', [u'a', u'Q', u'^'])
> +        self.assertNotMatch(u'[^[:ascii:]]', [u'a', u'Q', u'^'])
> +        self.assertNotMatch(u'[[:ascii:]]', [u'\xcc', u'\u8336'])
> +
> +        self.assertMatch(u'[[:blank:]]', [u'\t'])
> +        self.assertNotMatch(u'[^[:blank:]]', [u'\t'])
> +        self.assertNotMatch(u'[[:blank:]]', [u'x', u'y', u'z'])
> +
> +        self.assertMatch(u'[[:cntrl:]]', [u'\b', u'\t', '\x7f'])
> +        self.assertNotMatch(u'[[:cntrl:]]', [u'a', u'Q', u'\u8336'])
> +
> +        self.assertMatch(u'[a-z]', [u'a', u'q', u'f'])
> +        self.assertNotMatch(u'[a-z]', [u'A', u'Q', u'F'])
> +        self.assertMatch(u'[^a-z]', [u'A', u'Q', u'F'])
> +        self.assertNotMatch(u'[^a-z]', [u'a', u'q', u'f'])
> +
> +        self.assertMatch(ur'[\x20-\x30\u8336]', [u'\040', u'\044', u'\u8336'])
> +        self.assertNotMatch(ur'[^\x20-\x30\u8336]', [u'\040', u'\044', u'\u8336'])
> +
> +    def test_question_mark(self):
> +        self.assertMatch(u'?foo', [u'xfoo', u'bar/xfoo', u'bar/\u8336foo'])
> +        self.assertNotMatch(u'?foo', [u'.foo', u'bar/.foo', u'bar/foo',
> +                u'foo'])
> +
> +        self.assertMatch(u'foo?bar', [u'fooxbar', u'foo.bar',
> +                u'foo\u8336bar', u'qyzzy/foo.bar'])
> +        self.assertNotMatch(u'foo?bar', [u'foo/bar'])
> +
> +        self.assertMatch(u'foo/?bar', [u'foo/xbar'])
> +        self.assertNotMatch(u'foo/?bar', [u'foo/.bar', u'foo/bar',
> +                u'bar/foo/xbar'])
> +
> +    def test_asterisk(self):
> +        self.assertMatch(u'*.x', [u'foo/bar/baz.x', u'\u8336/Q.x'])
> +        self.assertNotMatch(u'*.x', [u'.foo.x', u'bar/.foo.x', u'.x'])
> +
> +        self.assertMatch(u'x*x', [u'xx', u'x.x', u'x\u8336..x',
> +                u'\u8336/x.x'])
> +        self.assertNotMatch(u'x*x', [u'x/x', u'bar/x/bar/x', u'bax/abaxab'])
> +
> +        self.assertMatch(u'*/*x', [u'\u8336/x', u'foo/bax', u'x/a.x'])
> +        self.assertNotMatch(u'*/*x', [u'.foo/x', u'\u8336/.x', u'foo/.q.x',
> +                u'foo/bar/bax'])
> +
> +    def test_double_asterisk(self):
> +        self.assertMatch(u'**/\u8336', [u'\u8336', u'foo/\u8336',
> +                u'q/y/z/z/y/\u8336'])
> +        self.assertNotMatch(u'**/\u8336', [u'that\u8336',
> +                u'q/y/z/.z/y/\u8336', u'a/b\u8336'])
> +
> +        self.assertMatch(u'x**x', [u'xaaargx', u'boo/x-x'])
> +        self.assertNotMatch(u'x**x', [u'x/y/z/bax', u'boo/x/x'])
> +        # ... because it's not **, but rather **/ and it is only recognized
> +        # after / or at the begining.
> +
> +        self.assertMatch(u'x**/x', [u'xaarg/x', u'x/x'])
> +        self.assertNotMatch(u'x**/x', [u'xa/b/x', u'foo/xfoo/x'])
> +
> +    def test_leading_dotslash(self):
> +        self.assertMatch(u'./foo', [u'foo'])
> +        self.assertNotMatch(u'./foo', [u'\u8336/foo', u'barfoo', u'x/y/foo'])
> +
> +    def test_end_anchor(self):
> +        self.assertMatch(u'*.333', [u'foo.333'])
> +        self.assertNotMatch(u'*.3', [u'foo.333'])
> +
> +class TestBzrignore(TestCaseInTempDir):
> +
> +    shape = None
> +

PEP8: Two whitespaces between classes.

> +    def setUp(self):
> +        super(TestBzrignore, self).setUp()
> +        self.branch = Branch.initialize(u'.')
> +        self.wt = self.branch.working_tree()
> +
> +    def putIgnores(self, ignores):
> +        bzrignore = file(u'.bzrignore', 'wb')
> +        bzrignore.write(ignores)
> +

The rest looks good to me.

John
=:->
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 256 bytes
Desc: OpenPGP digital signature
Url : https://lists.ubuntu.com/archives/bazaar/attachments/20060117/01fc3a2b/attachment.pgp