Improved .dockerignore pattern processing to better match Docker CLI behaviorc6024-improved_excludes

Signed-off-by: Joffrey F <joffrey@docker.com>
author: Joffrey F <joffrey@docker.com> 2018-06-21 00:15:55 -0700
committer: Joffrey F <joffrey@docker.com> 2018-06-21 00:19:22 -0700
commit: 5af89904c9ad33b632305b53058ddc69190f56a9 (patch)
tree: de0201e6e7a945503da14a0e21f21c6d491d7816
parent: 5a85cad54785bae45787b2584476c286301329e2 (diff)
download: docker-py-c6024-improved_excludes.tar.gz
3 files changed, 123 insertions, 89 deletions
diff --git a/docker/utils/build.py b/docker/utils/build.py
index b644c9f..9ce0095 100644
--- a/docker/utils/build.py
+++ b/docker/utils/build.py
@@ -6,8 +6,7 @@ import tarfile
 import tempfile
 
 from ..constants import IS_WINDOWS_PLATFORM
-from fnmatch import fnmatch
-from itertools import chain
+from .fnmatch import fnmatch
 
 
 _SEP = re.compile('/|\\\\') if IS_WINDOWS_PLATFORM else re.compile('/')
@@ -44,92 +43,9 @@ def exclude_paths(root, patterns, dockerfile=None):
     if dockerfile is None:
         dockerfile = 'Dockerfile'
 
-    def split_path(p):
-        return [pt for pt in re.split(_SEP, p) if pt and pt != '.']
-
-    def normalize(p):
-        # Leading and trailing slashes are not relevant. Yes,
-        # "foo.py/" must exclude the "foo.py" regular file. "."
-        # components are not relevant either, even if the whole
-        # pattern is only ".", as the Docker reference states: "For
-        # historical reasons, the pattern . is ignored."
-        # ".." component must be cleared with the potential previous
-        # component, regardless of whether it exists: "A preprocessing
-        # step [...]  eliminates . and .. elements using Go's
-        # filepath.".
-        i = 0
-        split = split_path(p)
-        while i < len(split):
-            if split[i] == '..':
-                del split[i]
-                if i > 0:
-                    del split[i - 1]
-                    i -= 1
-            else:
-                i += 1
-        return split
-
-    patterns = (
-        (True, normalize(p[1:]))
-        if p.startswith('!') else
-        (False, normalize(p))
-        for p in patterns)
-    patterns = list(reversed(list(chain(
-        # Exclude empty patterns such as "." or the empty string.
-        filter(lambda p: p[1], patterns),
-        # Always include the Dockerfile and .dockerignore
-        [(True, split_path(dockerfile)), (True, ['.dockerignore'])]))))
-    return set(walk(root, patterns))
-
-
-def walk(root, patterns, default=True):
-    """
-    A collection of file lying below root that should be included according to
-    patterns.
-    """
-
-    def match(p):
-        if p[1][0] == '**':
-            rec = (p[0], p[1][1:])
-            return [p] + (match(rec) if rec[1] else [rec])
-        elif fnmatch(f, p[1][0]):
-            return [(p[0], p[1][1:])]
-        else:
-            return []
-
-    for f in os.listdir(root):
-        cur = os.path.join(root, f)
-        # The patterns if recursing in that directory.
-        sub = list(chain(*(match(p) for p in patterns)))
-        # Whether this file is explicitely included / excluded.
-        hit = next((p[0] for p in sub if not p[1]), None)
-        # Whether this file is implicitely included / excluded.
-        matched = default if hit is None else hit
-        sub = list(filter(lambda p: p[1], sub))
-        if os.path.isdir(cur) and not os.path.islink(cur):
-            # Entirely skip directories if there are no chance any subfile will
-            # be included.
-            if all(not p[0] for p in sub) and not matched:
-                continue
-            # I think this would greatly speed up dockerignore handling by not
-            # recursing into directories we are sure would be entirely
-            # included, and only yielding the directory itself, which will be
-            # recursively archived anyway. However the current unit test expect
-            # the full list of subfiles and I'm not 100% sure it would make no
-            # difference yet.
-            # if all(p[0] for p in sub) and matched:
-            #     yield f
-            #     continue
-            children = False
-            for r in (os.path.join(f, p) for p in walk(cur, sub, matched)):
-                yield r
-                children = True
-            # The current unit tests expect directories only under those
-            # conditions. It might be simplifiable though.
-            if (not sub or not children) and hit or hit is None and default:
-                yield f
-        elif matched:
-            yield f
+    patterns.append('!' + dockerfile)
+    pm = PatternMatcher(patterns)
+    return set(pm.walk(root))
 
 
 def build_file_list(root):
@@ -217,3 +133,110 @@ def mkbuildcontext(dockerfile):
     t.close()
     f.seek(0)
     return f
+
+
+def split_path(p):
+    return [pt for pt in re.split(_SEP, p) if pt and pt != '.']
+
+
+# Heavily based on
+# https://github.com/moby/moby/blob/master/pkg/fileutils/fileutils.go
+class PatternMatcher(object):
+    def __init__(self, patterns):
+        self.patterns = list(filter(
+            lambda p: p.dirs, [Pattern(p) for p in patterns]
+        ))
+        self.patterns.append(Pattern('!.dockerignore'))
+
+    def matches(self, filepath):
+        matched = False
+        parent_path = os.path.dirname(filepath)
+        parent_path_dirs = split_path(parent_path)
+
+        for pattern in self.patterns:
+            negative = pattern.exclusion
+            match = pattern.match(filepath)
+            if not match and parent_path != '':
+                if len(pattern.dirs) <= len(parent_path_dirs):
+                    match = pattern.match(
+                        os.path.sep.join(parent_path_dirs[:len(pattern.dirs)])
+                    )
+
+            if match:
+                matched = not negative
+
+        return matched
+
+    def walk(self, root):
+        def rec_walk(current_dir):
+            for f in os.listdir(current_dir):
+                fpath = os.path.join(
+                    os.path.relpath(current_dir, root), f
+                )
+                if fpath.startswith('.' + os.path.sep):
+                    fpath = fpath[2:]
+                match = self.matches(fpath)
+                if not match:
+                    yield fpath
+
+                cur = os.path.join(root, fpath)
+                if not os.path.isdir(cur) or os.path.islink(cur):
+                    continue
+
+                if match:
+                    # If we want to skip this file and its a directory
+                    # then we should first check to see if there's an
+                    # excludes pattern (e.g. !dir/file) that starts with this
+                    # dir. If so then we can't skip this dir.
+                    skip = True
+
+                    for pat in self.patterns:
+                        if not pat.exclusion:
+                            continue
+                        if pat.cleaned_pattern.startswith(fpath):
+                            skip = False
+                            break
+                    if skip:
+                        continue
+                for sub in rec_walk(cur):
+                    yield sub
+
+        return rec_walk(root)
+
+
+class Pattern(object):
+    def __init__(self, pattern_str):
+        self.exclusion = False
+        if pattern_str.startswith('!'):
+            self.exclusion = True
+            pattern_str = pattern_str[1:]
+
+        self.dirs = self.normalize(pattern_str)
+        self.cleaned_pattern = '/'.join(self.dirs)
+
+    @classmethod
+    def normalize(cls, p):
+
+        # Leading and trailing slashes are not relevant. Yes,
+        # "foo.py/" must exclude the "foo.py" regular file. "."
+        # components are not relevant either, even if the whole
+        # pattern is only ".", as the Docker reference states: "For
+        # historical reasons, the pattern . is ignored."
+        # ".." component must be cleared with the potential previous
+        # component, regardless of whether it exists: "A preprocessing
+        # step [...]  eliminates . and .. elements using Go's
+        # filepath.".
+        i = 0
+        split = split_path(p)
+        while i < len(split):
+            if split[i] == '..':
+                del split[i]
+                if i > 0:
+                    del split[i - 1]
+                    i -= 1
+            else:
+                i += 1
+        return split
+
+    def match(self, filepath):
+        return fnmatch(filepath, self.cleaned_pattern)
diff --git a/docker/utils/fnmatch.py b/docker/utils/fnmatch.py
index 42461dd..cc940a2 100644
--- a/docker/utils/fnmatch.py
+++ b/docker/utils/fnmatch.py
@@ -111,4 +111,5 @@ def translate(pat):
                 res = '%s[%s]' % (res, stuff)
         else:
             res = res + re.escape(c)
+
     return res + '$'
diff --git a/tests/unit/utils_test.py b/tests/unit/utils_test.py
index 00456e8..467e835 100644
--- a/tests/unit/utils_test.py
+++ b/tests/unit/utils_test.py
@@ -887,12 +887,22 @@ class ExcludePathsTest(unittest.TestCase):
             )
         )
 
+    def test_double_wildcard_with_exception(self):
+        assert self.exclude(['**', '!bar', '!foo/bar']) == convert_paths(
+            set([
+                'foo/bar', 'foo/bar/a.py', 'bar', 'bar/a.py', 'Dockerfile',
+                '.dockerignore',
+            ])
+        )
+
     def test_include_wildcard(self):
+        # This may be surprising but it matches the CLI's behavior
+        # (tested with 18.05.0-ce on linux)
         base = make_tree(['a'], ['a/b.py'])
         assert exclude_paths(
             base,
             ['*', '!*/b.py']
-        ) == convert_paths(['a/b.py'])
+        ) == set()
 
     def test_last_line_precedence(self):
         base = make_tree(
author	Joffrey F <joffrey@docker.com>	2018-06-21 00:15:55 -0700
committer	Joffrey F <joffrey@docker.com>	2018-06-21 00:19:22 -0700
commit	5af89904c9ad33b632305b53058ddc69190f56a9 (patch)
tree	de0201e6e7a945503da14a0e21f21c6d491d7816
parent	5a85cad54785bae45787b2584476c286301329e2 (diff)
download	docker-py-c6024-improved_excludes.tar.gz