bpo-45582: Port getpath[p].c to Python (GH-29041)

The getpath.py file is frozen at build time and executed as code over a namespace. It is never imported, nor is it meant to be importable or reusable. However, it should be easier to read, modify, and patch than the previous code. This commit attempts to preserve every previously tested quirk, but these may be changed in the future to better align platforms.
author: Steve Dower <steve.dower@python.org> 2021-12-03 00:08:42 +0000
committer: GitHub <noreply@github.com> 2021-12-03 00:08:42 +0000
commit: 99fcf1505218464c489d419d4500f126b6d6dc28 (patch)
tree: a9d607d854e943b3651248eadbe2f31f8c410021 /Python/fileutils.c
parent: 9f2f7e42269db74a89fc8cd74d82a875787f01d7 (diff)
download: cpython-git-99fcf1505218464c489d419d4500f126b6d6dc28.tar.gz
1 files changed, 144 insertions, 91 deletions
diff --git a/Python/fileutils.c b/Python/fileutils.c
index ac0046cdac..cae6b75b6a 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -2000,13 +2000,28 @@ _Py_wrealpath(const wchar_t *path,
 #endif
 
 
-#ifndef MS_WINDOWS
 int
 _Py_isabs(const wchar_t *path)
 {
+#ifdef MS_WINDOWS
+    const wchar_t *tail;
+    HRESULT hr = PathCchSkipRoot(path, &tail);
+    if (FAILED(hr) || path == tail) {
+        return 0;
+    }
+    if (tail == &path[1] && (path[0] == SEP || path[0] == ALTSEP)) {
+        // Exclude paths with leading SEP
+        return 0;
+    }
+    if (tail == &path[2] && path[1] == L':') {
+        // Exclude drive-relative paths (e.g. C:filename.ext)
+        return 0;
+    }
+    return 1;
+#else
     return (path[0] == SEP);
-}
 #endif
+}
 
 
 /* Get an absolute path.
@@ -2017,6 +2032,22 @@ _Py_isabs(const wchar_t *path)
 int
 _Py_abspath(const wchar_t *path, wchar_t **abspath_p)
 {
+    if (path[0] == '\0' || !wcscmp(path, L".")) {
+        wchar_t cwd[MAXPATHLEN + 1];
+        cwd[Py_ARRAY_LENGTH(cwd) - 1] = 0;
+        if (!_Py_wgetcwd(cwd, Py_ARRAY_LENGTH(cwd) - 1)) {
+            /* unable to get the current directory */
+            return -1;
+        }
+        *abspath_p = _PyMem_RawWcsdup(cwd);
+        return 0;
+    }
+
+    if (_Py_isabs(path)) {
+        *abspath_p = _PyMem_RawWcsdup(path);
+        return 0;
+    }
+
 #ifdef MS_WINDOWS
     wchar_t woutbuf[MAX_PATH], *woutbufp = woutbuf;
     DWORD result;
@@ -2028,7 +2059,7 @@ _Py_abspath(const wchar_t *path, wchar_t **abspath_p)
         return -1;
     }
 
-    if (result > Py_ARRAY_LENGTH(woutbuf)) {
+    if (result >= Py_ARRAY_LENGTH(woutbuf)) {
         if ((size_t)result <= (size_t)PY_SSIZE_T_MAX / sizeof(wchar_t)) {
             woutbufp = PyMem_RawMalloc((size_t)result * sizeof(wchar_t));
         }
@@ -2055,11 +2086,6 @@ _Py_abspath(const wchar_t *path, wchar_t **abspath_p)
     *abspath_p = _PyMem_RawWcsdup(woutbufp);
     return 0;
 #else
-    if (_Py_isabs(path)) {
-        *abspath_p = _PyMem_RawWcsdup(path);
-        return 0;
-    }
-
     wchar_t cwd[MAXPATHLEN + 1];
     cwd[Py_ARRAY_LENGTH(cwd) - 1] = 0;
     if (!_Py_wgetcwd(cwd, Py_ARRAY_LENGTH(cwd) - 1)) {
@@ -2102,7 +2128,8 @@ join_relfile(wchar_t *buffer, size_t bufsize,
              const wchar_t *dirname, const wchar_t *relfile)
 {
 #ifdef MS_WINDOWS
-    if (FAILED(PathCchCombineEx(buffer, bufsize, dirname, relfile, 0))) {
+    if (FAILED(PathCchCombineEx(buffer, bufsize, dirname, relfile, 
+        PATHCCH_ALLOW_LONG_PATHS | PATHCCH_FORCE_ENABLE_LONG_NAME_PROCESS))) {
         return -1;
     }
 #else
@@ -2180,99 +2207,125 @@ _Py_find_basename(const wchar_t *filename)
     return 0;
 }
 
-
-/* Remove navigation elements such as "." and "..".
-
-   This is mostly a C implementation of posixpath.normpath().
-   Return 0 on success.  Return -1 if "orig" is too big for the buffer. */
-int
-_Py_normalize_path(const wchar_t *path, wchar_t *buf, const size_t buf_len)
+/* In-place path normalisation. Returns the start of the normalized
+   path, which will be within the original buffer. Guaranteed to not
+   make the path longer, and will not fail. 'size' is the length of
+   the path, if known. If -1, the first null character will be assumed
+   to be the end of the path. */
+wchar_t *
+_Py_normpath(wchar_t *path, Py_ssize_t size)
 {
-    assert(path && *path != L'\0');
-    assert(*path == SEP);  // an absolute path
-    if (wcslen(path) + 1 >= buf_len) {
-        return -1;
+    if (!path[0] || size == 0) {
+        return path;
     }
+    wchar_t lastC = L'\0';
+    wchar_t *p1 = path;
+    wchar_t *pEnd = size >= 0 ? &path[size] : NULL;
+    wchar_t *p2 = path;
+    wchar_t *minP2 = path;
 
-    int dots = -1;
-    int check_leading = 1;
-    const wchar_t *buf_start = buf;
-    wchar_t *buf_next = buf;
-    // The resulting filename will never be longer than path.
-    for (const wchar_t *remainder = path; *remainder != L'\0'; remainder++) {
-        wchar_t c = *remainder;
-        buf_next[0] = c;
-        buf_next++;
-        if (c == SEP) {
-            assert(dots <= 2);
-            if (dots == 2) {
-                // Turn "/x/y/../z" into "/x/z".
-                buf_next -= 4;  // "/../"
-                assert(*buf_next == SEP);
-                // We cap it off at the root, so "/../spam" becomes "/spam".
-                if (buf_next == buf_start) {
-                    buf_next++;
-                }
-                else {
-                    // Move to the previous SEP in the buffer.
-                    while (*(buf_next - 1) != SEP) {
-                        assert(buf_next != buf_start);
-                        buf_next--;
-                    }
-                }
-            }
-            else if (dots == 1) {
-                // Turn "/./" into "/".
-                buf_next -= 2;  // "./"
-                assert(*(buf_next - 1) == SEP);
-            }
-            else if (dots == 0) {
-                // Turn "//" into "/".
-                buf_next--;
-                assert(*(buf_next - 1) == SEP);
-                if (check_leading) {
-                    if (buf_next - 1 == buf && *(remainder + 1) != SEP) {
-                        // Leave a leading "//" alone, unless "///...".
-                        buf_next++;
-                        buf_start++;
-                    }
-                    check_leading = 0;
-                }
-            }
-            dots = 0;
+#define IS_END(x) (pEnd ? (x) == pEnd : !*(x))
+#ifdef ALTSEP
+#define IS_SEP(x) (*(x) == SEP || *(x) == ALTSEP)
+#else
+#define IS_SEP(x) (*(x) == SEP)
+#endif
+#define SEP_OR_END(x) (IS_SEP(x) || IS_END(x))
+
+    // Skip leading '.\'
+    if (p1[0] == L'.' && IS_SEP(&p1[1])) {
+        path = &path[2];
+        while (IS_SEP(path) && !IS_END(path)) {
+            path++;
         }
-        else {
-            check_leading = 0;
-            if (dots >= 0) {
-                if (c == L'.' && dots < 2) {
-                    dots++;
-                }
-                else {
-                    dots = -1;
-                }
+        p1 = p2 = minP2 = path;
+        lastC = SEP;
+    }
+#ifdef MS_WINDOWS
+    // Skip past drive segment and update minP2
+    else if (p1[0] && p1[1] == L':') {
+        *p2++ = *p1++;
+        *p2++ = *p1++;
+        minP2 = p2;
+        lastC = L':';
+    }
+    // Skip past all \\-prefixed paths, including \\?\, \\.\,
+    // and network paths, including the first segment.
+    else if (IS_SEP(&p1[0]) && IS_SEP(&p1[1])) {
+        int sepCount = 2;
+        *p2++ = SEP;
+        *p2++ = SEP;
+        p1 += 2;
+        for (; !IS_END(p1) && sepCount; ++p1) {
+            if (IS_SEP(p1)) {
+                --sepCount;
+                *p2++ = lastC = SEP;
+            } else {
+                *p2++ = lastC = *p1;
             }
         }
+        minP2 = p2;
     }
-    if (dots >= 0) {
-        // Strip any trailing dots and trailing slash.
-        buf_next -= dots + 1;  // "/" or "/." or "/.."
-        assert(*buf_next == SEP);
-        if (buf_next == buf_start) {
-            // Leave the leading slash for root.
-            buf_next++;
+#else
+    // Skip past two leading SEPs
+    else if (IS_SEP(&p1[0]) && IS_SEP(&p1[1]) && !IS_SEP(&p1[2])) {
+        *p2++ = *p1++;
+        *p2++ = *p1++;
+        minP2 = p2;
+        lastC = SEP;
+    }
+#endif /* MS_WINDOWS */
+
+    /* if pEnd is specified, check that. Else, check for null terminator */
+    for (; !IS_END(p1); ++p1) {
+        wchar_t c = *p1;
+#ifdef ALTSEP
+        if (c == ALTSEP) {
+            c = SEP;
         }
-        else {
-            if (dots == 2) {
-                // Move to the previous SEP in the buffer.
-                do {
-                    assert(buf_next != buf_start);
-                    buf_next--;
-                } while (*(buf_next) != SEP);
+#endif
+        if (lastC == SEP) {
+            if (c == L'.') {
+                int sep_at_1 = SEP_OR_END(&p1[1]);
+                int sep_at_2 = !sep_at_1 && SEP_OR_END(&p1[2]);
+                if (sep_at_2 && p1[1] == L'.') {
+                    wchar_t *p3 = p2;
+                    while (p3 != minP2 && *--p3 == SEP) { }
+                    while (p3 != minP2 && *(p3 - 1) != SEP) { --p3; }
+                    if (p3[0] == L'.' && p3[1] == L'.' && IS_SEP(&p3[2])) {
+                        // Previous segment is also ../, so append instead
+                        *p2++ = L'.';
+                        *p2++ = L'.';
+                        lastC = L'.';
+                    } else if (p3[0] == SEP) {
+                        // Absolute path, so absorb segment
+                        p2 = p3 + 1;
+                    } else {
+                        p2 = p3;
+                    }
+                    p1 += 1;
+                } else if (sep_at_1) {
+                } else {
+                    *p2++ = lastC = c;
+                }
+            } else if (c == SEP) {
+            } else {
+                *p2++ = lastC = c;
             }
+        } else {
+            *p2++ = lastC = c;
+        } 
+    }
+    *p2 = L'\0';
+    if (p2 != minP2) {
+        while (--p2 != minP2 && *p2 == SEP) {
+            *p2 = L'\0';
         }
     }
-    *buf_next = L'\0';
-    return 0;
+#undef SEP_OR_END
+#undef IS_SEP
+#undef IS_END
+    return path;
 }
author	Steve Dower <steve.dower@python.org>	2021-12-03 00:08:42 +0000
committer	GitHub <noreply@github.com>	2021-12-03 00:08:42 +0000
commit	99fcf1505218464c489d419d4500f126b6d6dc28 (patch)
tree	a9d607d854e943b3651248eadbe2f31f8c410021 /Python/fileutils.c
parent	9f2f7e42269db74a89fc8cd74d82a875787f01d7 (diff)
download	cpython-git-99fcf1505218464c489d419d4500f126b6d6dc28.tar.gz