diff options
author | Tom Marshall <tdm@cyngn.com> | 2016-04-01 14:57:21 -0700 |
---|---|---|
committer | Tom Marshall <tdm@cyngn.com> | 2016-04-13 11:23:18 -0700 |
commit | 98ea0860a43eb4d830f44062fd1130a68f8fc59c (patch) | |
tree | d0a07d5dff3c344959758c324cf3f0acfbb86f3a | |
parent | f6164e6c266759ab0cd9e7e491326d3c0974dfbd (diff) | |
download | bionic-98ea0860a43eb4d830f44062fd1130a68f8fc59c.zip bionic-98ea0860a43eb4d830f44062fd1130a68f8fc59c.tar.gz bionic-98ea0860a43eb4d830f44062fd1130a68f8fc59c.tar.bz2 |
bionic: Teach fnmatch(3) to handle UTF-8 characters in patterns
This is NOT comprehensive UTF-8 support. It is just a quick hack to
make alternation work in bracket expressions so that the system file
manager can find files with non-ASCII names in root mode. Bracket
expressions that contain non-ASCII ranges are explicitly avoided to
avoid the complexities of unicode collation rules.
Things like the following will now work:
fnmatch("те[с][т].jpg", "тест.jpg", 0);
fnmatch("test[αβγ].txt", "testβ.txt", 0);
Things like the following will still fail:
fnmatch("тес[а-я].txt", "тест.txt", 0);
Jira: CYNGNOS-2336
Change-Id: If38dc6692bc22d20128b0cd8a7632754a496d7fb
-rw-r--r-- | libc/upstream-openbsd/lib/libc/gen/fnmatch.c | 114 |
1 files changed, 90 insertions, 24 deletions
diff --git a/libc/upstream-openbsd/lib/libc/gen/fnmatch.c b/libc/upstream-openbsd/lib/libc/gen/fnmatch.c index 2c860f7..62c58f8 100644 --- a/libc/upstream-openbsd/lib/libc/gen/fnmatch.c +++ b/libc/upstream-openbsd/lib/libc/gen/fnmatch.c @@ -90,12 +90,72 @@ #include <ctype.h> #include <limits.h> +#include <stdint.h> + #include "charclass.h" #define RANGE_MATCH 1 #define RANGE_NOMATCH 0 #define RANGE_ERROR (-1) +static unsigned int +utf8_len(const char *s) +{ + const unsigned char *b = (const unsigned char *)s; + unsigned int len = 1; + unsigned char c; + + if ((b[0] & 0xc0) != 0xc0) { + return 1; + } + c = b[0] << 1; + while (len < 6 && (c & 0x80)) { + if ((b[len] & 0xc0) != 0x80) { + return 1; + } + c <<= 1; + ++len; + } + + return len; +} + +static void +utf8_inc(const char **s) +{ + *s += utf8_len(*s); +} + +static uint32_t +utf8_get_inc(const char **s) +{ + unsigned int len = utf8_len(*s); + const unsigned char *b = (const unsigned char *)(*s); + unsigned int n; + uint32_t ch; + + *s += len; + + if (len == 1) { + return b[0]; + } + + ch = b[0] & (0xff >> len); + for (n = 1; n < len; ++n) { + ch <<= 6; + ch |= (b[n] & 0x3f); + } + + return ch; +} + +static uint32_t +utf8_get(const char *s) +{ + const char *tmp = s; + return utf8_get_inc(&tmp); +} + static int classmatch(const char *pattern, char test, int foldcase, const char **ep) { @@ -150,7 +210,7 @@ static int fnmatch_ch(const char **pattern, const char **string, int flags) const int escape = !(flags & FNM_NOESCAPE); const int slash = !!(flags & FNM_PATHNAME); int result = FNM_NOMATCH; - const char *startch; + uint32_t startch, endch, compch; int negate; if (**pattern == '[') @@ -171,7 +231,7 @@ static int fnmatch_ch(const char **pattern, const char **string, int flags) if (**pattern == ']') { ++*pattern; /* XXX: Fix for MBCS character width */ - ++*string; + utf8_inc(string); return (result ^ negate); } @@ -199,10 +259,13 @@ leadingclosebrace: * "x-]" is not allowed unless escaped ("x-\]") * XXX: Fix for locale/MBCS character width */ - if (((*pattern)[1] == '-') && ((*pattern)[2] != ']')) + startch = utf8_get_inc(pattern); + compch = utf8_get(*string); + if (((*pattern)[0] == '-') && ((*pattern)[1] != ']')) { - startch = *pattern; - *pattern += (escape && ((*pattern)[2] == '\\')) ? 3 : 2; + *pattern += 1; + if (escape && **pattern == '\\') + *pattern += 1; /* NOT a properly balanced [expr] pattern, EOS terminated * or ranges containing a slash in FNM_PATHNAME mode pattern @@ -211,32 +274,35 @@ leadingclosebrace: if (!**pattern || (slash && (**pattern == '/'))) break; + endch = utf8_get_inc(pattern); + + /* Refuse to attempt collation for non-ASCII chars */ + if (startch >= 0x80 || endch >= 0x80) + continue; + /* XXX: handle locale/MBCS comparison, advance by MBCS char width */ - if ((**string >= *startch) && (**string <= **pattern)) + if ((compch >= startch) && (compch <= endch)) result = 0; - else if (nocase && (isupper((unsigned char)**string) || - isupper((unsigned char)*startch) || - isupper((unsigned char)**pattern)) - && (tolower((unsigned char)**string) >= - tolower((unsigned char)*startch)) - && (tolower((unsigned char)**string) <= - tolower((unsigned char)**pattern))) + else if (nocase && (isupper(compch) || + isupper(startch) || + isupper(endch)) + && (tolower(compch) >= + tolower(startch)) + && (tolower(compch) <= + tolower(endch))) result = 0; - ++*pattern; continue; } /* XXX: handle locale/MBCS comparison, advance by MBCS char width */ - if ((**string == **pattern)) + if (compch == startch) result = 0; - else if (nocase && (isupper((unsigned char)**string) || - isupper((unsigned char)**pattern)) - && (tolower((unsigned char)**string) == - tolower((unsigned char)**pattern))) + else if (nocase && (isupper(compch) || + isupper(startch)) + && (tolower(compch) == + tolower(startch))) result = 0; - - ++*pattern; } /* NOT a properly balanced [expr] pattern; Rewind @@ -257,7 +323,7 @@ leadingclosebrace: } /* XXX: handle locale/MBCS comparison, advance by the MBCS char width */ - if (**string == **pattern) + if (utf8_get(*string) == utf8_get(*pattern)) result = 0; else if (nocase && (isupper((unsigned char)**string) || isupper((unsigned char)**pattern)) @@ -271,8 +337,8 @@ leadingclosebrace: return result; fnmatch_ch_success: - ++*pattern; - ++*string; + utf8_inc(pattern); + utf8_inc(string); return result; } |