From 98ea0860a43eb4d830f44062fd1130a68f8fc59c Mon Sep 17 00:00:00 2001 From: Tom Marshall Date: Fri, 1 Apr 2016 14:57:21 -0700 Subject: bionic: Teach fnmatch(3) to handle UTF-8 characters in patterns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is NOT comprehensive UTF-8 support. It is just a quick hack to make alternation work in bracket expressions so that the system file manager can find files with non-ASCII names in root mode. Bracket expressions that contain non-ASCII ranges are explicitly avoided to avoid the complexities of unicode collation rules. Things like the following will now work: fnmatch("те[с][т].jpg", "тест.jpg", 0); fnmatch("test[αβγ].txt", "testβ.txt", 0); Things like the following will still fail: fnmatch("тес[а-я].txt", "тест.txt", 0); Jira: CYNGNOS-2336 Change-Id: If38dc6692bc22d20128b0cd8a7632754a496d7fb --- libc/upstream-openbsd/lib/libc/gen/fnmatch.c | 114 +++++++++++++++++++++------ 1 file changed, 90 insertions(+), 24 deletions(-) diff --git a/libc/upstream-openbsd/lib/libc/gen/fnmatch.c b/libc/upstream-openbsd/lib/libc/gen/fnmatch.c index 2c860f7..62c58f8 100644 --- a/libc/upstream-openbsd/lib/libc/gen/fnmatch.c +++ b/libc/upstream-openbsd/lib/libc/gen/fnmatch.c @@ -90,12 +90,72 @@ #include #include +#include + #include "charclass.h" #define RANGE_MATCH 1 #define RANGE_NOMATCH 0 #define RANGE_ERROR (-1) +static unsigned int +utf8_len(const char *s) +{ + const unsigned char *b = (const unsigned char *)s; + unsigned int len = 1; + unsigned char c; + + if ((b[0] & 0xc0) != 0xc0) { + return 1; + } + c = b[0] << 1; + while (len < 6 && (c & 0x80)) { + if ((b[len] & 0xc0) != 0x80) { + return 1; + } + c <<= 1; + ++len; + } + + return len; +} + +static void +utf8_inc(const char **s) +{ + *s += utf8_len(*s); +} + +static uint32_t +utf8_get_inc(const char **s) +{ + unsigned int len = utf8_len(*s); + const unsigned char *b = (const unsigned char *)(*s); + unsigned int n; + uint32_t ch; + + *s += len; + + if (len == 1) { + return b[0]; + } + + ch = b[0] & (0xff >> len); + for (n = 1; n < len; ++n) { + ch <<= 6; + ch |= (b[n] & 0x3f); + } + + return ch; +} + +static uint32_t +utf8_get(const char *s) +{ + const char *tmp = s; + return utf8_get_inc(&tmp); +} + static int classmatch(const char *pattern, char test, int foldcase, const char **ep) { @@ -150,7 +210,7 @@ static int fnmatch_ch(const char **pattern, const char **string, int flags) const int escape = !(flags & FNM_NOESCAPE); const int slash = !!(flags & FNM_PATHNAME); int result = FNM_NOMATCH; - const char *startch; + uint32_t startch, endch, compch; int negate; if (**pattern == '[') @@ -171,7 +231,7 @@ static int fnmatch_ch(const char **pattern, const char **string, int flags) if (**pattern == ']') { ++*pattern; /* XXX: Fix for MBCS character width */ - ++*string; + utf8_inc(string); return (result ^ negate); } @@ -199,10 +259,13 @@ leadingclosebrace: * "x-]" is not allowed unless escaped ("x-\]") * XXX: Fix for locale/MBCS character width */ - if (((*pattern)[1] == '-') && ((*pattern)[2] != ']')) + startch = utf8_get_inc(pattern); + compch = utf8_get(*string); + if (((*pattern)[0] == '-') && ((*pattern)[1] != ']')) { - startch = *pattern; - *pattern += (escape && ((*pattern)[2] == '\\')) ? 3 : 2; + *pattern += 1; + if (escape && **pattern == '\\') + *pattern += 1; /* NOT a properly balanced [expr] pattern, EOS terminated * or ranges containing a slash in FNM_PATHNAME mode pattern @@ -211,32 +274,35 @@ leadingclosebrace: if (!**pattern || (slash && (**pattern == '/'))) break; + endch = utf8_get_inc(pattern); + + /* Refuse to attempt collation for non-ASCII chars */ + if (startch >= 0x80 || endch >= 0x80) + continue; + /* XXX: handle locale/MBCS comparison, advance by MBCS char width */ - if ((**string >= *startch) && (**string <= **pattern)) + if ((compch >= startch) && (compch <= endch)) result = 0; - else if (nocase && (isupper((unsigned char)**string) || - isupper((unsigned char)*startch) || - isupper((unsigned char)**pattern)) - && (tolower((unsigned char)**string) >= - tolower((unsigned char)*startch)) - && (tolower((unsigned char)**string) <= - tolower((unsigned char)**pattern))) + else if (nocase && (isupper(compch) || + isupper(startch) || + isupper(endch)) + && (tolower(compch) >= + tolower(startch)) + && (tolower(compch) <= + tolower(endch))) result = 0; - ++*pattern; continue; } /* XXX: handle locale/MBCS comparison, advance by MBCS char width */ - if ((**string == **pattern)) + if (compch == startch) result = 0; - else if (nocase && (isupper((unsigned char)**string) || - isupper((unsigned char)**pattern)) - && (tolower((unsigned char)**string) == - tolower((unsigned char)**pattern))) + else if (nocase && (isupper(compch) || + isupper(startch)) + && (tolower(compch) == + tolower(startch))) result = 0; - - ++*pattern; } /* NOT a properly balanced [expr] pattern; Rewind @@ -257,7 +323,7 @@ leadingclosebrace: } /* XXX: handle locale/MBCS comparison, advance by the MBCS char width */ - if (**string == **pattern) + if (utf8_get(*string) == utf8_get(*pattern)) result = 0; else if (nocase && (isupper((unsigned char)**string) || isupper((unsigned char)**pattern)) @@ -271,8 +337,8 @@ leadingclosebrace: return result; fnmatch_ch_success: - ++*pattern; - ++*string; + utf8_inc(pattern); + utf8_inc(string); return result; } -- cgit v1.1