summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTom Marshall <tdm@cyngn.com>2016-04-01 14:57:21 -0700
committerTom Marshall <tdm@cyngn.com>2016-04-13 11:23:18 -0700
commit98ea0860a43eb4d830f44062fd1130a68f8fc59c (patch)
treed0a07d5dff3c344959758c324cf3f0acfbb86f3a
parentf6164e6c266759ab0cd9e7e491326d3c0974dfbd (diff)
downloadbionic-98ea0860a43eb4d830f44062fd1130a68f8fc59c.zip
bionic-98ea0860a43eb4d830f44062fd1130a68f8fc59c.tar.gz
bionic-98ea0860a43eb4d830f44062fd1130a68f8fc59c.tar.bz2
bionic: Teach fnmatch(3) to handle UTF-8 characters in patterns
This is NOT comprehensive UTF-8 support. It is just a quick hack to make alternation work in bracket expressions so that the system file manager can find files with non-ASCII names in root mode. Bracket expressions that contain non-ASCII ranges are explicitly avoided to avoid the complexities of unicode collation rules. Things like the following will now work: fnmatch("те[с][т].jpg", "тест.jpg", 0); fnmatch("test[αβγ].txt", "testβ.txt", 0); Things like the following will still fail: fnmatch("тес[а-я].txt", "тест.txt", 0); Jira: CYNGNOS-2336 Change-Id: If38dc6692bc22d20128b0cd8a7632754a496d7fb
-rw-r--r--libc/upstream-openbsd/lib/libc/gen/fnmatch.c114
1 files changed, 90 insertions, 24 deletions
diff --git a/libc/upstream-openbsd/lib/libc/gen/fnmatch.c b/libc/upstream-openbsd/lib/libc/gen/fnmatch.c
index 2c860f7..62c58f8 100644
--- a/libc/upstream-openbsd/lib/libc/gen/fnmatch.c
+++ b/libc/upstream-openbsd/lib/libc/gen/fnmatch.c
@@ -90,12 +90,72 @@
#include <ctype.h>
#include <limits.h>
+#include <stdint.h>
+
#include "charclass.h"
#define RANGE_MATCH 1
#define RANGE_NOMATCH 0
#define RANGE_ERROR (-1)
+static unsigned int
+utf8_len(const char *s)
+{
+ const unsigned char *b = (const unsigned char *)s;
+ unsigned int len = 1;
+ unsigned char c;
+
+ if ((b[0] & 0xc0) != 0xc0) {
+ return 1;
+ }
+ c = b[0] << 1;
+ while (len < 6 && (c & 0x80)) {
+ if ((b[len] & 0xc0) != 0x80) {
+ return 1;
+ }
+ c <<= 1;
+ ++len;
+ }
+
+ return len;
+}
+
+static void
+utf8_inc(const char **s)
+{
+ *s += utf8_len(*s);
+}
+
+static uint32_t
+utf8_get_inc(const char **s)
+{
+ unsigned int len = utf8_len(*s);
+ const unsigned char *b = (const unsigned char *)(*s);
+ unsigned int n;
+ uint32_t ch;
+
+ *s += len;
+
+ if (len == 1) {
+ return b[0];
+ }
+
+ ch = b[0] & (0xff >> len);
+ for (n = 1; n < len; ++n) {
+ ch <<= 6;
+ ch |= (b[n] & 0x3f);
+ }
+
+ return ch;
+}
+
+static uint32_t
+utf8_get(const char *s)
+{
+ const char *tmp = s;
+ return utf8_get_inc(&tmp);
+}
+
static int
classmatch(const char *pattern, char test, int foldcase, const char **ep)
{
@@ -150,7 +210,7 @@ static int fnmatch_ch(const char **pattern, const char **string, int flags)
const int escape = !(flags & FNM_NOESCAPE);
const int slash = !!(flags & FNM_PATHNAME);
int result = FNM_NOMATCH;
- const char *startch;
+ uint32_t startch, endch, compch;
int negate;
if (**pattern == '[')
@@ -171,7 +231,7 @@ static int fnmatch_ch(const char **pattern, const char **string, int flags)
if (**pattern == ']') {
++*pattern;
/* XXX: Fix for MBCS character width */
- ++*string;
+ utf8_inc(string);
return (result ^ negate);
}
@@ -199,10 +259,13 @@ leadingclosebrace:
* "x-]" is not allowed unless escaped ("x-\]")
* XXX: Fix for locale/MBCS character width
*/
- if (((*pattern)[1] == '-') && ((*pattern)[2] != ']'))
+ startch = utf8_get_inc(pattern);
+ compch = utf8_get(*string);
+ if (((*pattern)[0] == '-') && ((*pattern)[1] != ']'))
{
- startch = *pattern;
- *pattern += (escape && ((*pattern)[2] == '\\')) ? 3 : 2;
+ *pattern += 1;
+ if (escape && **pattern == '\\')
+ *pattern += 1;
/* NOT a properly balanced [expr] pattern, EOS terminated
* or ranges containing a slash in FNM_PATHNAME mode pattern
@@ -211,32 +274,35 @@ leadingclosebrace:
if (!**pattern || (slash && (**pattern == '/')))
break;
+ endch = utf8_get_inc(pattern);
+
+ /* Refuse to attempt collation for non-ASCII chars */
+ if (startch >= 0x80 || endch >= 0x80)
+ continue;
+
/* XXX: handle locale/MBCS comparison, advance by MBCS char width */
- if ((**string >= *startch) && (**string <= **pattern))
+ if ((compch >= startch) && (compch <= endch))
result = 0;
- else if (nocase && (isupper((unsigned char)**string) ||
- isupper((unsigned char)*startch) ||
- isupper((unsigned char)**pattern))
- && (tolower((unsigned char)**string) >=
- tolower((unsigned char)*startch))
- && (tolower((unsigned char)**string) <=
- tolower((unsigned char)**pattern)))
+ else if (nocase && (isupper(compch) ||
+ isupper(startch) ||
+ isupper(endch))
+ && (tolower(compch) >=
+ tolower(startch))
+ && (tolower(compch) <=
+ tolower(endch)))
result = 0;
- ++*pattern;
continue;
}
/* XXX: handle locale/MBCS comparison, advance by MBCS char width */
- if ((**string == **pattern))
+ if (compch == startch)
result = 0;
- else if (nocase && (isupper((unsigned char)**string) ||
- isupper((unsigned char)**pattern))
- && (tolower((unsigned char)**string) ==
- tolower((unsigned char)**pattern)))
+ else if (nocase && (isupper(compch) ||
+ isupper(startch))
+ && (tolower(compch) ==
+ tolower(startch)))
result = 0;
-
- ++*pattern;
}
/* NOT a properly balanced [expr] pattern; Rewind
@@ -257,7 +323,7 @@ leadingclosebrace:
}
/* XXX: handle locale/MBCS comparison, advance by the MBCS char width */
- if (**string == **pattern)
+ if (utf8_get(*string) == utf8_get(*pattern))
result = 0;
else if (nocase && (isupper((unsigned char)**string) ||
isupper((unsigned char)**pattern))
@@ -271,8 +337,8 @@ leadingclosebrace:
return result;
fnmatch_ch_success:
- ++*pattern;
- ++*string;
+ utf8_inc(pattern);
+ utf8_inc(string);
return result;
}