This is a dump from Google's source control system of the change that removed UCS-2 support from RE2. As the explanation below says, UCS-2 mode is fundamentally at odds with things like ^ and $, so it never really worked very well. But if you are interested in using it without those operators, it did work for that. It assumed that the UCS-2 data was in the native host byte order. If you are interested in adding UCS-2 mode back, this patch might be a good starting point. Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15 Retire UCS-2 mode. I added it as an experiment for V8, but it requires 2-byte lookahead to do completely, and RE2 has 1-byte lookahead (enough for UTF-8) as a fairly deep fundamental assumption, so it did not support ^ or $. ==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ==== re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319 cap_[0] = p; if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. return true; - if (prog_->flags() & Regexp::UCS2) - p++; } return false; } ==== re2/compile.cc#17 - re2/compile.cc#18 ==== re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100 // Input encodings. enum Encoding { kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) - kEncodingUCS2, // UCS-2 (0-FFFF), native byte order kEncodingLatin1, // Latin1 (0-FF) }; re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); void Add_80_10ffff(); - void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase); - void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1, - uint8 lo2, uint8 hi2, bool fold2); // New suffix that matches the byte range lo-hi, then goes to next. Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next); re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477 // Converts rune range lo-hi into a fragment that recognizes // the bytes that would make up those runes in the current - // encoding (Latin 1, UTF-8, or UCS-2). + // encoding (Latin 1 or UTF-8). // This lets the machine work byte-by-byte even when // using multibyte encodings. re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489 case kEncodingLatin1: AddRuneRangeLatin1(lo, hi, foldcase); break; - case kEncodingUCS2: - AddRuneRangeUCS2(lo, hi, foldcase); - break; } } re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501 AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL)); } - // Test whether 16-bit values are big or little endian. - static bool BigEndian() { - union { - char byte[2]; - int16 endian; - } u; - - u.byte[0] = 1; - u.byte[1] = 2; - return u.endian == 0x0102; - } - - void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1, - uint8 lo2, uint8 hi2, bool fold2) { - Inst* ip; - if (reversed_) { - ip = RuneByteSuffix(lo1, hi1, fold1, NULL); - ip = RuneByteSuffix(lo2, hi2, fold2, ip); - } else { - ip = RuneByteSuffix(lo2, hi2, fold2, NULL); - ip = RuneByteSuffix(lo1, hi1, fold1, ip); - } - AddSuffix(ip); - } - - void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) { - if (lo > hi || lo > 0xFFFF) - return; - if (hi > 0xFFFF) - hi = 0xFFFF; - - // We'll assemble a pattern assuming big endian. - // If the machine isn't, tell Cat to reverse its arguments. - bool oldreversed = reversed_; - if (!BigEndian()) { - reversed_ = !oldreversed; - } - - // Split into bytes. - int lo1 = lo >> 8; - int lo2 = lo & 0xFF; - int hi1 = hi >> 8; - int hi2 = hi & 0xFF; - - if (lo1 == hi1) { - // Easy case: high bits are same in both. - // Only do ASCII case folding on the second byte if the top byte is 00. - AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase); - } else { - // Harder case: different second byte ranges depending on first byte. - - // Initial fragment. - if (lo2 > 0) { - AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase); - lo1++; - } - - // Trailing fragment. - if (hi2 < 0xFF) { - AddUCS2Pair(hi1, hi1, false, 0, hi2, false); - hi1--; - } - - // Inner ranges. - if (lo1 <= hi1) { - AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false); - } - } - - // Restore reverse setting. - reversed_ = oldreversed; - } - // Table describing how to make a UTF-8 matching machine // for the rune range 80-10FFFF (Runeself-Runemax). // This range happens frequently enough (for example /./ and /[^a-z]/) re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634 Frag Compiler::Literal(Rune r, bool foldcase) { switch (encoding_) { - default: // UCS-2 or something new - BeginRange(); - AddRuneRange(r, r, foldcase); - return EndRange(); + default: + return kNullFrag; case kEncodingLatin1: return ByteRange(r, r, foldcase); re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850 if (re->parse_flags() & Regexp::Latin1) c.encoding_ = kEncodingLatin1; - else if (re->parse_flags() & Regexp::UCS2) - c.encoding_ = kEncodingUCS2; c.reversed_ = reversed; if (max_mem <= 0) { c.max_inst_ = 100000; // more than enough re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905 c.prog_->set_start_unanchored(c.prog_->start()); } else { Frag dot; - if (c.encoding_ == kEncodingUCS2) { - dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false)); - } else { - dot = c.ByteRange(0x00, 0xFF, false); - } + dot = c.ByteRange(0x00, 0xFF, false); Frag dotloop = c.Star(dot, true); Frag unanchored = c.Cat(dotloop, all); c.prog_->set_start_unanchored(unanchored.begin); ==== re2/nfa.cc#8 - re2/nfa.cc#9 ==== re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431 const char* bp = context.begin(); int c = -1; int wasword = 0; - bool ucs2 = prog_->flags() & Regexp::UCS2; if (text.begin() > context.begin()) { c = text.begin()[-1] & 0xFF; re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497 // If there's a required first byte for an unanchored search // and we're not in the middle of any possible matches, // use memchr to search for the byte quickly. - if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 && + if (!anchored && first_byte_ >= 0 && runq->size() == 0 && p < text.end() && (p[0] & 0xFF) != first_byte_) { p = reinterpret_cast(memchr(p, first_byte_, text.end() - p)); re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514 flag = Prog::EmptyFlags(context, p); } - // In UCS-2 mode, if we need to start a new thread, - // make sure to do it on an even boundary. - if(ucs2 && runq->size() == 0 && - (p - context.begin()) % 2 && p < text.end()) { - p++; - flag = Prog::EmptyFlags(context, p); - } - // Steal match storage (cleared but unused as of yet) // temporarily to hold match boundaries for new thread. - // In UCS-2 mode, only start the thread on a 2-byte boundary. - if(!ucs2 || (p - context.begin()) % 2 == 0) { - match_[0] = p; - AddToThreadq(runq, start_, flag, p, match_); - match_[0] = NULL; - } + match_[0] = p; + AddToThreadq(runq, start_, flag, p, match_); + match_[0] = NULL; } // If all the threads have died, stop early. ==== re2/parse.cc#22 - re2/parse.cc#23 ==== re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165 status_(status), stacktop_(NULL), ncap_(0) { if (flags_ & Latin1) rune_max_ = 0xFF; - else if (flags & UCS2) - rune_max_ = 0xFFFF; else rune_max_ = Runemax; } re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374 bool Regexp::ParseState::PushCarat() { if (flags_ & OneLine) { return PushSimpleOp(kRegexpBeginText); - } else { - if (flags_ & UCS2) { - status_->set_code(kRegexpUnsupported); - status_->set_error_arg("multiline ^ in UCS-2 mode"); - return false; - } - return PushSimpleOp(kRegexpBeginLine); } + return PushSimpleOp(kRegexpBeginLine); } // Pushes a \b or \B onto the stack. bool Regexp::ParseState::PushWordBoundary(bool word) { - if (flags_ & UCS2) { - status_->set_code(kRegexpUnsupported); - status_->set_error_arg("\\b or \\B in UCS-2 mode"); - return false; - } if (word) return PushSimpleOp(kRegexpWordBoundary); return PushSimpleOp(kRegexpNoWordBoundary); re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389 bool ret = PushSimpleOp(kRegexpEndText); flags_ = oflags; return ret; - } - if (flags_ & UCS2) { - status_->set_code(kRegexpUnsupported); - status_->set_error_arg("multiline $ in UCS-2 mode"); - return false; } return PushSimpleOp(kRegexpEndLine); } ==== re2/re2.cc#34 - re2/re2.cc#35 ==== re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84 return RE2::ErrorBadUTF8; case re2::kRegexpBadNamedCapture: return RE2::ErrorBadNamedCapture; - case re2::kRegexpUnsupported: - return RE2::ErrorUnsupported; } return RE2::ErrorInternal; } re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125 break; case RE2::Options::EncodingLatin1: flags |= Regexp::Latin1; - break; - case RE2::Options::EncodingUCS2: - flags |= Regexp::UCS2; break; } ==== re2/re2.h#36 - re2/re2.h#37 ==== re2/re2.h#36:246,252 - re2/re2.h#37:246,251 ErrorBadUTF8, // invalid UTF-8 in regexp ErrorBadNamedCapture, // bad named capture group ErrorPatternTooLarge, // pattern too large (compile failed) - ErrorUnsupported, // unsupported feature (in UCS-2 mode) }; // Predefined common options. re2/re2.h#36:570,576 - re2/re2.h#37:569,574 enum Encoding { EncodingUTF8 = 1, - EncodingUCS2, // 16-bit Unicode 0-FFFF only EncodingLatin1 }; ==== re2/regexp.cc#15 - re2/regexp.cc#16 ==== re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329 // the regexp that remains after the prefix. The prefix might // be ASCII case-insensitive. bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { - // Don't even bother for UCS-2; it's time to throw that code away. - if (parse_flags_ & UCS2) - return false; - // No need for a walker: the regexp must be of the form // 1. some number of ^ anchors // 2. a literal char or string ==== re2/regexp.h#20 - re2/regexp.h#21 ==== re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192 kRegexpBadPerlOp, // bad perl operator kRegexpBadUTF8, // invalid UTF-8 in regexp kRegexpBadNamedCapture, // bad named capture - kRegexpUnsupported, // unsupported operator }; // Error status for certain operations. re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314 // \Q and \E to disable/enable metacharacters // (?Pexpr) for named captures // \C to match any single byte - UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8. - UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group + UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group // and \P{Han} for its negation. - NeverNL = 1<<12, // Never match NL, even if the regexp mentions + NeverNL = 1<<11, // Never match NL, even if the regexp mentions // it explicitly. // As close to Perl as we can get. ==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ==== re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139 cap_[0] = p; if (Visit(prog_->start(), p)) // Match must be leftmost; done. return true; - if (prog_->flags() & Regexp::UCS2) - p++; } return false; } ==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ==== re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152 static ParseMode parse_modes[] = { { single_line, "single-line" }, { single_line|Regexp::Latin1, "single-line, latin1" }, - { single_line|Regexp::UCS2, "single-line, ucs2" }, { multi_line, "multiline" }, { multi_line|Regexp::NonGreedy, "multiline, nongreedy" }, { multi_line|Regexp::Latin1, "multiline, latin1" }, - { multi_line|Regexp::UCS2, "multiline, ucs2" }, }; static string FormatMode(Regexp::ParseFlags flags) { re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185 RegexpStatus status; regexp_ = Regexp::Parse(regexp_str, flags, &status); if (regexp_ == NULL) { - if (status.code() != kRegexpUnsupported) { - LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) - << " mode: " << FormatMode(flags); - error_ = true; - } + LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) + << " mode: " << FormatMode(flags); + error_ = true; return; } prog_ = regexp_->CompileToProg(0); re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231 RE2::Options options; if (flags & Regexp::Latin1) options.set_encoding(RE2::Options::EncodingLatin1); - else if (flags & Regexp::UCS2) - options.set_encoding(RE2::Options::EncodingUCS2); if (kind_ == Prog::kLongestMatch) options.set_longest_match(true); re2_ = new RE2(re, options); re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280 delete re2_; } - // Converts UTF-8 string in text into UCS-2 string in new_text. - static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) { - const char* p = text.begin(); - const char* ep = text.end(); - uint16* q = new uint16[ep - p]; - uint16* q0 = q; - - int n; - Rune r; - for (; p < ep; p += n) { - if (!fullrune(p, ep - p)) { - delete[] q0; - return false; - } - n = chartorune(&r, p); - if (r > 0xFFFF) { - delete[] q0; - return false; - } - *q++ = r; - } - *new_text = StringPiece(reinterpret_cast(q0), 2*(q - q0)); - return true; - } - - // Rewrites *sp from being a pointer into text8 (UTF-8) - // to being a pointer into text16 (equivalent text but in UCS-2). - static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16, - StringPiece *sp) { - if (sp->begin() == NULL && text8.begin() != NULL) - return; - - int nrune = 0; - int n; - Rune r; - const char* p = text8.begin(); - const char* ep = text8.end(); - const char* spbegin = NULL; - const char* spend = NULL; - for (;;) { - if (p == sp->begin()) - spbegin = text16.begin() + sizeof(uint16)*nrune; - if (p == sp->end()) - spend = text16.begin() + sizeof(uint16)*nrune; - if (p >= ep) - break; - n = chartorune(&r, p); - p += n; - nrune++; - } - if (spbegin == NULL || spend == NULL) { - LOG(FATAL) << "Error in AdjustUTF8ToUCS2 " - << CEscape(text8) << " " - << (int)(sp->begin() - text8.begin()) << " " - << (int)(sp->end() - text8.begin()); - } - *sp = StringPiece(spbegin, spend - spbegin); - } - - // Rewrites *sp from begin a pointer into text16 (UCS-2) - // to being a pointer into text8 (equivalent text but in UTF-8). - static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8, - StringPiece* sp) { - if (sp->begin() == NULL) - return; - - int nrune = 0; - int n; - Rune r; - const char* p = text8.begin(); - const char* ep = text8.end(); - const char* spbegin = NULL; - const char* spend = NULL; - for (;;) { - if (nrune == (sp->begin() - text16.begin())/2) - spbegin = p; - if (nrune == (sp->end() - text16.begin())/2) - spend = p; - if (p >= ep) - break; - n = chartorune(&r, p); - p += n; - nrune++; - } - if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) { - LOG(FATAL) << "Error in AdjustUCS2ToUTF8 " - << CEscape(text16) << " " - << (int)(sp->begin() - text16.begin()) << " " - << (int)(sp->end() - text16.begin()); - } - *sp = StringPiece(spbegin, spend - spbegin); - } - // Runs a single search using the named engine type. // This interface hides all the irregularities of the various // engine interfaces from the rest of this file. re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300 StringPiece text = orig_text; StringPiece context = orig_context; - bool ucs2 = false; - if ((flags() & Regexp::UCS2) && type != kEnginePCRE) { - if (!ConvertUTF8ToUCS2(orig_context, &context)) { - result->skipped = true; - return; - } - - // Rewrite context to refer to new text. - AdjustUTF8ToUCS2(orig_context, context, &text); - ucs2 = true; - } - switch (type) { default: LOG(FATAL) << "Bad RunSearch type: " << (int)type; re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451 } } - // If we did UCS-2 matching, rewrite the matches to refer - // to the original UTF-8 text. - if (ucs2) { - if (result->matched) { - if (result->have_submatch0) { - AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]); - } else if (result->have_submatch) { - for (int i = 0; i < nsubmatch; i++) { - AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]); - } - } - } - delete[] context.begin(); - } - if (!result->matched) memset(result->submatch, 0, sizeof result->submatch); } re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475 return true; } - // Check whether text uses only Unicode points <= 0xFFFF - // (in the BMP). - static bool IsBMP(const StringPiece& text) { - const char* p = text.begin(); - const char* ep = text.end(); - while (p < ep) { - if (!fullrune(p, ep - p)) - return false; - Rune r; - p += chartorune(&r, p); - if (r > 0xFFFF) - return false; - } - return true; - } - // Runs a single test. bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, Prog::Anchor anchor) { re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483 Result correct; RunSearch(kEngineBacktrack, text, context, anchor, &correct); if (correct.skipped) { - if (regexp_ == NULL || !IsBMP(context)) // okay to skip in UCS-2 mode + if (regexp_ == NULL) return true; LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) << " " << FormatMode(flags_);