diff options
author | battre <battre@chromium.org> | 2015-12-16 19:59:03 -0800 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-12-17 03:59:56 +0000 |
commit | 2bf80eeab7c142f91d45b46504b4485934e29512 (patch) | |
tree | 5d5168986b00288571236f08b10fbd89df7ace32 | |
parent | 0b9dbdd4cb09836efadb090edf7d981fb438cf50 (diff) | |
download | chromium_src-2bf80eeab7c142f91d45b46504b4485934e29512.zip chromium_src-2bf80eeab7c142f91d45b46504b4485934e29512.tar.gz chromium_src-2bf80eeab7c142f91d45b46504b4485934e29512.tar.bz2 |
Update re2 to tip of tree.
This version does not require any Chrome specific patches anymore.
This is an update of https://crrev.com/fd270ec705a75fd4e0883d091d9cf5204918c858
BUG=568119
R=thakis@chromium.org,tfarina@chromium.org
Review URL: https://codereview.chromium.org/1529143002
Cr-Commit-Position: refs/heads/master@{#365736}
117 files changed, 3169 insertions, 6036 deletions
diff --git a/third_party/re2/.gitignore b/third_party/re2/.gitignore new file mode 100644 index 0000000..a671fe2 --- /dev/null +++ b/third_party/re2/.gitignore @@ -0,0 +1,5 @@ +*.pyc +*.orig +core +obj/ +benchlog.* diff --git a/third_party/re2/AUTHORS b/third_party/re2/AUTHORS index e17d9bf..0754006 100644 --- a/third_party/re2/AUTHORS +++ b/third_party/re2/AUTHORS @@ -8,6 +8,6 @@ # Please keep the list sorted. -Brian Gunlogson <unixman83@gmail.com> Google Inc. +Samsung Electronics Stefano Rivera <stefano.rivera@gmail.com> diff --git a/third_party/re2/BUILD b/third_party/re2/BUILD new file mode 100644 index 0000000..9ab54b3 --- /dev/null +++ b/third_party/re2/BUILD @@ -0,0 +1,121 @@ +# Copyright 2009 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Bazel (http://bazel.io/) BUILD file for RE2. + +licenses(["notice"]) + +cc_library( + name = "re2", + srcs = [ + "re2/bitstate.cc", + "re2/compile.cc", + "re2/dfa.cc", + "re2/filtered_re2.cc", + "re2/mimics_pcre.cc", + "re2/nfa.cc", + "re2/onepass.cc", + "re2/parse.cc", + "re2/perl_groups.cc", + "re2/prefilter.cc", + "re2/prefilter.h", + "re2/prefilter_tree.cc", + "re2/prefilter_tree.h", + "re2/prog.cc", + "re2/prog.h", + "re2/re2.cc", + "re2/regexp.cc", + "re2/regexp.h", + "re2/set.cc", + "re2/simplify.cc", + "re2/stringpiece.cc", + "re2/tostring.cc", + "re2/unicode_casefold.cc", + "re2/unicode_casefold.h", + "re2/unicode_groups.cc", + "re2/unicode_groups.h", + "re2/walker-inl.h", + "util/atomicops.h", + "util/flags.h", + "util/hash.cc", + "util/logging.cc", + "util/logging.h", + "util/mutex.h", + "util/rune.cc", + "util/sparse_array.h", + "util/sparse_set.h", + "util/stringprintf.cc", + "util/strutil.cc", + "util/utf.h", + "util/util.h", + "util/valgrind.cc", + "util/valgrind.h", + ], + hdrs = [ + "re2/filtered_re2.h", + "re2/re2.h", + "re2/set.h", + "re2/stringpiece.h", + "re2/variadic_function.h", + ], + includes = ["."], + linkopts = ["-pthread"], + visibility = ["//visibility:public"], +) + +cc_library( + name = "test", + testonly = 1, + srcs = [ + "re2/testing/backtrack.cc", + "re2/testing/dump.cc", + "re2/testing/exhaustive_tester.cc", + "re2/testing/null_walker.cc", + "re2/testing/regexp_generator.cc", + "re2/testing/string_generator.cc", + "re2/testing/tester.cc", + "util/pcre.cc", + "util/random.cc", + "util/test.cc", + "util/thread.cc", + ], + hdrs = [ + "re2/testing/exhaustive_tester.h", + "re2/testing/regexp_generator.h", + "re2/testing/string_generator.h", + "re2/testing/tester.h", + "util/pcre.h", + "util/random.h", + "util/test.h", + "util/thread.h", + ], + includes = ["."], + deps = [":re2"], +) + +load("re2_test", "re2_test") + +re2_test("charclass_test") +re2_test("compile_test") +re2_test("filtered_re2_test") +re2_test("mimics_pcre_test") +re2_test("parse_test") +re2_test("possible_match_test") +re2_test("re2_test") +re2_test("re2_arg_test") +re2_test("regexp_test") +re2_test("required_prefix_test") +re2_test("search_test") +re2_test("set_test") +re2_test("simplify_test") +re2_test("string_generator_test") + +re2_test("dfa_test") +re2_test("exhaustive1_test") +re2_test("exhaustive2_test") +re2_test("exhaustive3_test") +re2_test("exhaustive_test") +re2_test("random_test") + +# TODO: Add support for regexp_benchmark. diff --git a/third_party/re2/BUILD.gn b/third_party/re2/BUILD.gn index ca1f7a2..9b68933 100644 --- a/third_party/re2/BUILD.gn +++ b/third_party/re2/BUILD.gn @@ -8,7 +8,6 @@ config("re2_config") { static_library("re2") { sources = [ - "mswin/stdint.h", "re2/bitstate.cc", "re2/compile.cc", "re2/dfa.cc", @@ -32,6 +31,7 @@ static_library("re2") { "re2/set.cc", "re2/set.h", "re2/simplify.cc", + "re2/stringpiece.cc", "re2/stringpiece.h", "re2/tostring.cc", "re2/unicode_casefold.cc", @@ -40,21 +40,21 @@ static_library("re2") { "re2/unicode_groups.h", "re2/variadic_function.h", "re2/walker-inl.h", - "util/arena.cc", - "util/arena.h", "util/atomicops.h", "util/flags.h", "util/hash.cc", + "util/logging.cc", "util/logging.h", "util/mutex.h", "util/rune.cc", "util/sparse_array.h", "util/sparse_set.h", - "util/stringpiece.cc", "util/stringprintf.cc", "util/strutil.cc", "util/utf.h", "util/util.h", + "util/valgrind.cc", + "util/valgrind.h", ] configs -= [ "//build/config/compiler:chromium_code" ] @@ -66,12 +66,15 @@ static_library("re2") { ] if (is_win) { - include_dirs = [ "mswin" ] cflags = [ "/wd4018", # Signed/unsigned mismatch in comparison. "/wd4722", # Destructor never terminates. ] - } else { - sources -= [ "mswin/stdint.h" ] } + + # TODO(battre) If Dr. Memory is ever migrated to GN, a flag needs to be + # added for this that adds a MEMORY_SANITIZER define. See re2.gyp. + # if (is_drmemory) { + # defines += [ "MEMORY_SANITIZER" ] + # } } diff --git a/third_party/re2/CMakeLists.txt b/third_party/re2/CMakeLists.txt new file mode 100644 index 0000000..c6c6060 --- /dev/null +++ b/third_party/re2/CMakeLists.txt @@ -0,0 +1,112 @@ +# Copyright 2015 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Old enough to support Ubuntu Precise. +cmake_minimum_required(VERSION 2.8.7) + +project(RE2 CXX) +option(BUILD_SHARED_LIBS "build shared libraries" OFF) +option(USEPCRE "use PCRE in tests and benchmarks" OFF) + +set(EXTRA_TARGET_LINK_LIBRARIES) + +if(WIN32) + add_definitions(-DUNICODE -D_UNICODE -DSTRICT -DNOMINMAX) + set(THREADING threadwin) +else() + set(THREADING thread) + list(APPEND EXTRA_TARGET_LINK_LIBRARIES -pthread) +endif() + +if(USEPCRE) + add_definitions(-DUSEPCRE) + list(APPEND EXTRA_TARGET_LINK_LIBRARIES pcre) +endif() + +include_directories(${CMAKE_SOURCE_DIR}) + +set(RE2_LIBRARY_SOURCES + re2/bitstate.cc + re2/compile.cc + re2/dfa.cc + re2/filtered_re2.cc + re2/mimics_pcre.cc + re2/nfa.cc + re2/onepass.cc + re2/parse.cc + re2/perl_groups.cc + re2/prefilter.cc + re2/prefilter_tree.cc + re2/prog.cc + re2/re2.cc + re2/regexp.cc + re2/set.cc + re2/simplify.cc + re2/stringpiece.cc + re2/tostring.cc + re2/unicode_casefold.cc + re2/unicode_groups.cc + util/hash.cc + util/logging.cc + util/rune.cc + util/stringprintf.cc + util/strutil.cc + util/valgrind.cc + ) + +add_library(re2 ${RE2_LIBRARY_SOURCES}) + +set(TEST_LIBRARY_SOURCES + re2/testing/backtrack.cc + re2/testing/dump.cc + re2/testing/exhaustive_tester.cc + re2/testing/null_walker.cc + re2/testing/regexp_generator.cc + re2/testing/string_generator.cc + re2/testing/tester.cc + util/pcre.cc + util/random.cc + util/${THREADING}.cc + ) + +add_library(test STATIC ${TEST_LIBRARY_SOURCES} util/test.cc) +add_library(benchmark STATIC ${TEST_LIBRARY_SOURCES} util/benchmark.cc) + +set(TEST_TARGETS + charclass_test + compile_test + filtered_re2_test + mimics_pcre_test + parse_test + possible_match_test + re2_test + re2_arg_test + regexp_test + required_prefix_test + search_test + set_test + simplify_test + string_generator_test + + dfa_test + exhaustive1_test + exhaustive2_test + exhaustive3_test + exhaustive_test + random_test + ) + +set(BENCHMARK_TARGETS + regexp_benchmark + ) + +foreach(target ${TEST_TARGETS}) + add_executable(${target} re2/testing/${target}.cc) + target_link_libraries(${target} test re2 ${EXTRA_TARGET_LINK_LIBRARIES}) +endforeach(target) + +foreach(target ${BENCHMARK_TARGETS}) + add_executable(${target} re2/testing/${target}.cc) + target_link_libraries(${target} benchmark re2 ${EXTRA_TARGET_LINK_LIBRARIES}) +endforeach(target) diff --git a/third_party/re2/CONTRIBUTING.md b/third_party/re2/CONTRIBUTING.md new file mode 100644 index 0000000..3af2b0a --- /dev/null +++ b/third_party/re2/CONTRIBUTING.md @@ -0,0 +1,2 @@ +RE2 uses Gerrit instead of GitHub pull requests. +See the [Contributing](https://github.com/google/re2/wiki/Contribute) wiki page. diff --git a/third_party/re2/CONTRIBUTORS b/third_party/re2/CONTRIBUTORS index 7f6a93d..1a1c848 100644 --- a/third_party/re2/CONTRIBUTORS +++ b/third_party/re2/CONTRIBUTORS @@ -26,11 +26,16 @@ # Please keep the list sorted. -Brian Gunlogson <unixman83@gmail.com> Dominic Battré <battre@chromium.org> +Doug Kwan <dougkwan@google.com> +Dmitriy Vyukov <dvyukov@google.com> John Millikin <jmillikin@gmail.com> +Mike Nazarewicz <mpn@google.com> +Nico Weber <thakis@chromium.org> +Pawel Hajdan <phajdan.jr@gmail.com> Rob Pike <r@google.com> Russ Cox <rsc@swtch.com> Sanjay Ghemawat <sanjay@google.com> Stefano Rivera <stefano.rivera@gmail.com> Srinivasan Venkatachary <vsri@google.com> +Viatcheslav Ostapenko <sl.ostapenko@samsung.com> diff --git a/third_party/re2/Makefile b/third_party/re2/Makefile index 4ded8ec..5068459 100644 --- a/third_party/re2/Makefile +++ b/third_party/re2/Makefile @@ -2,21 +2,19 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -all: obj/libre2.a obj/so/libre2.so - # to build against PCRE for testing or benchmarking, # uncomment the next two lines # CCPCRE=-I/usr/local/include -DUSEPCRE # LDPCRE=-L/usr/local/lib -lpcre -CXX=g++ -CXXFLAGS=-Wall -O3 -g -pthread # can override -RE2_CXXFLAGS=-Wno-sign-compare -c -I. $(CCPCRE) # required -LDFLAGS=-pthread -AR=ar -ARFLAGS=rsc -NM=nm -NMFLAGS=-p +CXX?=g++ +CXXFLAGS?=-O3 -g # can override +RE2_CXXFLAGS?=-Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCPCRE) # required +LDFLAGS?=-pthread +AR?=ar +ARFLAGS?=rsc +NM?=nm +NMFLAGS?=-p # Variables mandated by GNU, the arbiter of all good taste on the internet. # http://www.gnu.org/prep/standards/standards.html @@ -38,11 +36,24 @@ SONAME=0 # REBUILD_TABLES=1 ifeq ($(shell uname),Darwin) -MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib $(LDFLAGS) -exported_symbols_list libre2.symbols.darwin +SOEXT=dylib +SOEXTVER=$(SONAME).$(SOEXT) +SOEXTVER00=$(SONAME).0.0.$(SOEXT) +MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib $(LDFLAGS) -Wl,-install_name,@rpath/libre2.$(SOEXTVER) -exported_symbols_list libre2.symbols.darwin +else ifeq ($(shell uname),SunOS) +SOEXT=so +SOEXTVER=$(SOEXT).$(SONAME) +SOEXTVER00=$(SOEXT).$(SONAME).0.0 +MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),-M,libre2.symbols $(LDFLAGS) else -MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.so.$(SONAME),--version-script=libre2.symbols $(LDFLAGS) +SOEXT=so +SOEXTVER=$(SOEXT).$(SONAME) +SOEXTVER00=$(SOEXT).$(SONAME).0.0 +MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(LDFLAGS) endif +all: obj/libre2.a obj/so/libre2.$(SOEXT) + INSTALL_HFILES=\ re2/filtered_re2.h\ re2/re2.h\ @@ -51,7 +62,6 @@ INSTALL_HFILES=\ re2/variadic_function.h\ HFILES=\ - util/arena.h\ util/atomicops.h\ util/benchmark.h\ util/flags.h\ @@ -62,6 +72,7 @@ HFILES=\ util/sparse_array.h\ util/sparse_set.h\ util/test.h\ + util/thread.h\ util/utf.h\ util/util.h\ util/valgrind.h\ @@ -83,10 +94,9 @@ HFILES=\ re2/walker-inl.h\ OFILES=\ - obj/util/arena.o\ obj/util/hash.o\ + obj/util/logging.o\ obj/util/rune.o\ - obj/util/stringpiece.o\ obj/util/stringprintf.o\ obj/util/strutil.o\ obj/util/valgrind.o\ @@ -106,6 +116,7 @@ OFILES=\ obj/re2/regexp.o\ obj/re2/set.o\ obj/re2/simplify.o\ + obj/re2/stringpiece.o\ obj/re2/tostring.o\ obj/re2/unicode_casefold.o\ obj/re2/unicode_groups.o\ @@ -158,15 +169,15 @@ DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS)) obj/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) - $(CXX) -o $@ $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc + $(CXX) -c -o $@ $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc obj/dbg/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) - $(CXX) -o $@ -fPIC $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) $*.cc + $(CXX) -c -o $@ $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) $*.cc obj/so/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) - $(CXX) -o $@ -fPIC $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc + $(CXX) -c -o $@ -fPIC $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc obj/libre2.a: $(OFILES) @mkdir -p obj @@ -176,10 +187,10 @@ obj/dbg/libre2.a: $(DOFILES) @mkdir -p obj/dbg $(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES) -obj/so/libre2.so: $(SOFILES) +obj/so/libre2.$(SOEXT): $(SOFILES) @mkdir -p obj/so - $(MAKE_SHARED_LIBRARY) -o $@.$(SONAME) $(SOFILES) - ln -sf libre2.so.$(SONAME) $@ + $(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES) + ln -sf libre2.$(SOEXTVER) $@ obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o @mkdir -p obj/test @@ -189,7 +200,7 @@ obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/ @mkdir -p obj/dbg/test $(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(LDFLAGS) $(LDPCRE) -obj/so/test/%: obj/so/libre2.so obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o +obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o @mkdir -p obj/so/test $(CXX) -o $@ obj/so/re2/testing/$*.o $(STESTOFILES) obj/so/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE) @@ -203,6 +214,8 @@ re2/perl_groups.cc: re2/make_perl_groups.pl re2/unicode_%.cc: re2/make_unicode_%.py python $< > $@ + +.PRECIOUS: re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc endif distclean: clean @@ -245,23 +258,28 @@ shared-bigtest: $(STESTS) $(SBIGTESTS) benchmark: obj/test/regexp_benchmark -install: obj/libre2.a obj/so/libre2.so - mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir) +install: obj/libre2.a obj/so/libre2.$(SOEXT) + mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2 $(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a - $(INSTALL) obj/so/libre2.so $(DESTDIR)$(libdir)/libre2.so.$(SONAME).0.0 - ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so.$(SONAME) - ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so + $(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00) + ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER) + ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT) + sed -e "s#@prefix@#${prefix}#" re2.pc >$(DESTDIR)$(libdir)/pkgconfig/re2.pc testinstall: @mkdir -p obj cp testinstall.cc obj +ifneq ($(shell uname),Darwin) + (cd obj && $(CXX) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -pthread -static -o testinstall) + obj/testinstall +endif (cd obj && $(CXX) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -pthread -o testinstall) LD_LIBRARY_PATH=$(DESTDIR)$(libdir) obj/testinstall benchlog: obj/test/regexp_benchmark (echo '==BENCHMARK==' `hostname` `date`; \ - (uname -a; $(CXX) --version; hg identify; file obj/test/regexp_benchmark) | sed 's/^/# /'; \ + (uname -a; $(CXX) --version; git rev-parse --short HEAD; file obj/test/regexp_benchmark) | sed 's/^/# /'; \ echo; \ ./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//') @@ -273,8 +291,9 @@ benchlog: obj/test/regexp_benchmark obj/test/% obj/so/test/% obj/dbg/test/% log: - make clean - make CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/exhaustive{,1,2,3}_test + $(MAKE) clean + $(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" \ + $(filter obj/test/exhaustive%_test,$(BIGTESTS)) echo '#' RE2 exhaustive tests built by make log >re2-exhaustive.txt echo '#' $$(date) >>re2-exhaustive.txt obj/test/exhaustive_test |grep -v '^PASS$$' >>re2-exhaustive.txt @@ -282,7 +301,10 @@ log: obj/test/exhaustive2_test |grep -v '^PASS$$' >>re2-exhaustive.txt obj/test/exhaustive3_test |grep -v '^PASS$$' >>re2-exhaustive.txt - make CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test + $(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test echo '#' RE2 basic search tests built by make $@ >re2-search.txt echo '#' $$(date) >>re2-search.txt obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt + +x: x.cc obj/libre2.a + g++ -I. -o x x.cc obj/libre2.a diff --git a/third_party/re2/README b/third_party/re2/README index 57b3181..2c23660 100644 --- a/third_party/re2/README +++ b/third_party/re2/README @@ -1,7 +1,7 @@ This is the source code repository for RE2, a regular expression library. For documentation about how to install and use RE2, -visit http://code.google.com/p/re2/. +visit https://github.com/google/re2/. The short version is: @@ -10,10 +10,23 @@ make test make install make testinstall +More information can be found on the wiki: +https://github.com/google/re2/wiki + +Issue tracker: +https://github.com/google/re2/issues + +Mailing list: +https://groups.google.com/group/re2-dev + Unless otherwise noted, the RE2 source files are distributed under the BSD-style license found in the LICENSE file. RE2's native language is C++. -An Inferno wrapper is at http://code.google.com/p/inferno-re2/. -A Python wrapper is at http://github.com/facebook/pyre2/. -A Ruby wrapper is at http://github.com/axic/rre2/. +An Erlang wrapper is at https://github.com/tuncer/re2/. +An Inferno wrapper is at https://github.com/powerman/inferno-re2/. +A Node.js wrapper is at https://github.com/uhop/node-re2/ and on NPM. +An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM. +A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN. +A Python wrapper is at https://github.com/facebook/pyre2/. +A Ruby wrapper is at https://github.com/axic/rre2/. diff --git a/third_party/re2/README.chromium b/third_party/re2/README.chromium index 3522d4c..f7d576d 100644 --- a/third_party/re2/README.chromium +++ b/third_party/re2/README.chromium @@ -1,9 +1,8 @@ Name: re2 - an efficient, principled regular expression library Short Name: re2 URL: https://github.com/google/re2 -Version: 7f91923f3ad4 -Date: 2012-06-20 -Revision: 100:7f91923f3ad4 +Version: dba3349aba83b5588e85e5ecf2b56c97f2d259b7 +Date: 2015-12-10 License: BSD 3-Clause License File: LICENSE Security Critical: yes @@ -12,23 +11,11 @@ Description: RE2 is a fast, safe, thread-friendly alternative to backtracking regular expression engines like those used in PCRE, Perl, and Python. -Local Modifications (to be applied in this order): -- Remove valgrind specific code that exists in chromium already - (patches/remove-valgrind-code.patch) -- Support for Windows (patches/re2-msvc9-chrome.patch) -- Support Android (patches/re2-android.patch) -- Remove static initializers (patches/remove-static-initializers.patch) -- Support libcxx (patches/re2-libcxx.patch) - https://code.google.com/p/re2/issues/detail?id=76 -- Memory optimization for filtered trees - (patches/re2-memory-optimization.patch) -- Prevent unwanted reports from MemorySanitizer. Note: there's an upstream fix - for this (https://code.google.com/p/re2/issues/detail?id=77) which is rendered - ineffective by patches/remove-valgrind-code.patch - (patches/re2-msan.patch) -- Remove comparisons of this with NULL, merges upstream b92ce81f1e25 -- Let COMPILE_ASSERT use static_assert if available, merges upstream - 2225f94df8ec -- Merge upstream cc56ba02d9d2bdafa614ad5ebf564dde287625bb. -- Suppress more unwanted reports from MemorySanitizer. - (patches/sparse-array-valgrind.patch) +To update RE2, execute the following commands from your Chromium checkout: +$ git clone https://github.com/google/re2 third_party/re2_new +$ cp third_party/re2/OWNERS third_party/re2/BUILD.gn third_party/re2/re2.gyp \ +third_party/re2/README.chromium third_party/re2/DEPS third_party/re2_new +$ rm -rf third_party/re2_new/.git +$ rm -rf third_party/re2 +$ mv third_party/re2_new third_party/re2 +Then update third_party/re2/BUILD.gn and third_party/re2/re2.gyp. diff --git a/third_party/re2/WORKSPACE b/third_party/re2/WORKSPACE new file mode 100644 index 0000000..393f5e6 --- /dev/null +++ b/third_party/re2/WORKSPACE @@ -0,0 +1,5 @@ +# Copyright 2009 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Bazel (http://bazel.io/) WORKSPACE file for RE2. diff --git a/third_party/re2/benchlog/benchplot.py b/third_party/re2/benchlog/benchplot.py new file mode 100755 index 0000000..104abe8 --- /dev/null +++ b/third_party/re2/benchlog/benchplot.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +import argparse # for ArgumentParser +import subprocess # for Popen +import tempfile # for NamedTemporaryFile +import os # for remove + +class gnuplot(object): + + output = "result.png" + + script = """ + set terminal png size 1024, 768 + set output "{}.png" + set title "re2 benchlog" + set datafile separator ";" + set grid x y + set ylabel "MB/s" + set autoscale + plot """ + + template = """'{}' using 1:5:xticlabels(2) with linespoints linewidth 3 title "{}",\\\n""" + + benchdata = dict() + tempfiles = [] + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + """ + remove all temporary files + """ + + for filename in self.tempfiles: + os.remove(filename) + + def parse_re2_benchlog(self, filename): + """ + parse the input benchlog and return a dictionary contain bench data + """ + + benchdata = self.benchdata + + with open(filename) as f: + + for raw in f.readlines(): + + data = raw.split('\t') + + if len(data) == 4: + + data = data[0].split('/') + data[1:] + data = list(map(str.strip, data)) + + if not benchdata.get(data[0]): + benchdata[data[0]] = [ data[1:] ] + else: + benchdata[data[0]].append(data[1:]) + + def gen_csv(self): + """ + generate temporary csv files + """ + + for name, data in self.benchdata.items(): + + with tempfile.NamedTemporaryFile(delete=False) as f: + + for index, line in enumerate(data): + f.write('{};{}\n'.format(index, ';'.join(line)).encode()) + + self.tempfiles.append(f.name) + self.script = self.script + self.template.format(f.name, name) + + def run(self): + self.gen_csv() + script = self.script[:-3].format(self.output) + command = subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE) + command.communicate(script.encode()) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='generate plots for benchlog') + parser.add_argument('benchlog', type=str, help='benchlog generated by re2') + args = parser.parse_args() + + try: + subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE) + except FileNotFoundError: + print('you can install "gnuplot" to generate plots automatically') + exit(1) + + with gnuplot() as plot: + plot.output = args.benchlog + plot.parse_re2_benchlog(args.benchlog) + plot.run() diff --git a/third_party/re2/doc/mksyntaxgo b/third_party/re2/doc/mksyntaxgo index 42e87d6..caad9b6 100755 --- a/third_party/re2/doc/mksyntaxgo +++ b/third_party/re2/doc/mksyntaxgo @@ -1,7 +1,7 @@ #!/bin/sh set -e -out=$GOROOT/src/pkg/regexp/syntax/doc.go +out=$GOROOT/src/regexp/syntax/doc.go cp syntax.txt $out sam -d $out <<'!' ,x g/NOT SUPPORTED/d diff --git a/third_party/re2/doc/syntax.html b/third_party/re2/doc/syntax.html index 7f5e15a..aa08b11 100644 --- a/third_party/re2/doc/syntax.html +++ b/third_party/re2/doc/syntax.html @@ -11,16 +11,15 @@ <tr><td colspan=2>This page lists the regular expression syntax accepted by RE2.</td></tr> <tr><td colspan=2>It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr> <tr><td colspan=2>Grayed out expressions are not supported by RE2.</td></tr> -<tr><td colspan=2>See <a href="http://go/re2">http://go/re2</a> and <a href="http://go/re2quick">http://go/re2quick</a>.</td></tr> <tr><td></td></tr> <tr><td colspan=2><b>Single characters:</b></td></tr> -<tr><td><code>.</code></td><td>any character, including newline (s=true)</td></tr> +<tr><td><code>.</code></td><td>any character, possibly including newline (s=true)</td></tr> <tr><td><code>[xyz]</code></td><td>character class</td></tr> <tr><td><code>[^xyz]</code></td><td>negated character class</td></tr> <tr><td><code>\d</code></td><td>Perl character class</td></tr> <tr><td><code>\D</code></td><td>negated Perl character class</td></tr> -<tr><td><code>[:alpha:]</code></td><td>ASCII character class</td></tr> -<tr><td><code>[:^alpha:]</code></td><td>negated ASCII character class</td></tr> +<tr><td><code>[[:alpha:]]</code></td><td>ASCII character class</td></tr> +<tr><td><code>[[:^alpha:]]</code></td><td>negated ASCII character class</td></tr> <tr><td><code>\pN</code></td><td>Unicode character class (one-letter name)</td></tr> <tr><td><code>\p{Greek}</code></td><td>Unicode character class</td></tr> <tr><td><code>\PN</code></td><td>negated Unicode character class (one-letter name)</td></tr> @@ -62,7 +61,7 @@ <tr><td><code><font color=#808080>(?<name>re)</font></code></td><td>named & numbered capturing group </td></tr> <tr><td><code><font color=#808080>(?'name're)</font></code></td><td>named & numbered capturing group </td></tr> <tr><td><code>(?:re)</code></td><td>non-capturing group</td></tr> -<tr><td><code>(?flags)</code></td><td>set flags until outer paren closes; non-capturing</td></tr> +<tr><td><code>(?flags)</code></td><td>set flags within current group; non-capturing</td></tr> <tr><td><code>(?flags:re)</code></td><td>set flags during re; non-capturing</td></tr> <tr><td><code><font color=#808080>(?#text)</font></code></td><td>comment </td></tr> <tr><td><code><font color=#808080>(?|x|y|z)</font></code></td><td>branch numbering reset </td></tr> @@ -72,16 +71,16 @@ <tr><td></td></tr> <tr><td colspan=2><b>Flags:</b></td></tr> <tr><td><code>i</code></td><td>case-insensitive (default false)</td></tr> -<tr><td><code>m</code></td><td>multi-line mode (default false)</td></tr> +<tr><td><code>m</code></td><td>multi-line mode: <code>^</code> and <code>$</code> match begin/end line in addition to begin/end text (default false)</td></tr> <tr><td><code>s</code></td><td>let <code>.</code> match <code>\n</code> (default false)</td></tr> <tr><td><code>U</code></td><td>ungreedy: swap meaning of <code>x*</code> and <code>x*?</code>, <code>x+</code> and <code>x+?</code>, etc (default false)</td></tr> <tr><td colspan=2>Flag syntax is <code>xyz</code> (set) or <code>-xyz</code> (clear) or <code>xy-z</code> (set <code>xy</code>, clear <code>z</code>).</td></tr> <tr><td></td></tr> <tr><td colspan=2><b>Empty strings:</b></td></tr> <tr><td><code>^</code></td><td>at beginning of text or line (<code>m</code>=true)</td></tr> -<tr><td><code>$</code></td><td>at end of text or line (<code>m</code>=true)</td></tr> +<tr><td><code>$</code></td><td>at end of text (like <code>\z</code> not <code>\Z</code>) or line (<code>m</code>=true)</td></tr> <tr><td><code>\A</code></td><td>at beginning of text</td></tr> -<tr><td><code>\b</code></td><td>at word boundary (<code>\w</code> to left and <code>\W</code> to right or vice versa)</td></tr> +<tr><td><code>\b</code></td><td>at word boundary (<code>\w</code> on one side and <code>\W</code>, <code>\A</code>, or <code>\z</code> on the other)</td></tr> <tr><td><code>\B</code></td><td>not a word boundary</td></tr> <tr><td><code><font color=#808080>\G</font></code></td><td>at beginning of subtext being searched <font size=-2>PCRE</font></td></tr> <tr><td><code><font color=#808080>\G</font></code></td><td>at end of last match <font size=-2>PERL</font></td></tr> @@ -181,20 +180,20 @@ <tr><td><code><font color=#808080>\V</font></code></td><td>not vertical space </td></tr> <tr><td></td></tr> <tr><td colspan=2><b>ASCII character classes:</b></td></tr> -<tr><td><code>[:alnum:]</code></td><td>alphanumeric (≡ <code>[0-9A-Za-z]</code>)</td></tr> -<tr><td><code>[:alpha:]</code></td><td>alphabetic (≡ <code>[A-Za-z]</code>)</td></tr> -<tr><td><code>[:ascii:]</code></td><td>ASCII (≡ <code>[\x00-\x7F]</code>)</td></tr> -<tr><td><code>[:blank:]</code></td><td>blank (≡ <code>[\t ]</code>)</td></tr> -<tr><td><code>[:cntrl:]</code></td><td>control (≡ <code>[\x00-\x1F\x7F]</code>)</td></tr> -<tr><td><code>[:digit:]</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr> -<tr><td><code>[:graph:]</code></td><td>graphical (≡ <code>[!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]</code>)</td></tr> -<tr><td><code>[:lower:]</code></td><td>lower case (≡ <code>[a-z]</code>)</td></tr> -<tr><td><code>[:print:]</code></td><td>printable (≡ <code>[ -~] == [ [:graph:]]</code>)</td></tr> -<tr><td><code>[:punct:]</code></td><td>punctuation (≡ <code>[!-/:-@[-`{-~]</code>)</td></tr> -<tr><td><code>[:space:]</code></td><td>whitespace (≡ <code>[\t\n\v\f\r ]</code>)</td></tr> -<tr><td><code>[:upper:]</code></td><td>upper case (≡ <code>[A-Z]</code>)</td></tr> -<tr><td><code>[:word:]</code></td><td>word characters (≡ <code>[0-9A-Za-z_]</code>)</td></tr> -<tr><td><code>[:xdigit:]</code></td><td>hex digit (≡ <code>[0-9A-Fa-f]</code>)</td></tr> +<tr><td><code>[[:alnum:]]</code></td><td>alphanumeric (≡ <code>[0-9A-Za-z]</code>)</td></tr> +<tr><td><code>[[:alpha:]]</code></td><td>alphabetic (≡ <code>[A-Za-z]</code>)</td></tr> +<tr><td><code>[[:ascii:]]</code></td><td>ASCII (≡ <code>[\x00-\x7F]</code>)</td></tr> +<tr><td><code>[[:blank:]]</code></td><td>blank (≡ <code>[\t ]</code>)</td></tr> +<tr><td><code>[[:cntrl:]]</code></td><td>control (≡ <code>[\x00-\x1F\x7F]</code>)</td></tr> +<tr><td><code>[[:digit:]]</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr> +<tr><td><code>[[:graph:]]</code></td><td>graphical (≡ <code>[!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]</code>)</td></tr> +<tr><td><code>[[:lower:]]</code></td><td>lower case (≡ <code>[a-z]</code>)</td></tr> +<tr><td><code>[[:print:]]</code></td><td>printable (≡ <code>[ -~] == [ [:graph:]]</code>)</td></tr> +<tr><td><code>[[:punct:]]</code></td><td>punctuation (≡ <code>[!-/:-@[-`{-~]</code>)</td></tr> +<tr><td><code>[[:space:]]</code></td><td>whitespace (≡ <code>[\t\n\v\f\r ]</code>)</td></tr> +<tr><td><code>[[:upper:]]</code></td><td>upper case (≡ <code>[A-Z]</code>)</td></tr> +<tr><td><code>[[:word:]]</code></td><td>word characters (≡ <code>[0-9A-Za-z_]</code>)</td></tr> +<tr><td><code>[[:xdigit:]]</code></td><td>hex digit (≡ <code>[0-9A-Fa-f]</code>)</td></tr> <tr><td></td></tr> <tr><td colspan=2><b>Unicode character class names--general category:</b></td></tr> <tr><td><code>C</code></td><td>other</td></tr> @@ -241,13 +240,17 @@ <tr><td><code>Arabic</code></td><td>Arabic</td></tr> <tr><td><code>Armenian</code></td><td>Armenian</td></tr> <tr><td><code>Balinese</code></td><td>Balinese</td></tr> +<tr><td><code>Bamum</code></td><td>Bamum</td></tr> +<tr><td><code>Batak</code></td><td>Batak</td></tr> <tr><td><code>Bengali</code></td><td>Bengali</td></tr> <tr><td><code>Bopomofo</code></td><td>Bopomofo</td></tr> +<tr><td><code>Brahmi</code></td><td>Brahmi</td></tr> <tr><td><code>Braille</code></td><td>Braille</td></tr> <tr><td><code>Buginese</code></td><td>Buginese</td></tr> <tr><td><code>Buhid</code></td><td>Buhid</td></tr> <tr><td><code>Canadian_Aboriginal</code></td><td>Canadian Aboriginal</td></tr> <tr><td><code>Carian</code></td><td>Carian</td></tr> +<tr><td><code>Chakma</code></td><td>Chakma</td></tr> <tr><td><code>Cham</code></td><td>Cham</td></tr> <tr><td><code>Cherokee</code></td><td>Cherokee</td></tr> <tr><td><code>Common</code></td><td>characters not specific to one script</td></tr> @@ -257,6 +260,7 @@ <tr><td><code>Cyrillic</code></td><td>Cyrillic</td></tr> <tr><td><code>Deseret</code></td><td>Deseret</td></tr> <tr><td><code>Devanagari</code></td><td>Devanagari</td></tr> +<tr><td><code>Egyptian_Hieroglyphs</code></td><td>Egyptian Hieroglyphs</td></tr> <tr><td><code>Ethiopic</code></td><td>Ethiopic</td></tr> <tr><td><code>Georgian</code></td><td>Georgian</td></tr> <tr><td><code>Glagolitic</code></td><td>Glagolitic</td></tr> @@ -269,7 +273,12 @@ <tr><td><code>Hanunoo</code></td><td>Hanunoo</td></tr> <tr><td><code>Hebrew</code></td><td>Hebrew</td></tr> <tr><td><code>Hiragana</code></td><td>Hiragana</td></tr> +<tr><td><code>Imperial_Aramaic</code></td><td>Imperial Aramaic</td></tr> <tr><td><code>Inherited</code></td><td>inherit script from previous character</td></tr> +<tr><td><code>Inscriptional_Pahlavi</code></td><td>Inscriptional Pahlavi</td></tr> +<tr><td><code>Inscriptional_Parthian</code></td><td>Inscriptional Parthian</td></tr> +<tr><td><code>Javanese</code></td><td>Javanese</td></tr> +<tr><td><code>Kaithi</code></td><td>Kaithi</td></tr> <tr><td><code>Kannada</code></td><td>Kannada</td></tr> <tr><td><code>Katakana</code></td><td>Katakana</td></tr> <tr><td><code>Kayah_Li</code></td><td>Kayah Li</td></tr> @@ -283,6 +292,11 @@ <tr><td><code>Lycian</code></td><td>Lycian</td></tr> <tr><td><code>Lydian</code></td><td>Lydian</td></tr> <tr><td><code>Malayalam</code></td><td>Malayalam</td></tr> +<tr><td><code>Mandaic</code></td><td>Mandaic</td></tr> +<tr><td><code>Meetei_Mayek</code></td><td>Meetei Mayek</td></tr> +<tr><td><code>Meroitic_Cursive</code></td><td>Meroitic Cursive</td></tr> +<tr><td><code>Meroitic_Hieroglyphs</code></td><td>Meroitic Hieroglyphs</td></tr> +<tr><td><code>Miao</code></td><td>Miao</td></tr> <tr><td><code>Mongolian</code></td><td>Mongolian</td></tr> <tr><td><code>Myanmar</code></td><td>Myanmar</td></tr> <tr><td><code>New_Tai_Lue</code></td><td>New Tai Lue (aka Simplified Tai Lue)</td></tr> @@ -291,6 +305,8 @@ <tr><td><code>Ol_Chiki</code></td><td>Ol Chiki</td></tr> <tr><td><code>Old_Italic</code></td><td>Old Italic</td></tr> <tr><td><code>Old_Persian</code></td><td>Old Persian</td></tr> +<tr><td><code>Old_South_Arabian</code></td><td>Old South Arabian</td></tr> +<tr><td><code>Old_Turkic</code></td><td>Old Turkic</td></tr> <tr><td><code>Oriya</code></td><td>Oriya</td></tr> <tr><td><code>Osmanya</code></td><td>Osmanya</td></tr> <tr><td><code>Phags_Pa</code></td><td>'Phags Pa</td></tr> @@ -298,14 +314,19 @@ <tr><td><code>Rejang</code></td><td>Rejang</td></tr> <tr><td><code>Runic</code></td><td>Runic</td></tr> <tr><td><code>Saurashtra</code></td><td>Saurashtra</td></tr> +<tr><td><code>Sharada</code></td><td>Sharada</td></tr> <tr><td><code>Shavian</code></td><td>Shavian</td></tr> <tr><td><code>Sinhala</code></td><td>Sinhala</td></tr> +<tr><td><code>Sora_Sompeng</code></td><td>Sora Sompeng</td></tr> <tr><td><code>Sundanese</code></td><td>Sundanese</td></tr> <tr><td><code>Syloti_Nagri</code></td><td>Syloti Nagri</td></tr> <tr><td><code>Syriac</code></td><td>Syriac</td></tr> <tr><td><code>Tagalog</code></td><td>Tagalog</td></tr> <tr><td><code>Tagbanwa</code></td><td>Tagbanwa</td></tr> <tr><td><code>Tai_Le</code></td><td>Tai Le</td></tr> +<tr><td><code>Tai_Tham</code></td><td>Tai Tham</td></tr> +<tr><td><code>Tai_Viet</code></td><td>Tai Viet</td></tr> +<tr><td><code>Takri</code></td><td>Takri</td></tr> <tr><td><code>Tamil</code></td><td>Tamil</td></tr> <tr><td><code>Telugu</code></td><td>Telugu</td></tr> <tr><td><code>Thaana</code></td><td>Thaana</td></tr> diff --git a/third_party/re2/doc/syntax.txt b/third_party/re2/doc/syntax.txt index f940750..e9c6ff4 100644 --- a/third_party/re2/doc/syntax.txt +++ b/third_party/re2/doc/syntax.txt @@ -7,8 +7,8 @@ Single characters: [^xyz] negated character class \d Perl character class \D negated Perl character class -[:alpha:] ASCII character class -[:^alpha:] negated ASCII character class +[[:alpha:]] ASCII character class +[[:^alpha:]] negated ASCII character class \pN Unicode character class (one-letter name) \p{Greek} Unicode character class \PN negated Unicode character class (one-letter name) @@ -36,6 +36,10 @@ x{-} (== x*?) NOT SUPPORTED vim x{-n} (== x{n}?) NOT SUPPORTED vim x= (== x?) NOT SUPPORTED vim +Implementation restriction: The counting forms «x{n,m}», «x{n,}», and «x{n}» +reject forms that create a minimum or maximum repetition count above 1000. +Unlimited repetitions are not subject to this restriction. + Possessive repetitions: x*+ zero or more «x», possessive NOT SUPPORTED x++ one or more «x», possessive NOT SUPPORTED @@ -45,10 +49,10 @@ x{n,}+ «n» or more «x», possessive NOT SUPPORTED x{n}+ exactly «n» «x», possessive NOT SUPPORTED Grouping: -(re) numbered capturing group -(?P<name>re) named & numbered capturing group -(?<name>re) named & numbered capturing group NOT SUPPORTED -(?'name're) named & numbered capturing group NOT SUPPORTED +(re) numbered capturing group (submatch) +(?P<name>re) named & numbered capturing group (submatch) +(?<name>re) named & numbered capturing group (submatch) NOT SUPPORTED +(?'name're) named & numbered capturing group (submatch) NOT SUPPORTED (?:re) non-capturing group (?flags) set flags within current group; non-capturing (?flags:re) set flags during re; non-capturing @@ -69,8 +73,8 @@ Empty strings: ^ at beginning of text or line («m»=true) $ at end of text (like «\z» not «\Z») or line («m»=true) \A at beginning of text -\b at word boundary («\w» on one side and «\W», «\A», or «\z» on the other) -\B not a word boundary +\b at ASCII word boundary («\w» on one side and «\W», «\A», or «\z» on the other) +\B not at ASCII word boundary \G at beginning of subtext being searched NOT SUPPORTED pcre \G at end of last match NOT SUPPORTED perl \Z at end of text, or before newline at end of text NOT SUPPORTED @@ -155,7 +159,7 @@ Named character classes as character class elements: [\p{Name}] named Unicode property inside character class (== \p{Name}) [^\p{Name}] named Unicode property inside negated character class (== \P{Name}) -Perl character classes: +Perl character classes (all ASCII-only): \d digits (== [0-9]) \D not digits (== [^0-9]) \s whitespace (== [\t\n\f\r ]) @@ -169,20 +173,20 @@ Perl character classes: \V not vertical space NOT SUPPORTED ASCII character classes: -[:alnum:] alphanumeric (== [0-9A-Za-z]) -[:alpha:] alphabetic (== [A-Za-z]) -[:ascii:] ASCII (== [\x00-\x7F]) -[:blank:] blank (== [\t ]) -[:cntrl:] control (== [\x00-\x1F\x7F]) -[:digit:] digits (== [0-9]) -[:graph:] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]) -[:lower:] lower case (== [a-z]) -[:print:] printable (== [ -~] == [ [:graph:]]) -[:punct:] punctuation (== [!-/:-@[-`{-~]) -[:space:] whitespace (== [\t\n\v\f\r ]) -[:upper:] upper case (== [A-Z]) -[:word:] word characters (== [0-9A-Za-z_]) -[:xdigit:] hex digit (== [0-9A-Fa-f]) +[[:alnum:]] alphanumeric (== [0-9A-Za-z]) +[[:alpha:]] alphabetic (== [A-Za-z]) +[[:ascii:]] ASCII (== [\x00-\x7F]) +[[:blank:]] blank (== [\t ]) +[[:cntrl:]] control (== [\x00-\x1F\x7F]) +[[:digit:]] digits (== [0-9]) +[[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]) +[[:lower:]] lower case (== [a-z]) +[[:print:]] printable (== [ -~] == [ [:graph:]]) +[[:punct:]] punctuation (== [!-/:-@[-`{-~]) +[[:space:]] whitespace (== [\t\n\v\f\r ]) +[[:upper:]] upper case (== [A-Z]) +[[:word:]] word characters (== [0-9A-Za-z_]) +[[:xdigit:]] hex digit (== [0-9A-Fa-f]) Unicode character class names--general category: C other @@ -229,13 +233,17 @@ Unicode character class names--scripts: Arabic Arabic Armenian Armenian Balinese Balinese +Bamum Bamum +Batak Batak Bengali Bengali Bopomofo Bopomofo +Brahmi Brahmi Braille Braille Buginese Buginese Buhid Buhid Canadian_Aboriginal Canadian Aboriginal Carian Carian +Chakma Chakma Cham Cham Cherokee Cherokee Common characters not specific to one script @@ -245,6 +253,7 @@ Cypriot Cypriot Cyrillic Cyrillic Deseret Deseret Devanagari Devanagari +Egyptian_Hieroglyphs Egyptian Hieroglyphs Ethiopic Ethiopic Georgian Georgian Glagolitic Glagolitic @@ -257,7 +266,12 @@ Hangul Hangul Hanunoo Hanunoo Hebrew Hebrew Hiragana Hiragana +Imperial_Aramaic Imperial Aramaic Inherited inherit script from previous character +Inscriptional_Pahlavi Inscriptional Pahlavi +Inscriptional_Parthian Inscriptional Parthian +Javanese Javanese +Kaithi Kaithi Kannada Kannada Katakana Katakana Kayah_Li Kayah Li @@ -271,6 +285,11 @@ Linear_B Linear B Lycian Lycian Lydian Lydian Malayalam Malayalam +Mandaic Mandaic +Meetei_Mayek Meetei Mayek +Meroitic_Cursive Meroitic Cursive +Meroitic_Hieroglyphs Meroitic Hieroglyphs +Miao Miao Mongolian Mongolian Myanmar Myanmar New_Tai_Lue New Tai Lue (aka Simplified Tai Lue) @@ -279,6 +298,8 @@ Ogham Ogham Ol_Chiki Ol Chiki Old_Italic Old Italic Old_Persian Old Persian +Old_South_Arabian Old South Arabian +Old_Turkic Old Turkic Oriya Oriya Osmanya Osmanya Phags_Pa 'Phags Pa @@ -286,14 +307,19 @@ Phoenician Phoenician Rejang Rejang Runic Runic Saurashtra Saurashtra +Sharada Sharada Shavian Shavian Sinhala Sinhala +Sora_Sompeng Sora Sompeng Sundanese Sundanese Syloti_Nagri Syloti Nagri Syriac Syriac Tagalog Tagalog Tagbanwa Tagbanwa Tai_Le Tai Le +Tai_Tham Tai Tham +Tai_Viet Tai Viet +Takri Takri Tamil Tamil Telugu Telugu Thaana Thaana diff --git a/third_party/re2/lib/codereview/codereview.cfg b/third_party/re2/lib/codereview/codereview.cfg deleted file mode 100644 index 9581920..0000000 --- a/third_party/re2/lib/codereview/codereview.cfg +++ /dev/null @@ -1 +0,0 @@ -defaultcc: re2-dev@googlegroups.com diff --git a/third_party/re2/lib/codereview/codereview.py b/third_party/re2/lib/codereview/codereview.py deleted file mode 100644 index b892727..0000000 --- a/third_party/re2/lib/codereview/codereview.py +++ /dev/null @@ -1,3565 +0,0 @@ -# coding=utf-8 -# (The line above is necessary so that I can use 世界 in the -# *comment* below without Python getting all bent out of shape.) - -# Copyright 2007-2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -'''Mercurial interface to codereview.appspot.com. - -To configure, set the following options in -your repository's .hg/hgrc file. - - [extensions] - codereview = /path/to/codereview.py - - [codereview] - server = codereview.appspot.com - -The server should be running Rietveld; see http://code.google.com/p/rietveld/. - -In addition to the new commands, this extension introduces -the file pattern syntax @nnnnnn, where nnnnnn is a change list -number, to mean the files included in that change list, which -must be associated with the current client. - -For example, if change 123456 contains the files x.go and y.go, -"hg diff @123456" is equivalent to"hg diff x.go y.go". -''' - -import sys - -if __name__ == "__main__": - print >>sys.stderr, "This is a Mercurial extension and should not be invoked directly." - sys.exit(2) - -# We require Python 2.6 for the json package. -if sys.version < '2.6': - print >>sys.stderr, "The codereview extension requires Python 2.6 or newer." - print >>sys.stderr, "You are running Python " + sys.version - sys.exit(2) - -import json -import os -import re -import stat -import subprocess -import threading -import time - -from mercurial import commands as hg_commands -from mercurial import util as hg_util - -defaultcc = None -codereview_disabled = None -real_rollback = None -releaseBranch = None -server = "codereview.appspot.com" -server_url_base = None - -####################################################################### -# Normally I would split this into multiple files, but it simplifies -# import path headaches to keep it all in one file. Sorry. -# The different parts of the file are separated by banners like this one. - -####################################################################### -# Helpers - -def RelativePath(path, cwd): - n = len(cwd) - if path.startswith(cwd) and path[n] == '/': - return path[n+1:] - return path - -def Sub(l1, l2): - return [l for l in l1 if l not in l2] - -def Add(l1, l2): - l = l1 + Sub(l2, l1) - l.sort() - return l - -def Intersect(l1, l2): - return [l for l in l1 if l in l2] - -####################################################################### -# RE: UNICODE STRING HANDLING -# -# Python distinguishes between the str (string of bytes) -# and unicode (string of code points) types. Most operations -# work on either one just fine, but some (like regexp matching) -# require unicode, and others (like write) require str. -# -# As befits the language, Python hides the distinction between -# unicode and str by converting between them silently, but -# *only* if all the bytes/code points involved are 7-bit ASCII. -# This means that if you're not careful, your program works -# fine on "hello, world" and fails on "hello, 世界". And of course, -# the obvious way to be careful - use static types - is unavailable. -# So the only way is trial and error to find where to put explicit -# conversions. -# -# Because more functions do implicit conversion to str (string of bytes) -# than do implicit conversion to unicode (string of code points), -# the convention in this module is to represent all text as str, -# converting to unicode only when calling a unicode-only function -# and then converting back to str as soon as possible. - -def typecheck(s, t): - if type(s) != t: - raise hg_util.Abort("type check failed: %s has type %s != %s" % (repr(s), type(s), t)) - -# If we have to pass unicode instead of str, ustr does that conversion clearly. -def ustr(s): - typecheck(s, str) - return s.decode("utf-8") - -# Even with those, Mercurial still sometimes turns unicode into str -# and then tries to use it as ascii. Change Mercurial's default. -def set_mercurial_encoding_to_utf8(): - from mercurial import encoding - encoding.encoding = 'utf-8' - -set_mercurial_encoding_to_utf8() - -# Even with those we still run into problems. -# I tried to do things by the book but could not convince -# Mercurial to let me check in a change with UTF-8 in the -# CL description or author field, no matter how many conversions -# between str and unicode I inserted and despite changing the -# default encoding. I'm tired of this game, so set the default -# encoding for all of Python to 'utf-8', not 'ascii'. -def default_to_utf8(): - import sys - stdout, __stdout__ = sys.stdout, sys.__stdout__ - reload(sys) # site.py deleted setdefaultencoding; get it back - sys.stdout, sys.__stdout__ = stdout, __stdout__ - sys.setdefaultencoding('utf-8') - -default_to_utf8() - -####################################################################### -# Status printer for long-running commands - -global_status = None - -def set_status(s): - # print >>sys.stderr, "\t", time.asctime(), s - global global_status - global_status = s - -class StatusThread(threading.Thread): - def __init__(self): - threading.Thread.__init__(self) - def run(self): - # pause a reasonable amount of time before - # starting to display status messages, so that - # most hg commands won't ever see them. - time.sleep(30) - - # now show status every 15 seconds - while True: - time.sleep(15 - time.time() % 15) - s = global_status - if s is None: - continue - if s == "": - s = "(unknown status)" - print >>sys.stderr, time.asctime(), s - -def start_status_thread(): - t = StatusThread() - t.setDaemon(True) # allowed to exit if t is still running - t.start() - -####################################################################### -# Change list parsing. -# -# Change lists are stored in .hg/codereview/cl.nnnnnn -# where nnnnnn is the number assigned by the code review server. -# Most data about a change list is stored on the code review server -# too: the description, reviewer, and cc list are all stored there. -# The only thing in the cl.nnnnnn file is the list of relevant files. -# Also, the existence of the cl.nnnnnn file marks this repository -# as the one where the change list lives. - -emptydiff = """Index: ~rietveld~placeholder~ -=================================================================== -diff --git a/~rietveld~placeholder~ b/~rietveld~placeholder~ -new file mode 100644 -""" - -class CL(object): - def __init__(self, name): - typecheck(name, str) - self.name = name - self.desc = '' - self.files = [] - self.reviewer = [] - self.cc = [] - self.url = '' - self.local = False - self.web = False - self.copied_from = None # None means current user - self.mailed = False - self.private = False - self.lgtm = [] - - def DiskText(self): - cl = self - s = "" - if cl.copied_from: - s += "Author: " + cl.copied_from + "\n\n" - if cl.private: - s += "Private: " + str(self.private) + "\n" - s += "Mailed: " + str(self.mailed) + "\n" - s += "Description:\n" - s += Indent(cl.desc, "\t") - s += "Files:\n" - for f in cl.files: - s += "\t" + f + "\n" - typecheck(s, str) - return s - - def EditorText(self): - cl = self - s = _change_prolog - s += "\n" - if cl.copied_from: - s += "Author: " + cl.copied_from + "\n" - if cl.url != '': - s += 'URL: ' + cl.url + ' # cannot edit\n\n' - if cl.private: - s += "Private: True\n" - s += "Reviewer: " + JoinComma(cl.reviewer) + "\n" - s += "CC: " + JoinComma(cl.cc) + "\n" - s += "\n" - s += "Description:\n" - if cl.desc == '': - s += "\t<enter description here>\n" - else: - s += Indent(cl.desc, "\t") - s += "\n" - if cl.local or cl.name == "new": - s += "Files:\n" - for f in cl.files: - s += "\t" + f + "\n" - s += "\n" - typecheck(s, str) - return s - - def PendingText(self, quick=False): - cl = self - s = cl.name + ":" + "\n" - s += Indent(cl.desc, "\t") - s += "\n" - if cl.copied_from: - s += "\tAuthor: " + cl.copied_from + "\n" - if not quick: - s += "\tReviewer: " + JoinComma(cl.reviewer) + "\n" - for (who, line) in cl.lgtm: - s += "\t\t" + who + ": " + line + "\n" - s += "\tCC: " + JoinComma(cl.cc) + "\n" - s += "\tFiles:\n" - for f in cl.files: - s += "\t\t" + f + "\n" - typecheck(s, str) - return s - - def Flush(self, ui, repo): - if self.name == "new": - self.Upload(ui, repo, gofmt_just_warn=True, creating=True) - dir = CodeReviewDir(ui, repo) - path = dir + '/cl.' + self.name - f = open(path+'!', "w") - f.write(self.DiskText()) - f.close() - if sys.platform == "win32" and os.path.isfile(path): - os.remove(path) - os.rename(path+'!', path) - if self.web and not self.copied_from: - EditDesc(self.name, desc=self.desc, - reviewers=JoinComma(self.reviewer), cc=JoinComma(self.cc), - private=self.private) - - def Delete(self, ui, repo): - dir = CodeReviewDir(ui, repo) - os.unlink(dir + "/cl." + self.name) - - def Subject(self): - s = line1(self.desc) - if len(s) > 60: - s = s[0:55] + "..." - if self.name != "new": - s = "code review %s: %s" % (self.name, s) - typecheck(s, str) - return s - - def Upload(self, ui, repo, send_mail=False, gofmt=True, gofmt_just_warn=False, creating=False, quiet=False): - if not self.files and not creating: - ui.warn("no files in change list\n") - if ui.configbool("codereview", "force_gofmt", True) and gofmt: - CheckFormat(ui, repo, self.files, just_warn=gofmt_just_warn) - set_status("uploading CL metadata + diffs") - os.chdir(repo.root) - form_fields = [ - ("content_upload", "1"), - ("reviewers", JoinComma(self.reviewer)), - ("cc", JoinComma(self.cc)), - ("description", self.desc), - ("base_hashes", ""), - ] - - if self.name != "new": - form_fields.append(("issue", self.name)) - vcs = None - # We do not include files when creating the issue, - # because we want the patch sets to record the repository - # and base revision they are diffs against. We use the patch - # set message for that purpose, but there is no message with - # the first patch set. Instead the message gets used as the - # new CL's overall subject. So omit the diffs when creating - # and then we'll run an immediate upload. - # This has the effect that every CL begins with an empty "Patch set 1". - if self.files and not creating: - vcs = MercurialVCS(upload_options, ui, repo) - data = vcs.GenerateDiff(self.files) - files = vcs.GetBaseFiles(data) - if len(data) > MAX_UPLOAD_SIZE: - uploaded_diff_file = [] - form_fields.append(("separate_patches", "1")) - else: - uploaded_diff_file = [("data", "data.diff", data)] - else: - uploaded_diff_file = [("data", "data.diff", emptydiff)] - - if vcs and self.name != "new": - form_fields.append(("subject", "diff -r " + vcs.base_rev + " " + ui.expandpath("default"))) - else: - # First upload sets the subject for the CL itself. - form_fields.append(("subject", self.Subject())) - ctype, body = EncodeMultipartFormData(form_fields, uploaded_diff_file) - response_body = MySend("/upload", body, content_type=ctype) - patchset = None - msg = response_body - lines = msg.splitlines() - if len(lines) >= 2: - msg = lines[0] - patchset = lines[1].strip() - patches = [x.split(" ", 1) for x in lines[2:]] - if response_body.startswith("Issue updated.") and quiet: - pass - else: - ui.status(msg + "\n") - set_status("uploaded CL metadata + diffs") - if not response_body.startswith("Issue created.") and not response_body.startswith("Issue updated."): - raise hg_util.Abort("failed to update issue: " + response_body) - issue = msg[msg.rfind("/")+1:] - self.name = issue - if not self.url: - self.url = server_url_base + self.name - if not uploaded_diff_file: - set_status("uploading patches") - patches = UploadSeparatePatches(issue, rpc, patchset, data, upload_options) - if vcs: - set_status("uploading base files") - vcs.UploadBaseFiles(issue, rpc, patches, patchset, upload_options, files) - if send_mail: - set_status("sending mail") - MySend("/" + issue + "/mail", payload="") - self.web = True - set_status("flushing changes to disk") - self.Flush(ui, repo) - return - - def Mail(self, ui, repo): - pmsg = "Hello " + JoinComma(self.reviewer) - if self.cc: - pmsg += " (cc: %s)" % (', '.join(self.cc),) - pmsg += ",\n" - pmsg += "\n" - repourl = ui.expandpath("default") - if not self.mailed: - pmsg += "I'd like you to review this change to\n" + repourl + "\n" - else: - pmsg += "Please take another look.\n" - typecheck(pmsg, str) - PostMessage(ui, self.name, pmsg, subject=self.Subject()) - self.mailed = True - self.Flush(ui, repo) - -def GoodCLName(name): - typecheck(name, str) - return re.match("^[0-9]+$", name) - -def ParseCL(text, name): - typecheck(text, str) - typecheck(name, str) - sname = None - lineno = 0 - sections = { - 'Author': '', - 'Description': '', - 'Files': '', - 'URL': '', - 'Reviewer': '', - 'CC': '', - 'Mailed': '', - 'Private': '', - } - for line in text.split('\n'): - lineno += 1 - line = line.rstrip() - if line != '' and line[0] == '#': - continue - if line == '' or line[0] == ' ' or line[0] == '\t': - if sname == None and line != '': - return None, lineno, 'text outside section' - if sname != None: - sections[sname] += line + '\n' - continue - p = line.find(':') - if p >= 0: - s, val = line[:p].strip(), line[p+1:].strip() - if s in sections: - sname = s - if val != '': - sections[sname] += val + '\n' - continue - return None, lineno, 'malformed section header' - - for k in sections: - sections[k] = StripCommon(sections[k]).rstrip() - - cl = CL(name) - if sections['Author']: - cl.copied_from = sections['Author'] - cl.desc = sections['Description'] - for line in sections['Files'].split('\n'): - i = line.find('#') - if i >= 0: - line = line[0:i].rstrip() - line = line.strip() - if line == '': - continue - cl.files.append(line) - cl.reviewer = SplitCommaSpace(sections['Reviewer']) - cl.cc = SplitCommaSpace(sections['CC']) - cl.url = sections['URL'] - if sections['Mailed'] != 'False': - # Odd default, but avoids spurious mailings when - # reading old CLs that do not have a Mailed: line. - # CLs created with this update will always have - # Mailed: False on disk. - cl.mailed = True - if sections['Private'] in ('True', 'true', 'Yes', 'yes'): - cl.private = True - if cl.desc == '<enter description here>': - cl.desc = '' - return cl, 0, '' - -def SplitCommaSpace(s): - typecheck(s, str) - s = s.strip() - if s == "": - return [] - return re.split(", *", s) - -def CutDomain(s): - typecheck(s, str) - i = s.find('@') - if i >= 0: - s = s[0:i] - return s - -def JoinComma(l): - for s in l: - typecheck(s, str) - return ", ".join(l) - -def ExceptionDetail(): - s = str(sys.exc_info()[0]) - if s.startswith("<type '") and s.endswith("'>"): - s = s[7:-2] - elif s.startswith("<class '") and s.endswith("'>"): - s = s[8:-2] - arg = str(sys.exc_info()[1]) - if len(arg) > 0: - s += ": " + arg - return s - -def IsLocalCL(ui, repo, name): - return GoodCLName(name) and os.access(CodeReviewDir(ui, repo) + "/cl." + name, 0) - -# Load CL from disk and/or the web. -def LoadCL(ui, repo, name, web=True): - typecheck(name, str) - set_status("loading CL " + name) - if not GoodCLName(name): - return None, "invalid CL name" - dir = CodeReviewDir(ui, repo) - path = dir + "cl." + name - if os.access(path, 0): - ff = open(path) - text = ff.read() - ff.close() - cl, lineno, err = ParseCL(text, name) - if err != "": - return None, "malformed CL data: "+err - cl.local = True - else: - cl = CL(name) - if web: - set_status("getting issue metadata from web") - d = JSONGet(ui, "/api/" + name + "?messages=true") - set_status(None) - if d is None: - return None, "cannot load CL %s from server" % (name,) - if 'owner_email' not in d or 'issue' not in d or str(d['issue']) != name: - return None, "malformed response loading CL data from code review server" - cl.dict = d - cl.reviewer = d.get('reviewers', []) - cl.cc = d.get('cc', []) - if cl.local and cl.copied_from and cl.desc: - # local copy of CL written by someone else - # and we saved a description. use that one, - # so that committers can edit the description - # before doing hg submit. - pass - else: - cl.desc = d.get('description', "") - cl.url = server_url_base + name - cl.web = True - cl.private = d.get('private', False) != False - cl.lgtm = [] - for m in d.get('messages', []): - if m.get('approval', False) == True: - who = re.sub('@.*', '', m.get('sender', '')) - text = re.sub("\n(.|\n)*", '', m.get('text', '')) - cl.lgtm.append((who, text)) - - set_status("loaded CL " + name) - return cl, '' - -class LoadCLThread(threading.Thread): - def __init__(self, ui, repo, dir, f, web): - threading.Thread.__init__(self) - self.ui = ui - self.repo = repo - self.dir = dir - self.f = f - self.web = web - self.cl = None - def run(self): - cl, err = LoadCL(self.ui, self.repo, self.f[3:], web=self.web) - if err != '': - self.ui.warn("loading "+self.dir+self.f+": " + err + "\n") - return - self.cl = cl - -# Load all the CLs from this repository. -def LoadAllCL(ui, repo, web=True): - dir = CodeReviewDir(ui, repo) - m = {} - files = [f for f in os.listdir(dir) if f.startswith('cl.')] - if not files: - return m - active = [] - first = True - for f in files: - t = LoadCLThread(ui, repo, dir, f, web) - t.start() - if web and first: - # first request: wait in case it needs to authenticate - # otherwise we get lots of user/password prompts - # running in parallel. - t.join() - if t.cl: - m[t.cl.name] = t.cl - first = False - else: - active.append(t) - for t in active: - t.join() - if t.cl: - m[t.cl.name] = t.cl - return m - -# Find repository root. On error, ui.warn and return None -def RepoDir(ui, repo): - url = repo.url(); - if not url.startswith('file:'): - ui.warn("repository %s is not in local file system\n" % (url,)) - return None - url = url[5:] - if url.endswith('/'): - url = url[:-1] - typecheck(url, str) - return url - -# Find (or make) code review directory. On error, ui.warn and return None -def CodeReviewDir(ui, repo): - dir = RepoDir(ui, repo) - if dir == None: - return None - dir += '/.hg/codereview/' - if not os.path.isdir(dir): - try: - os.mkdir(dir, 0700) - except: - ui.warn('cannot mkdir %s: %s\n' % (dir, ExceptionDetail())) - return None - typecheck(dir, str) - return dir - -# Turn leading tabs into spaces, so that the common white space -# prefix doesn't get confused when people's editors write out -# some lines with spaces, some with tabs. Only a heuristic -# (some editors don't use 8 spaces either) but a useful one. -def TabsToSpaces(line): - i = 0 - while i < len(line) and line[i] == '\t': - i += 1 - return ' '*(8*i) + line[i:] - -# Strip maximal common leading white space prefix from text -def StripCommon(text): - typecheck(text, str) - ws = None - for line in text.split('\n'): - line = line.rstrip() - if line == '': - continue - line = TabsToSpaces(line) - white = line[:len(line)-len(line.lstrip())] - if ws == None: - ws = white - else: - common = '' - for i in range(min(len(white), len(ws))+1): - if white[0:i] == ws[0:i]: - common = white[0:i] - ws = common - if ws == '': - break - if ws == None: - return text - t = '' - for line in text.split('\n'): - line = line.rstrip() - line = TabsToSpaces(line) - if line.startswith(ws): - line = line[len(ws):] - if line == '' and t == '': - continue - t += line + '\n' - while len(t) >= 2 and t[-2:] == '\n\n': - t = t[:-1] - typecheck(t, str) - return t - -# Indent text with indent. -def Indent(text, indent): - typecheck(text, str) - typecheck(indent, str) - t = '' - for line in text.split('\n'): - t += indent + line + '\n' - typecheck(t, str) - return t - -# Return the first line of l -def line1(text): - typecheck(text, str) - return text.split('\n')[0] - -_change_prolog = """# Change list. -# Lines beginning with # are ignored. -# Multi-line values should be indented. -""" - -desc_re = '^(.+: |(tag )?(release|weekly)\.|fix build|undo CL)' - -desc_msg = '''Your CL description appears not to use the standard form. - -The first line of your change description is conventionally a -one-line summary of the change, prefixed by the primary affected package, -and is used as the subject for code review mail; the rest of the description -elaborates. - -Examples: - - encoding/rot13: new package - - math: add IsInf, IsNaN - - net: fix cname in LookupHost - - unicode: update to Unicode 5.0.2 - -''' - -def promptyesno(ui, msg): - if hgversion >= "2.7": - return ui.promptchoice(msg + " $$ &yes $$ &no", 0) == 0 - else: - return ui.promptchoice(msg, ["&yes", "&no"], 0) == 0 - -def promptremove(ui, repo, f): - if promptyesno(ui, "hg remove %s (y/n)?" % (f,)): - if hg_commands.remove(ui, repo, 'path:'+f) != 0: - ui.warn("error removing %s" % (f,)) - -def promptadd(ui, repo, f): - if promptyesno(ui, "hg add %s (y/n)?" % (f,)): - if hg_commands.add(ui, repo, 'path:'+f) != 0: - ui.warn("error adding %s" % (f,)) - -def EditCL(ui, repo, cl): - set_status(None) # do not show status - s = cl.EditorText() - while True: - s = ui.edit(s, ui.username()) - - # We can't trust Mercurial + Python not to die before making the change, - # so, by popular demand, just scribble the most recent CL edit into - # $(hg root)/last-change so that if Mercurial does die, people - # can look there for their work. - try: - f = open(repo.root+"/last-change", "w") - f.write(s) - f.close() - except: - pass - - clx, line, err = ParseCL(s, cl.name) - if err != '': - if not promptyesno(ui, "error parsing change list: line %d: %s\nre-edit (y/n)?" % (line, err)): - return "change list not modified" - continue - - # Check description. - if clx.desc == '': - if promptyesno(ui, "change list should have a description\nre-edit (y/n)?"): - continue - elif re.search('<enter reason for undo>', clx.desc): - if promptyesno(ui, "change list description omits reason for undo\nre-edit (y/n)?"): - continue - elif not re.match(desc_re, clx.desc.split('\n')[0]): - if promptyesno(ui, desc_msg + "re-edit (y/n)?"): - continue - - # Check file list for files that need to be hg added or hg removed - # or simply aren't understood. - pats = ['path:'+f for f in clx.files] - changed = hg_matchPattern(ui, repo, *pats, modified=True, added=True, removed=True) - deleted = hg_matchPattern(ui, repo, *pats, deleted=True) - unknown = hg_matchPattern(ui, repo, *pats, unknown=True) - ignored = hg_matchPattern(ui, repo, *pats, ignored=True) - clean = hg_matchPattern(ui, repo, *pats, clean=True) - files = [] - for f in clx.files: - if f in changed: - files.append(f) - continue - if f in deleted: - promptremove(ui, repo, f) - files.append(f) - continue - if f in unknown: - promptadd(ui, repo, f) - files.append(f) - continue - if f in ignored: - ui.warn("error: %s is excluded by .hgignore; omitting\n" % (f,)) - continue - if f in clean: - ui.warn("warning: %s is listed in the CL but unchanged\n" % (f,)) - files.append(f) - continue - p = repo.root + '/' + f - if os.path.isfile(p): - ui.warn("warning: %s is a file but not known to hg\n" % (f,)) - files.append(f) - continue - if os.path.isdir(p): - ui.warn("error: %s is a directory, not a file; omitting\n" % (f,)) - continue - ui.warn("error: %s does not exist; omitting\n" % (f,)) - clx.files = files - - cl.desc = clx.desc - cl.reviewer = clx.reviewer - cl.cc = clx.cc - cl.files = clx.files - cl.private = clx.private - break - return "" - -# For use by submit, etc. (NOT by change) -# Get change list number or list of files from command line. -# If files are given, make a new change list. -def CommandLineCL(ui, repo, pats, opts, defaultcc=None): - if len(pats) > 0 and GoodCLName(pats[0]): - if len(pats) != 1: - return None, "cannot specify change number and file names" - if opts.get('message'): - return None, "cannot use -m with existing CL" - cl, err = LoadCL(ui, repo, pats[0], web=True) - if err != "": - return None, err - else: - cl = CL("new") - cl.local = True - cl.files = ChangedFiles(ui, repo, pats, taken=Taken(ui, repo)) - if not cl.files: - return None, "no files changed" - if opts.get('reviewer'): - cl.reviewer = Add(cl.reviewer, SplitCommaSpace(opts.get('reviewer'))) - if opts.get('cc'): - cl.cc = Add(cl.cc, SplitCommaSpace(opts.get('cc'))) - if defaultcc: - cl.cc = Add(cl.cc, defaultcc) - if cl.name == "new": - if opts.get('message'): - cl.desc = opts.get('message') - else: - err = EditCL(ui, repo, cl) - if err != '': - return None, err - return cl, "" - -####################################################################### -# Change list file management - -# Return list of changed files in repository that match pats. -# The patterns came from the command line, so we warn -# if they have no effect or cannot be understood. -def ChangedFiles(ui, repo, pats, taken=None): - taken = taken or {} - # Run each pattern separately so that we can warn about - # patterns that didn't do anything useful. - for p in pats: - for f in hg_matchPattern(ui, repo, p, unknown=True): - promptadd(ui, repo, f) - for f in hg_matchPattern(ui, repo, p, removed=True): - promptremove(ui, repo, f) - files = hg_matchPattern(ui, repo, p, modified=True, added=True, removed=True) - for f in files: - if f in taken: - ui.warn("warning: %s already in CL %s\n" % (f, taken[f].name)) - if not files: - ui.warn("warning: %s did not match any modified files\n" % (p,)) - - # Again, all at once (eliminates duplicates) - l = hg_matchPattern(ui, repo, *pats, modified=True, added=True, removed=True) - l.sort() - if taken: - l = Sub(l, taken.keys()) - return l - -# Return list of changed files in repository that match pats and still exist. -def ChangedExistingFiles(ui, repo, pats, opts): - l = hg_matchPattern(ui, repo, *pats, modified=True, added=True) - l.sort() - return l - -# Return list of files claimed by existing CLs -def Taken(ui, repo): - all = LoadAllCL(ui, repo, web=False) - taken = {} - for _, cl in all.items(): - for f in cl.files: - taken[f] = cl - return taken - -# Return list of changed files that are not claimed by other CLs -def DefaultFiles(ui, repo, pats): - return ChangedFiles(ui, repo, pats, taken=Taken(ui, repo)) - -####################################################################### -# File format checking. - -def CheckFormat(ui, repo, files, just_warn=False): - set_status("running gofmt") - CheckGofmt(ui, repo, files, just_warn) - CheckTabfmt(ui, repo, files, just_warn) - -# Check that gofmt run on the list of files does not change them -def CheckGofmt(ui, repo, files, just_warn): - files = gofmt_required(files) - if not files: - return - cwd = os.getcwd() - files = [RelativePath(repo.root + '/' + f, cwd) for f in files] - files = [f for f in files if os.access(f, 0)] - if not files: - return - try: - cmd = subprocess.Popen(["gofmt", "-l"] + files, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=sys.platform != "win32") - cmd.stdin.close() - except: - raise hg_util.Abort("gofmt: " + ExceptionDetail()) - data = cmd.stdout.read() - errors = cmd.stderr.read() - cmd.wait() - set_status("done with gofmt") - if len(errors) > 0: - ui.warn("gofmt errors:\n" + errors.rstrip() + "\n") - return - if len(data) > 0: - msg = "gofmt needs to format these files (run hg gofmt):\n" + Indent(data, "\t").rstrip() - if just_warn: - ui.warn("warning: " + msg + "\n") - else: - raise hg_util.Abort(msg) - return - -# Check that *.[chys] files indent using tabs. -def CheckTabfmt(ui, repo, files, just_warn): - files = [f for f in files if f.startswith('src/') and re.search(r"\.[chys]$", f) and not re.search(r"\.tab\.[ch]$", f)] - if not files: - return - cwd = os.getcwd() - files = [RelativePath(repo.root + '/' + f, cwd) for f in files] - files = [f for f in files if os.access(f, 0)] - badfiles = [] - for f in files: - try: - for line in open(f, 'r'): - # Four leading spaces is enough to complain about, - # except that some Plan 9 code uses four spaces as the label indent, - # so allow that. - if line.startswith(' ') and not re.match(' [A-Za-z0-9_]+:', line): - badfiles.append(f) - break - except: - # ignore cannot open file, etc. - pass - if len(badfiles) > 0: - msg = "these files use spaces for indentation (use tabs instead):\n\t" + "\n\t".join(badfiles) - if just_warn: - ui.warn("warning: " + msg + "\n") - else: - raise hg_util.Abort(msg) - return - -####################################################################### -# CONTRIBUTORS file parsing - -contributorsCache = None -contributorsURL = None - -def ReadContributors(ui, repo): - global contributorsCache - if contributorsCache is not None: - return contributorsCache - - try: - if contributorsURL is not None: - opening = contributorsURL - f = urllib2.urlopen(contributorsURL) - else: - opening = repo.root + '/CONTRIBUTORS' - f = open(repo.root + '/CONTRIBUTORS', 'r') - except: - ui.write("warning: cannot open %s: %s\n" % (opening, ExceptionDetail())) - return - - contributors = {} - for line in f: - # CONTRIBUTORS is a list of lines like: - # Person <email> - # Person <email> <alt-email> - # The first email address is the one used in commit logs. - if line.startswith('#'): - continue - m = re.match(r"([^<>]+\S)\s+(<[^<>\s]+>)((\s+<[^<>\s]+>)*)\s*$", line) - if m: - name = m.group(1) - email = m.group(2)[1:-1] - contributors[email.lower()] = (name, email) - for extra in m.group(3).split(): - contributors[extra[1:-1].lower()] = (name, email) - - contributorsCache = contributors - return contributors - -def CheckContributor(ui, repo, user=None): - set_status("checking CONTRIBUTORS file") - user, userline = FindContributor(ui, repo, user, warn=False) - if not userline: - raise hg_util.Abort("cannot find %s in CONTRIBUTORS" % (user,)) - return userline - -def FindContributor(ui, repo, user=None, warn=True): - if not user: - user = ui.config("ui", "username") - if not user: - raise hg_util.Abort("[ui] username is not configured in .hgrc") - user = user.lower() - m = re.match(r".*<(.*)>", user) - if m: - user = m.group(1) - - contributors = ReadContributors(ui, repo) - if user not in contributors: - if warn: - ui.warn("warning: cannot find %s in CONTRIBUTORS\n" % (user,)) - return user, None - - user, email = contributors[user] - return email, "%s <%s>" % (user, email) - -####################################################################### -# Mercurial helper functions. -# Read http://mercurial.selenic.com/wiki/MercurialApi before writing any of these. -# We use the ui.pushbuffer/ui.popbuffer + hg_commands.xxx tricks for all interaction -# with Mercurial. It has proved the most stable as they make changes. - -hgversion = hg_util.version() - -# We require Mercurial 1.9 and suggest Mercurial 2.0. -# The details of the scmutil package changed then, -# so allowing earlier versions would require extra band-aids below. -# Ubuntu 11.10 ships with Mercurial 1.9.1 as the default version. -hg_required = "1.9" -hg_suggested = "2.0" - -old_message = """ - -The code review extension requires Mercurial """+hg_required+""" or newer. -You are using Mercurial """+hgversion+""". - -To install a new Mercurial, use - - sudo easy_install mercurial=="""+hg_suggested+""" - -or visit http://mercurial.selenic.com/downloads/. -""" - -linux_message = """ -You may need to clear your current Mercurial installation by running: - - sudo apt-get remove mercurial mercurial-common - sudo rm -rf /etc/mercurial -""" - -if hgversion < hg_required: - msg = old_message - if os.access("/etc/mercurial", 0): - msg += linux_message - raise hg_util.Abort(msg) - -from mercurial.hg import clean as hg_clean -from mercurial import cmdutil as hg_cmdutil -from mercurial import error as hg_error -from mercurial import match as hg_match -from mercurial import node as hg_node - -class uiwrap(object): - def __init__(self, ui): - self.ui = ui - ui.pushbuffer() - self.oldQuiet = ui.quiet - ui.quiet = True - self.oldVerbose = ui.verbose - ui.verbose = False - def output(self): - ui = self.ui - ui.quiet = self.oldQuiet - ui.verbose = self.oldVerbose - return ui.popbuffer() - -def to_slash(path): - if sys.platform == "win32": - return path.replace('\\', '/') - return path - -def hg_matchPattern(ui, repo, *pats, **opts): - w = uiwrap(ui) - hg_commands.status(ui, repo, *pats, **opts) - text = w.output() - ret = [] - prefix = to_slash(os.path.realpath(repo.root))+'/' - for line in text.split('\n'): - f = line.split() - if len(f) > 1: - if len(pats) > 0: - # Given patterns, Mercurial shows relative to cwd - p = to_slash(os.path.realpath(f[1])) - if not p.startswith(prefix): - print >>sys.stderr, "File %s not in repo root %s.\n" % (p, prefix) - else: - ret.append(p[len(prefix):]) - else: - # Without patterns, Mercurial shows relative to root (what we want) - ret.append(to_slash(f[1])) - return ret - -def hg_heads(ui, repo): - w = uiwrap(ui) - hg_commands.heads(ui, repo) - return w.output() - -noise = [ - "", - "resolving manifests", - "searching for changes", - "couldn't find merge tool hgmerge", - "adding changesets", - "adding manifests", - "adding file changes", - "all local heads known remotely", -] - -def isNoise(line): - line = str(line) - for x in noise: - if line == x: - return True - return False - -def hg_incoming(ui, repo): - w = uiwrap(ui) - ret = hg_commands.incoming(ui, repo, force=False, bundle="") - if ret and ret != 1: - raise hg_util.Abort(ret) - return w.output() - -def hg_log(ui, repo, **opts): - for k in ['date', 'keyword', 'rev', 'user']: - if not opts.has_key(k): - opts[k] = "" - w = uiwrap(ui) - ret = hg_commands.log(ui, repo, **opts) - if ret: - raise hg_util.Abort(ret) - return w.output() - -def hg_outgoing(ui, repo, **opts): - w = uiwrap(ui) - ret = hg_commands.outgoing(ui, repo, **opts) - if ret and ret != 1: - raise hg_util.Abort(ret) - return w.output() - -def hg_pull(ui, repo, **opts): - w = uiwrap(ui) - ui.quiet = False - ui.verbose = True # for file list - err = hg_commands.pull(ui, repo, **opts) - for line in w.output().split('\n'): - if isNoise(line): - continue - if line.startswith('moving '): - line = 'mv ' + line[len('moving '):] - if line.startswith('getting ') and line.find(' to ') >= 0: - line = 'mv ' + line[len('getting '):] - if line.startswith('getting '): - line = '+ ' + line[len('getting '):] - if line.startswith('removing '): - line = '- ' + line[len('removing '):] - ui.write(line + '\n') - return err - -def hg_push(ui, repo, **opts): - w = uiwrap(ui) - ui.quiet = False - ui.verbose = True - err = hg_commands.push(ui, repo, **opts) - for line in w.output().split('\n'): - if not isNoise(line): - ui.write(line + '\n') - return err - -def hg_commit(ui, repo, *pats, **opts): - return hg_commands.commit(ui, repo, *pats, **opts) - -####################################################################### -# Mercurial precommit hook to disable commit except through this interface. - -commit_okay = False - -def precommithook(ui, repo, **opts): - if commit_okay: - return False # False means okay. - ui.write("\ncodereview extension enabled; use mail, upload, or submit instead of commit\n\n") - return True - -####################################################################### -# @clnumber file pattern support - -# We replace scmutil.match with the MatchAt wrapper to add the @clnumber pattern. - -match_repo = None -match_ui = None -match_orig = None - -def InstallMatch(ui, repo): - global match_repo - global match_ui - global match_orig - - match_ui = ui - match_repo = repo - - from mercurial import scmutil - match_orig = scmutil.match - scmutil.match = MatchAt - -def MatchAt(ctx, pats=None, opts=None, globbed=False, default='relpath'): - taken = [] - files = [] - pats = pats or [] - opts = opts or {} - - for p in pats: - if p.startswith('@'): - taken.append(p) - clname = p[1:] - if clname == "default": - files = DefaultFiles(match_ui, match_repo, []) - else: - if not GoodCLName(clname): - raise hg_util.Abort("invalid CL name " + clname) - cl, err = LoadCL(match_repo.ui, match_repo, clname, web=False) - if err != '': - raise hg_util.Abort("loading CL " + clname + ": " + err) - if not cl.files: - raise hg_util.Abort("no files in CL " + clname) - files = Add(files, cl.files) - pats = Sub(pats, taken) + ['path:'+f for f in files] - - # work-around for http://selenic.com/hg/rev/785bbc8634f8 - if not hasattr(ctx, 'match'): - ctx = ctx[None] - return match_orig(ctx, pats=pats, opts=opts, globbed=globbed, default=default) - -####################################################################### -# Commands added by code review extension. - -# As of Mercurial 2.1 the commands are all required to return integer -# exit codes, whereas earlier versions allowed returning arbitrary strings -# to be printed as errors. We wrap the old functions to make sure we -# always return integer exit codes now. Otherwise Mercurial dies -# with a TypeError traceback (unsupported operand type(s) for &: 'str' and 'int'). -# Introduce a Python decorator to convert old functions to the new -# stricter convention. - -def hgcommand(f): - def wrapped(ui, repo, *pats, **opts): - err = f(ui, repo, *pats, **opts) - if type(err) is int: - return err - if not err: - return 0 - raise hg_util.Abort(err) - wrapped.__doc__ = f.__doc__ - return wrapped - -####################################################################### -# hg change - -@hgcommand -def change(ui, repo, *pats, **opts): - """create, edit or delete a change list - - Create, edit or delete a change list. - A change list is a group of files to be reviewed and submitted together, - plus a textual description of the change. - Change lists are referred to by simple alphanumeric names. - - Changes must be reviewed before they can be submitted. - - In the absence of options, the change command opens the - change list for editing in the default editor. - - Deleting a change with the -d or -D flag does not affect - the contents of the files listed in that change. To revert - the files listed in a change, use - - hg revert @123456 - - before running hg change -d 123456. - """ - - if codereview_disabled: - return codereview_disabled - - dirty = {} - if len(pats) > 0 and GoodCLName(pats[0]): - name = pats[0] - if len(pats) != 1: - return "cannot specify CL name and file patterns" - pats = pats[1:] - cl, err = LoadCL(ui, repo, name, web=True) - if err != '': - return err - if not cl.local and (opts["stdin"] or not opts["stdout"]): - return "cannot change non-local CL " + name - else: - name = "new" - cl = CL("new") - if repo[None].branch() != "default": - return "cannot create CL outside default branch; switch with 'hg update default'" - dirty[cl] = True - files = ChangedFiles(ui, repo, pats, taken=Taken(ui, repo)) - - if opts["delete"] or opts["deletelocal"]: - if opts["delete"] and opts["deletelocal"]: - return "cannot use -d and -D together" - flag = "-d" - if opts["deletelocal"]: - flag = "-D" - if name == "new": - return "cannot use "+flag+" with file patterns" - if opts["stdin"] or opts["stdout"]: - return "cannot use "+flag+" with -i or -o" - if not cl.local: - return "cannot change non-local CL " + name - if opts["delete"]: - if cl.copied_from: - return "original author must delete CL; hg change -D will remove locally" - PostMessage(ui, cl.name, "*** Abandoned ***", send_mail=cl.mailed) - EditDesc(cl.name, closed=True, private=cl.private) - cl.Delete(ui, repo) - return - - if opts["stdin"]: - s = sys.stdin.read() - clx, line, err = ParseCL(s, name) - if err != '': - return "error parsing change list: line %d: %s" % (line, err) - if clx.desc is not None: - cl.desc = clx.desc; - dirty[cl] = True - if clx.reviewer is not None: - cl.reviewer = clx.reviewer - dirty[cl] = True - if clx.cc is not None: - cl.cc = clx.cc - dirty[cl] = True - if clx.files is not None: - cl.files = clx.files - dirty[cl] = True - if clx.private != cl.private: - cl.private = clx.private - dirty[cl] = True - - if not opts["stdin"] and not opts["stdout"]: - if name == "new": - cl.files = files - err = EditCL(ui, repo, cl) - if err != "": - return err - dirty[cl] = True - - for d, _ in dirty.items(): - name = d.name - d.Flush(ui, repo) - if name == "new": - d.Upload(ui, repo, quiet=True) - - if opts["stdout"]: - ui.write(cl.EditorText()) - elif opts["pending"]: - ui.write(cl.PendingText()) - elif name == "new": - if ui.quiet: - ui.write(cl.name) - else: - ui.write("CL created: " + cl.url + "\n") - return - -####################################################################### -# hg code-login (broken?) - -@hgcommand -def code_login(ui, repo, **opts): - """log in to code review server - - Logs in to the code review server, saving a cookie in - a file in your home directory. - """ - if codereview_disabled: - return codereview_disabled - - MySend(None) - -####################################################################### -# hg clpatch / undo / release-apply / download -# All concerned with applying or unapplying patches to the repository. - -@hgcommand -def clpatch(ui, repo, clname, **opts): - """import a patch from the code review server - - Imports a patch from the code review server into the local client. - If the local client has already modified any of the files that the - patch modifies, this command will refuse to apply the patch. - - Submitting an imported patch will keep the original author's - name as the Author: line but add your own name to a Committer: line. - """ - if repo[None].branch() != "default": - return "cannot run hg clpatch outside default branch" - return clpatch_or_undo(ui, repo, clname, opts, mode="clpatch") - -@hgcommand -def undo(ui, repo, clname, **opts): - """undo the effect of a CL - - Creates a new CL that undoes an earlier CL. - After creating the CL, opens the CL text for editing so that - you can add the reason for the undo to the description. - """ - if repo[None].branch() != "default": - return "cannot run hg undo outside default branch" - return clpatch_or_undo(ui, repo, clname, opts, mode="undo") - -@hgcommand -def release_apply(ui, repo, clname, **opts): - """apply a CL to the release branch - - Creates a new CL copying a previously committed change - from the main branch to the release branch. - The current client must either be clean or already be in - the release branch. - - The release branch must be created by starting with a - clean client, disabling the code review plugin, and running: - - hg update weekly.YYYY-MM-DD - hg branch release-branch.rNN - hg commit -m 'create release-branch.rNN' - hg push --new-branch - - Then re-enable the code review plugin. - - People can test the release branch by running - - hg update release-branch.rNN - - in a clean client. To return to the normal tree, - - hg update default - - Move changes since the weekly into the release branch - using hg release-apply followed by the usual code review - process and hg submit. - - When it comes time to tag the release, record the - final long-form tag of the release-branch.rNN - in the *default* branch's .hgtags file. That is, run - - hg update default - - and then edit .hgtags as you would for a weekly. - - """ - c = repo[None] - if not releaseBranch: - return "no active release branches" - if c.branch() != releaseBranch: - if c.modified() or c.added() or c.removed(): - raise hg_util.Abort("uncommitted local changes - cannot switch branches") - err = hg_clean(repo, releaseBranch) - if err: - return err - try: - err = clpatch_or_undo(ui, repo, clname, opts, mode="backport") - if err: - raise hg_util.Abort(err) - except Exception, e: - hg_clean(repo, "default") - raise e - return None - -def rev2clname(rev): - # Extract CL name from revision description. - # The last line in the description that is a codereview URL is the real one. - # Earlier lines might be part of the user-written description. - all = re.findall('(?m)^http://codereview.appspot.com/([0-9]+)$', rev.description()) - if len(all) > 0: - return all[-1] - return "" - -undoHeader = """undo CL %s / %s - -<enter reason for undo> - -««« original CL description -""" - -undoFooter = """ -»»» -""" - -backportHeader = """[%s] %s - -««« CL %s / %s -""" - -backportFooter = """ -»»» -""" - -# Implementation of clpatch/undo. -def clpatch_or_undo(ui, repo, clname, opts, mode): - if codereview_disabled: - return codereview_disabled - - if mode == "undo" or mode == "backport": - # Find revision in Mercurial repository. - # Assume CL number is 7+ decimal digits. - # Otherwise is either change log sequence number (fewer decimal digits), - # hexadecimal hash, or tag name. - # Mercurial will fall over long before the change log - # sequence numbers get to be 7 digits long. - if re.match('^[0-9]{7,}$', clname): - found = False - for r in hg_log(ui, repo, keyword="codereview.appspot.com/"+clname, limit=100, template="{node}\n").split(): - rev = repo[r] - # Last line with a code review URL is the actual review URL. - # Earlier ones might be part of the CL description. - n = rev2clname(rev) - if n == clname: - found = True - break - if not found: - return "cannot find CL %s in local repository" % clname - else: - rev = repo[clname] - if not rev: - return "unknown revision %s" % clname - clname = rev2clname(rev) - if clname == "": - return "cannot find CL name in revision description" - - # Create fresh CL and start with patch that would reverse the change. - vers = hg_node.short(rev.node()) - cl = CL("new") - desc = str(rev.description()) - if mode == "undo": - cl.desc = (undoHeader % (clname, vers)) + desc + undoFooter - else: - cl.desc = (backportHeader % (releaseBranch, line1(desc), clname, vers)) + desc + undoFooter - v1 = vers - v0 = hg_node.short(rev.parents()[0].node()) - if mode == "undo": - arg = v1 + ":" + v0 - else: - vers = v0 - arg = v0 + ":" + v1 - patch = RunShell(["hg", "diff", "--git", "-r", arg]) - - else: # clpatch - cl, vers, patch, err = DownloadCL(ui, repo, clname) - if err != "": - return err - if patch == emptydiff: - return "codereview issue %s has no diff" % clname - - # find current hg version (hg identify) - ctx = repo[None] - parents = ctx.parents() - id = '+'.join([hg_node.short(p.node()) for p in parents]) - - # if version does not match the patch version, - # try to update the patch line numbers. - if vers != "" and id != vers: - # "vers in repo" gives the wrong answer - # on some versions of Mercurial. Instead, do the actual - # lookup and catch the exception. - try: - repo[vers].description() - except: - return "local repository is out of date; sync to get %s" % (vers) - patch1, err = portPatch(repo, patch, vers, id) - if err != "": - if not opts["ignore_hgpatch_failure"]: - return "codereview issue %s is out of date: %s (%s->%s)" % (clname, err, vers, id) - else: - patch = patch1 - argv = ["hgpatch"] - if opts["no_incoming"] or mode == "backport": - argv += ["--checksync=false"] - try: - cmd = subprocess.Popen(argv, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None, close_fds=sys.platform != "win32") - except: - return "hgpatch: " + ExceptionDetail() + "\nInstall hgpatch with:\n$ go get code.google.com/p/go.codereview/cmd/hgpatch\n" - - out, err = cmd.communicate(patch) - if cmd.returncode != 0 and not opts["ignore_hgpatch_failure"]: - return "hgpatch failed" - cl.local = True - cl.files = out.strip().split() - if not cl.files and not opts["ignore_hgpatch_failure"]: - return "codereview issue %s has no changed files" % clname - files = ChangedFiles(ui, repo, []) - extra = Sub(cl.files, files) - if extra: - ui.warn("warning: these files were listed in the patch but not changed:\n\t" + "\n\t".join(extra) + "\n") - cl.Flush(ui, repo) - if mode == "undo": - err = EditCL(ui, repo, cl) - if err != "": - return "CL created, but error editing: " + err - cl.Flush(ui, repo) - else: - ui.write(cl.PendingText() + "\n") - -# portPatch rewrites patch from being a patch against -# oldver to being a patch against newver. -def portPatch(repo, patch, oldver, newver): - lines = patch.splitlines(True) # True = keep \n - delta = None - for i in range(len(lines)): - line = lines[i] - if line.startswith('--- a/'): - file = line[6:-1] - delta = fileDeltas(repo, file, oldver, newver) - if not delta or not line.startswith('@@ '): - continue - # @@ -x,y +z,w @@ means the patch chunk replaces - # the original file's line numbers x up to x+y with the - # line numbers z up to z+w in the new file. - # Find the delta from x in the original to the same - # line in the current version and add that delta to both - # x and z. - m = re.match('@@ -([0-9]+),([0-9]+) \+([0-9]+),([0-9]+) @@', line) - if not m: - return None, "error parsing patch line numbers" - n1, len1, n2, len2 = int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4)) - d, err = lineDelta(delta, n1, len1) - if err != "": - return "", err - n1 += d - n2 += d - lines[i] = "@@ -%d,%d +%d,%d @@\n" % (n1, len1, n2, len2) - - newpatch = ''.join(lines) - return newpatch, "" - -# fileDelta returns the line number deltas for the given file's -# changes from oldver to newver. -# The deltas are a list of (n, len, newdelta) triples that say -# lines [n, n+len) were modified, and after that range the -# line numbers are +newdelta from what they were before. -def fileDeltas(repo, file, oldver, newver): - cmd = ["hg", "diff", "--git", "-r", oldver + ":" + newver, "path:" + file] - data = RunShell(cmd, silent_ok=True) - deltas = [] - for line in data.splitlines(): - m = re.match('@@ -([0-9]+),([0-9]+) \+([0-9]+),([0-9]+) @@', line) - if not m: - continue - n1, len1, n2, len2 = int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4)) - deltas.append((n1, len1, n2+len2-(n1+len1))) - return deltas - -# lineDelta finds the appropriate line number delta to apply to the lines [n, n+len). -# It returns an error if those lines were rewritten by the patch. -def lineDelta(deltas, n, len): - d = 0 - for (old, oldlen, newdelta) in deltas: - if old >= n+len: - break - if old+len > n: - return 0, "patch and recent changes conflict" - d = newdelta - return d, "" - -@hgcommand -def download(ui, repo, clname, **opts): - """download a change from the code review server - - Download prints a description of the given change list - followed by its diff, downloaded from the code review server. - """ - if codereview_disabled: - return codereview_disabled - - cl, vers, patch, err = DownloadCL(ui, repo, clname) - if err != "": - return err - ui.write(cl.EditorText() + "\n") - ui.write(patch + "\n") - return - -####################################################################### -# hg file - -@hgcommand -def file(ui, repo, clname, pat, *pats, **opts): - """assign files to or remove files from a change list - - Assign files to or (with -d) remove files from a change list. - - The -d option only removes files from the change list. - It does not edit them or remove them from the repository. - """ - if codereview_disabled: - return codereview_disabled - - pats = tuple([pat] + list(pats)) - if not GoodCLName(clname): - return "invalid CL name " + clname - - dirty = {} - cl, err = LoadCL(ui, repo, clname, web=False) - if err != '': - return err - if not cl.local: - return "cannot change non-local CL " + clname - - files = ChangedFiles(ui, repo, pats) - - if opts["delete"]: - oldfiles = Intersect(files, cl.files) - if oldfiles: - if not ui.quiet: - ui.status("# Removing files from CL. To undo:\n") - ui.status("# cd %s\n" % (repo.root)) - for f in oldfiles: - ui.status("# hg file %s %s\n" % (cl.name, f)) - cl.files = Sub(cl.files, oldfiles) - cl.Flush(ui, repo) - else: - ui.status("no such files in CL") - return - - if not files: - return "no such modified files" - - files = Sub(files, cl.files) - taken = Taken(ui, repo) - warned = False - for f in files: - if f in taken: - if not warned and not ui.quiet: - ui.status("# Taking files from other CLs. To undo:\n") - ui.status("# cd %s\n" % (repo.root)) - warned = True - ocl = taken[f] - if not ui.quiet: - ui.status("# hg file %s %s\n" % (ocl.name, f)) - if ocl not in dirty: - ocl.files = Sub(ocl.files, files) - dirty[ocl] = True - cl.files = Add(cl.files, files) - dirty[cl] = True - for d, _ in dirty.items(): - d.Flush(ui, repo) - return - -####################################################################### -# hg gofmt - -@hgcommand -def gofmt(ui, repo, *pats, **opts): - """apply gofmt to modified files - - Applies gofmt to the modified files in the repository that match - the given patterns. - """ - if codereview_disabled: - return codereview_disabled - - files = ChangedExistingFiles(ui, repo, pats, opts) - files = gofmt_required(files) - if not files: - return "no modified go files" - cwd = os.getcwd() - files = [RelativePath(repo.root + '/' + f, cwd) for f in files] - try: - cmd = ["gofmt", "-l"] - if not opts["list"]: - cmd += ["-w"] - if os.spawnvp(os.P_WAIT, "gofmt", cmd + files) != 0: - raise hg_util.Abort("gofmt did not exit cleanly") - except hg_error.Abort, e: - raise - except: - raise hg_util.Abort("gofmt: " + ExceptionDetail()) - return - -def gofmt_required(files): - return [f for f in files if (not f.startswith('test/') or f.startswith('test/bench/')) and f.endswith('.go')] - -####################################################################### -# hg mail - -@hgcommand -def mail(ui, repo, *pats, **opts): - """mail a change for review - - Uploads a patch to the code review server and then sends mail - to the reviewer and CC list asking for a review. - """ - if codereview_disabled: - return codereview_disabled - - cl, err = CommandLineCL(ui, repo, pats, opts, defaultcc=defaultcc) - if err != "": - return err - cl.Upload(ui, repo, gofmt_just_warn=True) - if not cl.reviewer: - # If no reviewer is listed, assign the review to defaultcc. - # This makes sure that it appears in the - # codereview.appspot.com/user/defaultcc - # page, so that it doesn't get dropped on the floor. - if not defaultcc: - return "no reviewers listed in CL" - cl.cc = Sub(cl.cc, defaultcc) - cl.reviewer = defaultcc - cl.Flush(ui, repo) - - if cl.files == []: - return "no changed files, not sending mail" - - cl.Mail(ui, repo) - -####################################################################### -# hg p / hg pq / hg ps / hg pending - -@hgcommand -def ps(ui, repo, *pats, **opts): - """alias for hg p --short - """ - opts['short'] = True - return pending(ui, repo, *pats, **opts) - -@hgcommand -def pq(ui, repo, *pats, **opts): - """alias for hg p --quick - """ - opts['quick'] = True - return pending(ui, repo, *pats, **opts) - -@hgcommand -def pending(ui, repo, *pats, **opts): - """show pending changes - - Lists pending changes followed by a list of unassigned but modified files. - """ - if codereview_disabled: - return codereview_disabled - - quick = opts.get('quick', False) - short = opts.get('short', False) - m = LoadAllCL(ui, repo, web=not quick and not short) - names = m.keys() - names.sort() - for name in names: - cl = m[name] - if short: - ui.write(name + "\t" + line1(cl.desc) + "\n") - else: - ui.write(cl.PendingText(quick=quick) + "\n") - - if short: - return - files = DefaultFiles(ui, repo, []) - if len(files) > 0: - s = "Changed files not in any CL:\n" - for f in files: - s += "\t" + f + "\n" - ui.write(s) - -####################################################################### -# hg submit - -def need_sync(): - raise hg_util.Abort("local repository out of date; must sync before submit") - -@hgcommand -def submit(ui, repo, *pats, **opts): - """submit change to remote repository - - Submits change to remote repository. - Bails out if the local repository is not in sync with the remote one. - """ - if codereview_disabled: - return codereview_disabled - - # We already called this on startup but sometimes Mercurial forgets. - set_mercurial_encoding_to_utf8() - - if not opts["no_incoming"] and hg_incoming(ui, repo): - need_sync() - - cl, err = CommandLineCL(ui, repo, pats, opts, defaultcc=defaultcc) - if err != "": - return err - - user = None - if cl.copied_from: - user = cl.copied_from - userline = CheckContributor(ui, repo, user) - typecheck(userline, str) - - about = "" - if cl.reviewer: - about += "R=" + JoinComma([CutDomain(s) for s in cl.reviewer]) + "\n" - if opts.get('tbr'): - tbr = SplitCommaSpace(opts.get('tbr')) - cl.reviewer = Add(cl.reviewer, tbr) - about += "TBR=" + JoinComma([CutDomain(s) for s in tbr]) + "\n" - if cl.cc: - about += "CC=" + JoinComma([CutDomain(s) for s in cl.cc]) + "\n" - - if not cl.reviewer: - return "no reviewers listed in CL" - - if not cl.local: - return "cannot submit non-local CL" - - # upload, to sync current patch and also get change number if CL is new. - if not cl.copied_from: - cl.Upload(ui, repo, gofmt_just_warn=True) - - # check gofmt for real; allowed upload to warn in order to save CL. - cl.Flush(ui, repo) - CheckFormat(ui, repo, cl.files) - - about += "%s%s\n" % (server_url_base, cl.name) - - if cl.copied_from: - about += "\nCommitter: " + CheckContributor(ui, repo, None) + "\n" - typecheck(about, str) - - if not cl.mailed and not cl.copied_from: # in case this is TBR - cl.Mail(ui, repo) - - # submit changes locally - message = cl.desc.rstrip() + "\n\n" + about - typecheck(message, str) - - set_status("pushing " + cl.name + " to remote server") - - if hg_outgoing(ui, repo): - raise hg_util.Abort("local repository corrupt or out-of-phase with remote: found outgoing changes") - - old_heads = len(hg_heads(ui, repo).split()) - - global commit_okay - commit_okay = True - ret = hg_commit(ui, repo, *['path:'+f for f in cl.files], message=message, user=userline) - commit_okay = False - if ret: - return "nothing changed" - node = repo["-1"].node() - # push to remote; if it fails for any reason, roll back - try: - new_heads = len(hg_heads(ui, repo).split()) - if old_heads != new_heads and not (old_heads == 0 and new_heads == 1): - # Created new head, so we weren't up to date. - need_sync() - - # Push changes to remote. If it works, we're committed. If not, roll back. - try: - hg_push(ui, repo) - except hg_error.Abort, e: - if e.message.find("push creates new heads") >= 0: - # Remote repository had changes we missed. - need_sync() - raise - except: - real_rollback() - raise - - # We're committed. Upload final patch, close review, add commit message. - changeURL = hg_node.short(node) - url = ui.expandpath("default") - m = re.match("(^https?://([^@/]+@)?([^.]+)\.googlecode\.com/hg/?)" + "|" + - "(^https?://([^@/]+@)?code\.google\.com/p/([^/.]+)(\.[^./]+)?/?)", url) - if m: - if m.group(1): # prj.googlecode.com/hg/ case - changeURL = "http://code.google.com/p/%s/source/detail?r=%s" % (m.group(3), changeURL) - elif m.group(4) and m.group(7): # code.google.com/p/prj.subrepo/ case - changeURL = "http://code.google.com/p/%s/source/detail?r=%s&repo=%s" % (m.group(6), changeURL, m.group(7)[1:]) - elif m.group(4): # code.google.com/p/prj/ case - changeURL = "http://code.google.com/p/%s/source/detail?r=%s" % (m.group(6), changeURL) - else: - print >>sys.stderr, "URL: ", url - else: - print >>sys.stderr, "URL: ", url - pmsg = "*** Submitted as " + changeURL + " ***\n\n" + message - - # When posting, move reviewers to CC line, - # so that the issue stops showing up in their "My Issues" page. - PostMessage(ui, cl.name, pmsg, reviewers="", cc=JoinComma(cl.reviewer+cl.cc)) - - if not cl.copied_from: - EditDesc(cl.name, closed=True, private=cl.private) - cl.Delete(ui, repo) - - c = repo[None] - if c.branch() == releaseBranch and not c.modified() and not c.added() and not c.removed(): - ui.write("switching from %s to default branch.\n" % releaseBranch) - err = hg_clean(repo, "default") - if err: - return err - return None - -####################################################################### -# hg sync - -@hgcommand -def sync(ui, repo, **opts): - """synchronize with remote repository - - Incorporates recent changes from the remote repository - into the local repository. - """ - if codereview_disabled: - return codereview_disabled - - if not opts["local"]: - err = hg_pull(ui, repo, update=True) - if err: - return err - sync_changes(ui, repo) - -def sync_changes(ui, repo): - # Look through recent change log descriptions to find - # potential references to http://.*/our-CL-number. - # Double-check them by looking at the Rietveld log. - for rev in hg_log(ui, repo, limit=100, template="{node}\n").split(): - desc = repo[rev].description().strip() - for clname in re.findall('(?m)^http://(?:[^\n]+)/([0-9]+)$', desc): - if IsLocalCL(ui, repo, clname) and IsRietveldSubmitted(ui, clname, repo[rev].hex()): - ui.warn("CL %s submitted as %s; closing\n" % (clname, repo[rev])) - cl, err = LoadCL(ui, repo, clname, web=False) - if err != "": - ui.warn("loading CL %s: %s\n" % (clname, err)) - continue - if not cl.copied_from: - EditDesc(cl.name, closed=True, private=cl.private) - cl.Delete(ui, repo) - - # Remove files that are not modified from the CLs in which they appear. - all = LoadAllCL(ui, repo, web=False) - changed = ChangedFiles(ui, repo, []) - for cl in all.values(): - extra = Sub(cl.files, changed) - if extra: - ui.warn("Removing unmodified files from CL %s:\n" % (cl.name,)) - for f in extra: - ui.warn("\t%s\n" % (f,)) - cl.files = Sub(cl.files, extra) - cl.Flush(ui, repo) - if not cl.files: - if not cl.copied_from: - ui.warn("CL %s has no files; delete (abandon) with hg change -d %s\n" % (cl.name, cl.name)) - else: - ui.warn("CL %s has no files; delete locally with hg change -D %s\n" % (cl.name, cl.name)) - return - -####################################################################### -# hg upload - -@hgcommand -def upload(ui, repo, name, **opts): - """upload diffs to the code review server - - Uploads the current modifications for a given change to the server. - """ - if codereview_disabled: - return codereview_disabled - - repo.ui.quiet = True - cl, err = LoadCL(ui, repo, name, web=True) - if err != "": - return err - if not cl.local: - return "cannot upload non-local change" - cl.Upload(ui, repo) - print "%s%s\n" % (server_url_base, cl.name) - return - -####################################################################### -# Table of commands, supplied to Mercurial for installation. - -review_opts = [ - ('r', 'reviewer', '', 'add reviewer'), - ('', 'cc', '', 'add cc'), - ('', 'tbr', '', 'add future reviewer'), - ('m', 'message', '', 'change description (for new change)'), -] - -cmdtable = { - # The ^ means to show this command in the help text that - # is printed when running hg with no arguments. - "^change": ( - change, - [ - ('d', 'delete', None, 'delete existing change list'), - ('D', 'deletelocal', None, 'delete locally, but do not change CL on server'), - ('i', 'stdin', None, 'read change list from standard input'), - ('o', 'stdout', None, 'print change list to standard output'), - ('p', 'pending', None, 'print pending summary to standard output'), - ], - "[-d | -D] [-i] [-o] change# or FILE ..." - ), - "^clpatch": ( - clpatch, - [ - ('', 'ignore_hgpatch_failure', None, 'create CL metadata even if hgpatch fails'), - ('', 'no_incoming', None, 'disable check for incoming changes'), - ], - "change#" - ), - # Would prefer to call this codereview-login, but then - # hg help codereview prints the help for this command - # instead of the help for the extension. - "code-login": ( - code_login, - [], - "", - ), - "^download": ( - download, - [], - "change#" - ), - "^file": ( - file, - [ - ('d', 'delete', None, 'delete files from change list (but not repository)'), - ], - "[-d] change# FILE ..." - ), - "^gofmt": ( - gofmt, - [ - ('l', 'list', None, 'list files that would change, but do not edit them'), - ], - "FILE ..." - ), - "^pending|p": ( - pending, - [ - ('s', 'short', False, 'show short result form'), - ('', 'quick', False, 'do not consult codereview server'), - ], - "[FILE ...]" - ), - "^ps": ( - ps, - [], - "[FILE ...]" - ), - "^pq": ( - pq, - [], - "[FILE ...]" - ), - "^mail": ( - mail, - review_opts + [ - ] + hg_commands.walkopts, - "[-r reviewer] [--cc cc] [change# | file ...]" - ), - "^release-apply": ( - release_apply, - [ - ('', 'ignore_hgpatch_failure', None, 'create CL metadata even if hgpatch fails'), - ('', 'no_incoming', None, 'disable check for incoming changes'), - ], - "change#" - ), - # TODO: release-start, release-tag, weekly-tag - "^submit": ( - submit, - review_opts + [ - ('', 'no_incoming', None, 'disable initial incoming check (for testing)'), - ] + hg_commands.walkopts + hg_commands.commitopts + hg_commands.commitopts2, - "[-r reviewer] [--cc cc] [change# | file ...]" - ), - "^sync": ( - sync, - [ - ('', 'local', None, 'do not pull changes from remote repository') - ], - "[--local]", - ), - "^undo": ( - undo, - [ - ('', 'ignore_hgpatch_failure', None, 'create CL metadata even if hgpatch fails'), - ('', 'no_incoming', None, 'disable check for incoming changes'), - ], - "change#" - ), - "^upload": ( - upload, - [], - "change#" - ), -} - -####################################################################### -# Mercurial extension initialization - -def norollback(*pats, **opts): - """(disabled when using this extension)""" - raise hg_util.Abort("codereview extension enabled; use undo instead of rollback") - -codereview_init = False - -def reposetup(ui, repo): - global codereview_disabled - global defaultcc - - # reposetup gets called both for the local repository - # and also for any repository we are pulling or pushing to. - # Only initialize the first time. - global codereview_init - if codereview_init: - return - codereview_init = True - - # Read repository-specific options from lib/codereview/codereview.cfg or codereview.cfg. - root = '' - try: - root = repo.root - except: - # Yes, repo might not have root; see issue 959. - codereview_disabled = 'codereview disabled: repository has no root' - return - - repo_config_path = '' - p1 = root + '/lib/codereview/codereview.cfg' - p2 = root + '/codereview.cfg' - if os.access(p1, os.F_OK): - repo_config_path = p1 - else: - repo_config_path = p2 - try: - f = open(repo_config_path) - for line in f: - if line.startswith('defaultcc:'): - defaultcc = SplitCommaSpace(line[len('defaultcc:'):]) - if line.startswith('contributors:'): - global contributorsURL - contributorsURL = line[len('contributors:'):].strip() - except: - codereview_disabled = 'codereview disabled: cannot open ' + repo_config_path - return - - remote = ui.config("paths", "default", "") - if remote.find("://") < 0: - raise hg_util.Abort("codereview: default path '%s' is not a URL" % (remote,)) - - InstallMatch(ui, repo) - RietveldSetup(ui, repo) - - # Disable the Mercurial commands that might change the repository. - # Only commands in this extension are supposed to do that. - ui.setconfig("hooks", "precommit.codereview", precommithook) - - # Rollback removes an existing commit. Don't do that either. - global real_rollback - real_rollback = repo.rollback - repo.rollback = norollback - - -####################################################################### -# Wrappers around upload.py for interacting with Rietveld - -from HTMLParser import HTMLParser - -# HTML form parser -class FormParser(HTMLParser): - def __init__(self): - self.map = {} - self.curtag = None - self.curdata = None - HTMLParser.__init__(self) - def handle_starttag(self, tag, attrs): - if tag == "input": - key = None - value = '' - for a in attrs: - if a[0] == 'name': - key = a[1] - if a[0] == 'value': - value = a[1] - if key is not None: - self.map[key] = value - if tag == "textarea": - key = None - for a in attrs: - if a[0] == 'name': - key = a[1] - if key is not None: - self.curtag = key - self.curdata = '' - def handle_endtag(self, tag): - if tag == "textarea" and self.curtag is not None: - self.map[self.curtag] = self.curdata - self.curtag = None - self.curdata = None - def handle_charref(self, name): - self.handle_data(unichr(int(name))) - def handle_entityref(self, name): - import htmlentitydefs - if name in htmlentitydefs.entitydefs: - self.handle_data(htmlentitydefs.entitydefs[name]) - else: - self.handle_data("&" + name + ";") - def handle_data(self, data): - if self.curdata is not None: - self.curdata += data - -def JSONGet(ui, path): - try: - data = MySend(path, force_auth=False) - typecheck(data, str) - d = fix_json(json.loads(data)) - except: - ui.warn("JSONGet %s: %s\n" % (path, ExceptionDetail())) - return None - return d - -# Clean up json parser output to match our expectations: -# * all strings are UTF-8-encoded str, not unicode. -# * missing fields are missing, not None, -# so that d.get("foo", defaultvalue) works. -def fix_json(x): - if type(x) in [str, int, float, bool, type(None)]: - pass - elif type(x) is unicode: - x = x.encode("utf-8") - elif type(x) is list: - for i in range(len(x)): - x[i] = fix_json(x[i]) - elif type(x) is dict: - todel = [] - for k in x: - if x[k] is None: - todel.append(k) - else: - x[k] = fix_json(x[k]) - for k in todel: - del x[k] - else: - raise hg_util.Abort("unknown type " + str(type(x)) + " in fix_json") - if type(x) is str: - x = x.replace('\r\n', '\n') - return x - -def IsRietveldSubmitted(ui, clname, hex): - dict = JSONGet(ui, "/api/" + clname + "?messages=true") - if dict is None: - return False - for msg in dict.get("messages", []): - text = msg.get("text", "") - m = re.match('\*\*\* Submitted as [^*]*?([0-9a-f]+) \*\*\*', text) - if m is not None and len(m.group(1)) >= 8 and hex.startswith(m.group(1)): - return True - return False - -def IsRietveldMailed(cl): - for msg in cl.dict.get("messages", []): - if msg.get("text", "").find("I'd like you to review this change") >= 0: - return True - return False - -def DownloadCL(ui, repo, clname): - set_status("downloading CL " + clname) - cl, err = LoadCL(ui, repo, clname, web=True) - if err != "": - return None, None, None, "error loading CL %s: %s" % (clname, err) - - # Find most recent diff - diffs = cl.dict.get("patchsets", []) - if not diffs: - return None, None, None, "CL has no patch sets" - patchid = diffs[-1] - - patchset = JSONGet(ui, "/api/" + clname + "/" + str(patchid)) - if patchset is None: - return None, None, None, "error loading CL patchset %s/%d" % (clname, patchid) - if patchset.get("patchset", 0) != patchid: - return None, None, None, "malformed patchset information" - - vers = "" - msg = patchset.get("message", "").split() - if len(msg) >= 3 and msg[0] == "diff" and msg[1] == "-r": - vers = msg[2] - diff = "/download/issue" + clname + "_" + str(patchid) + ".diff" - - diffdata = MySend(diff, force_auth=False) - - # Print warning if email is not in CONTRIBUTORS file. - email = cl.dict.get("owner_email", "") - if not email: - return None, None, None, "cannot find owner for %s" % (clname) - him = FindContributor(ui, repo, email) - me = FindContributor(ui, repo, None) - if him == me: - cl.mailed = IsRietveldMailed(cl) - else: - cl.copied_from = email - - return cl, vers, diffdata, "" - -def MySend(request_path, payload=None, - content_type="application/octet-stream", - timeout=None, force_auth=True, - **kwargs): - """Run MySend1 maybe twice, because Rietveld is unreliable.""" - try: - return MySend1(request_path, payload, content_type, timeout, force_auth, **kwargs) - except Exception, e: - if type(e) != urllib2.HTTPError or e.code != 500: # only retry on HTTP 500 error - raise - print >>sys.stderr, "Loading "+request_path+": "+ExceptionDetail()+"; trying again in 2 seconds." - time.sleep(2) - return MySend1(request_path, payload, content_type, timeout, force_auth, **kwargs) - -# Like upload.py Send but only authenticates when the -# redirect is to www.google.com/accounts. This keeps -# unnecessary redirects from happening during testing. -def MySend1(request_path, payload=None, - content_type="application/octet-stream", - timeout=None, force_auth=True, - **kwargs): - """Sends an RPC and returns the response. - - Args: - request_path: The path to send the request to, eg /api/appversion/create. - payload: The body of the request, or None to send an empty request. - content_type: The Content-Type header to use. - timeout: timeout in seconds; default None i.e. no timeout. - (Note: for large requests on OS X, the timeout doesn't work right.) - kwargs: Any keyword arguments are converted into query string parameters. - - Returns: - The response body, as a string. - """ - # TODO: Don't require authentication. Let the server say - # whether it is necessary. - global rpc - if rpc == None: - rpc = GetRpcServer(upload_options) - self = rpc - if not self.authenticated and force_auth: - self._Authenticate() - if request_path is None: - return - - old_timeout = socket.getdefaulttimeout() - socket.setdefaulttimeout(timeout) - try: - tries = 0 - while True: - tries += 1 - args = dict(kwargs) - url = "http://%s%s" % (self.host, request_path) - if args: - url += "?" + urllib.urlencode(args) - req = self._CreateRequest(url=url, data=payload) - req.add_header("Content-Type", content_type) - try: - f = self.opener.open(req) - response = f.read() - f.close() - # Translate \r\n into \n, because Rietveld doesn't. - response = response.replace('\r\n', '\n') - # who knows what urllib will give us - if type(response) == unicode: - response = response.encode("utf-8") - typecheck(response, str) - return response - except urllib2.HTTPError, e: - if tries > 3: - raise - elif e.code == 401: - self._Authenticate() - elif e.code == 302: - loc = e.info()["location"] - if not loc.startswith('https://www.google.com/a') or loc.find('/ServiceLogin') < 0: - return '' - self._Authenticate() - else: - raise - finally: - socket.setdefaulttimeout(old_timeout) - -def GetForm(url): - f = FormParser() - f.feed(ustr(MySend(url))) # f.feed wants unicode - f.close() - # convert back to utf-8 to restore sanity - m = {} - for k,v in f.map.items(): - m[k.encode("utf-8")] = v.replace("\r\n", "\n").encode("utf-8") - return m - -def EditDesc(issue, subject=None, desc=None, reviewers=None, cc=None, closed=False, private=False): - set_status("uploading change to description") - form_fields = GetForm("/" + issue + "/edit") - if subject is not None: - form_fields['subject'] = subject - if desc is not None: - form_fields['description'] = desc - if reviewers is not None: - form_fields['reviewers'] = reviewers - if cc is not None: - form_fields['cc'] = cc - if closed: - form_fields['closed'] = "checked" - if private: - form_fields['private'] = "checked" - ctype, body = EncodeMultipartFormData(form_fields.items(), []) - response = MySend("/" + issue + "/edit", body, content_type=ctype) - if response != "": - print >>sys.stderr, "Error editing description:\n" + "Sent form: \n", form_fields, "\n", response - sys.exit(2) - -def PostMessage(ui, issue, message, reviewers=None, cc=None, send_mail=True, subject=None): - set_status("uploading message") - form_fields = GetForm("/" + issue + "/publish") - if reviewers is not None: - form_fields['reviewers'] = reviewers - if cc is not None: - form_fields['cc'] = cc - if send_mail: - form_fields['send_mail'] = "checked" - else: - del form_fields['send_mail'] - if subject is not None: - form_fields['subject'] = subject - form_fields['message'] = message - - form_fields['message_only'] = '1' # Don't include draft comments - if reviewers is not None or cc is not None: - form_fields['message_only'] = '' # Must set '' in order to override cc/reviewer - ctype = "applications/x-www-form-urlencoded" - body = urllib.urlencode(form_fields) - response = MySend("/" + issue + "/publish", body, content_type=ctype) - if response != "": - print response - sys.exit(2) - -class opt(object): - pass - -def RietveldSetup(ui, repo): - global force_google_account - global rpc - global server - global server_url_base - global upload_options - global verbosity - - if not ui.verbose: - verbosity = 0 - - # Config options. - x = ui.config("codereview", "server") - if x is not None: - server = x - - # TODO(rsc): Take from ui.username? - email = None - x = ui.config("codereview", "email") - if x is not None: - email = x - - server_url_base = "http://" + server + "/" - - testing = ui.config("codereview", "testing") - force_google_account = ui.configbool("codereview", "force_google_account", False) - - upload_options = opt() - upload_options.email = email - upload_options.host = None - upload_options.verbose = 0 - upload_options.description = None - upload_options.description_file = None - upload_options.reviewers = None - upload_options.cc = None - upload_options.message = None - upload_options.issue = None - upload_options.download_base = False - upload_options.revision = None - upload_options.send_mail = False - upload_options.vcs = None - upload_options.server = server - upload_options.save_cookies = True - - if testing: - upload_options.save_cookies = False - upload_options.email = "test@example.com" - - rpc = None - - global releaseBranch - tags = repo.branchmap().keys() - if 'release-branch.go10' in tags: - # NOTE(rsc): This tags.sort is going to get the wrong - # answer when comparing release-branch.go9 with - # release-branch.go10. It will be a while before we care. - raise hg_util.Abort('tags.sort needs to be fixed for release-branch.go10') - tags.sort() - for t in tags: - if t.startswith('release-branch.go'): - releaseBranch = t - -####################################################################### -# http://codereview.appspot.com/static/upload.py, heavily edited. - -#!/usr/bin/env python -# -# Copyright 2007 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tool for uploading diffs from a version control system to the codereview app. - -Usage summary: upload.py [options] [-- diff_options] - -Diff options are passed to the diff command of the underlying system. - -Supported version control systems: - Git - Mercurial - Subversion - -It is important for Git/Mercurial users to specify a tree/node/branch to diff -against by using the '--rev' option. -""" -# This code is derived from appcfg.py in the App Engine SDK (open source), -# and from ASPN recipe #146306. - -import cookielib -import getpass -import logging -import mimetypes -import optparse -import os -import re -import socket -import subprocess -import sys -import urllib -import urllib2 -import urlparse - -# The md5 module was deprecated in Python 2.5. -try: - from hashlib import md5 -except ImportError: - from md5 import md5 - -try: - import readline -except ImportError: - pass - -# The logging verbosity: -# 0: Errors only. -# 1: Status messages. -# 2: Info logs. -# 3: Debug logs. -verbosity = 1 - -# Max size of patch or base file. -MAX_UPLOAD_SIZE = 900 * 1024 - -# whitelist for non-binary filetypes which do not start with "text/" -# .mm (Objective-C) shows up as application/x-freemind on my Linux box. -TEXT_MIMETYPES = [ - 'application/javascript', - 'application/x-javascript', - 'application/x-freemind' -] - -def GetEmail(prompt): - """Prompts the user for their email address and returns it. - - The last used email address is saved to a file and offered up as a suggestion - to the user. If the user presses enter without typing in anything the last - used email address is used. If the user enters a new address, it is saved - for next time we prompt. - - """ - last_email_file_name = os.path.expanduser("~/.last_codereview_email_address") - last_email = "" - if os.path.exists(last_email_file_name): - try: - last_email_file = open(last_email_file_name, "r") - last_email = last_email_file.readline().strip("\n") - last_email_file.close() - prompt += " [%s]" % last_email - except IOError, e: - pass - email = raw_input(prompt + ": ").strip() - if email: - try: - last_email_file = open(last_email_file_name, "w") - last_email_file.write(email) - last_email_file.close() - except IOError, e: - pass - else: - email = last_email - return email - - -def StatusUpdate(msg): - """Print a status message to stdout. - - If 'verbosity' is greater than 0, print the message. - - Args: - msg: The string to print. - """ - if verbosity > 0: - print msg - - -def ErrorExit(msg): - """Print an error message to stderr and exit.""" - print >>sys.stderr, msg - sys.exit(1) - - -class ClientLoginError(urllib2.HTTPError): - """Raised to indicate there was an error authenticating with ClientLogin.""" - - def __init__(self, url, code, msg, headers, args): - urllib2.HTTPError.__init__(self, url, code, msg, headers, None) - self.args = args - self.reason = args["Error"] - - -class AbstractRpcServer(object): - """Provides a common interface for a simple RPC server.""" - - def __init__(self, host, auth_function, host_override=None, extra_headers={}, save_cookies=False): - """Creates a new HttpRpcServer. - - Args: - host: The host to send requests to. - auth_function: A function that takes no arguments and returns an - (email, password) tuple when called. Will be called if authentication - is required. - host_override: The host header to send to the server (defaults to host). - extra_headers: A dict of extra headers to append to every request. - save_cookies: If True, save the authentication cookies to local disk. - If False, use an in-memory cookiejar instead. Subclasses must - implement this functionality. Defaults to False. - """ - self.host = host - self.host_override = host_override - self.auth_function = auth_function - self.authenticated = False - self.extra_headers = extra_headers - self.save_cookies = save_cookies - self.opener = self._GetOpener() - if self.host_override: - logging.info("Server: %s; Host: %s", self.host, self.host_override) - else: - logging.info("Server: %s", self.host) - - def _GetOpener(self): - """Returns an OpenerDirector for making HTTP requests. - - Returns: - A urllib2.OpenerDirector object. - """ - raise NotImplementedError() - - def _CreateRequest(self, url, data=None): - """Creates a new urllib request.""" - logging.debug("Creating request for: '%s' with payload:\n%s", url, data) - req = urllib2.Request(url, data=data) - if self.host_override: - req.add_header("Host", self.host_override) - for key, value in self.extra_headers.iteritems(): - req.add_header(key, value) - return req - - def _GetAuthToken(self, email, password): - """Uses ClientLogin to authenticate the user, returning an auth token. - - Args: - email: The user's email address - password: The user's password - - Raises: - ClientLoginError: If there was an error authenticating with ClientLogin. - HTTPError: If there was some other form of HTTP error. - - Returns: - The authentication token returned by ClientLogin. - """ - account_type = "GOOGLE" - if self.host.endswith(".google.com") and not force_google_account: - # Needed for use inside Google. - account_type = "HOSTED" - req = self._CreateRequest( - url="https://www.google.com/accounts/ClientLogin", - data=urllib.urlencode({ - "Email": email, - "Passwd": password, - "service": "ah", - "source": "rietveld-codereview-upload", - "accountType": account_type, - }), - ) - try: - response = self.opener.open(req) - response_body = response.read() - response_dict = dict(x.split("=") for x in response_body.split("\n") if x) - return response_dict["Auth"] - except urllib2.HTTPError, e: - if e.code == 403: - body = e.read() - response_dict = dict(x.split("=", 1) for x in body.split("\n") if x) - raise ClientLoginError(req.get_full_url(), e.code, e.msg, e.headers, response_dict) - else: - raise - - def _GetAuthCookie(self, auth_token): - """Fetches authentication cookies for an authentication token. - - Args: - auth_token: The authentication token returned by ClientLogin. - - Raises: - HTTPError: If there was an error fetching the authentication cookies. - """ - # This is a dummy value to allow us to identify when we're successful. - continue_location = "http://localhost/" - args = {"continue": continue_location, "auth": auth_token} - req = self._CreateRequest("http://%s/_ah/login?%s" % (self.host, urllib.urlencode(args))) - try: - response = self.opener.open(req) - except urllib2.HTTPError, e: - response = e - if (response.code != 302 or - response.info()["location"] != continue_location): - raise urllib2.HTTPError(req.get_full_url(), response.code, response.msg, response.headers, response.fp) - self.authenticated = True - - def _Authenticate(self): - """Authenticates the user. - - The authentication process works as follows: - 1) We get a username and password from the user - 2) We use ClientLogin to obtain an AUTH token for the user - (see http://code.google.com/apis/accounts/AuthForInstalledApps.html). - 3) We pass the auth token to /_ah/login on the server to obtain an - authentication cookie. If login was successful, it tries to redirect - us to the URL we provided. - - If we attempt to access the upload API without first obtaining an - authentication cookie, it returns a 401 response (or a 302) and - directs us to authenticate ourselves with ClientLogin. - """ - for i in range(3): - credentials = self.auth_function() - try: - auth_token = self._GetAuthToken(credentials[0], credentials[1]) - except ClientLoginError, e: - if e.reason == "BadAuthentication": - print >>sys.stderr, "Invalid username or password." - continue - if e.reason == "CaptchaRequired": - print >>sys.stderr, ( - "Please go to\n" - "https://www.google.com/accounts/DisplayUnlockCaptcha\n" - "and verify you are a human. Then try again.") - break - if e.reason == "NotVerified": - print >>sys.stderr, "Account not verified." - break - if e.reason == "TermsNotAgreed": - print >>sys.stderr, "User has not agreed to TOS." - break - if e.reason == "AccountDeleted": - print >>sys.stderr, "The user account has been deleted." - break - if e.reason == "AccountDisabled": - print >>sys.stderr, "The user account has been disabled." - break - if e.reason == "ServiceDisabled": - print >>sys.stderr, "The user's access to the service has been disabled." - break - if e.reason == "ServiceUnavailable": - print >>sys.stderr, "The service is not available; try again later." - break - raise - self._GetAuthCookie(auth_token) - return - - def Send(self, request_path, payload=None, - content_type="application/octet-stream", - timeout=None, - **kwargs): - """Sends an RPC and returns the response. - - Args: - request_path: The path to send the request to, eg /api/appversion/create. - payload: The body of the request, or None to send an empty request. - content_type: The Content-Type header to use. - timeout: timeout in seconds; default None i.e. no timeout. - (Note: for large requests on OS X, the timeout doesn't work right.) - kwargs: Any keyword arguments are converted into query string parameters. - - Returns: - The response body, as a string. - """ - # TODO: Don't require authentication. Let the server say - # whether it is necessary. - if not self.authenticated: - self._Authenticate() - - old_timeout = socket.getdefaulttimeout() - socket.setdefaulttimeout(timeout) - try: - tries = 0 - while True: - tries += 1 - args = dict(kwargs) - url = "http://%s%s" % (self.host, request_path) - if args: - url += "?" + urllib.urlencode(args) - req = self._CreateRequest(url=url, data=payload) - req.add_header("Content-Type", content_type) - try: - f = self.opener.open(req) - response = f.read() - f.close() - return response - except urllib2.HTTPError, e: - if tries > 3: - raise - elif e.code == 401 or e.code == 302: - self._Authenticate() - else: - raise - finally: - socket.setdefaulttimeout(old_timeout) - - -class HttpRpcServer(AbstractRpcServer): - """Provides a simplified RPC-style interface for HTTP requests.""" - - def _Authenticate(self): - """Save the cookie jar after authentication.""" - super(HttpRpcServer, self)._Authenticate() - if self.save_cookies: - StatusUpdate("Saving authentication cookies to %s" % self.cookie_file) - self.cookie_jar.save() - - def _GetOpener(self): - """Returns an OpenerDirector that supports cookies and ignores redirects. - - Returns: - A urllib2.OpenerDirector object. - """ - opener = urllib2.OpenerDirector() - opener.add_handler(urllib2.ProxyHandler()) - opener.add_handler(urllib2.UnknownHandler()) - opener.add_handler(urllib2.HTTPHandler()) - opener.add_handler(urllib2.HTTPDefaultErrorHandler()) - opener.add_handler(urllib2.HTTPSHandler()) - opener.add_handler(urllib2.HTTPErrorProcessor()) - if self.save_cookies: - self.cookie_file = os.path.expanduser("~/.codereview_upload_cookies_" + server) - self.cookie_jar = cookielib.MozillaCookieJar(self.cookie_file) - if os.path.exists(self.cookie_file): - try: - self.cookie_jar.load() - self.authenticated = True - StatusUpdate("Loaded authentication cookies from %s" % self.cookie_file) - except (cookielib.LoadError, IOError): - # Failed to load cookies - just ignore them. - pass - else: - # Create an empty cookie file with mode 600 - fd = os.open(self.cookie_file, os.O_CREAT, 0600) - os.close(fd) - # Always chmod the cookie file - os.chmod(self.cookie_file, 0600) - else: - # Don't save cookies across runs of update.py. - self.cookie_jar = cookielib.CookieJar() - opener.add_handler(urllib2.HTTPCookieProcessor(self.cookie_jar)) - return opener - - -def GetRpcServer(options): - """Returns an instance of an AbstractRpcServer. - - Returns: - A new AbstractRpcServer, on which RPC calls can be made. - """ - - rpc_server_class = HttpRpcServer - - def GetUserCredentials(): - """Prompts the user for a username and password.""" - # Disable status prints so they don't obscure the password prompt. - global global_status - st = global_status - global_status = None - - email = options.email - if email is None: - email = GetEmail("Email (login for uploading to %s)" % options.server) - password = getpass.getpass("Password for %s: " % email) - - # Put status back. - global_status = st - return (email, password) - - # If this is the dev_appserver, use fake authentication. - host = (options.host or options.server).lower() - if host == "localhost" or host.startswith("localhost:"): - email = options.email - if email is None: - email = "test@example.com" - logging.info("Using debug user %s. Override with --email" % email) - server = rpc_server_class( - options.server, - lambda: (email, "password"), - host_override=options.host, - extra_headers={"Cookie": 'dev_appserver_login="%s:False"' % email}, - save_cookies=options.save_cookies) - # Don't try to talk to ClientLogin. - server.authenticated = True - return server - - return rpc_server_class(options.server, GetUserCredentials, - host_override=options.host, save_cookies=options.save_cookies) - - -def EncodeMultipartFormData(fields, files): - """Encode form fields for multipart/form-data. - - Args: - fields: A sequence of (name, value) elements for regular form fields. - files: A sequence of (name, filename, value) elements for data to be - uploaded as files. - Returns: - (content_type, body) ready for httplib.HTTP instance. - - Source: - http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306 - """ - BOUNDARY = '-M-A-G-I-C---B-O-U-N-D-A-R-Y-' - CRLF = '\r\n' - lines = [] - for (key, value) in fields: - typecheck(key, str) - typecheck(value, str) - lines.append('--' + BOUNDARY) - lines.append('Content-Disposition: form-data; name="%s"' % key) - lines.append('') - lines.append(value) - for (key, filename, value) in files: - typecheck(key, str) - typecheck(filename, str) - typecheck(value, str) - lines.append('--' + BOUNDARY) - lines.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)) - lines.append('Content-Type: %s' % GetContentType(filename)) - lines.append('') - lines.append(value) - lines.append('--' + BOUNDARY + '--') - lines.append('') - body = CRLF.join(lines) - content_type = 'multipart/form-data; boundary=%s' % BOUNDARY - return content_type, body - - -def GetContentType(filename): - """Helper to guess the content-type from the filename.""" - return mimetypes.guess_type(filename)[0] or 'application/octet-stream' - - -# Use a shell for subcommands on Windows to get a PATH search. -use_shell = sys.platform.startswith("win") - -def RunShellWithReturnCode(command, print_output=False, - universal_newlines=True, env=os.environ): - """Executes a command and returns the output from stdout and the return code. - - Args: - command: Command to execute. - print_output: If True, the output is printed to stdout. - If False, both stdout and stderr are ignored. - universal_newlines: Use universal_newlines flag (default: True). - - Returns: - Tuple (output, return code) - """ - logging.info("Running %s", command) - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - shell=use_shell, universal_newlines=universal_newlines, env=env) - if print_output: - output_array = [] - while True: - line = p.stdout.readline() - if not line: - break - print line.strip("\n") - output_array.append(line) - output = "".join(output_array) - else: - output = p.stdout.read() - p.wait() - errout = p.stderr.read() - if print_output and errout: - print >>sys.stderr, errout - p.stdout.close() - p.stderr.close() - return output, p.returncode - - -def RunShell(command, silent_ok=False, universal_newlines=True, - print_output=False, env=os.environ): - data, retcode = RunShellWithReturnCode(command, print_output, universal_newlines, env) - if retcode: - ErrorExit("Got error status from %s:\n%s" % (command, data)) - if not silent_ok and not data: - ErrorExit("No output from %s" % command) - return data - - -class VersionControlSystem(object): - """Abstract base class providing an interface to the VCS.""" - - def __init__(self, options): - """Constructor. - - Args: - options: Command line options. - """ - self.options = options - - def GenerateDiff(self, args): - """Return the current diff as a string. - - Args: - args: Extra arguments to pass to the diff command. - """ - raise NotImplementedError( - "abstract method -- subclass %s must override" % self.__class__) - - def GetUnknownFiles(self): - """Return a list of files unknown to the VCS.""" - raise NotImplementedError( - "abstract method -- subclass %s must override" % self.__class__) - - def CheckForUnknownFiles(self): - """Show an "are you sure?" prompt if there are unknown files.""" - unknown_files = self.GetUnknownFiles() - if unknown_files: - print "The following files are not added to version control:" - for line in unknown_files: - print line - prompt = "Are you sure to continue?(y/N) " - answer = raw_input(prompt).strip() - if answer != "y": - ErrorExit("User aborted") - - def GetBaseFile(self, filename): - """Get the content of the upstream version of a file. - - Returns: - A tuple (base_content, new_content, is_binary, status) - base_content: The contents of the base file. - new_content: For text files, this is empty. For binary files, this is - the contents of the new file, since the diff output won't contain - information to reconstruct the current file. - is_binary: True iff the file is binary. - status: The status of the file. - """ - - raise NotImplementedError( - "abstract method -- subclass %s must override" % self.__class__) - - - def GetBaseFiles(self, diff): - """Helper that calls GetBase file for each file in the patch. - - Returns: - A dictionary that maps from filename to GetBaseFile's tuple. Filenames - are retrieved based on lines that start with "Index:" or - "Property changes on:". - """ - files = {} - for line in diff.splitlines(True): - if line.startswith('Index:') or line.startswith('Property changes on:'): - unused, filename = line.split(':', 1) - # On Windows if a file has property changes its filename uses '\' - # instead of '/'. - filename = to_slash(filename.strip()) - files[filename] = self.GetBaseFile(filename) - return files - - - def UploadBaseFiles(self, issue, rpc_server, patch_list, patchset, options, - files): - """Uploads the base files (and if necessary, the current ones as well).""" - - def UploadFile(filename, file_id, content, is_binary, status, is_base): - """Uploads a file to the server.""" - set_status("uploading " + filename) - file_too_large = False - if is_base: - type = "base" - else: - type = "current" - if len(content) > MAX_UPLOAD_SIZE: - print ("Not uploading the %s file for %s because it's too large." % - (type, filename)) - file_too_large = True - content = "" - checksum = md5(content).hexdigest() - if options.verbose > 0 and not file_too_large: - print "Uploading %s file for %s" % (type, filename) - url = "/%d/upload_content/%d/%d" % (int(issue), int(patchset), file_id) - form_fields = [ - ("filename", filename), - ("status", status), - ("checksum", checksum), - ("is_binary", str(is_binary)), - ("is_current", str(not is_base)), - ] - if file_too_large: - form_fields.append(("file_too_large", "1")) - if options.email: - form_fields.append(("user", options.email)) - ctype, body = EncodeMultipartFormData(form_fields, [("data", filename, content)]) - response_body = rpc_server.Send(url, body, content_type=ctype) - if not response_body.startswith("OK"): - StatusUpdate(" --> %s" % response_body) - sys.exit(1) - - # Don't want to spawn too many threads, nor do we want to - # hit Rietveld too hard, or it will start serving 500 errors. - # When 8 works, it's no better than 4, and sometimes 8 is - # too many for Rietveld to handle. - MAX_PARALLEL_UPLOADS = 4 - - sema = threading.BoundedSemaphore(MAX_PARALLEL_UPLOADS) - upload_threads = [] - finished_upload_threads = [] - - class UploadFileThread(threading.Thread): - def __init__(self, args): - threading.Thread.__init__(self) - self.args = args - def run(self): - UploadFile(*self.args) - finished_upload_threads.append(self) - sema.release() - - def StartUploadFile(*args): - sema.acquire() - while len(finished_upload_threads) > 0: - t = finished_upload_threads.pop() - upload_threads.remove(t) - t.join() - t = UploadFileThread(args) - upload_threads.append(t) - t.start() - - def WaitForUploads(): - for t in upload_threads: - t.join() - - patches = dict() - [patches.setdefault(v, k) for k, v in patch_list] - for filename in patches.keys(): - base_content, new_content, is_binary, status = files[filename] - file_id_str = patches.get(filename) - if file_id_str.find("nobase") != -1: - base_content = None - file_id_str = file_id_str[file_id_str.rfind("_") + 1:] - file_id = int(file_id_str) - if base_content != None: - StartUploadFile(filename, file_id, base_content, is_binary, status, True) - if new_content != None: - StartUploadFile(filename, file_id, new_content, is_binary, status, False) - WaitForUploads() - - def IsImage(self, filename): - """Returns true if the filename has an image extension.""" - mimetype = mimetypes.guess_type(filename)[0] - if not mimetype: - return False - return mimetype.startswith("image/") - - def IsBinary(self, filename): - """Returns true if the guessed mimetyped isnt't in text group.""" - mimetype = mimetypes.guess_type(filename)[0] - if not mimetype: - return False # e.g. README, "real" binaries usually have an extension - # special case for text files which don't start with text/ - if mimetype in TEXT_MIMETYPES: - return False - return not mimetype.startswith("text/") - - -class FakeMercurialUI(object): - def __init__(self): - self.quiet = True - self.output = '' - - def write(self, *args, **opts): - self.output += ' '.join(args) - def copy(self): - return self - def status(self, *args, **opts): - pass - - def formatter(self, topic, opts): - from mercurial.formatter import plainformatter - return plainformatter(self, topic, opts) - - def readconfig(self, *args, **opts): - pass - def expandpath(self, *args, **opts): - return global_ui.expandpath(*args, **opts) - def configitems(self, *args, **opts): - return global_ui.configitems(*args, **opts) - def config(self, *args, **opts): - return global_ui.config(*args, **opts) - -use_hg_shell = False # set to True to shell out to hg always; slower - -class MercurialVCS(VersionControlSystem): - """Implementation of the VersionControlSystem interface for Mercurial.""" - - def __init__(self, options, ui, repo): - super(MercurialVCS, self).__init__(options) - self.ui = ui - self.repo = repo - self.status = None - # Absolute path to repository (we can be in a subdir) - self.repo_dir = os.path.normpath(repo.root) - # Compute the subdir - cwd = os.path.normpath(os.getcwd()) - assert cwd.startswith(self.repo_dir) - self.subdir = cwd[len(self.repo_dir):].lstrip(r"\/") - if self.options.revision: - self.base_rev = self.options.revision - else: - mqparent, err = RunShellWithReturnCode(['hg', 'log', '--rev', 'qparent', '--template={node}']) - if not err and mqparent != "": - self.base_rev = mqparent - else: - out = RunShell(["hg", "parents", "-q"], silent_ok=True).strip() - if not out: - # No revisions; use 0 to mean a repository with nothing. - out = "0:0" - self.base_rev = out.split(':')[1].strip() - def _GetRelPath(self, filename): - """Get relative path of a file according to the current directory, - given its logical path in the repo.""" - assert filename.startswith(self.subdir), (filename, self.subdir) - return filename[len(self.subdir):].lstrip(r"\/") - - def GenerateDiff(self, extra_args): - # If no file specified, restrict to the current subdir - extra_args = extra_args or ["."] - cmd = ["hg", "diff", "--git", "-r", self.base_rev] + extra_args - data = RunShell(cmd, silent_ok=True) - svndiff = [] - filecount = 0 - for line in data.splitlines(): - m = re.match("diff --git a/(\S+) b/(\S+)", line) - if m: - # Modify line to make it look like as it comes from svn diff. - # With this modification no changes on the server side are required - # to make upload.py work with Mercurial repos. - # NOTE: for proper handling of moved/copied files, we have to use - # the second filename. - filename = m.group(2) - svndiff.append("Index: %s" % filename) - svndiff.append("=" * 67) - filecount += 1 - logging.info(line) - else: - svndiff.append(line) - if not filecount: - ErrorExit("No valid patches found in output from hg diff") - return "\n".join(svndiff) + "\n" - - def GetUnknownFiles(self): - """Return a list of files unknown to the VCS.""" - args = [] - status = RunShell(["hg", "status", "--rev", self.base_rev, "-u", "."], - silent_ok=True) - unknown_files = [] - for line in status.splitlines(): - st, fn = line.split(" ", 1) - if st == "?": - unknown_files.append(fn) - return unknown_files - - def get_hg_status(self, rev, path): - # We'd like to use 'hg status -C path', but that is buggy - # (see http://mercurial.selenic.com/bts/issue3023). - # Instead, run 'hg status -C' without a path - # and skim the output for the path we want. - if self.status is None: - if use_hg_shell: - out = RunShell(["hg", "status", "-C", "--rev", rev]) - else: - fui = FakeMercurialUI() - ret = hg_commands.status(fui, self.repo, *[], **{'rev': [rev], 'copies': True}) - if ret: - raise hg_util.Abort(ret) - out = fui.output - self.status = out.splitlines() - for i in range(len(self.status)): - # line is - # A path - # M path - # etc - line = to_slash(self.status[i]) - if line[2:] == path: - if i+1 < len(self.status) and self.status[i+1][:2] == ' ': - return self.status[i:i+2] - return self.status[i:i+1] - raise hg_util.Abort("no status for " + path) - - def GetBaseFile(self, filename): - set_status("inspecting " + filename) - # "hg status" and "hg cat" both take a path relative to the current subdir - # rather than to the repo root, but "hg diff" has given us the full path - # to the repo root. - base_content = "" - new_content = None - is_binary = False - oldrelpath = relpath = self._GetRelPath(filename) - out = self.get_hg_status(self.base_rev, relpath) - status, what = out[0].split(' ', 1) - if len(out) > 1 and status == "A" and what == relpath: - oldrelpath = out[1].strip() - status = "M" - if ":" in self.base_rev: - base_rev = self.base_rev.split(":", 1)[0] - else: - base_rev = self.base_rev - if status != "A": - if use_hg_shell: - base_content = RunShell(["hg", "cat", "-r", base_rev, oldrelpath], silent_ok=True) - else: - base_content = str(self.repo[base_rev][oldrelpath].data()) - is_binary = "\0" in base_content # Mercurial's heuristic - if status != "R": - new_content = open(relpath, "rb").read() - is_binary = is_binary or "\0" in new_content - if is_binary and base_content and use_hg_shell: - # Fetch again without converting newlines - base_content = RunShell(["hg", "cat", "-r", base_rev, oldrelpath], - silent_ok=True, universal_newlines=False) - if not is_binary or not self.IsImage(relpath): - new_content = None - return base_content, new_content, is_binary, status - - -# NOTE: The SplitPatch function is duplicated in engine.py, keep them in sync. -def SplitPatch(data): - """Splits a patch into separate pieces for each file. - - Args: - data: A string containing the output of svn diff. - - Returns: - A list of 2-tuple (filename, text) where text is the svn diff output - pertaining to filename. - """ - patches = [] - filename = None - diff = [] - for line in data.splitlines(True): - new_filename = None - if line.startswith('Index:'): - unused, new_filename = line.split(':', 1) - new_filename = new_filename.strip() - elif line.startswith('Property changes on:'): - unused, temp_filename = line.split(':', 1) - # When a file is modified, paths use '/' between directories, however - # when a property is modified '\' is used on Windows. Make them the same - # otherwise the file shows up twice. - temp_filename = to_slash(temp_filename.strip()) - if temp_filename != filename: - # File has property changes but no modifications, create a new diff. - new_filename = temp_filename - if new_filename: - if filename and diff: - patches.append((filename, ''.join(diff))) - filename = new_filename - diff = [line] - continue - if diff is not None: - diff.append(line) - if filename and diff: - patches.append((filename, ''.join(diff))) - return patches - - -def UploadSeparatePatches(issue, rpc_server, patchset, data, options): - """Uploads a separate patch for each file in the diff output. - - Returns a list of [patch_key, filename] for each file. - """ - patches = SplitPatch(data) - rv = [] - for patch in patches: - set_status("uploading patch for " + patch[0]) - if len(patch[1]) > MAX_UPLOAD_SIZE: - print ("Not uploading the patch for " + patch[0] + - " because the file is too large.") - continue - form_fields = [("filename", patch[0])] - if not options.download_base: - form_fields.append(("content_upload", "1")) - files = [("data", "data.diff", patch[1])] - ctype, body = EncodeMultipartFormData(form_fields, files) - url = "/%d/upload_patch/%d" % (int(issue), int(patchset)) - print "Uploading patch for " + patch[0] - response_body = rpc_server.Send(url, body, content_type=ctype) - lines = response_body.splitlines() - if not lines or lines[0] != "OK": - StatusUpdate(" --> %s" % response_body) - sys.exit(1) - rv.append([lines[1], patch[0]]) - return rv diff --git a/third_party/re2/lib/git/commit-msg.hook b/third_party/re2/lib/git/commit-msg.hook new file mode 100755 index 0000000..985016b --- /dev/null +++ b/third_party/re2/lib/git/commit-msg.hook @@ -0,0 +1,104 @@ +#!/bin/sh +# From Gerrit Code Review 2.2.1 +# +# Part of Gerrit Code Review (http://code.google.com/p/gerrit/) +# +# Copyright (C) 2009 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +CHANGE_ID_AFTER="Bug|Issue" +MSG="$1" + +# Check for, and add if missing, a unique Change-Id +# +add_ChangeId() { + clean_message=`sed -e ' + /^diff --git a\/.*/{ + s/// + q + } + /^Signed-off-by:/d + /^#/d + ' "$MSG" | git stripspace` + if test -z "$clean_message" + then + return + fi + + if grep -i '^Change-Id:' "$MSG" >/dev/null + then + return + fi + + id=`_gen_ChangeId` + perl -e ' + $MSG = shift; + $id = shift; + $CHANGE_ID_AFTER = shift; + + undef $/; + open(I, $MSG); $_ = <I>; close I; + s|^diff --git a/.*||ms; + s|^#.*$||mg; + exit unless $_; + + @message = split /\n/; + $haveFooter = 0; + $startFooter = @message; + for($line = @message - 1; $line >= 0; $line--) { + $_ = $message[$line]; + + if (/^[a-zA-Z0-9-]+:/ && !m,^[a-z0-9-]+://,) { + $haveFooter++; + next; + } + next if /^[ []/; + $startFooter = $line if ($haveFooter && /^\r?$/); + last; + } + + @footer = @message[$startFooter+1..@message]; + @message = @message[0..$startFooter]; + push(@footer, "") unless @footer; + + for ($line = 0; $line < @footer; $line++) { + $_ = $footer[$line]; + next if /^($CHANGE_ID_AFTER):/i; + last; + } + splice(@footer, $line, 0, "Change-Id: I$id"); + + $_ = join("\n", @message, @footer); + open(O, ">$MSG"); print O; close O; + ' "$MSG" "$id" "$CHANGE_ID_AFTER" +} +_gen_ChangeIdInput() { + echo "tree `git write-tree`" + if parent=`git rev-parse HEAD^0 2>/dev/null` + then + echo "parent $parent" + fi + echo "author `git var GIT_AUTHOR_IDENT`" + echo "committer `git var GIT_COMMITTER_IDENT`" + echo + printf '%s' "$clean_message" +} +_gen_ChangeId() { + _gen_ChangeIdInput | + git hash-object -t commit --stdin +} + + +add_ChangeId diff --git a/third_party/re2/libre2.symbols b/third_party/re2/libre2.symbols index 1a9cae3..90a1020 100644 --- a/third_party/re2/libre2.symbols +++ b/third_party/re2/libre2.symbols @@ -10,6 +10,9 @@ _ZlsRSoRKN3re211StringPieceE; # re2::FilteredRE2* _ZN3re211FilteredRE2*; + _ZNK3re211FilteredRE2*; + # flags + _ZN3re2*FLAGS_*; local: *; }; diff --git a/third_party/re2/libre2.symbols.darwin b/third_party/re2/libre2.symbols.darwin index 93eab3e..4207f87 100644 --- a/third_party/re2/libre2.symbols.darwin +++ b/third_party/re2/libre2.symbols.darwin @@ -6,6 +6,14 @@ __ZNK3re23RE2* __ZN3re211StringPiece* __ZNK3re211StringPiece* # operator<<(std::ostream&, re2::StringPiece const&) -__ZlsRSoRKN3re211StringPieceE +# Seen with libstdc++ on 10.8 and below: +# __ZlsRSoRKN3re211StringPieceE +# Seen with libc++ on 10.9 and above: +# __ZlsRNSt3__113basic_ostreamIcNS_11char_traitsIcEEEERKN3re211StringPieceE +# Note that "ls" means operator<<, so this is not overly broad. +__Zls*RKN3re211StringPieceE # re2::FilteredRE2* __ZN3re211FilteredRE2* +__ZNK3re211FilteredRE2* +# flags +__ZN3re2*FLAGS_* diff --git a/third_party/re2/patches/re2-android.patch b/third_party/re2/patches/re2-android.patch deleted file mode 100644 index 67e9816..0000000 --- a/third_party/re2/patches/re2-android.patch +++ /dev/null @@ -1,30 +0,0 @@ -diff --git a/util/util.h b/util/util.h -index 17ef824..8f54040 100644 ---- a/util/util.h -+++ b/util/util.h -@@ -29,6 +29,7 @@ - #include <utility> - #include <set> - -+#include "build/build_config.h" - #include "base/third_party/dynamic_annotations/dynamic_annotations.h" - - // Use std names. -@@ -45,7 +46,7 @@ using std::sort; - using std::swap; - using std::make_pair; - --#if defined(__GNUC__) && !defined(USE_CXX0X) -+#if defined(__GNUC__) && !defined(USE_CXX0X) && !defined(OS_ANDROID) - - #include <tr1/unordered_set> - using std::tr1::unordered_set; -@@ -53,7 +54,7 @@ using std::tr1::unordered_set; - #else - - #include <unordered_set> --#ifdef WIN32 -+#if defined(WIN32) || defined(OS_ANDROID) - using std::tr1::unordered_set; - #else - using std::unordered_set; diff --git a/third_party/re2/patches/re2-libcxx.patch b/third_party/re2/patches/re2-libcxx.patch deleted file mode 100644 index 54f3b6b..0000000 --- a/third_party/re2/patches/re2-libcxx.patch +++ /dev/null @@ -1,23 +0,0 @@ -diff --git a/third_party/re2/util/util.h b/third_party/re2/util/util.h -index 8f54040..de1ef5b 100644 ---- a/third_party/re2/util/util.h -+++ b/third_party/re2/util/util.h -@@ -46,7 +46,8 @@ using std::sort; - using std::swap; - using std::make_pair; - --#if defined(__GNUC__) && !defined(USE_CXX0X) && !defined(OS_ANDROID) -+#if defined(__GNUC__) && !defined(USE_CXX0X) && !defined(OS_ANDROID) && \ -+ !defined(_LIBCPP_ABI_VERSION) - - #include <tr1/unordered_set> - using std::tr1::unordered_set; -@@ -54,1 +55,1 @@ using std::tr1::unordered_set; - #else - - #include <unordered_set> --#if defined(WIN32) || defined(OS_ANDROID) -+#if defined(WIN32) || (defined(OS_ANDROID) && !defined(_LIBCPP_ABI_VERSION)) - using std::tr1::unordered_set; - #else - using std::unordered_set;
\ No newline at end of file diff --git a/third_party/re2/patches/re2-msan.patch b/third_party/re2/patches/re2-msan.patch deleted file mode 100644 index 8577669..0000000 --- a/third_party/re2/patches/re2-msan.patch +++ /dev/null @@ -1,63 +0,0 @@ -diff --git a/third_party/re2/util/sparse_array.h b/third_party/re2/util/sparse_array.h -index 3e33f89..4ee5c94 100644 ---- a/third_party/re2/util/sparse_array.h -+++ b/third_party/re2/util/sparse_array.h -@@ -231,7 +231,8 @@ class SparseArray { - - template<typename Value> - SparseArray<Value>::SparseArray() -- : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(), valgrind_(RunningOnValgrind()) {} -+ : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(), -+ valgrind_(RunningOnValgrindOrMemorySanitizer()) {} - - // IndexValue pairs: exposed in SparseArray::iterator. - template<typename Value> -@@ -418,7 +419,7 @@ void SparseArray<Value>::create_index(int i) { - template<typename Value> SparseArray<Value>::SparseArray(int max_size) { - max_size_ = max_size; - sparse_to_dense_ = new int[max_size]; -- valgrind_ = RunningOnValgrind(); -+ valgrind_ = RunningOnValgrindOrMemorySanitizer(); - dense_.resize(max_size); - // Don't need to zero the new memory, but appease Valgrind. - if (valgrind_) { -diff --git a/third_party/re2/util/sparse_set.h b/third_party/re2/util/sparse_set.h -index 165dd09..4a324d7 100644 ---- a/third_party/re2/util/sparse_set.h -+++ b/third_party/re2/util/sparse_set.h -@@ -54,13 +54,14 @@ namespace re2 { - class SparseSet { - public: - SparseSet() -- : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL), valgrind_(RunningOnValgrind()) {} -+ : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL), -+ valgrind_(RunningOnValgrindOrMemorySanitizer()) {} - - SparseSet(int max_size) { - max_size_ = max_size; - sparse_to_dense_ = new int[max_size]; - dense_ = new int[max_size]; -- valgrind_ = RunningOnValgrind(); -+ valgrind_ = RunningOnValgrindOrMemorySanitizer(); - // Don't need to zero the memory, but do so anyway - // to appease Valgrind. - if (valgrind_) { -diff --git a/third_party/re2/util/util.h b/third_party/re2/util/util.h -index de1ef5b..49159c2 100644 ---- a/third_party/re2/util/util.h -+++ b/third_party/re2/util/util.h -@@ -129,6 +129,14 @@ static inline uint64 Hash64StringWithSeed(const char* s, int len, uint32 seed) { - return ((uint64)x << 32) | y; - } - -+inline bool RunningOnValgrindOrMemorySanitizer() { -+#if defined(MEMORY_SANITIZER) -+ return true; -+#else -+ return RunningOnValgrind(); -+#endif -+} -+ - } // namespace re2 - - #include "util/arena.h" diff --git a/third_party/re2/patches/re2-msvc9-chrome.patch b/third_party/re2/patches/re2-msvc9-chrome.patch deleted file mode 100644 index 49a2b75..0000000 --- a/third_party/re2/patches/re2-msvc9-chrome.patch +++ /dev/null @@ -1,344 +0,0 @@ -diff --git a/AUTHORS b/AUTHORS -index 3c0f928..e17d9bf 100644 ---- a/AUTHORS -+++ b/AUTHORS -@@ -8,5 +8,6 @@ - - # Please keep the list sorted. - -+Brian Gunlogson <unixman83@gmail.com> - Google Inc. - Stefano Rivera <stefano.rivera@gmail.com> -diff --git a/CONTRIBUTORS b/CONTRIBUTORS -index 7b44e04..7f6a93d 100644 ---- a/CONTRIBUTORS -+++ b/CONTRIBUTORS -@@ -26,6 +26,7 @@ - - # Please keep the list sorted. - -+Brian Gunlogson <unixman83@gmail.com> - Dominic Battré <battre@chromium.org> - John Millikin <jmillikin@gmail.com> - Rob Pike <r@google.com> -diff --git a/re2/compile.cc b/re2/compile.cc -index 9cddb71..adb45fd 100644 ---- a/re2/compile.cc -+++ b/re2/compile.cc -@@ -502,7 +502,7 @@ int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { - return UncachedRuneByteSuffix(lo, hi, foldcase, next); - } - -- uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase; -+ uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | (foldcase ? 1ULL : 0ULL); - map<uint64, int>::iterator it = rune_cache_.find(key); - if (it != rune_cache_.end()) - return it->second; -diff --git a/re2/prefilter_tree.cc b/re2/prefilter_tree.cc -index d8bc37a..cdcf77e 100644 ---- a/re2/prefilter_tree.cc -+++ b/re2/prefilter_tree.cc -@@ -8,6 +8,11 @@ - #include "re2/prefilter_tree.h" - #include "re2/re2.h" - -+#ifdef WIN32 -+#include <stdio.h> -+#define snprintf _snprintf -+#endif -+ - DEFINE_int32(filtered_re2_min_atom_len, - 3, - "Strings less than this length are not stored as atoms"); -diff --git a/re2/re2.cc b/re2/re2.cc -index 8d1d468..0da886d 100644 ---- a/re2/re2.cc -+++ b/re2/re2.cc -@@ -11,7 +11,13 @@ - - #include <stdio.h> - #include <string> -+#ifdef WIN32 -+#define strtoll _strtoi64 -+#define strtoull _strtoui64 -+#define strtof strtod -+#else - #include <pthread.h> -+#endif - #include <errno.h> - #include "util/util.h" - #include "util/flags.h" -@@ -31,10 +37,22 @@ const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Par - const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume; - const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume; - --// This will trigger LNK2005 error in MSVC. --#ifndef COMPILER_MSVC --const int RE2::Options::kDefaultMaxMem; // initialized in re2.h --#endif // COMPILER_MSVC -+#define kDefaultMaxMem (8<<20) -+ -+RE2::Options::Options() -+ : encoding_(EncodingUTF8), -+ posix_syntax_(false), -+ longest_match_(false), -+ log_errors_(true), -+ max_mem_(kDefaultMaxMem), -+ literal_(false), -+ never_nl_(false), -+ never_capture_(false), -+ case_sensitive_(true), -+ perl_classes_(false), -+ word_boundary_(false), -+ one_line_(false) { -+} - - RE2::Options::Options(RE2::CannedOptions opt) - : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), -diff --git a/re2/re2.h b/re2/re2.h -index 272028b..c509853 100644 ---- a/re2/re2.h -+++ b/re2/re2.h -@@ -552,28 +552,16 @@ class RE2 { - // If this happens too often, RE2 falls back on the NFA implementation. - - // For now, make the default budget something close to Code Search. -+#ifndef WIN32 - static const int kDefaultMaxMem = 8<<20; -+#endif - - enum Encoding { - EncodingUTF8 = 1, - EncodingLatin1 - }; - -- Options() : -- encoding_(EncodingUTF8), -- posix_syntax_(false), -- longest_match_(false), -- log_errors_(true), -- max_mem_(kDefaultMaxMem), -- literal_(false), -- never_nl_(false), -- never_capture_(false), -- case_sensitive_(true), -- perl_classes_(false), -- word_boundary_(false), -- one_line_(false) { -- } -- -+ Options(); - /*implicit*/ Options(CannedOptions); - - Encoding encoding() const { return encoding_; } -diff --git a/re2/stringpiece.h b/re2/stringpiece.h -index ab9297c..38a5150 100644 ---- a/re2/stringpiece.h -+++ b/re2/stringpiece.h -@@ -23,6 +23,9 @@ - #include <cstddef> - #include <iosfwd> - #include <string> -+#ifdef WIN32 -+#include <algorithm> -+#endif - - namespace re2 { - -diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc -index b99cacf..911e868 100644 ---- a/re2/testing/re2_test.cc -+++ b/re2/testing/re2_test.cc -@@ -6,7 +6,9 @@ - // TODO: Test extractions for PartialMatch/Consume - - #include <sys/types.h> -+#ifndef WIN32 - #include <sys/mman.h> -+#endif - #include <sys/stat.h> - #include <errno.h> - #include <vector> -@@ -14,6 +16,11 @@ - #include "re2/re2.h" - #include "re2/regexp.h" - -+#ifdef WIN32 -+#include <stdio.h> -+#define snprintf _snprintf -+#endif -+ - DECLARE_bool(logtostderr); - - namespace re2 { -@@ -657,6 +664,7 @@ TEST(RE2, FullMatchTypedNullArg) { - CHECK(!RE2::FullMatch("hello", "(.*)", (float*)NULL)); - } - -+#ifndef WIN32 - // Check that numeric parsing code does not read past the end of - // the number being parsed. - TEST(RE2, NULTerminated) { -@@ -678,6 +686,7 @@ TEST(RE2, NULTerminated) { - CHECK(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x)); - CHECK_EQ(x, 1); - } -+#endif - - TEST(RE2, FullMatchTypeTests) { - // Type tests -diff --git a/util/logging.h b/util/logging.h -index 4443f7c..d0a2d87 100644 ---- a/util/logging.h -+++ b/util/logging.h -@@ -7,8 +7,13 @@ - #ifndef RE2_UTIL_LOGGING_H__ - #define RE2_UTIL_LOGGING_H__ - -+#ifndef WIN32 - #include <unistd.h> /* for write */ -+#endif - #include <sstream> -+#ifdef WIN32 -+#include <io.h> -+#endif - - // Debug-only checking. - #define DCHECK(condition) assert(condition) -diff --git a/util/mutex.h b/util/mutex.h -index 9787bfb..e321fae 100644 ---- a/util/mutex.h -+++ b/util/mutex.h -@@ -12,8 +12,10 @@ - - namespace re2 { - -+#ifndef WIN32 - #define HAVE_PTHREAD 1 - #define HAVE_RWLOCK 1 -+#endif - - #if defined(NO_THREADS) - typedef int MutexType; // to keep a lock-count -@@ -32,7 +34,9 @@ namespace re2 { - # include <pthread.h> - typedef pthread_mutex_t MutexType; - #elif defined(WIN32) --# define WIN32_LEAN_AND_MEAN // We only need minimal includes -+# ifndef WIN32_LEAN_AND_MEAN -+# define WIN32_LEAN_AND_MEAN // We only need minimal includes -+# endif - # ifdef GMUTEX_TRYLOCK - // We need Windows NT or later for TryEnterCriticalSection(). If you - // don't need that functionality, you can remove these _WIN32_WINNT -diff --git a/util/pcre.cc b/util/pcre.cc -index 5e67e1f..1602133 100644 ---- a/util/pcre.cc -+++ b/util/pcre.cc -@@ -11,6 +11,11 @@ - #include "util/flags.h" - #include "util/pcre.h" - -+#ifdef WIN32 -+#define strtoll _strtoi64 -+#define strtoull _strtoui64 -+#endif -+ - #define PCREPORT(level) LOG(level) - - // Default PCRE limits. -diff --git a/util/pcre.h b/util/pcre.h -index 4dda95d..771ac91 100644 ---- a/util/pcre.h -+++ b/util/pcre.h -@@ -180,9 +180,15 @@ struct pcre_extra { int flags, match_limit, match_limit_recursion; }; - #define PCRE_ERROR_MATCHLIMIT 2 - #define PCRE_ERROR_RECURSIONLIMIT 3 - #define PCRE_INFO_CAPTURECOUNT 0 -+#ifndef WIN32 - #define pcre_compile(a,b,c,d,e) ({ (void)(a); (void)(b); *(c)=""; *(d)=0; (void)(e); ((pcre*)0); }) - #define pcre_exec(a, b, c, d, e, f, g, h) ({ (void)(a); (void)(b); (void)(c); (void)(d); (void)(e); (void)(f); (void)(g); (void)(h); 0; }) - #define pcre_fullinfo(a, b, c, d) ({ (void)(a); (void)(b); (void)(c); *(d) = 0; 0; }) -+#else -+#define pcre_compile(a,b,c,d,e) NULL -+#define pcre_exec(a, b, c, d, e, f, g, h) NULL -+#define pcre_fullinfo(a, b, c, d) NULL -+#endif - } // namespace re2 - #endif - -diff --git a/util/test.cc b/util/test.cc -index 0644829..2fe1bfa 100644 ---- a/util/test.cc -+++ b/util/test.cc -@@ -3,7 +3,9 @@ - // license that can be found in the LICENSE file. - - #include <stdio.h> -+#ifndef WIN32 - #include <sys/resource.h> -+#endif - #include "util/test.h" - - DEFINE_string(test_tmpdir, "/var/tmp", "temp directory"); -@@ -23,9 +25,13 @@ void RegisterTest(void (*fn)(void), const char *name) { - - namespace re2 { - int64 VirtualProcessSize() { -+#ifndef WIN32 - struct rusage ru; - getrusage(RUSAGE_SELF, &ru); - return (int64)ru.ru_maxrss*1024; -+#else -+ return 0; -+#endif - } - } // namespace re2 - -diff --git a/util/util.h b/util/util.h -index c46ab1b..17ef824 100644 ---- a/util/util.h -+++ b/util/util.h -@@ -12,7 +12,9 @@ - #include <stddef.h> // For size_t - #include <assert.h> - #include <stdarg.h> -+#ifndef WIN32 - #include <sys/time.h> -+#endif - #include <time.h> - #include <ctype.h> // For isdigit, isalpha. - -@@ -51,7 +53,11 @@ using std::tr1::unordered_set; - #else - - #include <unordered_set> -+#ifdef WIN32 -+using std::tr1::unordered_set; -+#else - using std::unordered_set; -+#endif - - #endif - -diff --git a/util/valgrind.h b/util/valgrind.h -index ca10b1a..d097b0c 100644 ---- a/util/valgrind.h -+++ b/util/valgrind.h -@@ -4064,6 +4064,7 @@ typedef - #endif /* PLAT_ppc64_aix5 */ - - -+#ifndef WIN32 - /* ------------------------------------------------------------------ */ - /* ARCHITECTURE INDEPENDENT MACROS for CLIENT REQUESTS. */ - /* */ -@@ -4170,7 +4171,7 @@ typedef - VG_USERREQ__DISCARD_TRANSLATIONS, \ - _qzz_addr, _qzz_len, 0, 0, 0); \ - } -- -+#endif - - /* These requests are for getting Valgrind itself to print something. - Possibly with a backtrace. This is a really ugly hack. The return value diff --git a/third_party/re2/patches/remove-valgrind-code.patch b/third_party/re2/patches/remove-valgrind-code.patch deleted file mode 100644 index ba6007a..0000000 --- a/third_party/re2/patches/remove-valgrind-code.patch +++ /dev/null @@ -1,35 +0,0 @@ -diff --git a/re2/dfa.cc b/re2/dfa.cc -index 2556c0f..f1fc7b0 100644 ---- a/re2/dfa.cc -+++ b/re2/dfa.cc -@@ -27,6 +27,8 @@ - #include "util/flags.h" - #include "util/sparse_set.h" - -+#define NO_THREAD_SAFETY_ANALYSIS -+ - DEFINE_bool(re2_dfa_bail_when_slow, true, - "Whether the RE2 DFA should bail out early " - "if the NFA would be faster (for testing)."); -diff --git a/util/util.h b/util/util.h -index 471c64f..c46ab1b 100644 ---- a/util/util.h -+++ b/util/util.h -@@ -27,6 +27,8 @@ - #include <utility> - #include <set> - -+#include "base/third_party/dynamic_annotations/dynamic_annotations.h" -+ - // Use std names. - using std::set; - using std::pair; -@@ -119,8 +121,6 @@ static inline uint64 Hash64StringWithSeed(const char* s, int len, uint32 seed) { - return ((uint64)x << 32) | y; - } - --int RunningOnValgrind(); -- - } // namespace re2 - - #include "util/arena.h" diff --git a/third_party/re2/patches/sparse-array-valgrind.patch b/third_party/re2/patches/sparse-array-valgrind.patch deleted file mode 100644 index e2cf0bd..0000000 --- a/third_party/re2/patches/sparse-array-valgrind.patch +++ /dev/null @@ -1,23 +0,0 @@ -diff --git a/third_party/re2/util/sparse_array.h b/third_party/re2/util/sparse_array.h -index 4ee5c94..7bc3a86 100644 ---- a/third_party/re2/util/sparse_array.h -+++ b/third_party/re2/util/sparse_array.h -@@ -273,13 +273,13 @@ void SparseArray<Value>::resize(int new_max_size) { - int* a = new int[new_max_size]; - if (sparse_to_dense_) { - memmove(a, sparse_to_dense_, max_size_*sizeof a[0]); -- // Don't need to zero the memory but appease Valgrind. -- if (valgrind_) { -- for (int i = max_size_; i < new_max_size; i++) -- a[i] = 0xababababU; -- } - delete[] sparse_to_dense_; - } -+ // Don't need to zero the memory but appease Valgrind. -+ if (valgrind_) { -+ for (int i = max_size_; i < new_max_size; i++) -+ a[i] = 0xababababU; -+ } - sparse_to_dense_ = a; - - dense_.resize(new_max_size); diff --git a/third_party/re2/re2.gyp b/third_party/re2/re2.gyp index 8ddfc91..e66504d 100644 --- a/third_party/re2/re2.gyp +++ b/third_party/re2/re2.gyp @@ -3,6 +3,9 @@ # found in the LICENSE file. { + 'variables': { + 'build_for_tool%': '', + }, 'targets': [ { 'target_name': 're2', @@ -44,6 +47,7 @@ 're2/set.cc', 're2/set.h', 're2/simplify.cc', + 're2/stringpiece.cc', 're2/stringpiece.h', 're2/tostring.cc', 're2/unicode_casefold.cc', @@ -52,26 +56,34 @@ 're2/unicode_groups.h', 're2/variadic_function.h', 're2/walker-inl.h', - 'util/arena.cc', - 'util/arena.h', 'util/atomicops.h', 'util/flags.h', 'util/hash.cc', + 'util/logging.cc', 'util/logging.h', 'util/mutex.h', 'util/rune.cc', 'util/sparse_array.h', 'util/sparse_set.h', - 'util/stringpiece.cc', 'util/stringprintf.cc', 'util/strutil.cc', 'util/utf.h', 'util/util.h', + 'util/valgrind.cc', + 'util/valgrind.h', ], 'conditions': [ ['OS=="win"', { 'msvs_disabled_warnings': [ 4018, 4722, 4267 ], - }] + }], + ['build_for_tool=="drmemory"', { + # Treat builds for Dr. Memory as builds for MSAN to prevent false + # positives created by lazily initialized memory. + # See crbug.com/568119#3 . + 'defines': [ + 'MEMORY_SANITIZER' + ], + }], ] }, ], diff --git a/third_party/re2/re2.pc b/third_party/re2/re2.pc new file mode 100644 index 0000000..9e90cdad --- /dev/null +++ b/third_party/re2/re2.pc @@ -0,0 +1,10 @@ +prefix=@prefix@ +exec_prefix=${prefix} +includedir=${prefix}/include +libdir=${exec_prefix}/lib + +Name: re2 +Description: RE2 is a fast, safe, thread-friendly regular expression engine. +Version: 0.0.0 +Cflags: -I${includedir} +Libs: -L${libdir} -lre2 -pthread diff --git a/third_party/re2/re2/Makefile b/third_party/re2/re2/Makefile deleted file mode 100644 index 8b13789..0000000 --- a/third_party/re2/re2/Makefile +++ /dev/null @@ -1 +0,0 @@ - diff --git a/third_party/re2/re2/bitstate.cc b/third_party/re2/re2/bitstate.cc index 518d642..5740daa 100644 --- a/third_party/re2/re2/bitstate.cc +++ b/third_party/re2/re2/bitstate.cc @@ -94,7 +94,7 @@ BitState::~BitState() { // If so, remember that it was visited so that the next time, // we don't repeat the visit. bool BitState::ShouldVisit(int id, const char* p) { - uint n = id * (text_.size() + 1) + (p - text_.begin()); + size_t n = id * (text_.size() + 1) + (p - text_.begin()); if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1)))) return false; visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1)); @@ -170,6 +170,8 @@ bool BitState::TrySearch(int id0, const char* p0) { Prog::Inst* ip = prog_->inst(id); switch (ip->opcode()) { case kInstFail: + return false; + default: LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg; return false; @@ -270,7 +272,8 @@ bool BitState::TrySearch(int id0, const char* p0) { if (submatch_[0].data() == NULL || (longest_ && p > submatch_[0].end())) { for (int i = 0; i < nsubmatch_; i++) - submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]); + submatch_[i].set(cap_[2*i], + static_cast<int>(cap_[2*i+1] - cap_[2*i])); } // If going for first match, we're done. diff --git a/third_party/re2/re2/compile.cc b/third_party/re2/re2/compile.cc index 9a59f13..5037524 100644 --- a/third_party/re2/re2/compile.cc +++ b/third_party/re2/re2/compile.cc @@ -230,7 +230,7 @@ class Compiler : public Regexp::Walker<Frag> { RE2::Anchor anchor_; // anchor mode for RE2::Set - DISALLOW_EVIL_CONSTRUCTORS(Compiler); + DISALLOW_COPY_AND_ASSIGN(Compiler); }; Compiler::Compiler() { @@ -371,6 +371,8 @@ Frag Compiler::Plus(Frag a, bool nongreedy) { // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) Frag Compiler::Quest(Frag a, bool nongreedy) { + if (IsNoMatch(a)) + return Nop(); int id = AllocInst(1); if (id < 0) return NoMatch(); @@ -433,7 +435,10 @@ Frag Compiler::EmptyWidth(EmptyOp empty) { if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { int j; for (int i = 0; i < 256; i = j) { - for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++) + for (j = i + 1; j < 256 && + Prog::IsWordChar(static_cast<uint8>(i)) == + Prog::IsWordChar(static_cast<uint8>(j)); + j++) ; prog_->MarkByteRange(i, j-1); } @@ -443,6 +448,8 @@ Frag Compiler::EmptyWidth(EmptyOp empty) { // Given a fragment a, returns a fragment with capturing parens around a. Frag Compiler::Capture(Frag a, int n) { + if (IsNoMatch(a)) + return NoMatch(); int id = AllocInst(2); if (id < 0) return NoMatch(); @@ -499,7 +506,10 @@ int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { return UncachedRuneByteSuffix(lo, hi, foldcase, next); } - uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | (foldcase ? 1ULL : 0ULL); + uint64 key = (uint64)next << 17 | + (uint64)lo << 9 | + (uint64)hi << 1 | + (uint64)foldcase; map<uint64, int>::iterator it = rune_cache_.find(key); if (it != rune_cache_.end()) return it->second; @@ -551,7 +561,8 @@ void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { return; if (hi > 0xFF) hi = 0xFF; - AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); + AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi), + foldcase, 0)); } // Table describing how to make a UTF-8 matching machine @@ -592,7 +603,8 @@ void Compiler::Add_80_10ffff() { int next = 0; if (p.next >= 0) next = inst[p.next]; - inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next); + inst[i] = UncachedRuneByteSuffix(static_cast<uint8>(p.lo), + static_cast<uint8>(p.hi), false, next); if ((p.lo & 0xC0) != 0x80) AddSuffix(inst[i]); } @@ -621,7 +633,8 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { // ASCII range is always a special case. if (hi < Runeself) { - AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); + AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi), + foldcase, 0)); return; } @@ -749,16 +762,16 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, } case kRegexpStar: - return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); case kRegexpPlus: - return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); case kRegexpQuest: - return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); case kRegexpLiteral: - return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase); + return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0); case kRegexpLiteralString: { // Concatenation of literals. @@ -766,7 +779,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, return Nop(); Frag f; for (int i = 0; i < re->nrunes(); i++) { - Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase); + Frag f1 = Literal(re->runes()[i], + (re->parse_flags()&Regexp::FoldCase) != 0); if (i == 0) f = f1; else @@ -811,7 +825,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, // If this range contains all of A-Za-z or none of it, // the fold flag is unnecessary; don't bother. bool fold = foldascii; - if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo) + if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo || + ('Z' < i->lo && i->hi < 'a')) fold = false; AddRuneRange(i->lo, i->hi, fold); @@ -954,7 +969,7 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, max_mem_ = max_mem; if (max_mem <= 0) { max_inst_ = 100000; // more than enough - } else if (max_mem <= sizeof(Prog)) { + } else if (max_mem <= static_cast<int64>(sizeof(Prog))) { // No room for anything. max_inst_ = 0; } else { @@ -974,7 +989,7 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, if (m > Prog::Inst::kMaxInst) m = Prog::Inst::kMaxInst; - max_inst_ = m; + max_inst_ = static_cast<int>(m); } anchor_ = anchor; diff --git a/third_party/re2/re2/dfa.cc b/third_party/re2/re2/dfa.cc index f1fc7b0..1f54b9f 100644 --- a/third_party/re2/re2/dfa.cc +++ b/third_party/re2/re2/dfa.cc @@ -21,13 +21,11 @@ // // See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent. -#include "re2/prog.h" -#include "re2/stringpiece.h" #include "util/atomicops.h" #include "util/flags.h" #include "util/sparse_set.h" - -#define NO_THREAD_SAFETY_ANALYSIS +#include "re2/prog.h" +#include "re2/stringpiece.h" DEFINE_bool(re2_dfa_bail_when_slow, true, "Whether the RE2 DFA should bail out early " @@ -96,7 +94,7 @@ class DFA { // States, linked by the next_ pointers. If in state s and reading // byte c, the next state should be s->next_[c]. struct State { - inline bool IsMatch() const { return flag_ & kFlagMatch; } + inline bool IsMatch() const { return (flag_ & kFlagMatch) != 0; } void SaveMatch(vector<int>* v); int* inst_; // Instruction pointers in the state. @@ -145,7 +143,7 @@ class DFA { if (sizeof(size_t) == sizeof(uint32)) return Hash32StringWithSeed(s, len, a->flag_); else - return Hash64StringWithSeed(s, len, a->flag_); + return static_cast<size_t>(Hash64StringWithSeed(s, len, a->flag_)); } #ifdef STL_MSVC // Less than operator. @@ -230,9 +228,8 @@ class DFA { // sets *ismatch to true. // L >= mutex_ void RunWorkqOnByte(Workq* q, Workq* nq, - int c, uint flag, bool* ismatch, - Prog::MatchKind kind, - int new_byte_loop); + int c, uint flag, bool* ismatch, + Prog::MatchKind kind); // Runs a Workq on a set of empty-string flags, producing a new Workq in nq. // L >= mutex_ @@ -277,7 +274,7 @@ class DFA { vector<int>* matches; private: - DISALLOW_EVIL_CONSTRUCTORS(SearchParams); + DISALLOW_COPY_AND_ASSIGN(SearchParams); }; // Before each search, the parameters to Search are analyzed by @@ -342,7 +339,6 @@ class DFA { // Constant after initialization. Prog* prog_; // The regular expression program to run. Prog::MatchKind kind_; // The kind of DFA. - int start_unanchored_; // start of unanchored program bool init_failed_; // initialization failed (out of memory) Mutex mutex_; // mutex_ >= cache_mutex_.r @@ -430,7 +426,7 @@ class DFA::Workq : public SparseSet { int maxmark_; // maximum number of marks int nextmark_; // id of next mark bool last_was_mark_; // last inserted was mark - DISALLOW_EVIL_CONSTRUCTORS(Workq); + DISALLOW_COPY_AND_ASSIGN(Workq); }; DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem) @@ -445,11 +441,8 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem) if (DebugDFA) fprintf(stderr, "\nkind %d\n%s\n", (int)kind_, prog_->DumpUnanchored().c_str()); int nmark = 0; - start_unanchored_ = 0; - if (kind_ == Prog::kLongestMatch) { + if (kind_ == Prog::kLongestMatch) nmark = prog->size(); - start_unanchored_ = prog->start_unanchored(); - } nastack_ = 2 * prog->size() + nmark; // Account for space needed for DFA, q0, q1, astack. @@ -458,7 +451,7 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem) (sizeof(int)+sizeof(int)) * 2; // q0, q1 mem_budget_ -= nastack_ * sizeof(int); // astack if (mem_budget_ < 0) { - LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld", + LOG(INFO) << StringPrintf("DFA out of memory: prog size %d mem %lld", prog_->size(), max_mem); init_failed_ = true; return; @@ -473,7 +466,7 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem) int64 one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(int) + (prog_->bytemap_range()+1)*sizeof(State*); if (state_budget_ < 20*one_state) { - LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld", + LOG(INFO) << StringPrintf("DFA out of memory: prog size %d mem %lld", prog_->size(), max_mem); init_failed_ = true; return; @@ -789,7 +782,7 @@ void DFA::ClearCache() { it != state_cache_.end(); ++it) v.push_back(*it); state_cache_.clear(); - for (int i = 0; i < v.size(); i++) + for (size_t i = 0; i < v.size(); i++) delete[] reinterpret_cast<const char*>(v[i]); } @@ -871,8 +864,10 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) { break; case kInstEmptyWidth: - if ((ip->empty() & flag) == ip->empty()) - stk[nstk++] = ip->out(); + // Continue on if we have all the right flag bits. + if (ip->empty() & ~flag) + break; + stk[nstk++] = ip->out(); break; } } @@ -910,8 +905,7 @@ void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint flag) { // regular expression program has been reached (the regexp has matched). void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, int c, uint flag, bool* ismatch, - Prog::MatchKind kind, - int new_byte_loop) { + Prog::MatchKind kind) { if (DEBUG_MODE) mutex_.AssertHeld(); @@ -990,9 +984,8 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { } // If someone else already computed this, return it. - MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering - State* ns = state->next_[ByteMap(c)]; - ANNOTATE_HAPPENS_AFTER(ns); + State* ns; + ATOMIC_LOAD_CONSUME(ns, &state->next_[ByteMap(c)]); if (ns != NULL) return ns; @@ -1022,8 +1015,8 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { // The state flag kFlagLastWord says whether the last // byte processed was a word character. Use that info to // insert empty-width (non-)word boundaries. - bool islastword = state->flag_ & kFlagLastWord; - bool isword = (c != kByteEndText && Prog::IsWordChar(c)); + bool islastword = (state->flag_ & kFlagLastWord) != 0; + bool isword = (c != kByteEndText && Prog::IsWordChar(static_cast<uint8>(c))); if (isword == islastword) beforeflag |= kEmptyNonWordBoundary; else @@ -1036,8 +1029,8 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { swap(q0_, q1_); } bool ismatch = false; - RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch, kind_, start_unanchored_); - + RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch, kind_); + // Most of the time, we build the state from the output of // RunWorkqOnByte, so swap q0_ and q1_ here. However, so that // RE2::Set can tell exactly which match instructions @@ -1058,18 +1051,11 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { ns = WorkqToCachedState(q0_, flag); + // Flush ns before linking to it. // Write barrier before updating state->next_ so that the // main search loop can proceed without any locking, for speed. // (Otherwise it would need one mutex operation per input byte.) - // The annotations below tell race detectors that: - // a) the access to next_ should be ignored, - // b) 'ns' is properly published. - WriteMemoryBarrier(); // Flush ns before linking to it. - - ANNOTATE_IGNORE_WRITES_BEGIN(); - ANNOTATE_HAPPENS_BEFORE(ns); - state->next_[ByteMap(c)] = ns; - ANNOTATE_IGNORE_WRITES_END(); + ATOMIC_STORE_RELEASE(&state->next_[ByteMap(c)], ns); return ns; } @@ -1112,7 +1098,7 @@ class DFA::RWLocker { Mutex* mu_; bool writing_; - DISALLOW_EVIL_CONSTRUCTORS(RWLocker); + DISALLOW_COPY_AND_ASSIGN(RWLocker); }; DFA::RWLocker::RWLocker(Mutex* mu) @@ -1212,7 +1198,7 @@ class DFA::StateSaver { bool is_special_; // whether original state was special State* special_; // if is_special_, the original state - DISALLOW_EVIL_CONSTRUCTORS(StateSaver); + DISALLOW_COPY_AND_ASSIGN(StateSaver); }; DFA::StateSaver::StateSaver(DFA* dfa, State* state) { @@ -1390,9 +1376,8 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, // Okay to use bytemap[] not ByteMap() here, because // c is known to be an actual byte and not kByteEndText. - MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering - State* ns = s->next_[bytemap[c]]; - ANNOTATE_HAPPENS_AFTER(ns); + State* ns; + ATOMIC_LOAD_CONSUME(ns, &s->next_[bytemap[c]]); if (ns == NULL) { ns = RunStateOnByteUnlocked(s, c); if (ns == NULL) { @@ -1405,7 +1390,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, // of 10 bytes per state computation, fail so that RE2 can // fall back to the NFA. if (FLAGS_re2_dfa_bail_when_slow && resetp != NULL && - (p - resetp) < 10*state_cache_.size()) { + static_cast<unsigned long>(p - resetp) < 10*state_cache_.size()) { params->failed = true; return false; } @@ -1479,9 +1464,8 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, lastbyte = params->text.begin()[-1] & 0xFF; } - MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering - State* ns = s->next_[ByteMap(lastbyte)]; - ANNOTATE_HAPPENS_AFTER(ns); + State* ns; + ATOMIC_LOAD_CONSUME(ns, &s->next_[ByteMap(lastbyte)]); if (ns == NULL) { ns = RunStateOnByteUnlocked(s, lastbyte); if (ns == NULL) { @@ -1669,13 +1653,16 @@ bool DFA::AnalyzeSearch(SearchParams* params) { } } - if (DebugDFA) + if (DebugDFA) { + int fb; + ATOMIC_LOAD_RELAXED(fb, &info->firstbyte); fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s firstbyte=%d\n", params->anchored, params->run_forward, flags, - DumpState(info->start).c_str(), info->firstbyte); + DumpState(info->start).c_str(), fb); + } params->start = info->start; - params->firstbyte = ANNOTATE_UNPROTECTED_READ(info->firstbyte); + ATOMIC_LOAD_ACQUIRE(params->firstbyte, &info->firstbyte); return true; } @@ -1683,17 +1670,15 @@ bool DFA::AnalyzeSearch(SearchParams* params) { // Fills in info if needed. Returns true on success, false on failure. bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint flags) { - // Quick check; okay because of memory barriers below. - if (ANNOTATE_UNPROTECTED_READ(info->firstbyte) != kFbUnknown) { - ANNOTATE_HAPPENS_AFTER(&info->firstbyte); + // Quick check. + int fb; + ATOMIC_LOAD_ACQUIRE(fb, &info->firstbyte); + if (fb != kFbUnknown) return true; - } MutexLock l(&mutex_); - if (info->firstbyte != kFbUnknown) { - ANNOTATE_HAPPENS_AFTER(&info->firstbyte); + if (info->firstbyte != kFbUnknown) return true; - } q0_->clear(); AddToQueue(q0_, @@ -1704,16 +1689,14 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, return false; if (info->start == DeadState) { - ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); - WriteMemoryBarrier(); // Synchronize with "quick check" above. - info->firstbyte = kFbNone; + // Synchronize with "quick check" above. + ATOMIC_STORE_RELEASE(&info->firstbyte, kFbNone); return true; } if (info->start == FullMatchState) { - ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); - WriteMemoryBarrier(); // Synchronize with "quick check" above. - info->firstbyte = kFbNone; // will be ignored + // Synchronize with "quick check" above. + ATOMIC_STORE_RELEASE(&info->firstbyte, kFbNone); // will be ignored return true; } @@ -1724,9 +1707,8 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, for (int i = 0; i < 256; i++) { State* s = RunStateOnByte(info->start, i); if (s == NULL) { - ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); - WriteMemoryBarrier(); // Synchronize with "quick check" above. - info->firstbyte = firstbyte; + // Synchronize with "quick check" above. + ATOMIC_STORE_RELEASE(&info->firstbyte, firstbyte); return false; } if (s == info->start) @@ -1739,9 +1721,8 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, break; } } - ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); - WriteMemoryBarrier(); // Synchronize with "quick check" above. - info->firstbyte = firstbyte; + // Synchronize with "quick check" above. + ATOMIC_STORE_RELEASE(&info->firstbyte, firstbyte); return true; } @@ -1821,19 +1802,16 @@ DFA* Prog::GetDFA(MatchKind kind) { pdfa = &dfa_longest_; } - // Quick check; okay because of memory barrier below. - DFA *dfa = ANNOTATE_UNPROTECTED_READ(*pdfa); - if (dfa != NULL) { - ANNOTATE_HAPPENS_AFTER(dfa); + // Quick check. + DFA *dfa; + ATOMIC_LOAD_ACQUIRE(dfa, pdfa); + if (dfa != NULL) return dfa; - } MutexLock l(&dfa_mutex_); dfa = *pdfa; - if (dfa != NULL) { - ANNOTATE_HAPPENS_AFTER(dfa); + if (dfa != NULL) return dfa; - } // For a forward DFA, half the memory goes to each DFA. // For a reverse DFA, all the memory goes to the @@ -1850,9 +1828,7 @@ DFA* Prog::GetDFA(MatchKind kind) { delete_dfa_ = DeleteDFA; // Synchronize with "quick check" above. - ANNOTATE_HAPPENS_BEFORE(dfa); - WriteMemoryBarrier(); - *pdfa = dfa; + ATOMIC_STORE_RELEASE(pdfa, dfa); return dfa; } @@ -1925,9 +1901,9 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, // as the beginning. if (match0) { if (reversed_) - *match0 = StringPiece(ep, text.end() - ep); + match0->set(ep, static_cast<int>(text.end() - ep)); else - *match0 = StringPiece(text.begin(), ep - text.begin()); + match0->set(text.begin(), static_cast<int>(ep - text.begin())); } return true; } @@ -1952,7 +1928,7 @@ int DFA::BuildAllStates() { q.push_back(params.start); // Flood to expand every state. - for (int i = 0; i < q.size(); i++) { + for (size_t i = 0; i < q.size(); i++) { State* s = q[i]; for (int c = 0; c < 257; c++) { State* ns = RunStateOnByteUnlocked(s, c); @@ -1963,7 +1939,7 @@ int DFA::BuildAllStates() { } } - return q.size(); + return static_cast<int>(q.size()); } // Build out all states in DFA for kind. Returns number of states. @@ -2035,6 +2011,7 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { // Build minimum prefix. State* s = params.start; min->clear(); + MutexLock lock(&mutex_); for (int i = 0; i < maxlen; i++) { if (previously_visited_states[s] > kMaxEltRepetitions) { VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions @@ -2044,7 +2021,7 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { previously_visited_states[s]++; // Stop if min is a match. - State* ns = RunStateOnByteUnlocked(s, kByteEndText); + State* ns = RunStateOnByte(s, kByteEndText); if (ns == NULL) // DFA out of memory return false; if (ns != DeadState && (ns == FullMatchState || ns->IsMatch())) @@ -2053,13 +2030,13 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { // Try to extend the string with low bytes. bool extended = false; for (int j = 0; j < 256; j++) { - ns = RunStateOnByteUnlocked(s, j); + ns = RunStateOnByte(s, j); if (ns == NULL) // DFA out of memory return false; if (ns == FullMatchState || (ns > SpecialStateMax && ns->ninst_ > 0)) { extended = true; - min->append(1, j); + min->append(1, static_cast<char>(j)); s = ns; break; } @@ -2083,13 +2060,13 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { // Try to extend the string with high bytes. bool extended = false; for (int j = 255; j >= 0; j--) { - State* ns = RunStateOnByteUnlocked(s, j); + State* ns = RunStateOnByte(s, j); if (ns == NULL) return false; if (ns == FullMatchState || (ns > SpecialStateMax && ns->ninst_ > 0)) { extended = true; - max->append(1, j); + max->append(1, static_cast<char>(j)); s = ns; break; } @@ -2122,11 +2099,12 @@ bool Prog::PossibleMatchRange(string* min, string* max, int maxlen) { MutexLock l(&dfa_mutex_); // Have to use dfa_longest_ to get all strings for full matches. // For example, (a|aa) never matches aa in first-match mode. - if (dfa_longest_ == NULL) { - dfa_longest_ = new DFA(this, Prog::kLongestMatch, dfa_mem_/2); + dfa = dfa_longest_; + if (dfa == NULL) { + dfa = new DFA(this, Prog::kLongestMatch, dfa_mem_/2); + ATOMIC_STORE_RELEASE(&dfa_longest_, dfa); delete_dfa_ = DeleteDFA; } - dfa = dfa_longest_; } return dfa->PossibleMatchRange(min, max, maxlen); } diff --git a/third_party/re2/re2/filtered_re2.cc b/third_party/re2/re2/filtered_re2.cc index f576258..5dd65d5 100644 --- a/third_party/re2/re2/filtered_re2.cc +++ b/third_party/re2/re2/filtered_re2.cc @@ -16,7 +16,7 @@ FilteredRE2::FilteredRE2() } FilteredRE2::~FilteredRE2() { - for (int i = 0; i < re2_vec_.size(); i++) + for (size_t i = 0; i < re2_vec_.size(); i++) delete re2_vec_[i]; delete prefilter_tree_; } @@ -33,7 +33,7 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, } delete re; } else { - *id = re2_vec_.size(); + *id = static_cast<int>(re2_vec_.size()); re2_vec_.push_back(re); } @@ -46,7 +46,7 @@ void FilteredRE2::Compile(vector<string>* atoms) { return; } - for (int i = 0; i < re2_vec_.size(); i++) { + for (size_t i = 0; i < re2_vec_.size(); i++) { Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]); prefilter_tree_->Add(prefilter); } @@ -56,9 +56,9 @@ void FilteredRE2::Compile(vector<string>* atoms) { } int FilteredRE2::SlowFirstMatch(const StringPiece& text) const { - for (int i = 0; i < re2_vec_.size(); i++) + for (size_t i = 0; i < re2_vec_.size(); i++) if (RE2::PartialMatch(text, *re2_vec_[i])) - return i; + return static_cast<int>(i); return -1; } @@ -70,7 +70,7 @@ int FilteredRE2::FirstMatch(const StringPiece& text, } vector<int> regexps; prefilter_tree_->RegexpsGivenStrings(atoms, ®exps); - for (int i = 0; i < regexps.size(); i++) + for (size_t i = 0; i < regexps.size(); i++) if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) return regexps[i]; return -1; @@ -83,18 +83,23 @@ bool FilteredRE2::AllMatches( matching_regexps->clear(); vector<int> regexps; prefilter_tree_->RegexpsGivenStrings(atoms, ®exps); - for (int i = 0; i < regexps.size(); i++) + for (size_t i = 0; i < regexps.size(); i++) if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) matching_regexps->push_back(regexps[i]); return !matching_regexps->empty(); } +void FilteredRE2::AllPotentials( + const vector<int>& atoms, + vector<int>* potential_regexps) const { + prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps); +} + void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms, vector<int>* passed_regexps) { prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps); } - void FilteredRE2::PrintPrefilter(int regexpid) { prefilter_tree_->PrintPrefilter(regexpid); } diff --git a/third_party/re2/re2/filtered_re2.h b/third_party/re2/re2/filtered_re2.h index 64b35be..f4b2be4 100644 --- a/third_party/re2/re2/filtered_re2.h +++ b/third_party/re2/re2/filtered_re2.h @@ -67,8 +67,16 @@ class FilteredRE2 { const vector<int>& atoms, vector<int>* matching_regexps) const; + // Returns the indices of all potentially matching regexps after first + // clearing potential_regexps. + // A regexp is potentially matching if it passes the filter. + // If a regexp passes the filter it may still not match. + // A regexp that does not pass the filter is guaranteed to not match. + void AllPotentials(const vector<int>& atoms, + vector<int>* potential_regexps) const; + // The number of regexps added. - int NumRegexps() const { return re2_vec_.size(); } + int NumRegexps() const { return static_cast<int>(re2_vec_.size()); } private: @@ -91,7 +99,7 @@ class FilteredRE2 { // An AND-OR tree of string atoms used for filtering regexps. PrefilterTree* prefilter_tree_; - //DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2); + //DISALLOW_COPY_AND_ASSIGN(FilteredRE2); FilteredRE2(const FilteredRE2&); void operator=(const FilteredRE2&); }; diff --git a/third_party/re2/re2/make_perl_groups.pl b/third_party/re2/re2/make_perl_groups.pl index d5eaa59..d9fcdaf 100755 --- a/third_party/re2/re2/make_perl_groups.pl +++ b/third_party/re2/re2/make_perl_groups.pl @@ -32,14 +32,20 @@ "\\w", ); +%overrides = ( + # Prior to Perl 5.18, \s did not match vertical tab. + # RE2 preserves that original behaviour. + "\\s:11" => 0, +); + sub ComputeClass($) { + my ($cname) = @_; my @ranges; - my ($class) = @_; - my $regexp = "[$class]"; + my $regexp = qr/[$cname]/; my $start = -1; for (my $i=0; $i<=129; $i++) { if ($i == 129) { $i = 256; } - if ($i <= 128 && chr($i) =~ $regexp) { + if ($i <= 128 && ($overrides{"$cname:$i"} // chr($i) =~ $regexp)) { if ($start < 0) { $start = $i; } @@ -54,15 +60,15 @@ sub ComputeClass($) { } sub PrintClass($$@) { - my ($cname, $name, @ranges) = @_; - print "static URange16 code${cname}[] = { /* $name */\n"; + my ($cnum, $cname, @ranges) = @_; + print "static const URange16 code${cnum}[] = { /* $cname */\n"; for (my $i=0; $i<@ranges; $i++) { my @a = @{$ranges[$i]}; printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1]; } print "};\n"; my $n = @ranges; - my $escname = $name; + my $escname = $cname; $escname =~ s/\\/\\\\/g; $negname = $escname; if ($negname =~ /:/) { @@ -70,25 +76,25 @@ sub PrintClass($$@) { } else { $negname =~ y/a-z/A-Z/; } - return "{ \"$escname\", +1, code$cname, $n }", "{ \"$negname\", -1, code$cname, $n }"; + return "{ \"$escname\", +1, code$cnum, $n }", "{ \"$negname\", -1, code$cnum, $n }"; } -my $gen = 0; +my $cnum = 0; sub PrintClasses($@) { - my ($cname, @classes) = @_; + my ($pname, @classes) = @_; my @entries; - foreach my $cl (@classes) { - my @ranges = ComputeClass($cl); - push @entries, PrintClass(++$gen, $cl, @ranges); + foreach my $cname (@classes) { + my @ranges = ComputeClass($cname); + push @entries, PrintClass(++$cnum, $cname, @ranges); } - print "UGroup ${cname}_groups[] = {\n"; + print "const UGroup ${pname}_groups[] = {\n"; foreach my $e (@entries) { print "\t$e,\n"; } print "};\n"; my $count = @entries; - print "int num_${cname}_groups = $count;\n"; + print "const int num_${pname}_groups = $count;\n"; } print <<EOF; diff --git a/third_party/re2/re2/make_unicode_casefold.py b/third_party/re2/re2/make_unicode_casefold.py index 3375d2e..d215eb1 100755 --- a/third_party/re2/re2/make_unicode_casefold.py +++ b/third_party/re2/re2/make_unicode_casefold.py @@ -9,7 +9,8 @@ """Generate C++ table for Unicode case folding.""" -import unicode, sys +import sys +import unicode _header = """ // GENERATED BY make_unicode_casefold.py; DO NOT EDIT. @@ -130,11 +131,11 @@ def main(): foldpairs.sort() foldranges = _MakeRanges(foldpairs) print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges)) - print "CaseFold unicode_%s[] = {" % (name,) + print "const CaseFold unicode_%s[] = {" % (name,) for lo, hi, delta in foldranges: print "\t{ %d, %d, %s }," % (lo, hi, delta) print "};" - print "int num_unicode_%s = %d;" % (name, len(foldranges),) + print "const int num_unicode_%s = %d;" % (name, len(foldranges),) print "" print _header diff --git a/third_party/re2/re2/make_unicode_groups.py b/third_party/re2/re2/make_unicode_groups.py index c2e25c1..8499793 100755 --- a/third_party/re2/re2/make_unicode_groups.py +++ b/third_party/re2/re2/make_unicode_groups.py @@ -41,7 +41,7 @@ def MakeRanges(codes): def PrintRanges(type, name, ranges): """Print the ranges as an array of type named name.""" - print "static %s %s[] = {" % (type, name,) + print "static const %s %s[] = {" % (type, name,) for lo, hi in ranges: print "\t{ %d, %d }," % (lo, hi) print "};" @@ -99,12 +99,12 @@ def main(): for name, codes in unicode.Scripts().iteritems(): ugroups.append(PrintGroup(name, codes)) print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32) - print "UGroup unicode_groups[] = {"; + print "const UGroup unicode_groups[] = {"; ugroups.sort() for ug in ugroups: print "\t%s," % (ug,) print "};" - print "int num_unicode_groups = %d;" % (len(ugroups),) + print "const int num_unicode_groups = %d;" % (len(ugroups),) print _trailer if __name__ == '__main__': diff --git a/third_party/re2/re2/mimics_pcre.cc b/third_party/re2/re2/mimics_pcre.cc index fc6dd4a..0a55004 100644 --- a/third_party/re2/re2/mimics_pcre.cc +++ b/third_party/re2/re2/mimics_pcre.cc @@ -124,7 +124,7 @@ class EmptyStringWalker : public Regexp::Walker<bool> { } private: - DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker); + DISALLOW_COPY_AND_ASSIGN(EmptyStringWalker); }; // Called after visiting re's children. child_args contains the return diff --git a/third_party/re2/re2/nfa.cc b/third_party/re2/re2/nfa.cc index 8c4f761..bc8996c 100644 --- a/third_party/re2/re2/nfa.cc +++ b/third_party/re2/re2/nfa.cc @@ -122,7 +122,7 @@ class NFA { Thread* free_threads_; // free list - DISALLOW_EVIL_CONSTRUCTORS(NFA); + DISALLOW_COPY_AND_ASSIGN(NFA); }; NFA::NFA(Prog* prog) { @@ -468,7 +468,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, if (text.begin() > context.begin()) { c = text.begin()[-1] & 0xFF; - wasword = Prog::IsWordChar(c); + wasword = Prog::IsWordChar(static_cast<uint8>(c)); } // Loop over the text, stepping the machine. @@ -529,7 +529,8 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, break; case kInstCapture: - match_[ip->cap()] = p; + if (ip->cap() < ncapture_) + match_[ip->cap()] = p; id = ip->out(); continue; @@ -607,7 +608,8 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, if (matched_) { for (int i = 0; i < nsubmatch; i++) - submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]); + submatch[i].set(match_[2*i], + static_cast<int>(match_[2*i+1] - match_[2*i])); if (Debug) fprintf(stderr, "match (%d,%d)\n", static_cast<int>(match_[0] - btext_), @@ -705,5 +707,52 @@ Prog::SearchNFA(const StringPiece& text, const StringPiece& context, return true; } -} // namespace re2 +// For each instruction i in the program reachable from the start, compute the +// number of instructions reachable from i by following only empty transitions +// and record that count as fanout[i]. +// +// fanout holds the results and is also the work queue for the outer iteration. +// reachable holds the reached nodes for the inner iteration. +void Prog::Fanout(SparseArray<int>* fanout) { + DCHECK_EQ(fanout->max_size(), size()); + SparseSet reachable(size()); + fanout->clear(); + fanout->set_new(start(), 0); + for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) { + int* count = &i->second; + reachable.clear(); + reachable.insert(i->index()); + for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) { + Prog::Inst* ip = inst(*j); + switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()"; + break; + + case kInstByteRange: + (*count)++; + if (!fanout->has_index(ip->out())) { + fanout->set_new(ip->out(), 0); + } + break; + + case kInstAlt: + case kInstAltMatch: + reachable.insert(ip->out1()); + // fall through + + case kInstCapture: + case kInstEmptyWidth: + case kInstNop: + reachable.insert(ip->out()); + break; + case kInstMatch: + case kInstFail: + break; + } + } + } +} + +} // namespace re2 diff --git a/third_party/re2/re2/onepass.cc b/third_party/re2/re2/onepass.cc index 1c49988..73acdc8 100644 --- a/third_party/re2/re2/onepass.cc +++ b/third_party/re2/re2/onepass.cc @@ -53,7 +53,6 @@ #include <string.h> #include <map> #include "util/util.h" -#include "util/arena.h" #include "util/sparse_set.h" #include "re2/prog.h" #include "re2/stringpiece.h" @@ -126,9 +125,6 @@ static const int Debug = 0; // whether a set of conditions required to finish a match at that // point in the input rather than process the next byte. -// A state in the one-pass NFA (aka DFA) - just an array of actions. -struct OneState; - // A state in the one-pass NFA - just an array of actions indexed // by the bytemap_[] of the next input byte. (The bytemap // maps next input bytes into equivalence classes, to reduce @@ -335,7 +331,8 @@ done: if (!matched) return false; for (int i = 0; i < nmatch; i++) - match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]); + match[i].set(matchcap[2*i], + static_cast<int>(matchcap[2*i+1] - matchcap[2*i])); return true; } diff --git a/third_party/re2/re2/parse.cc b/third_party/re2/re2/parse.cc index 0cf4ab4..cf74f5a1 100644 --- a/third_party/re2/re2/parse.cc +++ b/third_party/re2/re2/parse.cc @@ -21,6 +21,7 @@ #include "re2/stringpiece.h" #include "re2/unicode_casefold.h" #include "re2/unicode_groups.h" +#include "re2/walker-inl.h" namespace re2 { @@ -156,7 +157,7 @@ private: int ncap_; // number of capturing parens seen int rune_max_; // maximum char value for this encoding - DISALLOW_EVIL_CONSTRUCTORS(ParseState); + DISALLOW_COPY_AND_ASSIGN(ParseState); }; // Pseudo-operators - only on parse stack. @@ -214,7 +215,8 @@ bool Regexp::ParseState::PushRegexp(Regexp* re) { // single characters (e.g., [.] instead of \.), and some // analysis does better with fewer character classes. // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding. - if (re->op_ == kRegexpCharClass) { + if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { + re->ccb_->RemoveAbove(rune_max_); if (re->ccb_->size() == 1) { Rune r = re->ccb_->begin()->lo; re->Decref(); @@ -240,8 +242,8 @@ bool Regexp::ParseState::PushRegexp(Regexp* re) { // Searches the case folding tables and returns the CaseFold* that contains r. // If there isn't one, returns the CaseFold* with smallest f->lo bigger than r. // If there isn't one, returns NULL. -CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) { - CaseFold* ef = f + n; +const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) { + const CaseFold* ef = f + n; // Binary search for entry containing r. while (n > 0) { @@ -268,7 +270,7 @@ CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) { } // Returns the result of applying the fold f to the rune r. -Rune ApplyFold(CaseFold *f, Rune r) { +Rune ApplyFold(const CaseFold *f, Rune r) { switch (f->delta) { default: return r + f->delta; @@ -304,7 +306,7 @@ Rune ApplyFold(CaseFold *f, Rune r) { // // CycleFoldRune('?') = '?' Rune CycleFoldRune(Rune r) { - CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r); + const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r); if (f == NULL || r < f->lo) return r; return ApplyFold(f, r); @@ -327,7 +329,7 @@ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { return; while (lo <= hi) { - CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo); + const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo); if (f == NULL) // lo has no fold, nor does anything above lo break; if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo @@ -377,7 +379,6 @@ bool Regexp::ParseState::PushLiteral(Rune r) { } r = CycleFoldRune(r); } while (r != r1); - re->ccb_->RemoveAbove(rune_max_); return PushRegexp(re); } @@ -463,6 +464,59 @@ bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, return true; } +// RepetitionWalker reports whether the repetition regexp is valid. +// Valid means that the combination of the top-level repetition +// and any inner repetitions does not exceed n copies of the +// innermost thing. +// This rewalks the regexp tree and is called for every repetition, +// so we have to worry about inducing quadratic behavior in the parser. +// We avoid this by only using RepetitionWalker when min or max >= 2. +// In that case the depth of any >= 2 nesting can only get to 9 without +// triggering a parse error, so each subtree can only be rewalked 9 times. +class RepetitionWalker : public Regexp::Walker<int> { + public: + RepetitionWalker() {} + virtual int PreVisit(Regexp* re, int parent_arg, bool* stop); + virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg, + int* child_args, int nchild_args); + virtual int ShortVisit(Regexp* re, int parent_arg); + + private: + DISALLOW_COPY_AND_ASSIGN(RepetitionWalker); +}; + +int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { + int arg = parent_arg; + if (re->op() == kRegexpRepeat) { + int m = re->max(); + if (m < 0) { + m = re->min(); + } + if (m > 0) { + arg /= m; + } + } + return arg; +} + +int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, + int* child_args, int nchild_args) { + int arg = pre_arg; + for (int i = 0; i < nchild_args; i++) { + if (child_args[i] < arg) { + arg = child_args[i]; + } + } + return arg; +} + +int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) { + // This should never be called, since we use Walk and not + // WalkExponential. + LOG(DFATAL) << "RepetitionWalker::ShortVisit called"; + return 0; +} + // Pushes a repetition regexp onto the stack. // A valid argument for the operator must already be on the stack. bool Regexp::ParseState::PushRepetition(int min, int max, @@ -488,8 +542,15 @@ bool Regexp::ParseState::PushRepetition(int min, int max, re->down_ = stacktop_->down_; re->sub()[0] = FinishRegexp(stacktop_); re->simple_ = re->ComputeSimple(); - stacktop_ = re; + if (min >= 2 || max >= 2) { + RepetitionWalker w; + if (w.Walk(stacktop_, 1000) == 0) { + status_->set_code(kRegexpRepeatSize); + status_->set_error_arg(s); + return false; + } + } return true; } @@ -515,13 +576,6 @@ bool Regexp::ParseState::DoLeftParenNoCapture() { return PushRegexp(re); } -// Adds r to cc, along with r's upper case if foldascii is set. -static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) { - cc->AddRange(r, r); - if (foldascii && 'a' <= r && r <= 'z') - cc->AddRange(r + 'A' - 'a', r + 'A' - 'a'); -} - // Processes a vertical bar in the input. bool Regexp::ParseState::DoVerticalBar() { MaybeConcatString(-1, NoParseFlags); @@ -535,46 +589,34 @@ bool Regexp::ParseState::DoVerticalBar() { Regexp* r1; Regexp* r2; if ((r1 = stacktop_) != NULL && - (r2 = stacktop_->down_) != NULL && + (r2 = r1->down_) != NULL && r2->op() == kVerticalBar) { - // If above and below vertical bar are literal or char class, - // can merge into a single char class. Regexp* r3; - if ((r1->op() == kRegexpLiteral || - r1->op() == kRegexpCharClass || - r1->op() == kRegexpAnyChar) && - (r3 = r2->down_) != NULL) { - Rune rune; - switch (r3->op()) { - case kRegexpLiteral: // convert to char class - rune = r3->rune_; - r3->op_ = kRegexpCharClass; - r3->cc_ = NULL; - r3->ccb_ = new CharClassBuilder; - AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase); - // fall through - case kRegexpCharClass: - if (r1->op() == kRegexpLiteral) - AddLiteral(r3->ccb_, r1->rune_, - r1->parse_flags_ & Regexp::FoldCase); - else if (r1->op() == kRegexpCharClass) - r3->ccb_->AddCharClass(r1->ccb_); - if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) { - delete r3->ccb_; - r3->ccb_ = NULL; - r3->op_ = kRegexpAnyChar; - } - // fall through - case kRegexpAnyChar: - // pop r1 - stacktop_ = r2; - r1->Decref(); - return true; - default: - break; + if ((r3 = r2->down_) != NULL && + (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) { + // AnyChar is above or below the vertical bar. Let it subsume + // the other when the other is Literal, CharClass or AnyChar. + if (r3->op() == kRegexpAnyChar && + (r1->op() == kRegexpLiteral || + r1->op() == kRegexpCharClass || + r1->op() == kRegexpAnyChar)) { + // Discard r1. + stacktop_ = r2; + r1->Decref(); + return true; + } + if (r1->op() == kRegexpAnyChar && + (r3->op() == kRegexpLiteral || + r3->op() == kRegexpCharClass || + r3->op() == kRegexpAnyChar)) { + // Rearrange the stack and discard r3. + r1->down_ = r3->down_; + r2->down_ = r1; + stacktop_ = r2; + r3->Decref(); + return true; } } - // Swap r1 below vertical bar (r2). r1->down_ = r2->down_; r2->down_ = r1; @@ -1105,7 +1147,7 @@ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { if (r >= 0) { re1->op_ = kRegexpLiteral; re1->rune_ = r; - re1->parse_flags_ = flags; + re1->parse_flags_ = static_cast<uint16>(flags); return true; } @@ -1188,6 +1230,14 @@ static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { int n; if (fullrune(sp->data(), sp->size())) { n = chartorune(r, sp->data()); + // Some copies of chartorune have a bug that accepts + // encodings of values in (10FFFF, 1FFFFF] as valid. + // Those values break the character class algorithm, + // which assumes Runemax is the largest rune. + if (*r > Runemax) { + n = 1; + *r = Runeerror; + } if (!(n == 1 && *r == Runeerror)) { // no decoding error sp->remove_prefix(n); return n; @@ -1290,6 +1340,8 @@ static bool ParseEscape(StringPiece* s, Rune* rp, } } } + if (code > rune_max) + goto BadEscape; *rp = code; return true; @@ -1375,7 +1427,8 @@ static bool ParseEscape(StringPiece* s, Rune* rp, BadEscape: // Unrecognized escape sequence. status->set_code(kRegexpBadEscape); - status->set_error_arg(StringPiece(begin, s->data() - begin)); + status->set_error_arg( + StringPiece(begin, static_cast<int>(s->data() - begin))); return false; } @@ -1403,8 +1456,8 @@ void CharClassBuilder::AddRangeFlags( } // Look for a group with the given name. -static UGroup* LookupGroup(const StringPiece& name, - UGroup *groups, int ngroups) { +static const UGroup* LookupGroup(const StringPiece& name, + const UGroup *groups, int ngroups) { // Simple name lookup. for (int i = 0; i < ngroups; i++) if (StringPiece(groups[i].name) == name) @@ -1418,16 +1471,16 @@ static URange32 any32[] = { { 65536, Runemax } }; static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 }; // Look for a POSIX group with the given name (e.g., "[:^alpha:]") -static UGroup* LookupPosixGroup(const StringPiece& name) { +static const UGroup* LookupPosixGroup(const StringPiece& name) { return LookupGroup(name, posix_groups, num_posix_groups); } -static UGroup* LookupPerlGroup(const StringPiece& name) { +static const UGroup* LookupPerlGroup(const StringPiece& name) { return LookupGroup(name, perl_groups, num_perl_groups); } // Look for a Unicode group with the given name (e.g., "Han") -static UGroup* LookupUnicodeGroup(const StringPiece& name) { +static const UGroup* LookupUnicodeGroup(const StringPiece& name) { // Special case: "Any" means any. if (name == StringPiece("Any")) return &anygroup; @@ -1435,7 +1488,7 @@ static UGroup* LookupUnicodeGroup(const StringPiece& name) { } // Add a UGroup or its negation to the character class. -static void AddUGroup(CharClassBuilder *cc, UGroup *g, int sign, +static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign, Regexp::ParseFlags parse_flags) { if (sign == +1) { for (int i = 0; i < g->nr16; i++) { @@ -1486,7 +1539,7 @@ static void AddUGroup(CharClassBuilder *cc, UGroup *g, int sign, // On success, sets *s to span the remainder of the string // and returns the corresponding UGroup. // The StringPiece must *NOT* be edited unless the call succeeds. -UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) { +const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) { if (!(parse_flags & Regexp::PerlClasses)) return NULL; if (s->size() < 2 || (*s)[0] != '\\') @@ -1494,7 +1547,7 @@ UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) { // Could use StringPieceToRune, but there aren't // any non-ASCII Perl group names. StringPiece name(s->begin(), 2); - UGroup *g = LookupPerlGroup(name); + const UGroup *g = LookupPerlGroup(name); if (g == NULL) return NULL; s->remove_prefix(name.size()); @@ -1534,10 +1587,10 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, if (c != '{') { // Name is the bit of string we just skipped over for c. const char* p = seq.begin() + 2; - name = StringPiece(p, s->begin() - p); + name = StringPiece(p, static_cast<int>(s->begin() - p)); } else { // Name is in braces. Look for closing } - int end = s->find('}', 0); + size_t end = s->find('}', 0); if (end == s->npos) { if (!IsValidUTF8(seq, status)) return kParseError; @@ -1545,21 +1598,21 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, status->set_error_arg(seq); return kParseError; } - name = StringPiece(s->begin(), end); // without '}' - s->remove_prefix(end + 1); // with '}' + name = StringPiece(s->begin(), static_cast<int>(end)); // without '}' + s->remove_prefix(static_cast<int>(end) + 1); // with '}' if (!IsValidUTF8(name, status)) return kParseError; } // Chop seq where s now begins. - seq = StringPiece(seq.begin(), s->begin() - seq.begin()); + seq = StringPiece(seq.begin(), static_cast<int>(s->begin() - seq.begin())); // Look up group if (name.size() > 0 && name[0] == '^') { sign = -sign; name.remove_prefix(1); // '^' } - UGroup *g = LookupUnicodeGroup(name); + const UGroup *g = LookupUnicodeGroup(name); if (g == NULL) { status->set_code(kRegexpBadCharRange); status->set_error_arg(seq); @@ -1593,9 +1646,9 @@ static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, // Got it. Check that it's valid. q += 2; - StringPiece name(p, q-p); + StringPiece name(p, static_cast<int>(q-p)); - UGroup *g = LookupPosixGroup(name); + const UGroup *g = LookupPosixGroup(name); if (g == NULL) { status->set_code(kRegexpBadCharRange); status->set_error_arg(name); @@ -1647,7 +1700,8 @@ bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, return false; if (rr->hi < rr->lo) { status->set_code(kRegexpBadCharRange); - status->set_error_arg(StringPiece(os.data(), s->data() - os.data())); + status->set_error_arg( + StringPiece(os.data(), static_cast<int>(s->data() - os.data()))); return false; } } else { @@ -1732,7 +1786,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, } // Look for Perl character class symbols (extension). - UGroup *g = MaybeParsePerlCCEscape(s, flags_); + const UGroup *g = MaybeParsePerlCCEscape(s, flags_); if (g != NULL) { AddUGroup(re->ccb_, g, g->sign, flags_); continue; @@ -1761,7 +1815,6 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, if (negated) re->ccb_->Negate(); - re->ccb_->RemoveAbove(rune_max_); *out_re = re; return true; @@ -1820,7 +1873,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { // so that's the one we implement. One is enough. if (t.size() > 2 && t[0] == 'P' && t[1] == '<') { // Pull out name. - int end = t.find('>', 2); + size_t end = t.find('>', 2); if (end == t.npos) { if (!IsValidUTF8(*s, status_)) return false; @@ -1830,8 +1883,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { } // t is "P<name>...", t[end] == '>' - StringPiece capture(t.begin()-2, end+3); // "(?P<name>" - StringPiece name(t.begin()+2, end-2); // "name" + StringPiece capture(t.begin()-2, static_cast<int>(end)+3); // "(?P<name>" + StringPiece name(t.begin()+2, static_cast<int>(end)-2); // "name" if (!IsValidUTF8(name, status_)) return false; if (!IsValidCaptureName(name)) { @@ -1845,7 +1898,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { return false; } - s->remove_prefix(capture.end() - s->begin()); + s->remove_prefix(static_cast<int>(capture.end() - s->begin())); return true; } @@ -1928,7 +1981,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { BadPerlOp: status_->set_code(kRegexpBadPerlOp); - status_->set_error_arg(StringPiece(s->begin(), t.begin() - s->begin())); + status_->set_error_arg( + StringPiece(s->begin(), static_cast<int>(t.begin() - s->begin()))); return false; } @@ -2075,12 +2129,13 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, // a** is a syntax error, not a double-star. // (and a++ means something else entirely, which we don't support!) status->set_code(kRegexpRepeatOp); - status->set_error_arg(StringPiece(lastunary.begin(), - t.begin() - lastunary.begin())); + status->set_error_arg( + StringPiece(lastunary.begin(), + static_cast<int>(t.begin() - lastunary.begin()))); return NULL; } } - opstr.set(opstr.data(), t.data() - opstr.data()); + opstr.set(opstr.data(), static_cast<int>(t.data() - opstr.data())); if (!ps.PushRepeatOp(op, opstr, nongreedy)) return NULL; isunary = opstr; @@ -2106,12 +2161,13 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, if (lastunary.size() > 0) { // Not allowed to stack repetition operators. status->set_code(kRegexpRepeatOp); - status->set_error_arg(StringPiece(lastunary.begin(), - t.begin() - lastunary.begin())); + status->set_error_arg( + StringPiece(lastunary.begin(), + static_cast<int>(t.begin() - lastunary.begin()))); return NULL; } } - opstr.set(opstr.data(), t.data() - opstr.data()); + opstr.set(opstr.data(), static_cast<int>(t.data() - opstr.data())); if (!ps.PushRepetition(lo, hi, opstr, nongreedy)) return NULL; isunary = opstr; @@ -2187,7 +2243,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, } } - UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags()); + const UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags()); if (g != NULL) { Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); re->ccb_ = new CharClassBuilder; diff --git a/third_party/re2/re2/perl_groups.cc b/third_party/re2/re2/perl_groups.cc index 1af5b43..422b388 100644 --- a/third_party/re2/re2/perl_groups.cc +++ b/third_party/re2/re2/perl_groups.cc @@ -5,21 +5,21 @@ namespace re2 { -static URange16 code1[] = { /* \d */ +static const URange16 code1[] = { /* \d */ { 0x30, 0x39 }, }; -static URange16 code2[] = { /* \s */ +static const URange16 code2[] = { /* \s */ { 0x9, 0xa }, { 0xc, 0xd }, { 0x20, 0x20 }, }; -static URange16 code3[] = { /* \w */ +static const URange16 code3[] = { /* \w */ { 0x30, 0x39 }, { 0x41, 0x5a }, { 0x5f, 0x5f }, { 0x61, 0x7a }, }; -UGroup perl_groups[] = { +const UGroup perl_groups[] = { { "\\d", +1, code1, 1 }, { "\\D", -1, code1, 1 }, { "\\s", +1, code2, 3 }, @@ -27,64 +27,64 @@ UGroup perl_groups[] = { { "\\w", +1, code3, 4 }, { "\\W", -1, code3, 4 }, }; -int num_perl_groups = 6; -static URange16 code4[] = { /* [:alnum:] */ +const int num_perl_groups = 6; +static const URange16 code4[] = { /* [:alnum:] */ { 0x30, 0x39 }, { 0x41, 0x5a }, { 0x61, 0x7a }, }; -static URange16 code5[] = { /* [:alpha:] */ +static const URange16 code5[] = { /* [:alpha:] */ { 0x41, 0x5a }, { 0x61, 0x7a }, }; -static URange16 code6[] = { /* [:ascii:] */ +static const URange16 code6[] = { /* [:ascii:] */ { 0x0, 0x7f }, }; -static URange16 code7[] = { /* [:blank:] */ +static const URange16 code7[] = { /* [:blank:] */ { 0x9, 0x9 }, { 0x20, 0x20 }, }; -static URange16 code8[] = { /* [:cntrl:] */ +static const URange16 code8[] = { /* [:cntrl:] */ { 0x0, 0x1f }, { 0x7f, 0x7f }, }; -static URange16 code9[] = { /* [:digit:] */ +static const URange16 code9[] = { /* [:digit:] */ { 0x30, 0x39 }, }; -static URange16 code10[] = { /* [:graph:] */ +static const URange16 code10[] = { /* [:graph:] */ { 0x21, 0x7e }, }; -static URange16 code11[] = { /* [:lower:] */ +static const URange16 code11[] = { /* [:lower:] */ { 0x61, 0x7a }, }; -static URange16 code12[] = { /* [:print:] */ +static const URange16 code12[] = { /* [:print:] */ { 0x20, 0x7e }, }; -static URange16 code13[] = { /* [:punct:] */ +static const URange16 code13[] = { /* [:punct:] */ { 0x21, 0x2f }, { 0x3a, 0x40 }, { 0x5b, 0x60 }, { 0x7b, 0x7e }, }; -static URange16 code14[] = { /* [:space:] */ +static const URange16 code14[] = { /* [:space:] */ { 0x9, 0xd }, { 0x20, 0x20 }, }; -static URange16 code15[] = { /* [:upper:] */ +static const URange16 code15[] = { /* [:upper:] */ { 0x41, 0x5a }, }; -static URange16 code16[] = { /* [:word:] */ +static const URange16 code16[] = { /* [:word:] */ { 0x30, 0x39 }, { 0x41, 0x5a }, { 0x5f, 0x5f }, { 0x61, 0x7a }, }; -static URange16 code17[] = { /* [:xdigit:] */ +static const URange16 code17[] = { /* [:xdigit:] */ { 0x30, 0x39 }, { 0x41, 0x46 }, { 0x61, 0x66 }, }; -UGroup posix_groups[] = { +const UGroup posix_groups[] = { { "[:alnum:]", +1, code4, 3 }, { "[:^alnum:]", -1, code4, 3 }, { "[:alpha:]", +1, code5, 2 }, @@ -114,6 +114,6 @@ UGroup posix_groups[] = { { "[:xdigit:]", +1, code17, 3 }, { "[:^xdigit:]", -1, code17, 3 }, }; -int num_posix_groups = 28; +const int num_posix_groups = 28; } // namespace re2 diff --git a/third_party/re2/re2/prefilter.cc b/third_party/re2/re2/prefilter.cc index 77e0cbd..45e43c9 100644 --- a/third_party/re2/re2/prefilter.cc +++ b/third_party/re2/re2/prefilter.cc @@ -15,6 +15,7 @@ static const int Trace = false; typedef set<string>::iterator SSIter; typedef set<string>::const_iterator ConstSSIter; +GLOBAL_MUTEX(alloc_id_mutex); static int alloc_id = 100000; // Used for debugging. // Initializes a Prefilter, allocating subs_ as necessary. Prefilter::Prefilter(Op op) { @@ -23,7 +24,9 @@ Prefilter::Prefilter(Op op) { if (op_ == AND || op_ == OR) subs_ = new vector<Prefilter*>; + GLOBAL_MUTEX_LOCK(alloc_id_mutex); alloc_id_ = alloc_id++; + GLOBAL_MUTEX_UNLOCK(alloc_id_mutex); VLOG(10) << "alloc_id: " << alloc_id_; } @@ -31,7 +34,7 @@ Prefilter::Prefilter(Op op) { Prefilter::~Prefilter() { VLOG(10) << "Deleted: " << alloc_id_; if (subs_) { - for (int i = 0; i < subs_->size(); i++) + for (size_t i = 0; i < subs_->size(); i++) delete (*subs_)[i]; delete subs_; subs_ = NULL; @@ -100,7 +103,7 @@ Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) { // If a and b match op, merge their contents. if (a->op() == op && b->op() == op) { - for (int i = 0; i < b->subs()->size(); i++) { + for (size_t i = 0; i < b->subs()->size(); i++) { Prefilter* bb = (*b->subs())[i]; a->subs()->push_back(bb); } @@ -175,7 +178,7 @@ static Rune ToLowerRune(Rune r) { return r; } - CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r); + const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r); if (f == NULL || r < f->lo) return r; return ApplyFold(f, r); @@ -492,7 +495,7 @@ class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> { bool latin1() { return latin1_; } private: bool latin1_; - DISALLOW_EVIL_CONSTRUCTORS(Walker); + DISALLOW_COPY_AND_ASSIGN(Walker); }; Prefilter::Info* Prefilter::BuildInfo(Regexp* re) { @@ -500,7 +503,7 @@ Prefilter::Info* Prefilter::BuildInfo(Regexp* re) { LOG(INFO) << "BuildPrefilter::Info: " << re->ToString(); } - bool latin1 = re->parse_flags() & Regexp::Latin1; + bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0; Prefilter::Info::Walker w(latin1); Prefilter::Info* info = w.WalkExponential(re, NULL, 100000); @@ -669,7 +672,7 @@ string Prefilter::DebugString() const { return ""; case AND: { string s = ""; - for (int i = 0; i < subs_->size(); i++) { + for (size_t i = 0; i < subs_->size(); i++) { if (i > 0) s += " "; Prefilter* sub = (*subs_)[i]; @@ -679,7 +682,7 @@ string Prefilter::DebugString() const { } case OR: { string s = "("; - for (int i = 0; i < subs_->size(); i++) { + for (size_t i = 0; i < subs_->size(); i++) { if (i > 0) s += "|"; Prefilter* sub = (*subs_)[i]; diff --git a/third_party/re2/re2/prefilter.h b/third_party/re2/re2/prefilter.h index c2f9ddd..2bc1676 100644 --- a/third_party/re2/re2/prefilter.h +++ b/third_party/re2/re2/prefilter.h @@ -97,7 +97,7 @@ class Prefilter { // Used for debugging, helps in tracking memory leaks. int alloc_id_; - DISALLOW_EVIL_CONSTRUCTORS(Prefilter); + DISALLOW_COPY_AND_ASSIGN(Prefilter); }; } // namespace re2 diff --git a/third_party/re2/re2/prefilter_tree.cc b/third_party/re2/re2/prefilter_tree.cc index e5c465b..be9b584 100644 --- a/third_party/re2/re2/prefilter_tree.cc +++ b/third_party/re2/re2/prefilter_tree.cc @@ -8,11 +8,6 @@ #include "re2/prefilter_tree.h" #include "re2/re2.h" -#ifdef WIN32 -#include <stdio.h> -#define snprintf _snprintf -#endif - DEFINE_int32(filtered_re2_min_atom_len, 3, "Strings less than this length are not stored as atoms"); @@ -24,10 +19,10 @@ PrefilterTree::PrefilterTree() } PrefilterTree::~PrefilterTree() { - for (int i = 0; i < prefilter_vec_.size(); i++) + for (size_t i = 0; i < prefilter_vec_.size(); i++) delete prefilter_vec_[i]; - for (int i = 0; i < entries_.size(); i++) + for (size_t i = 0; i < entries_.size(); i++) delete entries_[i].parents; } @@ -48,12 +43,12 @@ static bool KeepPart(Prefilter* prefilter, int level) { case Prefilter::ATOM: return prefilter->atom().size() >= - FLAGS_filtered_re2_min_atom_len; + static_cast<size_t>(FLAGS_filtered_re2_min_atom_len); case Prefilter::AND: { int j = 0; vector<Prefilter*>* subs = prefilter->subs(); - for (int i = 0; i < subs->size(); i++) + for (size_t i = 0; i < subs->size(); i++) if (KeepPart((*subs)[i], level + 1)) (*subs)[j++] = (*subs)[i]; else @@ -64,7 +59,7 @@ static bool KeepPart(Prefilter* prefilter, int level) { } case Prefilter::OR: - for (int i = 0; i < prefilter->subs()->size(); i++) + for (size_t i = 0; i < prefilter->subs()->size(); i++) if (!KeepPart((*prefilter->subs())[i], level + 1)) return false; return true; @@ -106,7 +101,7 @@ void PrefilterTree::Compile(vector<string>* atom_vec) { // no longer necessary for their parent to trigger; that is, we do // not miss out on any regexps triggering by getting rid of a // prefilter node. - for (int i = 0; i < entries_.size(); i++) { + for (size_t i = 0; i < entries_.size(); i++) { StdIntMap* parents = entries_[i].parents; if (parents->size() > 8) { // This one triggers too many things. If all the parents are AND @@ -153,7 +148,7 @@ string PrefilterTree::NodeString(Prefilter* node) const { if (node->op() == Prefilter::ATOM) { s += node->atom(); } else { - for (int i = 0; i < node->subs()->size() ; i++) { + for (size_t i = 0; i < node->subs()->size(); i++) { if (i > 0) s += ','; s += Itoa((*node->subs())[i]->unique_id()); @@ -170,10 +165,10 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { vector<Prefilter*> v; // Add the top level nodes of each regexp prefilter. - for (int i = 0; i < prefilter_vec_.size(); i++) { + for (size_t i = 0; i < prefilter_vec_.size(); i++) { Prefilter* f = prefilter_vec_[i]; if (f == NULL) - unfiltered_.push_back(i); + unfiltered_.push_back(static_cast<int>(i)); // We push NULL also on to v, so that we maintain the // mapping of index==regexpid for level=0 prefilter nodes. @@ -181,20 +176,20 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { } // Now add all the descendant nodes. - for (int i = 0; i < v.size(); i++) { + for (size_t i = 0; i < v.size(); i++) { Prefilter* f = v[i]; if (f == NULL) continue; if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { const vector<Prefilter*>& subs = *f->subs(); - for (int j = 0; j < subs.size(); j++) + for (size_t j = 0; j < subs.size(); j++) v.push_back(subs[j]); } } // Identify unique nodes. int unique_id = 0; - for (int i = v.size() - 1; i >= 0; i--) { + for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { Prefilter *node = v[i]; if (node == NULL) continue; @@ -216,7 +211,7 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { entries_.resize(node_map_.size()); // Create parent StdIntMap for the entries. - for (int i = v.size() - 1; i >= 0; i--) { + for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { Prefilter* prefilter = v[i]; if (prefilter == NULL) continue; @@ -229,7 +224,7 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { } // Fill the entries. - for (int i = v.size() - 1; i >= 0; i--) { + for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { Prefilter* prefilter = v[i]; if (prefilter == NULL) continue; @@ -251,8 +246,8 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { case Prefilter::OR: case Prefilter::AND: { - std::set<int> uniq_child; - for (int j = 0; j < prefilter->subs()->size() ; j++) { + set<int> uniq_child; + for (size_t j = 0; j < prefilter->subs()->size(); j++) { Prefilter* child = (*prefilter->subs())[j]; Prefilter* canonical = CanonicalNode(child); if (canonical == NULL) { @@ -264,11 +259,13 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { // To the child, we want to add to parent indices. Entry* child_entry = &entries_[child_id]; if (child_entry->parents->find(prefilter->unique_id()) == - child_entry->parents->end()) + child_entry->parents->end()) { (*child_entry->parents)[prefilter->unique_id()] = 1; + } } - entry->propagate_up_at_count = - prefilter->op() == Prefilter::AND ? uniq_child.size() : 1; + entry->propagate_up_at_count = prefilter->op() == Prefilter::AND + ? static_cast<int>(uniq_child.size()) + : 1; break; } @@ -276,13 +273,13 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { } // For top level nodes, populate regexp id. - for (int i = 0; i < prefilter_vec_.size(); i++) { + for (size_t i = 0; i < prefilter_vec_.size(); i++) { if (prefilter_vec_[i] == NULL) continue; int id = CanonicalNode(prefilter_vec_[i])->unique_id(); DCHECK_LE(0, id); Entry* entry = &entries_[id]; - entry->regexps.push_back(i); + entry->regexps.push_back(static_cast<int>(i)); } } @@ -293,13 +290,13 @@ void PrefilterTree::RegexpsGivenStrings( regexps->clear(); if (!compiled_) { LOG(WARNING) << "Compile() not called"; - for (int i = 0; i < prefilter_vec_.size(); ++i) - regexps->push_back(i); + for (size_t i = 0; i < prefilter_vec_.size(); ++i) + regexps->push_back(static_cast<int>(i)); } else { if (!prefilter_vec_.empty()) { - IntMap regexps_map(prefilter_vec_.size()); + IntMap regexps_map(static_cast<int>(prefilter_vec_.size())); vector<int> matched_atom_ids; - for (int j = 0; j < matched_atoms.size(); j++) { + for (size_t j = 0; j < matched_atoms.size(); j++) { matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]]; } @@ -317,15 +314,15 @@ void PrefilterTree::RegexpsGivenStrings( void PrefilterTree::PropagateMatch(const vector<int>& atom_ids, IntMap* regexps) const { - IntMap count(entries_.size()); - IntMap work(entries_.size()); - for (int i = 0; i < atom_ids.size(); i++) + IntMap count(static_cast<int>(entries_.size())); + IntMap work(static_cast<int>(entries_.size())); + for (size_t i = 0; i < atom_ids.size(); i++) work.set(atom_ids[i], 1); for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { const Entry& entry = entries_[it->index()]; VLOG(10) << "Processing: " << it->index(); // Record regexps triggered. - for (int i = 0; i < entry.regexps.size(); i++) { + for (size_t i = 0; i < entry.regexps.size(); i++) { VLOG(10) << "Regexp triggered: " << entry.regexps[i]; regexps->set(entry.regexps[i], 1); } @@ -365,7 +362,7 @@ void PrefilterTree::PrintDebugInfo() { VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size(); VLOG(10) << "#Unique Nodes: " << entries_.size(); - for (int i = 0; i < entries_.size(); ++i) { + for (size_t i = 0; i < entries_.size(); ++i) { StdIntMap* parents = entries_[i].parents; const vector<int>& regexps = entries_[i].regexps; VLOG(10) << "EntryId: " << i @@ -390,7 +387,7 @@ string PrefilterTree::DebugNodeString(Prefilter* node) const { // Adding the operation disambiguates AND and OR nodes. node_string += node->op() == Prefilter::AND ? "AND" : "OR"; node_string += "("; - for (int i = 0; i < node->subs()->size() ; i++) { + for (size_t i = 0; i < node->subs()->size(); i++) { if (i > 0) node_string += ','; node_string += Itoa((*node->subs())[i]->unique_id()); diff --git a/third_party/re2/re2/prefilter_tree.h b/third_party/re2/re2/prefilter_tree.h index 94eb183..abea55d 100644 --- a/third_party/re2/re2/prefilter_tree.h +++ b/third_party/re2/re2/prefilter_tree.h @@ -16,15 +16,13 @@ #ifndef RE2_PREFILTER_TREE_H_ #define RE2_PREFILTER_TREE_H_ -#include <map> - #include "util/util.h" #include "util/sparse_array.h" namespace re2 { typedef SparseArray<int> IntMap; -typedef std::map<int, int> StdIntMap; +typedef map<int, int> StdIntMap; class Prefilter; @@ -125,7 +123,7 @@ class PrefilterTree { // Has the prefilter tree been compiled. bool compiled_; - DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree); + DISALLOW_COPY_AND_ASSIGN(PrefilterTree); }; } // namespace diff --git a/third_party/re2/re2/prog.cc b/third_party/re2/re2/prog.cc index ef9ef23..499f560 100644 --- a/third_party/re2/re2/prog.cc +++ b/third_party/re2/re2/prog.cc @@ -25,7 +25,7 @@ void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) { set_out_opcode(out, kInstByteRange); lo_ = lo & 0xFF; hi_ = hi & 0xFF; - foldcase_ = foldcase; + foldcase_ = foldcase & 0xFF; } void Prog::Inst::InitCapture(int cap, uint32 out) { @@ -295,13 +295,15 @@ uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) { } void Prog::MarkByteRange(int lo, int hi) { - CHECK_GE(lo, 0); - CHECK_GE(hi, 0); - CHECK_LE(lo, 255); - CHECK_LE(hi, 255); - if (lo > 0) + DCHECK_GE(lo, 0); + DCHECK_GE(hi, 0); + DCHECK_LE(lo, 255); + DCHECK_LE(hi, 255); + DCHECK_LE(lo, hi); + if (0 < lo && lo <= 255) byterange_.Set(lo - 1); - byterange_.Set(hi); + if (0 <= hi && hi <= 255) + byterange_.Set(hi); } void Prog::ComputeByteMap() { @@ -325,12 +327,12 @@ void Prog::ComputeByteMap() { bytemap_range_ = bytemap_[255] + 1; unbytemap_ = new uint8[bytemap_range_]; for (int i = 0; i < 256; i++) - unbytemap_[bytemap_[i]] = i; + unbytemap_[bytemap_[i]] = static_cast<uint8>(i); if (0) { // For debugging: use trivial byte map. for (int i = 0; i < 256; i++) { - bytemap_[i] = i; - unbytemap_[i] = i; + bytemap_[i] = static_cast<uint8>(i); + unbytemap_[i] = static_cast<uint8>(i); } bytemap_range_ = 256; LOG(INFO) << "Using trivial bytemap."; diff --git a/third_party/re2/re2/prog.h b/third_party/re2/re2/prog.h index 2cf65bc..8c5b2c4 100644 --- a/third_party/re2/re2/prog.h +++ b/third_party/re2/re2/prog.h @@ -10,6 +10,7 @@ #define RE2_PROG_H__ #include "util/util.h" +#include "util/sparse_array.h" #include "re2/re2.h" namespace re2 { @@ -42,7 +43,7 @@ class Bitmap { static const int WordLog = 5; static const int Words = (Bits+31)/32; uint32 w_[Words]; - DISALLOW_EVIL_CONSTRUCTORS(Bitmap); + DISALLOW_COPY_AND_ASSIGN(Bitmap); }; @@ -95,7 +96,7 @@ class Prog { void InitFail(); // Getters - int id(Prog* p) { return this - p->inst_; } + int id(Prog* p) { return static_cast<int>(this - p->inst_); } InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); } int out() { return out_opcode_>>3; } int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } @@ -167,7 +168,7 @@ class Prog { friend struct PatchList; friend class Prog; - DISALLOW_EVIL_CONSTRUCTORS(Inst); + DISALLOW_COPY_AND_ASSIGN(Inst); }; // Whether to anchor the search. @@ -200,10 +201,10 @@ class Prog { int start_unanchored() { return start_unanchored_; } void set_start(int start) { start_ = start; } void set_start_unanchored(int start) { start_unanchored_ = start; } - int64 size() { return size_; } + int size() { return size_; } bool reversed() { return reversed_; } void set_reversed(bool reversed) { reversed_ = reversed; } - int64 byte_inst_count() { return byte_inst_count_; } + int byte_inst_count() { return byte_inst_count_; } const Bitmap<256>& byterange() { return byterange_; } void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; } int64 dfa_mem() { return dfa_mem_; } @@ -329,6 +330,10 @@ class Prog { // Returns true on success, false on error. bool PossibleMatchRange(string* min, string* max, int maxlen); + // EXPERIMENTAL! SUBJECT TO CHANGE! + // Outputs the program fanout into the given sparse array. + void Fanout(SparseArray<int>* fanout); + // Compiles a collection of regexps to Prog. Each regexp will have // its own Match instruction recording the index in the vector. static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, @@ -368,7 +373,7 @@ class Prog { uint8* onepass_nodes_; // data for OnePass nodes OneState* onepass_start_; // start node for OnePass program - DISALLOW_EVIL_CONSTRUCTORS(Prog); + DISALLOW_COPY_AND_ASSIGN(Prog); }; } // namespace re2 diff --git a/third_party/re2/re2/re2.cc b/third_party/re2/re2/re2.cc index b9e44fc..b3e582f 100644 --- a/third_party/re2/re2/re2.cc +++ b/third_party/re2/re2/re2.cc @@ -11,16 +11,10 @@ #include <stdio.h> #include <string> -#ifdef WIN32 -#define strtoll _strtoi64 -#define strtoull _strtoui64 -#define strtof strtod -#else -#include <pthread.h> -#endif #include <errno.h> #include "util/util.h" #include "util/flags.h" +#include "util/sparse_array.h" #include "re2/prog.h" #include "re2/regexp.h" @@ -37,22 +31,10 @@ const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Par const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume = {}; const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume = {}; -#define kDefaultMaxMem (8<<20) - -RE2::Options::Options() - : encoding_(EncodingUTF8), - posix_syntax_(false), - longest_match_(false), - log_errors_(true), - max_mem_(kDefaultMaxMem), - literal_(false), - never_nl_(false), - never_capture_(false), - case_sensitive_(true), - perl_classes_(false), - word_boundary_(false), - one_line_(false) { -} +// This will trigger LNK2005 error in MSVC. +#ifndef _MSC_VER +const int RE2::Options::kDefaultMaxMem; // initialized in re2.h +#endif RE2::Options::Options(RE2::CannedOptions opt) : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), @@ -62,6 +44,7 @@ RE2::Options::Options(RE2::CannedOptions opt) max_mem_(kDefaultMaxMem), literal_(false), never_nl_(false), + dot_nl_(false), never_capture_(false), case_sensitive_(true), perl_classes_(false), @@ -169,6 +152,9 @@ int RE2::Options::ParseFlags() const { if (never_nl()) flags |= Regexp::NeverNL; + if (dot_nl()) + flags |= Regexp::DotNL; + if (never_capture()) flags |= Regexp::NeverCapture; @@ -285,8 +271,36 @@ int RE2::ProgramSize() const { return prog_->size(); } +int RE2::ProgramFanout(map<int, int>* histogram) const { + if (prog_ == NULL) + return -1; + SparseArray<int> fanout(prog_->size()); + prog_->Fanout(&fanout); + histogram->clear(); + for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) { + // TODO(junyer): Optimise this? + int bucket = 0; + while (1 << bucket < i->second) { + bucket++; + } + (*histogram)[bucket]++; + } + return histogram->rbegin()->first; +} + +// Returns num_captures_, computing it if needed, or -1 if the +// regexp wasn't valid on construction. +int RE2::NumberOfCapturingGroups() const { + MutexLock l(mutex_); + if (suffix_regexp_ == NULL) + return -1; + if (num_captures_ == -1) + num_captures_ = suffix_regexp_->NumCaptures(); + return num_captures_; +} + // Returns named_groups_, computing it if needed. -const map<string, int>& RE2::NamedCapturingGroups() const { +const map<string, int>& RE2::NamedCapturingGroups() const { MutexLock l(mutex_); if (!ok()) return *empty_named_groups; @@ -299,7 +313,7 @@ const map<string, int>& RE2::NamedCapturingGroups() const { } // Returns group_names_, computing it if needed. -const map<int, string>& RE2::CapturingGroupNames() const { +const map<int, string>& RE2::CapturingGroupNames() const { MutexLock l(mutex_); if (!ok()) return *empty_group_names; @@ -371,7 +385,7 @@ bool RE2::Replace(string *str, int nvec = 1 + MaxSubmatch(rewrite); if (nvec > arraysize(vec)) return false; - if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) + if (!re.Match(*str, 0, static_cast<int>(str->size()), UNANCHORED, vec, nvec)) return false; string s; @@ -398,7 +412,8 @@ int RE2::GlobalReplace(string *str, string out; int count = 0; while (p <= ep) { - if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec)) + if (!re.Match(*str, static_cast<int>(p - str->data()), + static_cast<int>(str->size()), UNANCHORED, vec, nvec)) break; if (p < vec[0].begin()) out.append(p, vec[0].begin() - p); @@ -482,7 +497,7 @@ bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { if (prog_ == NULL) return false; - int n = prefix_.size(); + int n = static_cast<int>(prefix_.size()); if (n > maxlen) n = maxlen; @@ -554,7 +569,10 @@ bool RE2::Match(const StringPiece& text, if (startpos < 0 || startpos > endpos || endpos > text.size()) { if (options_.log_errors()) - LOG(ERROR) << "RE2: invalid startpos, endpos pair."; + LOG(ERROR) << "RE2: invalid startpos, endpos pair. [" + << "startpos: " << startpos << ", " + << "endpos: " << endpos << ", " + << "text size: " << text.size() << "]"; return false; } @@ -591,7 +609,7 @@ bool RE2::Match(const StringPiece& text, if (!prefix_.empty()) { if (startpos != 0) return false; - prefixlen = prefix_.size(); + prefixlen = static_cast<int>(prefix_.size()); if (prefixlen > subtext.size()) return false; if (prefix_foldcase_) { @@ -832,8 +850,8 @@ bool RE2::DoMatch(const StringPiece& text, return false; } - if(consumed != NULL) - *consumed = vec[0].end() - text.begin(); + if (consumed != NULL) + *consumed = static_cast<int>(vec[0].end() - text.begin()); if (n == 0 || args == NULL) { // We are not interested in results @@ -855,7 +873,7 @@ bool RE2::DoMatch(const StringPiece& text, if (!args[i]->Parse(s.data(), s.size())) { // TODO: Should we indicate what the error was? VLOG(1) << "Parse error on #" << i << " " << s << " " - << (void*)s.data() << "/" << s.size(); + << (void*)s.data() << "/" << s.size(); delete[] heapvec; return false; } @@ -871,48 +889,35 @@ bool RE2::Rewrite(string *out, const StringPiece &rewrite, const StringPiece *vec, int veclen) const { for (const char *s = rewrite.data(), *end = s + rewrite.size(); s < end; s++) { - int c = *s; - if (c == '\\') { - s++; - c = (s < end) ? *s : -1; - if (isdigit(c)) { - int n = (c - '0'); - if (n >= veclen) { - if (options_.log_errors()) { - LOG(ERROR) << "requested group " << n - << " in regexp " << rewrite.data(); - } - return false; + if (*s != '\\') { + out->push_back(*s); + continue; + } + s++; + int c = (s < end) ? *s : -1; + if (isdigit(c)) { + int n = (c - '0'); + if (n >= veclen) { + if (options_.log_errors()) { + LOG(ERROR) << "requested group " << n + << " in regexp " << rewrite.data(); } - StringPiece snip = vec[n]; - if (snip.size() > 0) - out->append(snip.data(), snip.size()); - } else if (c == '\\') { - out->push_back('\\'); - } else { - if (options_.log_errors()) - LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); return false; } + StringPiece snip = vec[n]; + if (snip.size() > 0) + out->append(snip.data(), snip.size()); + } else if (c == '\\') { + out->push_back('\\'); } else { - out->push_back(c); + if (options_.log_errors()) + LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); + return false; } } return true; } -// Return the number of capturing subpatterns, or -1 if the -// regexp wasn't valid on construction. -int RE2::NumberOfCapturingGroups() const { - if (suffix_regexp_ == NULL) - return -1; - ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case" - " multiple threads end up doing the same work in parallel."); - if (num_captures_ == -1) - num_captures_ = suffix_regexp_->NumCaptures(); - return num_captures_; -} - // Checks that the rewrite string is well-formed with respect to this // regular expression. bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { @@ -987,16 +992,23 @@ bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { // Largest number spec that we are willing to parse static const int kMaxNumberLength = 32; -// REQUIRES "buf" must have length at least kMaxNumberLength+1 +// REQUIRES "buf" must have length at least nbuf. // Copies "str" into "buf" and null-terminates. // Overwrites *np with the new length. -static const char* TerminateNumber(char* buf, const char* str, int* np) { +static const char* TerminateNumber(char* buf, int nbuf, const char* str, int* np, + bool accept_spaces) { int n = *np; if (n <= 0) return ""; if (n > 0 && isspace(*str)) { // We are less forgiving than the strtoxxx() routines and do not - // allow leading spaces. - return ""; + // allow leading spaces. We do allow leading spaces for floats. + if (!accept_spaces) { + return ""; + } + while (n > 0 && isspace(*str)) { + n--; + str++; + } } // Although buf has a fixed maximum size, we can still handle @@ -1026,7 +1038,7 @@ static const char* TerminateNumber(char* buf, const char* str, int* np) { str--; } - if (n > kMaxNumberLength) return ""; + if (n > nbuf-1) return ""; memmove(buf, str, n); if (neg) { @@ -1043,7 +1055,7 @@ bool RE2::Arg::parse_long_radix(const char* str, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, &n); + str = TerminateNumber(buf, sizeof buf, str, &n, false); char* end; errno = 0; long r = strtol(str, &end, radix); @@ -1060,7 +1072,7 @@ bool RE2::Arg::parse_ulong_radix(const char* str, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, &n); + str = TerminateNumber(buf, sizeof buf, str, &n, false); if (str[0] == '-') { // strtoul() will silently accept negative numbers and parse // them. This module is more strict and treats them as errors. @@ -1085,7 +1097,7 @@ bool RE2::Arg::parse_short_radix(const char* str, if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse if ((short)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast<short*>(dest)) = r; + *(reinterpret_cast<short*>(dest)) = (short)r; return true; } @@ -1097,7 +1109,7 @@ bool RE2::Arg::parse_ushort_radix(const char* str, if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse if ((ushort)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast<unsigned short*>(dest)) = r; + *(reinterpret_cast<unsigned short*>(dest)) = (ushort)r; return true; } @@ -1125,13 +1137,14 @@ bool RE2::Arg::parse_uint_radix(const char* str, return true; } +#if RE2_HAVE_LONGLONG bool RE2::Arg::parse_longlong_radix(const char* str, int n, void* dest, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, &n); + str = TerminateNumber(buf, sizeof buf, str, &n, false); char* end; errno = 0; int64 r = strtoll(str, &end, radix); @@ -1148,7 +1161,7 @@ bool RE2::Arg::parse_ulonglong_radix(const char* str, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, &n); + str = TerminateNumber(buf, sizeof buf, str, &n, false); if (str[0] == '-') { // strtoull() will silently accept negative numbers and parse // them. This module is more strict and treats them as errors. @@ -1163,27 +1176,26 @@ bool RE2::Arg::parse_ulonglong_radix(const char* str, *(reinterpret_cast<uint64*>(dest)) = r; return true; } +#endif static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) { if (n == 0) return false; static const int kMaxLength = 200; - char buf[kMaxLength]; - if (n >= kMaxLength) return false; - memcpy(buf, str, n); - buf[n] = '\0'; - errno = 0; + char buf[kMaxLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, true); char* end; + errno = 0; double r; if (isfloat) { - r = strtof(buf, &end); + r = strtof(str, &end); } else { - r = strtod(buf, &end); + r = strtod(str, &end); } - if (end != buf + n) return false; // Leftover junk + if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; if (isfloat) { - *(reinterpret_cast<float*>(dest)) = r; + *(reinterpret_cast<float*>(dest)) = (float)r; } else { *(reinterpret_cast<double*>(dest)) = r; } diff --git a/third_party/re2/re2/re2.h b/third_party/re2/re2/re2.h index c509853..a10d6a0 100644 --- a/third_party/re2/re2/re2.h +++ b/third_party/re2/re2/re2.h @@ -17,7 +17,7 @@ // some of the more complicated things thrown away. In particular, // backreferences and generalized assertions are not available, nor is \Z. // -// See http://code.google.com/p/re2/wiki/Syntax for the syntax +// See https://github.com/google/re2/wiki/Syntax for the syntax // supported by RE2, and a comparison with PCRE and PERL regexps. // // For those not familiar with Perl's regular expressions, @@ -179,13 +179,16 @@ // RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d)); // will leave 64 in a, b, c, and d. - #include <stdint.h> #include <map> #include <string> #include "re2/stringpiece.h" #include "re2/variadic_function.h" +#ifndef RE2_HAVE_LONGLONG +#define RE2_HAVE_LONGLONG 1 +#endif + namespace re2 { using std::string; @@ -240,7 +243,7 @@ class RE2 { ErrorBadPerlOp, // bad perl operator ErrorBadUTF8, // invalid UTF-8 in regexp ErrorBadNamedCapture, // bad named capture group - ErrorPatternTooLarge, // pattern too large (compile failed) + ErrorPatternTooLarge // pattern too large (compile failed) }; // Predefined common options. @@ -290,6 +293,11 @@ class RE2 { // Larger numbers are more expensive than smaller numbers. int ProgramSize() const; + // EXPERIMENTAL! SUBJECT TO CHANGE! + // Outputs the program fanout as a histogram bucketed by powers of 2. + // Returns the number of the largest non-empty bucket. + int ProgramFanout(map<int, int>* histogram) const; + // Returns the underlying Regexp; not for general use. // Returns entire_regexp_ so that callers don't need // to know about prefix_ and prefix_foldcase_. @@ -394,6 +402,8 @@ class RE2 { // // Returns true iff a match occurred and the extraction happened // successfully; if no match occurs, the string is left unaffected. + // + // REQUIRES: "text" must not alias any part of "*out". static bool Extract(const StringPiece &text, const RE2& pattern, const StringPiece &rewrite, @@ -429,7 +439,7 @@ class RE2 { enum Anchor { UNANCHORED, // No anchoring ANCHOR_START, // Anchor at start only - ANCHOR_BOTH, // Anchor at start and end + ANCHOR_BOTH // Anchor at start and end }; // Return the number of capturing subpatterns, or -1 if the @@ -437,7 +447,6 @@ class RE2 { // does not count: if the regexp is "(a)(b)", returns 2. int NumberOfCapturingGroups() const; - // Return a map from names to capturing indices. // The map records the index of the leftmost group // with the given name. @@ -512,6 +521,7 @@ class RE2 { // max_mem (see below) approx. max memory footprint of RE2 // literal (false) interpret string as literal, not regexp // never_nl (false) never match \n, even if it is in regexp + // dot_nl (false) dot matches everything including new line // never_capture (false) parse all parens as non-capturing // case_sensitive (true) match is case-sensitive (regexp can override // with (?i) unless in posix_syntax mode) @@ -552,16 +562,29 @@ class RE2 { // If this happens too often, RE2 falls back on the NFA implementation. // For now, make the default budget something close to Code Search. -#ifndef WIN32 static const int kDefaultMaxMem = 8<<20; -#endif enum Encoding { EncodingUTF8 = 1, EncodingLatin1 }; - Options(); + Options() : + encoding_(EncodingUTF8), + posix_syntax_(false), + longest_match_(false), + log_errors_(true), + max_mem_(kDefaultMaxMem), + literal_(false), + never_nl_(false), + dot_nl_(false), + never_capture_(false), + case_sensitive_(true), + perl_classes_(false), + word_boundary_(false), + one_line_(false) { + } + /*implicit*/ Options(CannedOptions); Encoding encoding() const { return encoding_; } @@ -587,8 +610,8 @@ class RE2 { bool log_errors() const { return log_errors_; } void set_log_errors(bool b) { log_errors_ = b; } - int max_mem() const { return max_mem_; } - void set_max_mem(int m) { max_mem_ = m; } + int64_t max_mem() const { return max_mem_; } + void set_max_mem(int64_t m) { max_mem_ = m; } bool literal() const { return literal_; } void set_literal(bool b) { literal_ = b; } @@ -596,6 +619,9 @@ class RE2 { bool never_nl() const { return never_nl_; } void set_never_nl(bool b) { never_nl_ = b; } + bool dot_nl() const { return dot_nl_; } + void set_dot_nl(bool b) { dot_nl_ = b; } + bool never_capture() const { return never_capture_; } void set_never_capture(bool b) { never_capture_ = b; } @@ -619,6 +645,7 @@ class RE2 { max_mem_ = src.max_mem_; literal_ = src.literal_; never_nl_ = src.never_nl_; + dot_nl_ = src.dot_nl_; never_capture_ = src.never_capture_; case_sensitive_ = src.case_sensitive_; perl_classes_ = src.perl_classes_; @@ -636,13 +663,14 @@ class RE2 { int64_t max_mem_; bool literal_; bool never_nl_; + bool dot_nl_; bool never_capture_; bool case_sensitive_; bool perl_classes_; bool word_boundary_; bool one_line_; - //DISALLOW_EVIL_CONSTRUCTORS(Options); + //DISALLOW_COPY_AND_ASSIGN(Options); Options(const Options&); void operator=(const Options&); }; @@ -657,8 +685,10 @@ class RE2 { static inline Arg CRadix(unsigned int* x); static inline Arg CRadix(long* x); static inline Arg CRadix(unsigned long* x); + #if RE2_HAVE_LONGLONG static inline Arg CRadix(long long* x); static inline Arg CRadix(unsigned long long* x); + #endif static inline Arg Hex(short* x); static inline Arg Hex(unsigned short* x); @@ -666,8 +696,10 @@ class RE2 { static inline Arg Hex(unsigned int* x); static inline Arg Hex(long* x); static inline Arg Hex(unsigned long* x); + #if RE2_HAVE_LONGLONG static inline Arg Hex(long long* x); static inline Arg Hex(unsigned long long* x); + #endif static inline Arg Octal(short* x); static inline Arg Octal(unsigned short* x); @@ -675,8 +707,10 @@ class RE2 { static inline Arg Octal(unsigned int* x); static inline Arg Octal(long* x); static inline Arg Octal(unsigned long* x); + #if RE2_HAVE_LONGLONG static inline Arg Octal(long long* x); static inline Arg Octal(unsigned long long* x); + #endif private: void Init(const StringPiece& pattern, const Options& options); @@ -711,7 +745,7 @@ class RE2 { // Map from capture indices to names mutable const map<int, string>* group_names_; - //DISALLOW_EVIL_CONSTRUCTORS(RE2); + //DISALLOW_COPY_AND_ASSIGN(RE2); RE2(const RE2&); void operator=(const RE2&); }; @@ -756,8 +790,10 @@ class RE2::Arg { MAKE_PARSER(unsigned int, parse_uint); MAKE_PARSER(long, parse_long); MAKE_PARSER(unsigned long, parse_ulong); + #if RE2_HAVE_LONGLONG MAKE_PARSER(long long, parse_longlong); MAKE_PARSER(unsigned long long, parse_ulonglong); + #endif MAKE_PARSER(float, parse_float); MAKE_PARSER(double, parse_double); MAKE_PARSER(string, parse_string); @@ -765,12 +801,11 @@ class RE2::Arg { #undef MAKE_PARSER - // Generic constructor - template <class T> Arg(T*, Parser parser); - // Generic constructor template + // Generic constructor templates template <class T> Arg(T* p) - : arg_(p), parser_(_RE2_MatchObject<T>::Parse) { - } + : arg_(p), parser_(_RE2_MatchObject<T>::Parse) { } + template <class T> Arg(T* p, Parser parser) + : arg_(p), parser_(parser) { } // Parse the data bool Parse(const char* str, int n) const; @@ -803,8 +838,10 @@ class RE2::Arg { DECLARE_INTEGER_PARSER(uint); DECLARE_INTEGER_PARSER(long); DECLARE_INTEGER_PARSER(ulong); + #if RE2_HAVE_LONGLONG DECLARE_INTEGER_PARSER(longlong); DECLARE_INTEGER_PARSER(ulonglong); + #endif #undef DECLARE_INTEGER_PARSER }; @@ -825,14 +862,16 @@ inline bool RE2::Arg::Parse(const char* str, int n) const { inline RE2::Arg RE2::CRadix(type* ptr) { \ return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); } -MAKE_INTEGER_PARSER(short, short); -MAKE_INTEGER_PARSER(unsigned short, ushort); -MAKE_INTEGER_PARSER(int, int); -MAKE_INTEGER_PARSER(unsigned int, uint); -MAKE_INTEGER_PARSER(long, long); -MAKE_INTEGER_PARSER(unsigned long, ulong); -MAKE_INTEGER_PARSER(long long, longlong); -MAKE_INTEGER_PARSER(unsigned long long, ulonglong); +MAKE_INTEGER_PARSER(short, short) +MAKE_INTEGER_PARSER(unsigned short, ushort) +MAKE_INTEGER_PARSER(int, int) +MAKE_INTEGER_PARSER(unsigned int, uint) +MAKE_INTEGER_PARSER(long, long) +MAKE_INTEGER_PARSER(unsigned long, ulong) +#if RE2_HAVE_LONGLONG +MAKE_INTEGER_PARSER(long long, longlong) +MAKE_INTEGER_PARSER(unsigned long long, ulonglong) +#endif #undef MAKE_INTEGER_PARSER diff --git a/third_party/re2/re2/regexp.cc b/third_party/re2/re2/regexp.cc index ed4c3a0..99e72e5 100644 --- a/third_party/re2/re2/regexp.cc +++ b/third_party/re2/re2/regexp.cc @@ -14,7 +14,7 @@ namespace re2 { // Constructor. Allocates vectors as appropriate for operator. Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) - : op_(op), + : op_(static_cast<uint8>(op)), simple_(false), parse_flags_(static_cast<uint16>(parse_flags)), ref_(1), @@ -43,7 +43,8 @@ Regexp::~Regexp() { delete[] runes_; break; case kRegexpCharClass: - cc_->Delete(); + if (cc_) + cc_->Delete(); delete ccb_; break; } @@ -106,7 +107,7 @@ void Regexp::Decref() { GLOBAL_MUTEX_LOCK(ref_mutex); int r = (*ref_map)[this] - 1; if (r < kMaxRef) { - ref_ = r; + ref_ = static_cast<uint16>(r); ref_map->erase(this); } else { (*ref_map)[this] = r; @@ -211,6 +212,13 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, if (nsub == 1) return sub[0]; + if (nsub == 0) { + if (op == kRegexpAlternate) + return new Regexp(kRegexpNoMatch, flags); + else + return new Regexp(kRegexpEmptyMatch, flags); + } + Regexp** subcopy = NULL; if (op == kRegexpAlternate && can_factor) { // Going to edit sub; make a copy so we don't step on caller. @@ -445,10 +453,11 @@ bool Regexp::Equal(Regexp* a, Regexp* b) { continue; } - int n = stk.size(); + size_t n = stk.size(); if (n == 0) break; + DCHECK_GE(n, 2); a = stk[n-2]; b = stk[n-1]; stk.resize(n-2); @@ -517,7 +526,7 @@ class NumCapturesWalker : public Regexp::Walker<Ignored> { private: int ncapture_; - DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker); + DISALLOW_COPY_AND_ASSIGN(NumCapturesWalker); }; int Regexp::NumCaptures() { @@ -561,7 +570,7 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> { private: map<string, int>* map_; - DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker); + DISALLOW_COPY_AND_ASSIGN(NamedCapturesWalker); }; map<string, int>* Regexp::NamedCaptures() { @@ -601,7 +610,7 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> { private: map<int, string>* map_; - DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker); + DISALLOW_COPY_AND_ASSIGN(CaptureNamesWalker); }; map<int, string>* Regexp::CaptureNames() { @@ -643,7 +652,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { if (re->parse_flags() & Latin1) { prefix->resize(re->nrunes_); for (int j = 0; j < re->nrunes_; j++) - (*prefix)[j] = re->runes_[j]; + (*prefix)[j] = static_cast<char>(re->runes_[j]); } else { // Convert to UTF-8 in place. // Assume worst-case space and then trim. @@ -652,7 +661,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { for (int j = 0; j < re->nrunes_; j++) { Rune r = re->runes_[j]; if (r < Runeself) - *p++ = r; + *p++ = static_cast<char>(r); else p += runetochar(p, &r); } @@ -662,14 +671,14 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { case kRegexpLiteral: if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { - prefix->append(1, re->rune_); + prefix->append(1, static_cast<char>(re->rune_)); } else { char buf[UTFmax]; prefix->append(buf, runetochar(buf, &re->rune_)); } break; } - *foldcase = (sub[i]->parse_flags() & FoldCase); + *foldcase = (sub[i]->parse_flags() & FoldCase) != 0; i++; // The rest. @@ -849,7 +858,7 @@ void CharClassBuilder::Negate() { } ranges_.clear(); - for (int i = 0; i < v.size(); i++) + for (size_t i = 0; i < v.size(); i++) ranges_.insert(v[i]); upper_ = AlphaMask & ~upper_; @@ -915,12 +924,12 @@ bool CharClass::Contains(Rune r) { } CharClass* CharClassBuilder::GetCharClass() { - CharClass* cc = CharClass::New(ranges_.size()); + CharClass* cc = CharClass::New(static_cast<int>(ranges_.size())); int n = 0; for (iterator it = begin(); it != end(); ++it) cc->ranges_[n++] = *it; cc->nranges_ = n; - DCHECK_LE(n, ranges_.size()); + DCHECK_LE(n, static_cast<int>(ranges_.size())); cc->nrunes_ = nrunes_; cc->folds_ascii_ = FoldsASCII(); return cc; diff --git a/third_party/re2/re2/regexp.h b/third_party/re2/re2/regexp.h index 331c017..5f222b7 100644 --- a/third_party/re2/re2/regexp.h +++ b/third_party/re2/re2/regexp.h @@ -208,10 +208,11 @@ class RegexpStatus { StringPiece error_arg_; // Piece of regexp containing syntax error. string* tmp_; // Temporary storage, possibly where error_arg_ is. - DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus); + DISALLOW_COPY_AND_ASSIGN(RegexpStatus); }; -// Walker to implement Simplify. +// Walkers to implement Simplify. +class CoalesceWalker; class SimplifyWalker; // Compiled form; see prog.h @@ -261,7 +262,7 @@ class CharClass { int nrunes_; RuneRange *ranges_; int nranges_; - DISALLOW_EVIL_CONSTRUCTORS(CharClass); + DISALLOW_COPY_AND_ASSIGN(CharClass); }; class Regexp { @@ -312,7 +313,7 @@ class Regexp { // Get. No set, Regexps are logically immutable once created. RegexpOp op() { return static_cast<RegexpOp>(op_); } int nsub() { return nsub_; } - bool simple() { return simple_; } + bool simple() { return simple_ != 0; } enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); } int Ref(); // For testing. @@ -353,6 +354,7 @@ class Regexp { // removed. The result will capture exactly the same // subexpressions the original did, unless formatted with ToString. Regexp* Simplify(); + friend class CoalesceWalker; friend class SimplifyWalker; // Parses the regexp src and then simplifies it and sets *dst to the @@ -568,7 +570,7 @@ class Regexp { void *the_union_[2]; // as big as any other element, for memset }; - DISALLOW_EVIL_CONSTRUCTORS(Regexp); + DISALLOW_COPY_AND_ASSIGN(Regexp); }; // Character class set: contains non-overlapping, non-abutting RuneRanges. @@ -602,7 +604,7 @@ class CharClassBuilder { uint32 lower_; // bitmap of a-z int nrunes_; RuneRangeSet ranges_; - DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder); + DISALLOW_COPY_AND_ASSIGN(CharClassBuilder); }; // Tell g++ that bitwise ops on ParseFlags produce ParseFlags. diff --git a/third_party/re2/re2/set.cc b/third_party/re2/re2/set.cc index 2bcd30a..a1a84ba 100644 --- a/third_party/re2/re2/set.cc +++ b/third_party/re2/re2/set.cc @@ -20,7 +20,7 @@ RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) { } RE2::Set::~Set() { - for (int i = 0; i < re_.size(); i++) + for (size_t i = 0; i < re_.size(); i++) re_[i]->Decref(); delete prog_; } @@ -45,7 +45,7 @@ int RE2::Set::Add(const StringPiece& pattern, string* error) { } // Concatenate with match index and push on vector. - int n = re_.size(); + int n = static_cast<int>(re_.size()); re2::Regexp* m = re2::Regexp::HaveMatch(n, pf); if (re->op() == kRegexpConcat) { int nsub = re->nsub(); @@ -75,8 +75,8 @@ bool RE2::Set::Compile() { Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>( options_.ParseFlags()); - re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]), - re_.size(), pf); + re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(re_.data()), + static_cast<int>(re_.size()), pf); re_.clear(); re2::Regexp* sre = re->Simplify(); re->Decref(); diff --git a/third_party/re2/re2/set.h b/third_party/re2/re2/set.h index d716425..1f55b61 100644 --- a/third_party/re2/re2/set.h +++ b/third_party/re2/re2/set.h @@ -45,7 +45,7 @@ class RE2::Set { vector<re2::Regexp*> re_; re2::Prog* prog_; bool compiled_; - //DISALLOW_EVIL_CONSTRUCTORS(Set); + //DISALLOW_COPY_AND_ASSIGN(Set); Set(const Set&); void operator=(const Set&); }; diff --git a/third_party/re2/re2/simplify.cc b/third_party/re2/re2/simplify.cc index faf3208..ecc60e7 100644 --- a/third_party/re2/re2/simplify.cc +++ b/third_party/re2/re2/simplify.cc @@ -61,7 +61,7 @@ bool Regexp::ComputeSimple() { // These are simple as long as the subpieces are simple. subs = sub(); for (int i = 0; i < nsub_; i++) - if (!subs[i]->simple_) + if (!subs[i]->simple()) return false; return true; case kRegexpCharClass: @@ -71,12 +71,12 @@ bool Regexp::ComputeSimple() { return !cc_->empty() && !cc_->full(); case kRegexpCapture: subs = sub(); - return subs[0]->simple_; + return subs[0]->simple(); case kRegexpStar: case kRegexpPlus: case kRegexpQuest: subs = sub(); - if (!subs[0]->simple_) + if (!subs[0]->simple()) return false; switch (subs[0]->op_) { case kRegexpStar: @@ -97,6 +97,36 @@ bool Regexp::ComputeSimple() { } // Walker subclass used by Simplify. +// Coalesces runs of star/plus/quest/repeat of the same literal along with any +// occurrences of that literal into repeats of that literal. It also works for +// char classes, any char and any byte. +// PostVisit creates the coalesced result, which should then be simplified. +class CoalesceWalker : public Regexp::Walker<Regexp*> { + public: + CoalesceWalker() {} + virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, + Regexp** child_args, int nchild_args); + virtual Regexp* Copy(Regexp* re); + virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); + + private: + // These functions are declared inside CoalesceWalker so that + // they can edit the private fields of the Regexps they construct. + + // Returns true if r1 and r2 can be coalesced. In particular, ensures that + // the parse flags are consistent. (They will not be checked again later.) + static bool CanCoalesce(Regexp* r1, Regexp* r2); + + // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards + // will be empty match and the coalesced op. In other cases, where part of a + // literal string was removed to be coalesced, the array elements afterwards + // will be the coalesced op and the remainder of the literal string. + static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr); + + DISALLOW_COPY_AND_ASSIGN(CoalesceWalker); +}; + +// Walker subclass used by Simplify. // The simplify walk is purely post-recursive: given the simplified children, // PostVisit creates the simplified result. // The child_args are simplified Regexp*s. @@ -104,9 +134,7 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> { public: SimplifyWalker() {} virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); - virtual Regexp* PostVisit(Regexp* re, - Regexp* parent_arg, - Regexp* pre_arg, + virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, Regexp** child_args, int nchild_args); virtual Regexp* Copy(Regexp* re); virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); @@ -130,7 +158,7 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> { // Caller must Decref return value when done with it. static Regexp* SimplifyCharClass(Regexp* re); - DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker); + DISALLOW_COPY_AND_ASSIGN(SimplifyWalker); }; // Simplifies a regular expression, returning a new regexp. @@ -143,14 +171,261 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> { // Caller must Decref() return value when done with it. Regexp* Regexp::Simplify() { - if (simple_) - return Incref(); - SimplifyWalker w; - return w.Walk(this, NULL); + CoalesceWalker cw; + Regexp* cre = cw.Walk(this, NULL); + if (cre == NULL) + return cre; + SimplifyWalker sw; + Regexp* sre = sw.Walk(cre, NULL); + cre->Decref(); + return sre; } #define Simplify DontCallSimplify // Avoid accidental recursion +// Utility function for PostVisit implementations that compares re->sub() with +// child_args to determine whether any child_args changed. In the common case, +// where nothing changed, calls Decref() for all child_args and returns false, +// so PostVisit must return re->Incref(). Otherwise, returns true. +static bool ChildArgsChanged(Regexp* re, Regexp** child_args) { + for (int i = 0; i < re->nsub(); i++) { + Regexp* sub = re->sub()[i]; + Regexp* newsub = child_args[i]; + if (newsub != sub) + return true; + } + for (int i = 0; i < re->nsub(); i++) { + Regexp* newsub = child_args[i]; + newsub->Decref(); + } + return false; +} + +Regexp* CoalesceWalker::Copy(Regexp* re) { + return re->Incref(); +} + +Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { + // This should never be called, since we use Walk and not + // WalkExponential. + LOG(DFATAL) << "CoalesceWalker::ShortVisit called"; + return re->Incref(); +} + +Regexp* CoalesceWalker::PostVisit(Regexp* re, + Regexp* parent_arg, + Regexp* pre_arg, + Regexp** child_args, + int nchild_args) { + if (re->nsub() == 0) + return re->Incref(); + + if (re->op() != kRegexpConcat) { + if (!ChildArgsChanged(re, child_args)) + return re->Incref(); + + // Something changed. Build a new op. + Regexp* nre = new Regexp(re->op(), re->parse_flags()); + nre->AllocSub(re->nsub()); + Regexp** nre_subs = nre->sub(); + for (int i = 0; i < re->nsub(); i++) + nre_subs[i] = child_args[i]; + // Repeats and Captures have additional data that must be copied. + if (re->op() == kRegexpRepeat) { + nre->min_ = re->min(); + nre->max_ = re->max(); + } else if (re->op() == kRegexpCapture) { + nre->cap_ = re->cap(); + } + return nre; + } + + bool can_coalesce = false; + for (int i = 0; i < re->nsub(); i++) { + if (i+1 < re->nsub() && + CanCoalesce(child_args[i], child_args[i+1])) { + can_coalesce = true; + break; + } + } + if (!can_coalesce) { + if (!ChildArgsChanged(re, child_args)) + return re->Incref(); + + // Something changed. Build a new op. + Regexp* nre = new Regexp(re->op(), re->parse_flags()); + nre->AllocSub(re->nsub()); + Regexp** nre_subs = nre->sub(); + for (int i = 0; i < re->nsub(); i++) + nre_subs[i] = child_args[i]; + return nre; + } + + for (int i = 0; i < re->nsub(); i++) { + if (i+1 < re->nsub() && + CanCoalesce(child_args[i], child_args[i+1])) + DoCoalesce(&child_args[i], &child_args[i+1]); + } + // Determine how many empty matches were left by DoCoalesce. + int n = 0; + for (int i = n; i < re->nsub(); i++) { + if (child_args[i]->op() == kRegexpEmptyMatch) + n++; + } + // Build a new op. + Regexp* nre = new Regexp(re->op(), re->parse_flags()); + nre->AllocSub(re->nsub() - n); + Regexp** nre_subs = nre->sub(); + for (int i = 0, j = 0; i < re->nsub(); i++) { + if (child_args[i]->op() == kRegexpEmptyMatch) { + child_args[i]->Decref(); + continue; + } + nre_subs[j] = child_args[i]; + j++; + } + return nre; +} + +bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) { + // r1 must be a star/plus/quest/repeat of a literal, char class, any char or + // any byte. + if ((r1->op() == kRegexpStar || + r1->op() == kRegexpPlus || + r1->op() == kRegexpQuest || + r1->op() == kRegexpRepeat) && + (r1->sub()[0]->op() == kRegexpLiteral || + r1->sub()[0]->op() == kRegexpCharClass || + r1->sub()[0]->op() == kRegexpAnyChar || + r1->sub()[0]->op() == kRegexpAnyByte)) { + // r2 must be a star/plus/quest/repeat of the same literal, char class, + // any char or any byte. + if ((r2->op() == kRegexpStar || + r2->op() == kRegexpPlus || + r2->op() == kRegexpQuest || + r2->op() == kRegexpRepeat) && + Regexp::Equal(r1->sub()[0], r2->sub()[0]) && + // The parse flags must be consistent. + ((r1->parse_flags() & Regexp::NonGreedy) == + (r2->parse_flags() & Regexp::NonGreedy))) { + return true; + } + // ... OR an occurrence of that literal, char class, any char or any byte + if (Regexp::Equal(r1->sub()[0], r2)) { + return true; + } + // ... OR a literal string that begins with that literal. + if (r1->sub()[0]->op() == kRegexpLiteral && + r2->op() == kRegexpLiteralString && + r2->runes()[0] == r1->sub()[0]->rune() && + // The parse flags must be consistent. + ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) == + (r2->parse_flags() & Regexp::FoldCase))) { + return true; + } + } + return false; +} + +void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) { + Regexp* r1 = *r1ptr; + Regexp* r2 = *r2ptr; + + Regexp* nre = Regexp::Repeat( + r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0); + + switch (r1->op()) { + case kRegexpStar: + nre->min_ = 0; + nre->max_ = -1; + break; + + case kRegexpPlus: + nre->min_ = 1; + nre->max_ = -1; + break; + + case kRegexpQuest: + nre->min_ = 0; + nre->max_ = 1; + break; + + case kRegexpRepeat: + nre->min_ = r1->min(); + nre->max_ = r1->max(); + break; + + default: + LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op(); + nre->Decref(); + return; + } + + switch (r2->op()) { + case kRegexpStar: + nre->max_ = -1; + goto LeaveEmpty; + + case kRegexpPlus: + nre->min_++; + nre->max_ = -1; + goto LeaveEmpty; + + case kRegexpQuest: + if (nre->max() != -1) + nre->max_++; + goto LeaveEmpty; + + case kRegexpRepeat: + nre->min_ += r2->min(); + if (r2->max() == -1) + nre->max_ = -1; + else if (nre->max() != -1) + nre->max_ += r2->max(); + goto LeaveEmpty; + + case kRegexpLiteral: + case kRegexpCharClass: + case kRegexpAnyChar: + case kRegexpAnyByte: + nre->min_++; + if (nre->max() != -1) + nre->max_++; + goto LeaveEmpty; + + LeaveEmpty: + *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags); + *r2ptr = nre; + break; + + case kRegexpLiteralString: { + Rune r = r1->sub()[0]->rune(); + // Determine how much of the literal string is removed. + // We know that we have at least one rune. :) + int n = 1; + while (n < r2->nrunes() && r2->runes()[n] == r) + n++; + nre->min_ += n; + if (nre->max() != -1) + nre->max_ += n; + if (n == r2->nrunes()) + goto LeaveEmpty; + *r1ptr = nre; + *r2ptr = Regexp::LiteralString( + &r2->runes()[n], r2->nrunes() - n, r2->parse_flags()); + break; + } + + default: + LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op(); + nre->Decref(); + return; + } + + r1->Decref(); + r2->Decref(); +} + Regexp* SimplifyWalker::Copy(Regexp* re) { return re->Incref(); } @@ -163,7 +438,7 @@ Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { } Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { - if (re->simple_) { + if (re->simple()) { *stop = true; return re->Incref(); } @@ -196,29 +471,14 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re, case kRegexpConcat: case kRegexpAlternate: { // These are simple as long as the subpieces are simple. - // Two passes to avoid allocation in the common case. - bool changed = false; - Regexp** subs = re->sub(); - for (int i = 0; i < re->nsub_; i++) { - Regexp* sub = subs[i]; - Regexp* newsub = child_args[i]; - if (newsub != sub) { - changed = true; - break; - } - } - if (!changed) { - for (int i = 0; i < re->nsub_; i++) { - Regexp* newsub = child_args[i]; - newsub->Decref(); - } + if (!ChildArgsChanged(re, child_args)) { re->simple_ = true; return re->Incref(); } Regexp* nre = new Regexp(re->op(), re->parse_flags()); - nre->AllocSub(re->nsub_); + nre->AllocSub(re->nsub()); Regexp** nre_subs = nre->sub(); - for (int i = 0; i <re->nsub_; i++) + for (int i = 0; i < re->nsub(); i++) nre_subs[i] = child_args[i]; nre->simple_ = true; return nre; @@ -234,7 +494,7 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re, Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); nre->AllocSub(1); nre->sub()[0] = newsub; - nre->cap_ = re->cap_; + nre->cap_ = re->cap(); nre->simple_ = true; return nre; } @@ -325,7 +585,6 @@ Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, // General case: x{4,} is xxxx+ Regexp* nre = new Regexp(kRegexpConcat, f); nre->AllocSub(min); - VLOG(1) << "Simplify " << min; Regexp** nre_subs = nre->sub(); for (int i = 0; i < min-1; i++) nre_subs[i] = re->Incref(); diff --git a/third_party/re2/util/stringpiece.cc b/third_party/re2/re2/stringpiece.cc index 37895b0..00f478a 100644 --- a/third_party/re2/util/stringpiece.cc +++ b/third_party/re2/re2/stringpiece.cc @@ -33,23 +33,33 @@ void StringPiece::CopyToString(string* target) const { target->assign(ptr_, length_); } -int StringPiece::copy(char* buf, size_type n, size_type pos) const { - int ret = min(length_ - pos, n); +void StringPiece::AppendToString(string* target) const { + target->append(ptr_, length_); +} + +StringPiece::size_type StringPiece::copy(char* buf, size_type n, + size_type pos) const { + size_type ret = min(length_ - pos, n); memcpy(buf, ptr_ + pos, ret); return ret; } -int StringPiece::find(const StringPiece& s, size_type pos) const { +bool StringPiece::contains(StringPiece s) const { + return find(s, 0) != npos; +} + +StringPiece::size_type StringPiece::find(const StringPiece& s, + size_type pos) const { if (length_ < 0 || pos > static_cast<size_type>(length_)) return npos; const char* result = std::search(ptr_ + pos, ptr_ + length_, s.ptr_, s.ptr_ + s.length_); const size_type xpos = result - ptr_; - return xpos + s.length_ <= length_ ? xpos : npos; + return xpos + s.length_ <= static_cast<size_type>(length_) ? xpos : npos; } -int StringPiece::find(char c, size_type pos) const { +StringPiece::size_type StringPiece::find(char c, size_type pos) const { if (length_ <= 0 || pos >= static_cast<size_type>(length_)) { return npos; } @@ -57,9 +67,10 @@ int StringPiece::find(char c, size_type pos) const { return result != ptr_ + length_ ? result - ptr_ : npos; } -int StringPiece::rfind(const StringPiece& s, size_type pos) const { +StringPiece::size_type StringPiece::rfind(const StringPiece& s, + size_type pos) const { if (length_ < s.length_) return npos; - const size_t ulen = length_; + const size_type ulen = length_; if (s.length_ == 0) return min(ulen, pos); const char* last = ptr_ + min(ulen - s.length_, pos) + s.length_; @@ -67,9 +78,9 @@ int StringPiece::rfind(const StringPiece& s, size_type pos) const { return result != last ? result - ptr_ : npos; } -int StringPiece::rfind(char c, size_type pos) const { +StringPiece::size_type StringPiece::rfind(char c, size_type pos) const { if (length_ <= 0) return npos; - for (int i = min(pos, static_cast<size_type>(length_ - 1)); + for (int i = static_cast<int>(min(pos, static_cast<size_type>(length_ - 1))); i >= 0; --i) { if (ptr_[i] == c) { return i; @@ -79,9 +90,9 @@ int StringPiece::rfind(char c, size_type pos) const { } StringPiece StringPiece::substr(size_type pos, size_type n) const { - if (pos > length_) pos = length_; + if (pos > static_cast<size_type>(length_)) pos = static_cast<size_type>(length_); if (n > length_ - pos) n = length_ - pos; - return StringPiece(ptr_ + pos, n); + return StringPiece(ptr_ + pos, static_cast<int>(n)); } const StringPiece::size_type StringPiece::npos = size_type(-1); diff --git a/third_party/re2/re2/stringpiece.h b/third_party/re2/re2/stringpiece.h index 38a5150..1479d1a 100644 --- a/third_party/re2/re2/stringpiece.h +++ b/third_party/re2/re2/stringpiece.h @@ -20,12 +20,10 @@ #define STRINGS_STRINGPIECE_H__ #include <string.h> +#include <algorithm> #include <cstddef> #include <iosfwd> #include <string> -#ifdef WIN32 -#include <algorithm> -#endif namespace re2 { @@ -139,15 +137,17 @@ class StringPiece { int max_size() const { return length_; } int capacity() const { return length_; } - int copy(char* buf, size_type n, size_type pos = 0) const; + size_type copy(char* buf, size_type n, size_type pos = 0) const; - int find(const StringPiece& s, size_type pos = 0) const; - int find(char c, size_type pos = 0) const; - int rfind(const StringPiece& s, size_type pos = npos) const; - int rfind(char c, size_type pos = npos) const; + bool contains(StringPiece s) const; + + size_type find(const StringPiece& s, size_type pos = 0) const; + size_type find(char c, size_type pos = 0) const; + size_type rfind(const StringPiece& s, size_type pos = npos) const; + size_type rfind(char c, size_type pos = npos) const; StringPiece substr(size_type pos, size_type n = npos) const; - + static bool _equal(const StringPiece&, const StringPiece&); }; diff --git a/third_party/re2/re2/testing/backtrack.cc b/third_party/re2/re2/testing/backtrack.cc index b2dd6db..a872840 100644 --- a/third_party/re2/re2/testing/backtrack.cc +++ b/third_party/re2/re2/testing/backtrack.cc @@ -72,7 +72,7 @@ class Backtracker { // Search state const char* cap_[64]; // capture registers uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked - int nvisited_; // # of words in bitmap + size_t nvisited_; // # of words in bitmap }; Backtracker::Backtracker(Prog* prog) @@ -150,7 +150,7 @@ bool Backtracker::Visit(int id, const char* p) { // either it didn't match or it did but we're hoping for a better match. // Either way, don't go down that road again. CHECK(p <= text_.end()); - int n = id*(text_.size()+1) + (p - text_.begin()); + size_t n = id*(text_.size()+1) + (p - text_.begin()); CHECK_LT(n/32, nvisited_); if (visited_[n/32] & (1 << (n&31))) return false; @@ -212,7 +212,8 @@ bool Backtracker::Visit(int id, const char* p) { if (submatch_[0].data() == NULL || // First match so far ... (longest_ && p > submatch_[0].end())) { // ... or better match for (int i = 0; i < nsubmatch_; i++) - submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]); + submatch_[i].set(cap_[2*i], + static_cast<int>(cap_[2*i+1] - cap_[2*i])); } return true; diff --git a/third_party/re2/re2/testing/compile_test.cc b/third_party/re2/re2/testing/compile_test.cc index 8d92105..d438b19 100644 --- a/third_party/re2/re2/testing/compile_test.cc +++ b/third_party/re2/re2/testing/compile_test.cc @@ -99,6 +99,10 @@ static Test tests[] = { { "[Aa]", "1. byte/i [61-61] -> 2\n" "2. match! 0\n" }, + // Issue 20992936 + { "[[-`]", + "1. byte [5b-60] -> 2\n" + "2. match! 0\n" }, }; TEST(TestRegexpCompileToProg, Simple) { diff --git a/third_party/re2/re2/testing/dfa_test.cc b/third_party/re2/re2/testing/dfa_test.cc index 8e95ae4..e9c7bef 100644 --- a/third_party/re2/re2/testing/dfa_test.cc +++ b/third_party/re2/re2/testing/dfa_test.cc @@ -2,14 +2,16 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include "util/test.h" #include "util/thread.h" +#include "util/test.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" #include "re2/testing/regexp_generator.h" #include "re2/testing/string_generator.h" +static const bool UsingMallocCounter = false; + DECLARE_bool(re2_dfa_bail_when_slow); DEFINE_int32(size, 8, "log2(number of DFA nodes)"); @@ -42,7 +44,7 @@ TEST(Multithreaded, BuildEntireDFA) { // Check that single-threaded code works. { //LOG(INFO) << s; - Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL); + Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); CHECK(prog); @@ -57,7 +59,7 @@ TEST(Multithreaded, BuildEntireDFA) { // Build the DFA simultaneously in a bunch of threads. for (int i = 0; i < FLAGS_repeat; i++) { - Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL); + Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); CHECK(prog); @@ -92,14 +94,13 @@ TEST(SingleThreaded, BuildEntireDFA) { s += "[ab]"; s += "b"; - //LOG(INFO) << s; - Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL); + Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL); CHECK(re); int max = 24; for (int i = 17; i < max; i++) { - int limit = 1<<i; - int usage; - //int progusage, dfamem; + int64 limit = 1<<i; + int64 usage; + //int64 progusage, dfamem; { testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY); Prog* prog = re->CompileToProg(limit); @@ -113,10 +114,13 @@ TEST(SingleThreaded, BuildEntireDFA) { } if (!UsingMallocCounter) continue; - //LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n", - // limit, progusage, dfamem, usage); + //LOG(INFO) << "limit " << limit << ", " + // << "prog usage " << progusage << ", " + // << "DFA budget " << dfamem << ", " + // << "total " << usage; + // Tolerate +/- 10%. CHECK_GT(usage, limit*9/10); - CHECK_LT(usage, limit + (16<<10)); // 16kB of slop okay + CHECK_LT(usage, limit*11/10); } re->Decref(); } @@ -132,7 +136,7 @@ TEST(SingleThreaded, BuildEntireDFA) { // position in the input, never reusing any states until it gets to the // end of the string. This is the worst possible case for DFA execution. static string DeBruijnString(int n) { - CHECK_LT(n, 8*sizeof(int)); + CHECK_LT(n, static_cast<int>(8*sizeof(int))); CHECK_GT(n, 0); vector<bool> did(1<<n); @@ -221,13 +225,13 @@ TEST(SingleThreaded, SearchDFA) { peak_usage = m.PeakHeapGrowth(); delete prog; } - re->Decref(); - if (!UsingMallocCounter) return; - //LOG(INFO) << "usage " << usage << " " << peak_usage; + //LOG(INFO) << "usage " << usage << ", " + // << "peak usage " << peak_usage; CHECK_LT(usage, 1<<n); CHECK_LT(peak_usage, 1<<n); + re->Decref(); } // Helper thread: searches for match, which should match, diff --git a/third_party/re2/re2/testing/dump.cc b/third_party/re2/re2/testing/dump.cc index 4bdf714..9703039 100644 --- a/third_party/re2/re2/testing/dump.cc +++ b/third_party/re2/re2/testing/dump.cc @@ -120,6 +120,8 @@ static void DumpRegexpAppending(Regexp* re, string* s) { DumpRegexpAppending(re->sub()[0], s); break; case kRegexpCapture: + if (re->cap() == 0) + LOG(DFATAL) << "kRegexpCapture cap() == 0"; if (re->name()) { s->append(*re->name()); s->append(":"); diff --git a/third_party/re2/re2/testing/exhaustive2_test.cc b/third_party/re2/re2/testing/exhaustive2_test.cc index c5fec5b..6dc5016 100644 --- a/third_party/re2/re2/testing/exhaustive2_test.cc +++ b/third_party/re2/re2/testing/exhaustive2_test.cc @@ -23,7 +23,7 @@ TEST(EmptyString, Exhaustive) { TEST(Punctuation, Literals) { vector<string> alphabet = Explode("()*+?{}[]\\^$."); vector<string> escaped = alphabet; - for (int i = 0; i < escaped.size(); i++) + for (size_t i = 0; i < escaped.size(); i++) escaped[i] = "\\" + escaped[i]; ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(), 2, alphabet, "", ""); diff --git a/third_party/re2/re2/testing/exhaustive3_test.cc b/third_party/re2/re2/testing/exhaustive3_test.cc index 5613fcb..6e46bb4 100644 --- a/third_party/re2/re2/testing/exhaustive3_test.cc +++ b/third_party/re2/re2/testing/exhaustive3_test.cc @@ -84,7 +84,7 @@ TEST(InterestingUTF8, AB) { "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]"); vector<string> ops; // no ops vector<string> alpha = InterestingUTF8(); - for (int i = 0; i < alpha.size(); i++) + for (size_t i = 0; i < alpha.size(); i++) alpha[i] = "a" + alpha[i] + "b"; ExhaustiveTest(1, 0, atoms, ops, 1, alpha, "a%sb", ""); diff --git a/third_party/re2/re2/testing/exhaustive_tester.cc b/third_party/re2/re2/testing/exhaustive_tester.cc index 54de857..0e90f33 100644 --- a/third_party/re2/re2/testing/exhaustive_tester.cc +++ b/third_party/re2/re2/testing/exhaustive_tester.cc @@ -148,7 +148,7 @@ void ExhaustiveTest(int maxatoms, int maxops, int maxstrlen, const vector<string>& stralphabet, const string& wrapper, const string& topwrapper) { - if (DEBUG_MODE && FLAGS_quick_debug_mode) { + if (RE2_DEBUG_MODE && FLAGS_quick_debug_mode) { if (maxatoms > 1) maxatoms--; if (maxops > 1) diff --git a/third_party/re2/re2/testing/exhaustive_tester.h b/third_party/re2/re2/testing/exhaustive_tester.h index 38a139f..1facb97 100644 --- a/third_party/re2/re2/testing/exhaustive_tester.h +++ b/third_party/re2/re2/testing/exhaustive_tester.h @@ -13,6 +13,16 @@ namespace re2 { +#if !defined(NDEBUG) +// We are in a debug build. +const bool RE2_DEBUG_MODE = true; +#elif ADDRESS_SANITIZER || MEMORY_SANITIZER || THREAD_SANITIZER +// Not a debug build, but still under sanitizers. +const bool RE2_DEBUG_MODE = true; +#else +const bool RE2_DEBUG_MODE = false; +#endif + // Exhaustive regular expression test: generate all regexps within parameters, // then generate all strings of a given length over a given alphabet, // then check that NFA, DFA, and PCRE agree about whether each regexp matches @@ -63,7 +73,7 @@ class ExhaustiveTester : public RegexpGenerator { bool randomstrings_; // Whether to use random strings int32 stringseed_; // If so, the seed. int stringcount_; // If so, how many to generate. - DISALLOW_EVIL_CONSTRUCTORS(ExhaustiveTester); + DISALLOW_COPY_AND_ASSIGN(ExhaustiveTester); }; // Runs an exhaustive test on the given parameters. diff --git a/third_party/re2/re2/testing/filtered_re2_test.cc b/third_party/re2/re2/testing/filtered_re2_test.cc index e3a0dd1..76c1284 100644 --- a/third_party/re2/re2/testing/filtered_re2_test.cc +++ b/third_party/re2/re2/testing/filtered_re2_test.cc @@ -44,7 +44,7 @@ TEST(FilteredRE2Test, SmallLatinTest) { FilterTestVars v; int id; - v.opts.set_utf8(false); + v.opts.set_encoding(RE2::Options::EncodingLatin1); v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id); v.f.Compile(&v.atoms); EXPECT_EQ(1, v.atoms.size()); @@ -158,10 +158,10 @@ bool CheckExpectedAtoms(const char* atoms[], if (!pass) { LOG(WARNING) << "Failed " << testname; LOG(WARNING) << "Expected #atoms = " << expected.size(); - for (int i = 0; i < expected.size(); i++) + for (size_t i = 0; i < expected.size(); i++) LOG(WARNING) << expected[i]; LOG(WARNING) << "Found #atoms = " << v->atoms.size(); - for (int i = 0; i < v->atoms.size(); i++) + for (size_t i = 0; i < v->atoms.size(); i++) LOG(WARNING) << v->atoms[i]; } @@ -189,18 +189,16 @@ TEST(FilteredRE2Test, AtomTests) { EXPECT_EQ(0, nfail); } -void FindAtomIndices(const vector<string> atoms, - const vector<string> matched_atoms, +void FindAtomIndices(const vector<string>& atoms, + const vector<string>& matched_atoms, vector<int>* atom_indices) { atom_indices->clear(); - for (int i = 0; i < matched_atoms.size(); i++) { - int j = 0; - for (; j < atoms.size(); j++) { + for (size_t i = 0; i < matched_atoms.size(); i++) { + for (size_t j = 0; j < atoms.size(); j++) { if (matched_atoms[i] == atoms[j]) { - atom_indices->push_back(j); + atom_indices->push_back(static_cast<int>(j)); break; } - EXPECT_LT(j, atoms.size()); } } } @@ -266,7 +264,7 @@ TEST(FilteredRE2Test, MatchTests) { atoms.push_back("yyyzzz"); FindAtomIndices(v.atoms, atoms, &atom_ids); LOG(INFO) << "S: " << atom_ids.size(); - for (int i = 0; i < atom_ids.size(); i++) + for (size_t i = 0; i < atom_ids.size(); i++) LOG(INFO) << "i: " << i << " : " << atom_ids[i]; v.f.AllMatches(text, atom_ids, &matching_regexps); EXPECT_EQ(2, matching_regexps.size()); diff --git a/third_party/re2/re2/testing/null_walker.cc b/third_party/re2/re2/testing/null_walker.cc index 09b53cb..bc943f4 100644 --- a/third_party/re2/re2/testing/null_walker.cc +++ b/third_party/re2/re2/testing/null_walker.cc @@ -23,7 +23,7 @@ class NullWalker : public Regexp::Walker<bool> { } private: - DISALLOW_EVIL_CONSTRUCTORS(NullWalker); + DISALLOW_COPY_AND_ASSIGN(NullWalker); }; // Called after visiting re's children. child_args contains the return diff --git a/third_party/re2/re2/testing/parse_test.cc b/third_party/re2/re2/testing/parse_test.cc index f67b477..75c0296 100644 --- a/third_party/re2/re2/testing/parse_test.cc +++ b/third_party/re2/re2/testing/parse_test.cc @@ -118,14 +118,24 @@ static Test tests[] = { { "(?:a)", "lit{a}" }, { "(?:ab)(?:cd)", "str{abcd}" }, { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" }, + { "a|c", "cc{0x61 0x63}" }, + { "a|[cd]", "cc{0x61 0x63-0x64}" }, { "a|.", "dot{}" }, - { ".|a", "dot{}" }, + { "[ab]|c", "cc{0x61-0x63}" }, + { "[ab]|[cd]", "cc{0x61-0x64}" }, + { "[ab]|.", "dot{}" }, + { ".|c", "dot{}" }, + { ".|[cd]", "dot{}" }, + { ".|.", "dot{}" }, // Test Perl quoted literals { "\\Q+|*?{[\\E", "str{+|*?{[}" }, { "\\Q+\\E+", "plus{lit{+}}" }, { "\\Q\\\\E", "lit{\\}" }, { "\\Q\\\\\\E", "str{\\\\}" }, + { "\\Qa\\E*", "star{lit{a}}" }, + { "\\Qab\\E*", "cat{lit{a}star{lit{b}}}" }, + { "\\Qabc\\E*", "cat{str{ab}star{lit{c}}}" }, // Test Perl \A and \z { "(?m)^", "bol{}" }, @@ -212,12 +222,12 @@ void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags, << status.Text(); string s = re[i]->Dump(); EXPECT_EQ(string(tests[i].parse), s) << "Regexp: " << tests[i].regexp - << "\nparse: " << tests[i].parse << " s: " << s << " flag=" << f; + << "\nparse: " << string(tests[i].parse) << " s: " << s << " flag=" << f; } for (int i = 0; i < ntests; i++) { for (int j = 0; j < ntests; j++) { - EXPECT_EQ(string(tests[i].parse) == tests[j].parse, + EXPECT_EQ(string(tests[i].parse) == string(tests[j].parse), RegexpEqualTestingOnly(re[i], re[j])) << "Regexp: " << tests[i].regexp << " " << tests[j].regexp; } @@ -299,6 +309,14 @@ Test prefix_tests[] = { "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" }, { "x{2}y|x{2}[0-9]y", "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" }, + { "n|r|rs", + "alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" }, + { "n|rs|r", + "alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" }, + { "r|rs|n", + "alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" }, + { "rs|r|n", + "alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" }, }; // Test that prefix factoring works. @@ -306,6 +324,22 @@ TEST(TestParse, Prefix) { TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix"); } +Test nested_tests[] = { + { "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))", + "cap{cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}}}}}}}}" }, + { "((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})", + "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{1,1 lit{x}}}}}}}}}}}}}}}}}}}}}" }, + { "((((((((((x{0}){2}){2}){2}){2}){2}){2}){2}){2}){2})", + "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{0,0 lit{x}}}}}}}}}}}}}}}}}}}}}" }, + { "((((((x{2}){2}){2}){5}){5}){5})", + "cap{rep{5,5 cap{rep{5,5 cap{rep{5,5 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}" }, +}; + +// Test that nested repetition works. +TEST(TestParse, Nested) { + TestParse(nested_tests, arraysize(nested_tests), Regexp::PerlX, "nested"); +} + // Invalid regular expressions const char* badtests[] = { "(", @@ -329,6 +363,9 @@ const char* badtests[] = { "(?i)[a-Z]", "a{100000}", "a{100000,}", + "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", + "(((x{7}){11}){13})", + "\\Q\\E*", }; // Valid in Perl, bad in POSIX diff --git a/third_party/re2/re2/testing/possible_match_test.cc b/third_party/re2/re2/testing/possible_match_test.cc index 7c2400e..4687165 100644 --- a/third_party/re2/re2/testing/possible_match_test.cc +++ b/third_party/re2/re2/testing/possible_match_test.cc @@ -7,6 +7,7 @@ #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" +#include "re2/testing/exhaustive_tester.h" #include "re2/testing/regexp_generator.h" #include "re2/testing/string_generator.h" @@ -136,26 +137,26 @@ TEST(PossibleMatchRange, Failures) { // are no valid UTF-8 strings beginning with byte 0xFF. EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); + << "min=" << CEscape(min) << ", max=" << CEscape(max); EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); + << "min=" << CEscape(min) << ", max=" << CEscape(max); EXPECT_FALSE(RE2(".+hello", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); + << "min=" << CEscape(min) << ", max=" << CEscape(max); EXPECT_FALSE(RE2(".*hello", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); + << "min=" << CEscape(min) << ", max=" << CEscape(max); EXPECT_FALSE(RE2(".*", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); + << "min=" << CEscape(min) << ", max=" << CEscape(max); EXPECT_FALSE(RE2("\\C*"). PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); + << "min=" << CEscape(min) << ", max=" << CEscape(max); // Fails because it's a malformed regexp. EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); + << "min=" << CEscape(min) << ", max=" << CEscape(max); } // Exhaustive test: generate all regexps within parameters, @@ -186,7 +187,7 @@ class PossibleMatchTester : public RegexpGenerator { int regexps_; // Number of HandleRegexp calls int tests_; // Number of regexp tests. - DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester); + DISALLOW_COPY_AND_ASSIGN(PossibleMatchTester); }; // Processes a single generated regexp. @@ -224,7 +225,7 @@ TEST(PossibleMatchRange, Exhaustive) { int natom = 3; int noperator = 3; int stringlen = 5; - if (DEBUG_MODE) { + if (RE2_DEBUG_MODE) { natom = 2; noperator = 3; stringlen = 3; diff --git a/third_party/re2/re2/testing/random_test.cc b/third_party/re2/re2/testing/random_test.cc index 91d2b32..d67ae64 100644 --- a/third_party/re2/re2/testing/random_test.cc +++ b/third_party/re2/re2/testing/random_test.cc @@ -25,7 +25,7 @@ static void RandomTest(int maxatoms, int maxops, const string& wrapper) { // Limit to smaller test cases in debug mode, // because everything is so much slower. - if (DEBUG_MODE) { + if (RE2_DEBUG_MODE) { maxatoms--; maxops--; maxstrlen /= 2; diff --git a/third_party/re2/re2/testing/re2_arg_test.cc b/third_party/re2/re2/testing/re2_arg_test.cc index ae7a7b0..d843ffa 100644 --- a/third_party/re2/re2/testing/re2_arg_test.cc +++ b/third_party/re2/re2/testing/re2_arg_test.cc @@ -84,24 +84,24 @@ const SuccessTable kSuccessTable[] = { { "18446744073709551616", 0, { false, false, false, false, false, false }}, }; -const int kNumStrings = ARRAYSIZE(kSuccessTable); +const int kNumStrings = arraysize(kSuccessTable); -// It's ugly to use a macro, but we apparently can't use the ASSERT_TRUE_M +// It's ugly to use a macro, but we apparently can't use the EXPECT_EQ // macro outside of a TEST block and this seems to be the only way to // avoid code duplication. I can also pull off a couple nice tricks // using concatenation for the type I'm checking against. #define PARSE_FOR_TYPE(type, column) { \ type r; \ - for ( int i = 0; i < kNumStrings; ++i ) { \ + for (int i = 0; i < kNumStrings; ++i) { \ RE2::Arg arg(&r); \ const char* const p = kSuccessTable[i].value_string; \ - bool retval = arg.Parse(p, strlen(p)); \ + bool retval = arg.Parse(p, static_cast<int>(strlen(p))); \ bool success = kSuccessTable[i].success[column]; \ - ASSERT_TRUE_M(retval == success, \ - StringPrintf("Parsing '%s' for type " #type " should return %d", \ - p, success).c_str()); \ - if ( success ) { \ - ASSERT_EQUALS(r, kSuccessTable[i].value); \ + EXPECT_EQ(retval, success) \ + << "Parsing '" << p << "' for type " #type " should return " \ + << success; \ + if (success) { \ + EXPECT_EQ(r, (type)kSuccessTable[i].value); \ } \ } \ } diff --git a/third_party/re2/re2/testing/re2_test.cc b/third_party/re2/re2/testing/re2_test.cc index 911e868..a1d9c57 100644 --- a/third_party/re2/re2/testing/re2_test.cc +++ b/third_party/re2/re2/testing/re2_test.cc @@ -5,22 +5,18 @@ // TODO: Test extractions for PartialMatch/Consume -#include <sys/types.h> -#ifndef WIN32 +#include <errno.h> +#ifndef _MSC_VER +#include <unistd.h> /* for sysconf */ #include <sys/mman.h> #endif #include <sys/stat.h> -#include <errno.h> +#include <sys/types.h> #include <vector> #include "util/test.h" #include "re2/re2.h" #include "re2/regexp.h" -#ifdef WIN32 -#include <stdio.h> -#define snprintf _snprintf -#endif - DECLARE_bool(logtostderr); namespace re2 { @@ -180,7 +176,7 @@ TEST(RE2, Replace) { { "", NULL, NULL, NULL, NULL, 0 } }; - for (const ReplaceTest *t = tests; t->original != NULL; ++t) { + for (const ReplaceTest* t = tests; t->original != NULL; t++) { VLOG(1) << StringPrintf("\"%s\" =~ s/%s/%s/g", t->original, t->regexp, t->rewrite); string one(t->original); CHECK(RE2::Replace(&one, t->regexp, t->rewrite)); @@ -373,12 +369,12 @@ TEST(RE2, Match) { CHECK_EQ(port, 9000); } -static void TestRecursion(int size, const char *pattern) { +static void TestRecursion(int size, const char* pattern) { // Fill up a string repeating the pattern given string domain; domain.resize(size); - int patlen = strlen(pattern); - for (int i = 0; i < size; ++i) { + size_t patlen = strlen(pattern); + for (int i = 0; i < size; i++) { domain[i] = pattern[i % patlen]; } // Just make sure it doesn't crash due to too much recursion. @@ -392,8 +388,8 @@ static void TestQuoteMeta(string unquoted, const RE2::Options& options = RE2::DefaultOptions) { string quoted = RE2::QuoteMeta(unquoted); RE2 re(quoted, options); - EXPECT_TRUE_M(RE2::FullMatch(unquoted, re), - "Unquoted='" + unquoted + "', quoted='" + quoted + "'."); + EXPECT_TRUE(RE2::FullMatch(unquoted, re)) + << "Unquoted='" << unquoted << "', quoted='" << quoted << "'."; } // A meta-quoted string, interpreted as a pattern, should always match @@ -402,8 +398,8 @@ static void NegativeTestQuoteMeta(string unquoted, string should_not_match, const RE2::Options& options = RE2::DefaultOptions) { string quoted = RE2::QuoteMeta(unquoted); RE2 re(quoted, options); - EXPECT_FALSE_M(RE2::FullMatch(should_not_match, re), - "Unquoted='" + unquoted + "', quoted='" + quoted + "'."); + EXPECT_FALSE(RE2::FullMatch(should_not_match, re)) + << "Unquoted='" << unquoted << "', quoted='" << quoted << "'."; } // Tests that quoted meta characters match their original strings, @@ -469,13 +465,38 @@ TEST(QuoteMeta, HasNull) { TEST(ProgramSize, BigProgram) { RE2 re_simple("simple regexp"); RE2 re_medium("medium.*regexp"); - RE2 re_complex("hard.{1,128}regexp"); + RE2 re_complex("complex.{1,128}regexp"); CHECK_GT(re_simple.ProgramSize(), 0); CHECK_GT(re_medium.ProgramSize(), re_simple.ProgramSize()); CHECK_GT(re_complex.ProgramSize(), re_medium.ProgramSize()); } +TEST(ProgramFanout, BigProgram) { + RE2 re1("(?:(?:(?:(?:(?:.)?){1})*)+)"); + RE2 re10("(?:(?:(?:(?:(?:.)?){10})*)+)"); + RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)"); + RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)"); + + map<int, int> histogram; + + // 3 is the largest non-empty bucket and has 1 element. + CHECK_EQ(3, re1.ProgramFanout(&histogram)); + CHECK_EQ(1, histogram[3]); + + // 7 is the largest non-empty bucket and has 10 elements. + CHECK_EQ(7, re10.ProgramFanout(&histogram)); + CHECK_EQ(10, histogram[7]); + + // 10 is the largest non-empty bucket and has 100 elements. + CHECK_EQ(10, re100.ProgramFanout(&histogram)); + CHECK_EQ(100, histogram[10]); + + // 13 is the largest non-empty bucket and has 1000 elements. + CHECK_EQ(13, re1000.ProgramFanout(&histogram)); + CHECK_EQ(1000, histogram[13]); +} + // Issue 956519: handling empty character sets was // causing NULL dereference. This tests a few empty character sets. // (The way to get an empty character set is to negate a full one.) @@ -490,6 +511,21 @@ TEST(EmptyCharset, Fuzz) { CHECK(!RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0)); } +// Bitstate assumes that kInstFail instructions in +// alternations or capture groups have been "compiled away". +TEST(EmptyCharset, BitstateAssumptions) { + // Captures trigger use of Bitstate. + static const char *nop_empties[] = { + "((((()))))" "[^\\S\\s]?", + "((((()))))" "([^\\S\\s])?", + "((((()))))" "([^\\S\\s]|[^\\S\\s])?", + "((((()))))" "(([^\\S\\s]|[^\\S\\s])|)" + }; + StringPiece group[6]; + for (int i = 0; i < arraysize(nop_empties); i++) + CHECK(RE2(nop_empties[i]).Match("", 0, 0, RE2::UNANCHORED, group, 6)); +} + // Test that named groups work correctly. TEST(Capture, NamedGroups) { { @@ -511,6 +547,34 @@ TEST(Capture, NamedGroups) { } } +TEST(RE2, CapturedGroupTest) { + RE2 re("directions from (?P<S>.*) to (?P<D>.*)"); + int num_groups = re.NumberOfCapturingGroups(); + EXPECT_EQ(2, num_groups); + string args[4]; + RE2::Arg arg0(&args[0]); + RE2::Arg arg1(&args[1]); + RE2::Arg arg2(&args[2]); + RE2::Arg arg3(&args[3]); + + const RE2::Arg* const matches[4] = {&arg0, &arg1, &arg2, &arg3}; + EXPECT_TRUE(RE2::FullMatchN("directions from mountain view to san jose", + re, matches, num_groups)); + const map<string, int>& named_groups = re.NamedCapturingGroups(); + EXPECT_TRUE(named_groups.find("S") != named_groups.end()); + EXPECT_TRUE(named_groups.find("D") != named_groups.end()); + + // The named group index is 1-based. + int source_group_index = named_groups.find("S")->second; + int destination_group_index = named_groups.find("D")->second; + EXPECT_EQ(1, source_group_index); + EXPECT_EQ(2, destination_group_index); + + // The args is zero-based. + EXPECT_EQ("mountain view", args[source_group_index - 1]); + EXPECT_EQ("san jose", args[destination_group_index - 1]); +} + TEST(RE2, FullMatchWithNoArgs) { CHECK(RE2::FullMatch("h", "h")); CHECK(RE2::FullMatch("hello", "hello")); @@ -664,10 +728,12 @@ TEST(RE2, FullMatchTypedNullArg) { CHECK(!RE2::FullMatch("hello", "(.*)", (float*)NULL)); } -#ifndef WIN32 // Check that numeric parsing code does not read past the end of // the number being parsed. +// This implementation requires mmap(2) et al. and thus cannot +// be used unless they are available. TEST(RE2, NULTerminated) { +#if defined(_POSIX_MAPPED_FILES) && _POSIX_MAPPED_FILES > 0 char *v; int x; long pagesize = sysconf(_SC_PAGE_SIZE); @@ -685,12 +751,12 @@ TEST(RE2, NULTerminated) { x = 0; CHECK(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x)); CHECK_EQ(x, 1); -} #endif +} TEST(RE2, FullMatchTypeTests) { // Type tests - string zeros(100, '0'); + string zeros(1000, '0'); { char c; CHECK(RE2::FullMatch("Hello", "(H)ello", &c)); @@ -792,12 +858,13 @@ TEST(RE2, FullMatchTypeTests) { } TEST(RE2, FloatingPointFullMatchTypes) { - string zeros(100, '0'); + string zeros(1000, '0'); { float v; CHECK(RE2::FullMatch("100", "(.*)", &v)); CHECK_EQ(v, 100); CHECK(RE2::FullMatch("-100.", "(.*)", &v)); CHECK_EQ(v, -100); CHECK(RE2::FullMatch("1e23", "(.*)", &v)); CHECK_EQ(v, float(1e23)); + CHECK(RE2::FullMatch(" 100", "(.*)", &v)); CHECK_EQ(v, 100); CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); CHECK_EQ(v, float(1e23)); @@ -997,14 +1064,14 @@ TEST(RE2, UTF8) { // Check UTF-8 handling // Three Japanese characters (nihongo) const char utf8_string[] = { - 0xe6, 0x97, 0xa5, // 65e5 - 0xe6, 0x9c, 0xac, // 627c - 0xe8, 0xaa, 0x9e, // 8a9e + (char)0xe6, (char)0x97, (char)0xa5, // 65e5 + (char)0xe6, (char)0x9c, (char)0xac, // 627c + (char)0xe8, (char)0xaa, (char)0x9e, // 8a9e 0 }; const char utf8_pattern[] = { '.', - 0xe6, 0x9c, 0xac, // 627c + (char)0xe6, (char)0x9c, (char)0xac, // 627c '.', 0 }; @@ -1253,6 +1320,16 @@ TEST(RE2, NeverNewline) { } } +// Check that dot_nl option works. +TEST(RE2, DotNL) { + RE2::Options opt; + opt.set_dot_nl(true); + EXPECT_TRUE(RE2::PartialMatch("\n", RE2(".", opt))); + EXPECT_FALSE(RE2::PartialMatch("\n", RE2("(?-s).", opt))); + opt.set_never_nl(true); + EXPECT_FALSE(RE2::PartialMatch("\n", RE2(".", opt))); +} + // Check that there are no capturing groups in "never capture" mode. TEST(RE2, NeverCapture) { RE2::Options opt; @@ -1377,4 +1454,79 @@ TEST(RE2, RegexpToStringLossOfAnchor) { EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)"); } +// Issue 10131674 +TEST(RE2, Bug10131674) { + // Some of these escapes describe values that do not fit in a byte. + RE2 re("\\140\\440\\174\\271\\150\\656\\106\\201\\004\\332", RE2::Latin1); + EXPECT_FALSE(re.ok()); + EXPECT_FALSE(RE2::FullMatch("hello world", re)); +} + +TEST(RE2, Bug18391750) { + // Stray write past end of match_ in nfa.cc, caught by fuzzing + address sanitizer. + const char t[] = { + (char)0x28, (char)0x28, (char)0xfc, (char)0xfc, (char)0x08, (char)0x08, + (char)0x26, (char)0x26, (char)0x28, (char)0xc2, (char)0x9b, (char)0xc5, + (char)0xc5, (char)0xd4, (char)0x8f, (char)0x8f, (char)0x69, (char)0x69, + (char)0xe7, (char)0x29, (char)0x7b, (char)0x37, (char)0x31, (char)0x31, + (char)0x7d, (char)0xae, (char)0x7c, (char)0x7c, (char)0xf3, (char)0x29, + (char)0xae, (char)0xae, (char)0x2e, (char)0x2a, (char)0x29, (char)0x00, + }; + RE2::Options opt; + opt.set_encoding(RE2::Options::EncodingLatin1); + opt.set_longest_match(true); + opt.set_dot_nl(true); + opt.set_case_sensitive(false); + RE2 re(t, opt); + CHECK(re.ok()); + RE2::PartialMatch(t, re); +} + +TEST(RE2, Bug18458852) { + // Bug in parser accepting invalid (too large) rune, + // causing compiler to fail in DCHECK in UTF-8 + // character class code. + const char b[] = { + (char)0x28, (char)0x05, (char)0x05, (char)0x41, (char)0x41, (char)0x28, + (char)0x24, (char)0x5b, (char)0x5e, (char)0xf5, (char)0x87, (char)0x87, + (char)0x90, (char)0x29, (char)0x5d, (char)0x29, (char)0x29, (char)0x00, + }; + RE2 re(b); + CHECK(!re.ok()); +} + +TEST(RE2, Bug18523943) { + // Bug in bitstate: case kFailInst was merged into the default with LOG(DFATAL). + + RE2::Options opt; + const char a[] = { + (char)0x29, (char)0x29, (char)0x24, (char)0x00, + }; + const char b[] = { + (char)0x28, (char)0x0a, (char)0x2a, (char)0x2a, (char)0x29, (char)0x00, + }; + opt.set_log_errors(false); + opt.set_encoding(RE2::Options::EncodingLatin1); + opt.set_posix_syntax(true); + opt.set_longest_match(true); + opt.set_literal(false); + opt.set_never_nl(true); + + RE2 re((const char*)b, opt); + CHECK(re.ok()); + string s1; + CHECK(!RE2::PartialMatch((const char*)a, re, &s1)); +} + +TEST(RE2, Bug21371806) { + // Bug in parser accepting Unicode groups in Latin-1 mode, + // causing compiler to fail in DCHECK in prog.cc. + + RE2::Options opt; + opt.set_encoding(RE2::Options::EncodingLatin1); + + RE2 re("g\\p{Zl}]", opt); + CHECK(re.ok()); +} + } // namespace re2 diff --git a/third_party/re2/re2/testing/regexp_benchmark.cc b/third_party/re2/re2/testing/regexp_benchmark.cc index ca7627f..6c19858 100644 --- a/third_party/re2/re2/testing/regexp_benchmark.cc +++ b/third_party/re2/re2/testing/regexp_benchmark.cc @@ -135,13 +135,15 @@ ParseImpl SearchParse1CachedPCRE, SearchParse1CachedRE2; // Generate random text that won't contain the search string, // to test worst-case search behavior. void MakeText(string* text, int nbytes) { + srand(1); text->resize(nbytes); - srand(0); for (int i = 0; i < nbytes; i++) { - if (!rand()%30) - (*text)[i] = '\n'; - else - (*text)[i] = rand()%(0x7E + 1 - 0x20)+0x20; + // Generate a one-byte rune that isn't a control character (e.g. '\n'). + // Clipping to 0x20 introduces some bias, but we don't need uniformity. + int byte = rand() & 0x7F; + if (byte < 0x20) + byte = 0x20; + (*text)[i] = byte; } } @@ -263,6 +265,7 @@ BENCHMARK_RANGE(Search_BigFixed_CachedPCRE, 8, 32<<10)->ThreadRange(1, NumCPU BENCHMARK_RANGE(Search_BigFixed_CachedRE2, 8, 1<<20)->ThreadRange(1, NumCPUs()); // Benchmark: FindAndConsume + void FindAndConsume(int iters, int nbytes) { StopBenchmarkTiming(); string s; @@ -284,9 +287,11 @@ BENCHMARK_RANGE(FindAndConsume, 8, 16<<20)->ThreadRange(1, NumCPUs()); // Benchmark: successful anchored search. void SearchSuccess(int iters, int nbytes, const char* regexp, SearchImpl* search) { + StopBenchmarkTiming(); string s; MakeText(&s, nbytes); BenchmarkMemoryUsage(); + StartBenchmarkTiming(); search(iters, regexp, s, Prog::kAnchored, true); SetBenchmarkBytesProcessed(static_cast<int64>(iters)*nbytes); } @@ -344,11 +349,9 @@ BENCHMARK_RANGE(Search_Success1_Cached_RE2, 8, 16<<20)->ThreadRange(1, NumCP // Benchmark: use regexp to find phone number. void SearchDigits(int iters, SearchImpl* search) { - const char *text = "650-253-0001"; - int len = strlen(text); + StringPiece s("650-253-0001"); BenchmarkMemoryUsage(); - search(iters, "([0-9]+)-([0-9]+)-([0-9]+)", - StringPiece(text, len), Prog::kAnchored, true); + search(iters, "([0-9]+)-([0-9]+)-([0-9]+)", s, Prog::kAnchored, true); SetBenchmarkItemsProcessed(iters); } @@ -686,7 +689,6 @@ BENCHMARK(BM_Regexp_SimplifyCompile)->ThreadRange(1, NumCPUs()); BENCHMARK(BM_Regexp_NullWalk)->ThreadRange(1, NumCPUs()); BENCHMARK(BM_RE2_Compile)->ThreadRange(1, NumCPUs()); - // Makes text of size nbytes, then calls run to search // the text for regexp iters times. void SearchPhone(int iters, int nbytes, ParseImpl* search) { diff --git a/third_party/re2/re2/testing/regexp_generator.cc b/third_party/re2/re2/testing/regexp_generator.cc index cf2db11..fd085db 100644 --- a/third_party/re2/re2/testing/regexp_generator.cc +++ b/third_party/re2/re2/testing/regexp_generator.cc @@ -111,7 +111,7 @@ void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk, // Add atoms if there is room. if (atoms < maxatoms_) { - for (int i = 0; i < atoms_.size(); i++) { + for (size_t i = 0; i < atoms_.size(); i++) { post->push_back(atoms_[i]); GeneratePostfix(post, nstk + 1, ops, atoms + 1); post->pop_back(); @@ -120,7 +120,7 @@ void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk, // Add operators if there are enough arguments. if (ops < maxops_) { - for (int i = 0; i < ops_.size(); i++) { + for (size_t i = 0; i < ops_.size(); i++) { const string& fmt = ops_[i]; int nargs = CountArgs(fmt); if (nargs <= nstk) { @@ -134,7 +134,7 @@ void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk, // Generates a random postfix command sequence. // Stops and returns true once a single sequence has been generated. -bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk, +bool RegexpGenerator::GenerateRandomPostfix(vector<string>* post, int nstk, int ops, int atoms) { for (;;) { // Stop if we get to a single element, but only sometimes. @@ -151,7 +151,7 @@ bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk, // Add operators if there are enough arguments. if (ops < maxops_ && acm_->Uniform(2) == 0) { - const string& fmt = ops_[acm_->Uniform(ops_.size())]; + const string& fmt = ops_[acm_->Uniform(static_cast<int32>(ops_.size()))]; int nargs = CountArgs(fmt); if (nargs <= nstk) { post->push_back(fmt); @@ -165,7 +165,7 @@ bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk, // Add atoms if there is room. if (atoms < maxatoms_ && acm_->Uniform(2) == 0) { - post->push_back(atoms_[acm_->Uniform(atoms_.size())]); + post->push_back(atoms_[acm_->Uniform(static_cast<int32>(atoms_.size()))]); bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1); post->pop_back(); if (ret) @@ -179,7 +179,7 @@ bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk, // in (?: ) to avoid needing to maintain a precedence table. void RegexpGenerator::RunPostfix(const vector<string>& post) { stack<string> regexps; - for (int i = 0; i < post.size(); i++) { + for (size_t i = 0; i < post.size(); i++) { switch (CountArgs(post[i])) { default: LOG(FATAL) << "Bad operator: " << post[i]; @@ -208,7 +208,7 @@ void RegexpGenerator::RunPostfix(const vector<string>& post) { if (regexps.size() != 1) { // Internal error - should never happen. printf("Bad regexp program:\n"); - for (int i = 0; i < post.size(); i++) { + for (size_t i = 0; i < post.size(); i++) { printf(" %s\n", CEscape(post[i]).c_str()); } printf("Stack after running program:\n"); diff --git a/third_party/re2/re2/testing/regexp_generator.h b/third_party/re2/re2/testing/regexp_generator.h index b4506f2..3ba0d70 100644 --- a/third_party/re2/re2/testing/regexp_generator.h +++ b/third_party/re2/re2/testing/regexp_generator.h @@ -53,7 +53,7 @@ class RegexpGenerator { vector<string> atoms_; // Possible atoms. vector<string> ops_; // Possible ops. ACMRandom* acm_; // Random generator. - DISALLOW_EVIL_CONSTRUCTORS(RegexpGenerator); + DISALLOW_COPY_AND_ASSIGN(RegexpGenerator); }; // Helpers for preparing arguments to RegexpGenerator constructor. diff --git a/third_party/re2/re2/testing/regexp_test.cc b/third_party/re2/re2/testing/regexp_test.cc index f317cbc..31c76a3 100644 --- a/third_party/re2/re2/testing/regexp_test.cc +++ b/third_party/re2/re2/testing/regexp_test.cc @@ -29,10 +29,11 @@ TEST(Regexp, BigConcat) { Regexp* x; x = Regexp::Parse("x", Regexp::NoParseFlags, NULL); vector<Regexp*> v(90000, x); // ToString bails out at 100000 - for (int i = 0; i < v.size(); i++) + for (size_t i = 0; i < v.size(); i++) x->Incref(); - CHECK_EQ(x->Ref(), 1 + v.size()) << x->Ref(); - Regexp* re = Regexp::Concat(&v[0], v.size(), Regexp::NoParseFlags); + CHECK_EQ(x->Ref(), 1 + static_cast<int>(v.size())) << x->Ref(); + Regexp* re = Regexp::Concat(v.data(), static_cast<int>(v.size()), + Regexp::NoParseFlags); CHECK_EQ(re->ToString(), string(v.size(), 'x')); re->Decref(); CHECK_EQ(x->Ref(), 1) << x->Ref(); diff --git a/third_party/re2/re2/testing/required_prefix_test.cc b/third_party/re2/re2/testing/required_prefix_test.cc index 1f0b216..aed41f7 100644 --- a/third_party/re2/re2/testing/required_prefix_test.cc +++ b/third_party/re2/re2/testing/required_prefix_test.cc @@ -28,7 +28,7 @@ static PrefixTest tests[] = { // Otherwise, it should work. { "^abc$", true, "abc", false, "(?-m:$)" }, - { "^abc", "true", "abc", false, "" }, + { "^abc", true, "abc", false, "" }, { "^(?i)abc", true, "abc", true, "" }, { "^abcd*", true, "abc", false, "d*" }, { "^[Aa][Bb]cd*", true, "ab", true, "cd*" }, diff --git a/third_party/re2/re2/testing/set_test.cc b/third_party/re2/re2/testing/set_test.cc index 74058a4..4e267ae 100644 --- a/third_party/re2/re2/testing/set_test.cc +++ b/third_party/re2/re2/testing/set_test.cc @@ -71,10 +71,10 @@ TEST(Set, UnanchoredFactored) { TEST(Set, UnanchoredDollar) { RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); - + CHECK_EQ(s.Add("foo$", NULL), 0); CHECK_EQ(s.Compile(), true); - + vector<int> v; CHECK_EQ(s.Match("foo", &v), true); CHECK_EQ(v.size(), 1); @@ -107,8 +107,34 @@ TEST(Set, Anchored) { CHECK_EQ(s.Match("bar", &v), true); CHECK_EQ(v.size(), 1); CHECK_EQ(v[0], 1); +} + +TEST(Set, EmptyUnanchored) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + CHECK_EQ(s.Compile(), true); + + vector<int> v; + CHECK_EQ(s.Match("", &v), false); + CHECK_EQ(v.size(), 0); + + v.clear(); + CHECK_EQ(s.Match("foobar", &v), false); + CHECK_EQ(v.size(), 0); +} + +TEST(Set, EmptyAnchored) { + RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH); + CHECK_EQ(s.Compile(), true); + + vector<int> v; + CHECK_EQ(s.Match("", &v), false); + CHECK_EQ(v.size(), 0); + + v.clear(); + CHECK_EQ(s.Match("foobar", &v), false); + CHECK_EQ(v.size(), 0); } } // namespace re2 - diff --git a/third_party/re2/re2/testing/simplify_test.cc b/third_party/re2/re2/testing/simplify_test.cc index d54837c..9db41ee 100644 --- a/third_party/re2/re2/testing/simplify_test.cc +++ b/third_party/re2/re2/testing/simplify_test.cc @@ -136,6 +136,99 @@ static Test tests[] = { { "(){1}", "()" }, { "(){1,}", "()+" }, { "(){0,2}", "(?:()()?)?" }, + + // Test that coalescing occurs and that the resulting repeats are simplified. + // Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal: + { "a*a*", "a*" }, + { "a*a+", "a+" }, + { "a*a?", "a*" }, + { "a*a{2}", "aa+" }, + { "a*a{2,}", "aa+" }, + { "a*a{2,3}", "aa+" }, + { "a+a*", "a+" }, + { "a+a+", "aa+" }, + { "a+a?", "a+" }, + { "a+a{2}", "aaa+" }, + { "a+a{2,}", "aaa+" }, + { "a+a{2,3}", "aaa+" }, + { "a?a*", "a*" }, + { "a?a+", "a+" }, + { "a?a?", "(?:aa?)?" }, + { "a?a{2}", "aaa?" }, + { "a?a{2,}", "aa+" }, + { "a?a{2,3}", "aa(?:aa?)?" }, + { "a{2}a*", "aa+" }, + { "a{2}a+", "aaa+" }, + { "a{2}a?", "aaa?" }, + { "a{2}a{2}", "aaaa" }, + { "a{2}a{2,}", "aaaa+" }, + { "a{2}a{2,3}", "aaaaa?" }, + { "a{2,}a*", "aa+" }, + { "a{2,}a+", "aaa+" }, + { "a{2,}a?", "aa+" }, + { "a{2,}a{2}", "aaaa+" }, + { "a{2,}a{2,}", "aaaa+" }, + { "a{2,}a{2,3}", "aaaa+" }, + { "a{2,3}a*", "aa+" }, + { "a{2,3}a+", "aaa+" }, + { "a{2,3}a?", "aa(?:aa?)?" }, + { "a{2,3}a{2}", "aaaaa?" }, + { "a{2,3}a{2,}", "aaaa+" }, + { "a{2,3}a{2,3}", "aaaa(?:aa?)?" }, + // With a char class, any char and any byte: + { "\\d*\\d*", "[0-9]*" }, + { ".*.*", ".*" }, + { "\\C*\\C*", "\\C*" }, + // FoldCase works, but must be consistent: + { "(?i)A*a*", "[Aa]*" }, + { "(?i)a+A+", "[Aa][Aa]+" }, + { "(?i)A*(?-i)a*", "[Aa]*a*" }, + { "(?i)a+(?-i)A+", "[Aa]+A+" }, + // NonGreedy works, but must be consistent: + { "a*?a*?", "a*?" }, + { "a+?a+?", "aa+?" }, + { "a*?a*", "a*?a*" }, + { "a+a+?", "a+a+?" }, + // The second element is the literal, char class, any char or any byte: + { "a*a", "a+" }, + { "\\d*\\d", "[0-9]+" }, + { ".*.", ".+" }, + { "\\C*\\C", "\\C+" }, + // FoldCase works, but must be consistent: + { "(?i)A*a", "[Aa]+" }, + { "(?i)a+A", "[Aa][Aa]+" }, + { "(?i)A*(?-i)a", "[Aa]*a" }, + { "(?i)a+(?-i)A", "[Aa]+A" }, + // The second element is a literal string that begins with the literal: + { "a*aa", "aa+" }, + { "a*aab", "aa+b" }, + // FoldCase works, but must be consistent: + { "(?i)a*aa", "[Aa][Aa]+" }, + { "(?i)a*aab", "[Aa][Aa]+[Bb]" }, + { "(?i)a*(?-i)aa", "[Aa]*aa" }, + { "(?i)a*(?-i)aab", "[Aa]*aab" }, + // Negative tests with mismatching ops: + { "a*b*", "a*b*" }, + { "\\d*\\D*", "[0-9]*[^0-9]*" }, + { "a+b", "a+b" }, + { "\\d+\\D", "[0-9]+[^0-9]" }, + { "a?bb", "a?bb" }, + // Negative tests with capturing groups: + { "(a*)a*", "(a*)a*" }, + { "a+(a)", "a+(a)" }, + { "(a?)(aa)", "(a?)(aa)" }, + // Just for fun: + { "aa*aa+aa?aa{2}aaa{2,}aaa{2,3}a", "aaaaaaaaaaaaaaaa+" }, + + // During coalescing, the child of the repeat changes, so we build a new + // repeat. The new repeat must have the min and max of the old repeat. + // Failure to copy them results in min=0 and max=0 -> empty match. + { "(?:a*aab){2}", "aa+baa+b" }, + + // During coalescing, the child of the capture changes, so we build a new + // capture. The new capture must have the cap of the old capture. + // Failure to copy it results in cap=0 -> ToString() logs a fatal error. + { "(a*aab)", "(aa+b)" }, }; TEST(TestSimplify, SimpleRegexps) { diff --git a/third_party/re2/re2/testing/string_generator.cc b/third_party/re2/re2/testing/string_generator.cc index 5be6d3e..f96ff20 100644 --- a/third_party/re2/re2/testing/string_generator.cc +++ b/third_party/re2/re2/testing/string_generator.cc @@ -43,14 +43,14 @@ void StringGenerator::Reset() { // Returns false if all the numbers have been used. bool StringGenerator::IncrementDigits() { // First try to increment the current number. - for (int i = digits_.size() - 1; i >= 0; i--) { - if (++digits_[i] < alphabet_.size()) + for (int i = static_cast<int>(digits_.size()) - 1; i >= 0; i--) { + if (++digits_[i] < static_cast<int>(alphabet_.size())) return true; digits_[i] = 0; } // If that failed, make a longer number. - if (digits_.size() < maxlen_) { + if (static_cast<int>(digits_.size()) < maxlen_) { digits_.push_back(0); return true; } @@ -68,7 +68,7 @@ bool StringGenerator::RandomDigits() { int len = acm_->Uniform(maxlen_+1); digits_.resize(len); for (int i = 0; i < len; i++) - digits_[i] = acm_->Uniform(alphabet_.size()); + digits_[i] = acm_->Uniform(static_cast<int32>(alphabet_.size())); return true; } @@ -84,7 +84,7 @@ const StringPiece& StringGenerator::Next() { return sp_; } s_.clear(); - for (int i = 0; i < digits_.size(); i++) { + for (size_t i = 0; i < digits_.size(); i++) { s_ += alphabet_[digits_[i]]; } hasnext_ = random_ ? RandomDigits() : IncrementDigits(); @@ -110,4 +110,3 @@ void StringGenerator::GenerateNULL() { } } // namespace re2 - diff --git a/third_party/re2/re2/testing/string_generator.h b/third_party/re2/re2/testing/string_generator.h index 6a9ef42..52e5e22 100644 --- a/third_party/re2/re2/testing/string_generator.h +++ b/third_party/re2/re2/testing/string_generator.h @@ -50,7 +50,7 @@ class StringGenerator { bool random_; // Whether generated strings are random. int nrandom_; // Number of random strings left to generate. ACMRandom* acm_; // Random number generator - DISALLOW_EVIL_CONSTRUCTORS(StringGenerator); + DISALLOW_COPY_AND_ASSIGN(StringGenerator); }; } // namespace re2 diff --git a/third_party/re2/re2/testing/tester.cc b/third_party/re2/re2/testing/tester.cc index 003dc5a..cb12bad 100644 --- a/third_party/re2/re2/testing/tester.cc +++ b/third_party/re2/re2/testing/tester.cc @@ -246,6 +246,7 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, // 2. It treats $ as this weird thing meaning end of string // or before the \n at the end of the string. // 3. It doesn't implement POSIX leftmost-longest matching. + // 4. It lets \s match vertical tab. // MimicsPCRE() detects 1 and 2. if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() && kind_ != Prog::kLongestMatch) { @@ -343,7 +344,8 @@ void TestInstance::RunSearch(Engine type, Prog::kAnchored, Prog::kLongestMatch, result->submatch, &result->skipped, NULL)) { - LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_) + LOG(ERROR) << "Reverse DFA inconsistency: " + << CEscape(regexp_str_) << " on " << CEscape(text); result->matched = false; } @@ -390,10 +392,13 @@ void TestInstance::RunSearch(Engine type, if (kind_ == Prog::kFullMatch) re_anchor = RE2::ANCHOR_BOTH; - result->matched = re2_->Match(context, - text.begin() - context.begin(), - text.end() - context.begin(), - re_anchor, result->submatch, nsubmatch); + result->matched = re2_->Match( + context, + static_cast<int>(text.begin() - context.begin()), + static_cast<int>(text.end() - context.begin()), + re_anchor, + result->submatch, + nsubmatch); result->have_submatch = nsubmatch > 0; break; } @@ -405,6 +410,14 @@ void TestInstance::RunSearch(Engine type, break; } + // PCRE 8.34 or so started allowing vertical tab to match \s, + // following a change made in Perl 5.18. RE2 does not. + if ((regexp_str_.contains("\\s") || regexp_str_.contains("\\S")) && + text.contains("\v")) { + result->skipped = true; + break; + } + const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch]; PCRE::Arg *a = new PCRE::Arg[nsubmatch]; for (int i = 0; i < nsubmatch; i++) { @@ -505,7 +518,7 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, } // We disagree with PCRE on the meaning of some Unicode matches. - // In particular, we treat all non-ASCII UTF-8 as word characters. + // In particular, we treat non-ASCII UTF-8 as non-word characters. // We also treat "empty" character sets like [^\w\W] as being // impossible to match, while PCRE apparently excludes some code // points (e.g., 0x0080) from both \w and \W. @@ -592,14 +605,14 @@ Tester::Tester(const StringPiece& regexp) { } Tester::~Tester() { - for (int i = 0; i < v_.size(); i++) + for (size_t i = 0; i < v_.size(); i++) delete v_[i]; } bool Tester::TestCase(const StringPiece& text, const StringPiece& context, Prog::Anchor anchor) { bool okay = true; - for (int i = 0; i < v_.size(); i++) + for (size_t i = 0; i < v_.size(); i++) okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor)); return okay; } diff --git a/third_party/re2/re2/testing/tester.h b/third_party/re2/re2/testing/tester.h index 6e16e77..d1e1b22 100644 --- a/third_party/re2/re2/testing/tester.h +++ b/third_party/re2/re2/testing/tester.h @@ -84,7 +84,7 @@ class TestInstance { PCRE* re_; // PCRE implementation RE2* re2_; // RE2 implementation - DISALLOW_EVIL_CONSTRUCTORS(TestInstance); + DISALLOW_COPY_AND_ASSIGN(TestInstance); }; // A group of TestInstances for all possible configurations. @@ -110,7 +110,7 @@ class Tester { bool error_; vector<TestInstance*> v_; - DISALLOW_EVIL_CONSTRUCTORS(Tester); + DISALLOW_COPY_AND_ASSIGN(Tester); }; // Run all possible tests using regexp and text. diff --git a/third_party/re2/re2/testing/unicode_test.py b/third_party/re2/re2/testing/unicode_test.py deleted file mode 100755 index a88a3ad..0000000 --- a/third_party/re2/re2/testing/unicode_test.py +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/python2.4 -# -# Copyright 2008 The RE2 Authors. All Rights Reserved. -# Use of this source code is governed by a BSD-style -# license that can be found in the LICENSE file. - -"""Unittest for the util/regexp/re2/unicode.py module.""" - -import os -import StringIO -from google3.pyglib import flags -from google3.testing.pybase import googletest -from google3.util.regexp.re2 import unicode - -_UNICODE_DIR = os.path.join(flags.FLAGS.test_srcdir, "google3", "third_party", - "unicode", "ucd-5.1.0") - - -class ConvertTest(googletest.TestCase): - """Test the conversion functions.""" - - def testUInt(self): - self.assertEquals(0x0000, unicode._UInt("0000")) - self.assertEquals(0x263A, unicode._UInt("263A")) - self.assertEquals(0x10FFFF, unicode._UInt("10FFFF")) - self.assertRaises(unicode.InputError, unicode._UInt, "263") - self.assertRaises(unicode.InputError, unicode._UInt, "263AAAA") - self.assertRaises(unicode.InputError, unicode._UInt, "110000") - - def testURange(self): - self.assertEquals([1, 2, 3], unicode._URange("0001..0003")) - self.assertEquals([1], unicode._URange("0001")) - self.assertRaises(unicode.InputError, unicode._URange, "0001..0003..0005") - self.assertRaises(unicode.InputError, unicode._URange, "0003..0001") - self.assertRaises(unicode.InputError, unicode._URange, "0001..0001") - - def testUStr(self): - self.assertEquals("0x263A", unicode._UStr(0x263a)) - self.assertEquals("0x10FFFF", unicode._UStr(0x10FFFF)) - self.assertRaises(unicode.InputError, unicode._UStr, 0x110000) - self.assertRaises(unicode.InputError, unicode._UStr, -1) - - -_UNICODE_TABLE = """# Commented line, should be ignored. -# The next line is blank and should be ignored. - -0041;Capital A;Line 1 -0061..007A;Lowercase;Line 2 -1F00;<Greek, First>;Ignored -1FFE;<Greek, Last>;Line 3 -10FFFF;Runemax;Line 4 -0000;Zero;Line 5 -""" - -_BAD_TABLE1 = """ -111111;Not a code point; -""" - -_BAD_TABLE2 = """ -0000;<Zero, First>;Missing <Zero, Last> -""" - -_BAD_TABLE3 = """ -0010..0001;Bad range; -""" - - -class AbortError(Exception): - """Function should not have been called.""" - - -def Abort(): - raise AbortError("Abort") - - -def StringTable(s, n, f): - unicode.ReadUnicodeTable(StringIO.StringIO(s), n, f) - - -class ReadUnicodeTableTest(googletest.TestCase): - """Test the ReadUnicodeTable function.""" - - def testSimpleTable(self): - - ncall = [0] # can't assign to ordinary int in DoLine - - def DoLine(codes, fields): - self.assertEquals(3, len(fields)) - ncall[0] += 1 - self.assertEquals("Line %d" % (ncall[0],), fields[2]) - if ncall[0] == 1: - self.assertEquals([0x0041], codes) - self.assertEquals("0041", fields[0]) - self.assertEquals("Capital A", fields[1]) - elif ncall[0] == 2: - self.assertEquals(range(0x0061, 0x007A + 1), codes) - self.assertEquals("0061..007A", fields[0]) - self.assertEquals("Lowercase", fields[1]) - elif ncall[0] == 3: - self.assertEquals(range(0x1F00, 0x1FFE + 1), codes) - self.assertEquals("1F00..1FFE", fields[0]) - self.assertEquals("Greek", fields[1]) - elif ncall[0] == 4: - self.assertEquals([0x10FFFF], codes) - self.assertEquals("10FFFF", fields[0]) - self.assertEquals("Runemax", fields[1]) - elif ncall[0] == 5: - self.assertEquals([0x0000], codes) - self.assertEquals("0000", fields[0]) - self.assertEquals("Zero", fields[1]) - - StringTable(_UNICODE_TABLE, 3, DoLine) - self.assertEquals(5, ncall[0]) - - def testErrorTables(self): - self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 4, Abort) - self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 2, Abort) - self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE1, 3, Abort) - self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE2, 3, Abort) - self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE3, 3, Abort) - - -class ParseContinueTest(googletest.TestCase): - """Test the ParseContinue function.""" - - def testParseContinue(self): - self.assertEquals(("Private Use", "First"), - unicode._ParseContinue("<Private Use, First>")) - self.assertEquals(("Private Use", "Last"), - unicode._ParseContinue("<Private Use, Last>")) - self.assertEquals(("<Private Use, Blah>", None), - unicode._ParseContinue("<Private Use, Blah>")) - - -class CaseGroupsTest(googletest.TestCase): - """Test the CaseGroups function (and the CaseFoldingReader).""" - - def FindGroup(self, c): - if type(c) == str: - c = ord(c) - for g in self.groups: - if c in g: - return g - return None - - def testCaseGroups(self): - self.groups = unicode.CaseGroups(unicode_dir=_UNICODE_DIR) - self.assertEquals([ord("A"), ord("a")], self.FindGroup("a")) - self.assertEquals(None, self.FindGroup("0")) - - -class ScriptsTest(googletest.TestCase): - """Test the Scripts function (and the ScriptsReader).""" - - def FindScript(self, c): - if type(c) == str: - c = ord(c) - for script, codes in self.scripts.items(): - for code in codes: - if c == code: - return script - return None - - def testScripts(self): - self.scripts = unicode.Scripts(unicode_dir=_UNICODE_DIR) - self.assertEquals("Latin", self.FindScript("a")) - self.assertEquals("Common", self.FindScript("0")) - self.assertEquals(None, self.FindScript(0xFFFE)) - - -class CategoriesTest(googletest.TestCase): - """Test the Categories function (and the UnicodeDataReader).""" - - def FindCategory(self, c): - if type(c) == str: - c = ord(c) - short = None - for category, codes in self.categories.items(): - for code in codes: - if code == c: - # prefer category Nd over N - if len(category) > 1: - return category - if short == None: - short = category - return short - - def testCategories(self): - self.categories = unicode.Categories(unicode_dir=_UNICODE_DIR) - self.assertEquals("Ll", self.FindCategory("a")) - self.assertEquals("Nd", self.FindCategory("0")) - self.assertEquals("Lo", self.FindCategory(0xAD00)) # in First, Last range - self.assertEquals(None, self.FindCategory(0xFFFE)) - self.assertEquals("Lo", self.FindCategory(0x8B5A)) - self.assertEquals("Lo", self.FindCategory(0x6C38)) - self.assertEquals("Lo", self.FindCategory(0x92D2)) - self.assertTrue(ord("a") in self.categories["L"]) - self.assertTrue(ord("0") in self.categories["N"]) - self.assertTrue(0x8B5A in self.categories["L"]) - self.assertTrue(0x6C38 in self.categories["L"]) - self.assertTrue(0x92D2 in self.categories["L"]) - -def main(): - googletest.main() - -if __name__ == "__main__": - main() diff --git a/third_party/re2/re2/tostring.cc b/third_party/re2/re2/tostring.cc index 555524f..0230c8c 100644 --- a/third_party/re2/re2/tostring.cc +++ b/third_party/re2/re2/tostring.cc @@ -42,7 +42,7 @@ class ToStringWalker : public Regexp::Walker<int> { private: string* t_; // The string the walker appends to. - DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker); + DISALLOW_COPY_AND_ASSIGN(ToStringWalker); }; string Regexp::ToString() { @@ -94,6 +94,8 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { case kRegexpCapture: t_->append("("); + if (re->cap() == 0) + LOG(DFATAL) << "kRegexpCapture cap() == 0"; if (re->name()) { t_->append("?P<"); t_->append(*re->name()); @@ -120,13 +122,13 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { static void AppendLiteral(string *t, Rune r, bool foldcase) { if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) { t->append(1, '\\'); - t->append(1, r); + t->append(1, static_cast<char>(r)); } else if (foldcase && 'a' <= r && r <= 'z') { if ('a' <= r && r <= 'z') r += 'A' - 'a'; t->append(1, '['); - t->append(1, r); - t->append(1, r + 'a' - 'A'); + t->append(1, static_cast<char>(r)); + t->append(1, static_cast<char>(r) + 'a' - 'A'); t->append(1, ']'); } else { AppendCCRange(t, r, r); @@ -154,12 +156,14 @@ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, break; case kRegexpLiteral: - AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase); + AppendLiteral(t_, re->rune(), + (re->parse_flags() & Regexp::FoldCase) != 0); break; case kRegexpLiteralString: for (int i = 0; i < re->nrunes(); i++) - AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase); + AppendLiteral(t_, re->runes()[i], + (re->parse_flags() & Regexp::FoldCase) != 0); if (prec < PrecConcat) t_->append(")"); break; @@ -297,7 +301,7 @@ static void AppendCCChar(string* t, Rune r) { if (0x20 <= r && r <= 0x7E) { if (strchr("[]^-\\", r)) t->append("\\"); - t->append(1, r); + t->append(1, static_cast<char>(r)); return; } switch (r) { diff --git a/third_party/re2/re2/unicode.py b/third_party/re2/re2/unicode.py index 8d78312..6dfe87b 100644 --- a/third_party/re2/re2/unicode.py +++ b/third_party/re2/re2/unicode.py @@ -9,7 +9,7 @@ import re import urllib2 # Directory or URL where Unicode tables reside. -_UNICODE_DIR = "http://www.unicode.org/Public/6.0.0/ucd" +_UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd" # Largest valid Unicode code value. _RUNE_MAX = 0x10FFFF diff --git a/third_party/re2/re2/unicode_casefold.cc b/third_party/re2/re2/unicode_casefold.cc index 6d4e878..2293cc7 100644 --- a/third_party/re2/re2/unicode_casefold.cc +++ b/third_party/re2/re2/unicode_casefold.cc @@ -7,8 +7,8 @@ namespace re2 { -// 1029 groups, 2079 pairs, 282 ranges -CaseFold unicode_casefold[] = { +// 1034 groups, 2089 pairs, 289 ranges +const CaseFold unicode_casefold[] = { { 65, 90, 32 }, { 97, 106, -32 }, { 107, 107, 8383 }, @@ -108,6 +108,7 @@ CaseFold unicode_casefold[] = { { 608, 608, -205 }, { 611, 611, -207 }, { 613, 613, 42280 }, + { 614, 614, 42308 }, { 616, 616, -209 }, { 617, 617, -211 }, { 619, 619, 10743 }, @@ -186,6 +187,8 @@ CaseFold unicode_casefold[] = { { 1329, 1366, 48 }, { 1377, 1414, -48 }, { 4256, 4293, 7264 }, + { 4295, 4295, 7264 }, + { 4301, 4301, 7264 }, { 7545, 7545, 35332 }, { 7549, 7549, 3814 }, { 7680, 7776, EvenOdd }, @@ -275,7 +278,10 @@ CaseFold unicode_casefold[] = { { 11390, 11391, -10815 }, { 11392, 11491, EvenOdd }, { 11499, 11502, OddEven }, + { 11506, 11507, EvenOdd }, { 11520, 11557, -7264 }, + { 11559, 11559, -7264 }, + { 11565, 11565, -7264 }, { 42560, 42605, EvenOdd }, { 42624, 42647, EvenOdd }, { 42786, 42799, EvenOdd }, @@ -285,17 +291,18 @@ CaseFold unicode_casefold[] = { { 42878, 42887, EvenOdd }, { 42891, 42892, OddEven }, { 42893, 42893, -42280 }, - { 42896, 42897, EvenOdd }, + { 42896, 42899, EvenOdd }, { 42912, 42921, EvenOdd }, + { 42922, 42922, -42308 }, { 65313, 65338, 32 }, { 65345, 65370, -32 }, { 66560, 66599, 40 }, { 66600, 66639, -40 }, }; -int num_unicode_casefold = 282; +const int num_unicode_casefold = 289; -// 1029 groups, 1050 pairs, 163 ranges -CaseFold unicode_tolower[] = { +// 1034 groups, 1055 pairs, 167 ranges +const CaseFold unicode_tolower[] = { { 65, 90, 32 }, { 181, 181, 775 }, { 192, 214, 32 }, @@ -393,6 +400,8 @@ CaseFold unicode_tolower[] = { { 1232, 1318, EvenOddSkip }, { 1329, 1366, 48 }, { 4256, 4293, 7264 }, + { 4295, 4295, 7264 }, + { 4301, 4301, 7264 }, { 7680, 7828, EvenOddSkip }, { 7835, 7835, -58 }, { 7838, 7838, -7615 }, @@ -446,6 +455,7 @@ CaseFold unicode_tolower[] = { { 11390, 11391, -10815 }, { 11392, 11490, EvenOddSkip }, { 11499, 11501, OddEvenSkip }, + { 11506, 11506, EvenOdd }, { 42560, 42604, EvenOddSkip }, { 42624, 42646, EvenOddSkip }, { 42786, 42798, EvenOddSkip }, @@ -455,12 +465,13 @@ CaseFold unicode_tolower[] = { { 42878, 42886, EvenOddSkip }, { 42891, 42891, OddEven }, { 42893, 42893, -42280 }, - { 42896, 42896, EvenOdd }, + { 42896, 42898, EvenOddSkip }, { 42912, 42920, EvenOddSkip }, + { 42922, 42922, -42308 }, { 65313, 65338, 32 }, { 66560, 66599, 40 }, }; -int num_unicode_tolower = 163; +const int num_unicode_tolower = 167; diff --git a/third_party/re2/re2/unicode_casefold.h b/third_party/re2/re2/unicode_casefold.h index 160b07e..1671140 100644 --- a/third_party/re2/re2/unicode_casefold.h +++ b/third_party/re2/re2/unicode_casefold.h @@ -51,24 +51,24 @@ enum { }; struct CaseFold { - uint32 lo; - uint32 hi; + Rune lo; + Rune hi; int32 delta; }; -extern CaseFold unicode_casefold[]; -extern int num_unicode_casefold; +extern const CaseFold unicode_casefold[]; +extern const int num_unicode_casefold; -extern CaseFold unicode_tolower[]; -extern int num_unicode_tolower; +extern const CaseFold unicode_tolower[]; +extern const int num_unicode_tolower; // Returns the CaseFold* in the tables that contains rune. // If rune is not in the tables, returns the first CaseFold* after rune. // If rune is larger than any value in the tables, returns NULL. -extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune); +extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune); // Returns the result of applying the fold f to the rune r. -extern Rune ApplyFold(CaseFold *f, Rune r); +extern Rune ApplyFold(const CaseFold *f, Rune r); } // namespace re2 diff --git a/third_party/re2/re2/unicode_groups.cc b/third_party/re2/re2/unicode_groups.cc index b57a327..0df585e 100644 --- a/third_party/re2/re2/unicode_groups.cc +++ b/third_party/re2/re2/unicode_groups.cc @@ -7,7 +7,7 @@ namespace re2 { -static URange16 Ps_range16[] = { +static const URange16 Ps_range16[] = { { 40, 40 }, { 91, 91 }, { 123, 123 }, @@ -19,6 +19,8 @@ static URange16 Ps_range16[] = { { 8261, 8261 }, { 8317, 8317 }, { 8333, 8333 }, + { 8968, 8968 }, + { 8970, 8970 }, { 9001, 9001 }, { 10088, 10088 }, { 10090, 10090 }, @@ -81,7 +83,7 @@ static URange16 Ps_range16[] = { { 65375, 65375 }, { 65378, 65378 }, }; -static URange16 Nl_range16[] = { +static const URange16 Nl_range16[] = { { 5870, 5872 }, { 8544, 8578 }, { 8581, 8584 }, @@ -90,14 +92,14 @@ static URange16 Nl_range16[] = { { 12344, 12346 }, { 42726, 42735 }, }; -static URange32 Nl_range32[] = { +static const URange32 Nl_range32[] = { { 65856, 65908 }, { 66369, 66369 }, { 66378, 66378 }, { 66513, 66517 }, { 74752, 74850 }, }; -static URange16 No_range16[] = { +static const URange16 No_range16[] = { { 178, 179 }, { 185, 185 }, { 188, 190 }, @@ -121,12 +123,13 @@ static URange16 No_range16[] = { { 11517, 11517 }, { 12690, 12693 }, { 12832, 12841 }, + { 12872, 12879 }, { 12881, 12895 }, { 12928, 12937 }, { 12977, 12991 }, { 43056, 43061 }, }; -static URange32 No_range32[] = { +static const URange32 No_range32[] = { { 65799, 65843 }, { 65909, 65912 }, { 65930, 65930 }, @@ -142,7 +145,9 @@ static URange32 No_range32[] = { { 119648, 119665 }, { 127232, 127242 }, }; -static URange16 Lo_range16[] = { +static const URange16 Lo_range16[] = { + { 170, 170 }, + { 186, 186 }, { 443, 443 }, { 448, 451 }, { 660, 660 }, @@ -163,6 +168,8 @@ static URange16 Lo_range16[] = { { 1994, 2026 }, { 2048, 2069 }, { 2112, 2136 }, + { 2208, 2208 }, + { 2210, 2220 }, { 2308, 2361 }, { 2365, 2365 }, { 2384, 2384 }, @@ -267,7 +274,7 @@ static URange16 Lo_range16[] = { { 3762, 3763 }, { 3773, 3773 }, { 3776, 3780 }, - { 3804, 3805 }, + { 3804, 3807 }, { 3840, 3840 }, { 3904, 3911 }, { 3913, 3948 }, @@ -282,7 +289,7 @@ static URange16 Lo_range16[] = { { 4213, 4225 }, { 4238, 4238 }, { 4304, 4346 }, - { 4352, 4680 }, + { 4349, 4680 }, { 4682, 4685 }, { 4688, 4694 }, { 4696, 4696 }, @@ -328,14 +335,15 @@ static URange16 Lo_range16[] = { { 6981, 6987 }, { 7043, 7072 }, { 7086, 7087 }, - { 7104, 7141 }, + { 7098, 7141 }, { 7168, 7203 }, { 7245, 7247 }, { 7258, 7287 }, { 7401, 7404 }, { 7406, 7409 }, + { 7413, 7414 }, { 8501, 8504 }, - { 11568, 11621 }, + { 11568, 11623 }, { 11648, 11670 }, { 11680, 11686 }, { 11688, 11694 }, @@ -356,7 +364,7 @@ static URange16 Lo_range16[] = { { 12704, 12730 }, { 12784, 12799 }, { 13312, 19893 }, - { 19968, 40907 }, + { 19968, 40908 }, { 40960, 40980 }, { 40982, 42124 }, { 42192, 42231 }, @@ -390,6 +398,8 @@ static URange16 Lo_range16[] = { { 43712, 43712 }, { 43714, 43714 }, { 43739, 43740 }, + { 43744, 43754 }, + { 43762, 43762 }, { 43777, 43782 }, { 43785, 43790 }, { 43793, 43798 }, @@ -399,8 +409,7 @@ static URange16 Lo_range16[] = { { 44032, 55203 }, { 55216, 55238 }, { 55243, 55291 }, - { 63744, 64045 }, - { 64048, 64109 }, + { 63744, 64109 }, { 64112, 64217 }, { 64285, 64285 }, { 64287, 64296 }, @@ -424,7 +433,7 @@ static URange16 Lo_range16[] = { { 65490, 65495 }, { 65498, 65500 }, }; -static URange32 Lo_range32[] = { +static const URange32 Lo_range32[] = { { 65536, 65547 }, { 65549, 65574 }, { 65576, 65594 }, @@ -449,6 +458,8 @@ static URange32 Lo_range32[] = { { 67647, 67669 }, { 67840, 67861 }, { 67872, 67897 }, + { 67968, 68023 }, + { 68030, 68031 }, { 68096, 68096 }, { 68112, 68115 }, { 68117, 68119 }, @@ -460,20 +471,58 @@ static URange32 Lo_range32[] = { { 68608, 68680 }, { 69635, 69687 }, { 69763, 69807 }, + { 69840, 69864 }, + { 69891, 69926 }, + { 70019, 70066 }, + { 70081, 70084 }, + { 71296, 71338 }, { 73728, 74606 }, { 77824, 78894 }, { 92160, 92728 }, + { 93952, 94020 }, + { 94032, 94032 }, { 110592, 110593 }, + { 126464, 126467 }, + { 126469, 126495 }, + { 126497, 126498 }, + { 126500, 126500 }, + { 126503, 126503 }, + { 126505, 126514 }, + { 126516, 126519 }, + { 126521, 126521 }, + { 126523, 126523 }, + { 126530, 126530 }, + { 126535, 126535 }, + { 126537, 126537 }, + { 126539, 126539 }, + { 126541, 126543 }, + { 126545, 126546 }, + { 126548, 126548 }, + { 126551, 126551 }, + { 126553, 126553 }, + { 126555, 126555 }, + { 126557, 126557 }, + { 126559, 126559 }, + { 126561, 126562 }, + { 126564, 126564 }, + { 126567, 126570 }, + { 126572, 126578 }, + { 126580, 126583 }, + { 126585, 126588 }, + { 126590, 126590 }, + { 126592, 126601 }, + { 126603, 126619 }, + { 126625, 126627 }, + { 126629, 126633 }, + { 126635, 126651 }, { 131072, 173782 }, { 173824, 177972 }, { 177984, 178205 }, { 194560, 195101 }, }; -static URange16 Ll_range16[] = { +static const URange16 Ll_range16[] = { { 97, 122 }, - { 170, 170 }, { 181, 181 }, - { 186, 186 }, { 223, 246 }, { 248, 255 }, { 257, 257 }, @@ -739,7 +788,7 @@ static URange16 Ll_range16[] = { { 1319, 1319 }, { 1377, 1415 }, { 7424, 7467 }, - { 7522, 7543 }, + { 7531, 7543 }, { 7545, 7578 }, { 7681, 7681 }, { 7683, 7683 }, @@ -903,7 +952,7 @@ static URange16 Ll_range16[] = { { 11372, 11372 }, { 11377, 11377 }, { 11379, 11380 }, - { 11382, 11388 }, + { 11382, 11387 }, { 11393, 11393 }, { 11395, 11395 }, { 11397, 11397 }, @@ -956,7 +1005,10 @@ static URange16 Ll_range16[] = { { 11491, 11492 }, { 11500, 11500 }, { 11502, 11502 }, + { 11507, 11507 }, { 11520, 11557 }, + { 11559, 11559 }, + { 11565, 11565 }, { 42561, 42561 }, { 42563, 42563 }, { 42565, 42565 }, @@ -1041,6 +1093,7 @@ static URange16 Ll_range16[] = { { 42892, 42892 }, { 42894, 42894 }, { 42897, 42897 }, + { 42899, 42899 }, { 42913, 42913 }, { 42915, 42915 }, { 42917, 42917 }, @@ -1051,7 +1104,7 @@ static URange16 Ll_range16[] = { { 64275, 64279 }, { 65345, 65370 }, }; -static URange32 Ll_range32[] = { +static const URange32 Ll_range32[] = { { 66600, 66639 }, { 119834, 119859 }, { 119886, 119892 }, @@ -1082,7 +1135,7 @@ static URange32 Ll_range32[] = { { 120772, 120777 }, { 120779, 120779 }, }; -static URange16 Lm_range16[] = { +static const URange16 Lm_range16[] = { { 688, 705 }, { 710, 721 }, { 736, 740 }, @@ -1106,13 +1159,13 @@ static URange16 Lm_range16[] = { { 6211, 6211 }, { 6823, 6823 }, { 7288, 7293 }, - { 7468, 7521 }, + { 7468, 7530 }, { 7544, 7544 }, { 7579, 7615 }, { 8305, 8305 }, { 8319, 8319 }, { 8336, 8348 }, - { 11389, 11389 }, + { 11388, 11389 }, { 11631, 11631 }, { 11823, 11823 }, { 12293, 12293 }, @@ -1127,13 +1180,18 @@ static URange16 Lm_range16[] = { { 42775, 42783 }, { 42864, 42864 }, { 42888, 42888 }, + { 43000, 43001 }, { 43471, 43471 }, { 43632, 43632 }, { 43741, 43741 }, + { 43763, 43764 }, { 65392, 65392 }, { 65438, 65439 }, }; -static URange16 Nd_range16[] = { +static const URange32 Lm_range32[] = { + { 94099, 94111 }, +}; +static const URange16 Nd_range16[] = { { 48, 57 }, { 1632, 1641 }, { 1776, 1785 }, @@ -1170,12 +1228,16 @@ static URange16 Nd_range16[] = { { 44016, 44025 }, { 65296, 65305 }, }; -static URange32 Nd_range32[] = { +static const URange32 Nd_range32[] = { { 66720, 66729 }, { 69734, 69743 }, + { 69872, 69881 }, + { 69942, 69951 }, + { 70096, 70105 }, + { 71360, 71369 }, { 120782, 120831 }, }; -static URange16 Pc_range16[] = { +static const URange16 Pc_range16[] = { { 95, 95 }, { 8255, 8256 }, { 8276, 8276 }, @@ -1183,7 +1245,7 @@ static URange16 Pc_range16[] = { { 65101, 65103 }, { 65343, 65343 }, }; -static URange16 Lt_range16[] = { +static const URange16 Lt_range16[] = { { 453, 453 }, { 456, 456 }, { 459, 459 }, @@ -1195,7 +1257,7 @@ static URange16 Lt_range16[] = { { 8140, 8140 }, { 8188, 8188 }, }; -static URange16 Lu_range16[] = { +static const URange16 Lu_range16[] = { { 65, 90 }, { 192, 214 }, { 216, 222 }, @@ -1464,6 +1526,8 @@ static URange16 Lu_range16[] = { { 1318, 1318 }, { 1329, 1366 }, { 4256, 4293 }, + { 4295, 4295 }, + { 4301, 4301 }, { 7680, 7680 }, { 7682, 7682 }, { 7684, 7684 }, @@ -1678,6 +1742,7 @@ static URange16 Lu_range16[] = { { 11490, 11490 }, { 11499, 11499 }, { 11501, 11501 }, + { 11506, 11506 }, { 42560, 42560 }, { 42562, 42562 }, { 42564, 42564 }, @@ -1761,14 +1826,16 @@ static URange16 Lu_range16[] = { { 42891, 42891 }, { 42893, 42893 }, { 42896, 42896 }, + { 42898, 42898 }, { 42912, 42912 }, { 42914, 42914 }, { 42916, 42916 }, { 42918, 42918 }, { 42920, 42920 }, + { 42922, 42922 }, { 65313, 65338 }, }; -static URange32 Lu_range32[] = { +static const URange32 Lu_range32[] = { { 66560, 66599 }, { 119808, 119833 }, { 119860, 119885 }, @@ -1802,7 +1869,7 @@ static URange32 Lu_range32[] = { { 120720, 120744 }, { 120778, 120778 }, }; -static URange16 Pf_range16[] = { +static const URange16 Pf_range16[] = { { 187, 187 }, { 8217, 8217 }, { 8221, 8221 }, @@ -1814,7 +1881,7 @@ static URange16 Pf_range16[] = { { 11805, 11805 }, { 11809, 11809 }, }; -static URange16 Pd_range16[] = { +static const URange16 Pd_range16[] = { { 45, 45 }, { 1418, 1418 }, { 1470, 1470 }, @@ -1823,6 +1890,7 @@ static URange16 Pd_range16[] = { { 8208, 8213 }, { 11799, 11799 }, { 11802, 11802 }, + { 11834, 11835 }, { 12316, 12316 }, { 12336, 12336 }, { 12448, 12448 }, @@ -1831,7 +1899,7 @@ static URange16 Pd_range16[] = { { 65123, 65123 }, { 65293, 65293 }, }; -static URange16 Pe_range16[] = { +static const URange16 Pe_range16[] = { { 41, 41 }, { 93, 93 }, { 125, 125 }, @@ -1841,6 +1909,8 @@ static URange16 Pe_range16[] = { { 8262, 8262 }, { 8318, 8318 }, { 8334, 8334 }, + { 8969, 8969 }, + { 8971, 8971 }, { 9002, 9002 }, { 10089, 10089 }, { 10091, 10091 }, @@ -1903,7 +1973,7 @@ static URange16 Pe_range16[] = { { 65376, 65376 }, { 65379, 65379 }, }; -static URange16 Pi_range16[] = { +static const URange16 Pi_range16[] = { { 171, 171 }, { 8216, 8216 }, { 8219, 8220 }, @@ -1916,7 +1986,7 @@ static URange16 Pi_range16[] = { { 11804, 11804 }, { 11808, 11808 }, }; -static URange16 Po_range16[] = { +static const URange16 Po_range16[] = { { 33, 35 }, { 37, 39 }, { 42, 42 }, @@ -1926,7 +1996,8 @@ static URange16 Po_range16[] = { { 63, 64 }, { 92, 92 }, { 161, 161 }, - { 183, 183 }, + { 167, 167 }, + { 182, 183 }, { 191, 191 }, { 894, 894 }, { 903, 903 }, @@ -1948,16 +2019,18 @@ static URange16 Po_range16[] = { { 2142, 2142 }, { 2404, 2405 }, { 2416, 2416 }, + { 2800, 2800 }, { 3572, 3572 }, { 3663, 3663 }, { 3674, 3675 }, { 3844, 3858 }, + { 3860, 3860 }, { 3973, 3973 }, { 4048, 4052 }, { 4057, 4058 }, { 4170, 4175 }, { 4347, 4347 }, - { 4961, 4968 }, + { 4960, 4968 }, { 5741, 5742 }, { 5867, 5869 }, { 5941, 5942 }, @@ -1973,6 +2046,7 @@ static URange16 Po_range16[] = { { 7164, 7167 }, { 7227, 7231 }, { 7294, 7295 }, + { 7360, 7367 }, { 7379, 7379 }, { 8214, 8215 }, { 8224, 8231 }, @@ -1993,7 +2067,7 @@ static URange16 Po_range16[] = { { 11803, 11803 }, { 11806, 11807 }, { 11818, 11822 }, - { 11824, 11825 }, + { 11824, 11833 }, { 12289, 12291 }, { 12349, 12349 }, { 12539, 12539 }, @@ -2011,6 +2085,7 @@ static URange16 Po_range16[] = { { 43486, 43487 }, { 43612, 43615 }, { 43742, 43743 }, + { 43760, 43761 }, { 44011, 44011 }, { 65040, 65046 }, { 65049, 65049 }, @@ -2033,8 +2108,8 @@ static URange16 Po_range16[] = { { 65377, 65377 }, { 65380, 65381 }, }; -static URange32 Po_range32[] = { - { 65792, 65793 }, +static const URange32 Po_range32[] = { + { 65792, 65794 }, { 66463, 66463 }, { 66512, 66512 }, { 67671, 67671 }, @@ -2046,31 +2121,34 @@ static URange32 Po_range32[] = { { 69703, 69709 }, { 69819, 69820 }, { 69822, 69825 }, + { 69952, 69955 }, + { 70085, 70088 }, { 74864, 74867 }, }; -static URange16 Me_range16[] = { +static const URange16 Me_range16[] = { { 1160, 1161 }, { 8413, 8416 }, { 8418, 8420 }, { 42608, 42610 }, }; -static URange16 C_range16[] = { +static const URange16 C_range16[] = { { 0, 31 }, { 127, 159 }, { 173, 173 }, - { 1536, 1539 }, + { 1536, 1540 }, + { 1564, 1564 }, { 1757, 1757 }, { 1807, 1807 }, - { 6068, 6069 }, + { 6158, 6158 }, { 8203, 8207 }, { 8234, 8238 }, { 8288, 8292 }, - { 8298, 8303 }, + { 8294, 8303 }, { 55296, 63743 }, { 65279, 65279 }, { 65529, 65531 }, }; -static URange32 C_range32[] = { +static const URange32 C_range32[] = { { 69821, 69821 }, { 119155, 119162 }, { 917505, 917505 }, @@ -2078,7 +2156,7 @@ static URange32 C_range32[] = { { 983040, 1048573 }, { 1048576, 1114109 }, }; -static URange16 Mc_range16[] = { +static const URange16 Mc_range16[] = { { 2307, 2307 }, { 2363, 2363 }, { 2366, 2368 }, @@ -2145,7 +2223,7 @@ static URange16 Mc_range16[] = { { 6451, 6456 }, { 6576, 6592 }, { 6600, 6601 }, - { 6681, 6683 }, + { 6681, 6682 }, { 6741, 6741 }, { 6743, 6743 }, { 6753, 6753 }, @@ -2160,6 +2238,7 @@ static URange16 Mc_range16[] = { { 7073, 7073 }, { 7078, 7079 }, { 7082, 7082 }, + { 7084, 7085 }, { 7143, 7143 }, { 7146, 7148 }, { 7150, 7150 }, @@ -2167,7 +2246,8 @@ static URange16 Mc_range16[] = { { 7204, 7211 }, { 7220, 7221 }, { 7393, 7393 }, - { 7410, 7410 }, + { 7410, 7411 }, + { 12334, 12335 }, { 43043, 43044 }, { 43047, 43047 }, { 43136, 43137 }, @@ -2181,21 +2261,32 @@ static URange16 Mc_range16[] = { { 43571, 43572 }, { 43597, 43597 }, { 43643, 43643 }, + { 43755, 43755 }, + { 43758, 43759 }, + { 43765, 43765 }, { 44003, 44004 }, { 44006, 44007 }, { 44009, 44010 }, { 44012, 44012 }, }; -static URange32 Mc_range32[] = { +static const URange32 Mc_range32[] = { { 69632, 69632 }, { 69634, 69634 }, { 69762, 69762 }, { 69808, 69810 }, { 69815, 69816 }, + { 69932, 69932 }, + { 70018, 70018 }, + { 70067, 70069 }, + { 70079, 70080 }, + { 71340, 71340 }, + { 71342, 71343 }, + { 71350, 71350 }, + { 94033, 94078 }, { 119141, 119142 }, { 119149, 119154 }, }; -static URange16 Mn_range16[] = { +static const URange16 Mn_range16[] = { { 768, 879 }, { 1155, 1159 }, { 1425, 1469 }, @@ -2219,6 +2310,7 @@ static URange16 Mn_range16[] = { { 2085, 2087 }, { 2089, 2093 }, { 2137, 2139 }, + { 2276, 2302 }, { 2304, 2306 }, { 2362, 2362 }, { 2364, 2364 }, @@ -2304,6 +2396,7 @@ static URange16 Mn_range16[] = { { 5938, 5940 }, { 5970, 5971 }, { 6002, 6003 }, + { 6068, 6069 }, { 6071, 6077 }, { 6086, 6086 }, { 6089, 6099 }, @@ -2315,6 +2408,7 @@ static URange16 Mn_range16[] = { { 6450, 6450 }, { 6457, 6459 }, { 6679, 6680 }, + { 6683, 6683 }, { 6742, 6742 }, { 6744, 6750 }, { 6752, 6752 }, @@ -2331,6 +2425,7 @@ static URange16 Mn_range16[] = { { 7040, 7041 }, { 7074, 7077 }, { 7080, 7081 }, + { 7083, 7083 }, { 7142, 7142 }, { 7144, 7145 }, { 7149, 7149 }, @@ -2341,6 +2436,7 @@ static URange16 Mn_range16[] = { { 7380, 7392 }, { 7394, 7400 }, { 7405, 7405 }, + { 7412, 7412 }, { 7616, 7654 }, { 7676, 7679 }, { 8400, 8412 }, @@ -2349,10 +2445,11 @@ static URange16 Mn_range16[] = { { 11503, 11505 }, { 11647, 11647 }, { 11744, 11775 }, - { 12330, 12335 }, + { 12330, 12333 }, { 12441, 12442 }, { 42607, 42607 }, - { 42620, 42621 }, + { 42612, 42621 }, + { 42655, 42655 }, { 42736, 42737 }, { 43010, 43010 }, { 43014, 43014 }, @@ -2376,6 +2473,8 @@ static URange16 Mn_range16[] = { { 43703, 43704 }, { 43710, 43711 }, { 43713, 43713 }, + { 43756, 43757 }, + { 43766, 43766 }, { 44005, 44005 }, { 44008, 44008 }, { 44013, 44013 }, @@ -2383,7 +2482,7 @@ static URange16 Mn_range16[] = { { 65024, 65039 }, { 65056, 65062 }, }; -static URange32 Mn_range32[] = { +static const URange32 Mn_range32[] = { { 66045, 66045 }, { 68097, 68099 }, { 68101, 68102 }, @@ -2395,6 +2494,16 @@ static URange32 Mn_range32[] = { { 69760, 69761 }, { 69811, 69814 }, { 69817, 69818 }, + { 69888, 69890 }, + { 69927, 69931 }, + { 69933, 69940 }, + { 70016, 70017 }, + { 70070, 70078 }, + { 71339, 71339 }, + { 71341, 71341 }, + { 71344, 71349 }, + { 71351, 71351 }, + { 94095, 94098 }, { 119143, 119145 }, { 119163, 119170 }, { 119173, 119179 }, @@ -2402,7 +2511,7 @@ static URange32 Mn_range32[] = { { 119362, 119364 }, { 917760, 917999 }, }; -static URange16 M_range16[] = { +static const URange16 M_range16[] = { { 768, 879 }, { 1155, 1161 }, { 1425, 1469 }, @@ -2426,6 +2535,7 @@ static URange16 M_range16[] = { { 2085, 2087 }, { 2089, 2093 }, { 2137, 2139 }, + { 2276, 2302 }, { 2304, 2307 }, { 2362, 2364 }, { 2366, 2383 }, @@ -2520,7 +2630,7 @@ static URange16 M_range16[] = { { 5938, 5940 }, { 5970, 5971 }, { 6002, 6003 }, - { 6070, 6099 }, + { 6068, 6099 }, { 6109, 6109 }, { 6155, 6157 }, { 6313, 6313 }, @@ -2536,13 +2646,13 @@ static URange16 M_range16[] = { { 6964, 6980 }, { 7019, 7027 }, { 7040, 7042 }, - { 7073, 7082 }, + { 7073, 7085 }, { 7142, 7155 }, { 7204, 7223 }, { 7376, 7378 }, { 7380, 7400 }, { 7405, 7405 }, - { 7410, 7410 }, + { 7410, 7412 }, { 7616, 7654 }, { 7676, 7679 }, { 8400, 8432 }, @@ -2552,7 +2662,8 @@ static URange16 M_range16[] = { { 12330, 12335 }, { 12441, 12442 }, { 42607, 42610 }, - { 42620, 42621 }, + { 42612, 42621 }, + { 42655, 42655 }, { 42736, 42737 }, { 43010, 43010 }, { 43014, 43014 }, @@ -2574,13 +2685,15 @@ static URange16 M_range16[] = { { 43703, 43704 }, { 43710, 43711 }, { 43713, 43713 }, + { 43755, 43759 }, + { 43765, 43766 }, { 44003, 44010 }, { 44012, 44013 }, { 64286, 64286 }, { 65024, 65039 }, { 65056, 65062 }, }; -static URange32 M_range32[] = { +static const URange32 M_range32[] = { { 66045, 66045 }, { 68097, 68099 }, { 68101, 68102 }, @@ -2591,6 +2704,13 @@ static URange32 M_range32[] = { { 69688, 69702 }, { 69760, 69762 }, { 69808, 69818 }, + { 69888, 69890 }, + { 69927, 69940 }, + { 70016, 70018 }, + { 70067, 70080 }, + { 71339, 71351 }, + { 94033, 94078 }, + { 94095, 94098 }, { 119141, 119145 }, { 119149, 119154 }, { 119163, 119170 }, @@ -2599,7 +2719,7 @@ static URange32 M_range32[] = { { 119362, 119364 }, { 917760, 917999 }, }; -static URange16 L_range16[] = { +static const URange16 L_range16[] = { { 65, 90 }, { 97, 122 }, { 170, 170 }, @@ -2647,6 +2767,8 @@ static URange16 L_range16[] = { { 2084, 2084 }, { 2088, 2088 }, { 2112, 2136 }, + { 2208, 2208 }, + { 2210, 2220 }, { 2308, 2361 }, { 2365, 2365 }, { 2384, 2384 }, @@ -2752,7 +2874,7 @@ static URange16 L_range16[] = { { 3773, 3773 }, { 3776, 3780 }, { 3782, 3782 }, - { 3804, 3805 }, + { 3804, 3807 }, { 3840, 3840 }, { 3904, 3911 }, { 3913, 3948 }, @@ -2767,9 +2889,10 @@ static URange16 L_range16[] = { { 4213, 4225 }, { 4238, 4238 }, { 4256, 4293 }, + { 4295, 4295 }, + { 4301, 4301 }, { 4304, 4346 }, - { 4348, 4348 }, - { 4352, 4680 }, + { 4348, 4680 }, { 4682, 4685 }, { 4688, 4694 }, { 4696, 4696 }, @@ -2816,12 +2939,13 @@ static URange16 L_range16[] = { { 6981, 6987 }, { 7043, 7072 }, { 7086, 7087 }, - { 7104, 7141 }, + { 7098, 7141 }, { 7168, 7203 }, { 7245, 7247 }, { 7258, 7293 }, { 7401, 7404 }, { 7406, 7409 }, + { 7413, 7414 }, { 7424, 7615 }, { 7680, 7957 }, { 7960, 7965 }, @@ -2863,8 +2987,11 @@ static URange16 L_range16[] = { { 11312, 11358 }, { 11360, 11492 }, { 11499, 11502 }, + { 11506, 11507 }, { 11520, 11557 }, - { 11568, 11621 }, + { 11559, 11559 }, + { 11565, 11565 }, + { 11568, 11623 }, { 11631, 11631 }, { 11648, 11670 }, { 11680, 11686 }, @@ -2888,7 +3015,7 @@ static URange16 L_range16[] = { { 12704, 12730 }, { 12784, 12799 }, { 13312, 19893 }, - { 19968, 40907 }, + { 19968, 40908 }, { 40960, 42124 }, { 42192, 42237 }, { 42240, 42508 }, @@ -2900,9 +3027,9 @@ static URange16 L_range16[] = { { 42775, 42783 }, { 42786, 42888 }, { 42891, 42894 }, - { 42896, 42897 }, - { 42912, 42921 }, - { 43002, 43009 }, + { 42896, 42899 }, + { 42912, 42922 }, + { 43000, 43009 }, { 43011, 43013 }, { 43015, 43018 }, { 43020, 43042 }, @@ -2927,6 +3054,8 @@ static URange16 L_range16[] = { { 43712, 43712 }, { 43714, 43714 }, { 43739, 43741 }, + { 43744, 43754 }, + { 43762, 43764 }, { 43777, 43782 }, { 43785, 43790 }, { 43793, 43798 }, @@ -2936,8 +3065,7 @@ static URange16 L_range16[] = { { 44032, 55203 }, { 55216, 55238 }, { 55243, 55291 }, - { 63744, 64045 }, - { 64048, 64109 }, + { 63744, 64109 }, { 64112, 64217 }, { 64256, 64262 }, { 64275, 64279 }, @@ -2963,7 +3091,7 @@ static URange16 L_range16[] = { { 65490, 65495 }, { 65498, 65500 }, }; -static URange32 L_range32[] = { +static const URange32 L_range32[] = { { 65536, 65547 }, { 65549, 65574 }, { 65576, 65594 }, @@ -2988,6 +3116,8 @@ static URange32 L_range32[] = { { 67647, 67669 }, { 67840, 67861 }, { 67872, 67897 }, + { 67968, 68023 }, + { 68030, 68031 }, { 68096, 68096 }, { 68112, 68115 }, { 68117, 68119 }, @@ -2999,9 +3129,17 @@ static URange32 L_range32[] = { { 68608, 68680 }, { 69635, 69687 }, { 69763, 69807 }, + { 69840, 69864 }, + { 69891, 69926 }, + { 70019, 70066 }, + { 70081, 70084 }, + { 71296, 71338 }, { 73728, 74606 }, { 77824, 78894 }, { 92160, 92728 }, + { 93952, 94020 }, + { 94032, 94032 }, + { 94099, 94111 }, { 110592, 110593 }, { 119808, 119892 }, { 119894, 119964 }, @@ -3033,12 +3171,45 @@ static URange32 L_range32[] = { { 120714, 120744 }, { 120746, 120770 }, { 120772, 120779 }, + { 126464, 126467 }, + { 126469, 126495 }, + { 126497, 126498 }, + { 126500, 126500 }, + { 126503, 126503 }, + { 126505, 126514 }, + { 126516, 126519 }, + { 126521, 126521 }, + { 126523, 126523 }, + { 126530, 126530 }, + { 126535, 126535 }, + { 126537, 126537 }, + { 126539, 126539 }, + { 126541, 126543 }, + { 126545, 126546 }, + { 126548, 126548 }, + { 126551, 126551 }, + { 126553, 126553 }, + { 126555, 126555 }, + { 126557, 126557 }, + { 126559, 126559 }, + { 126561, 126562 }, + { 126564, 126564 }, + { 126567, 126570 }, + { 126572, 126578 }, + { 126580, 126583 }, + { 126585, 126588 }, + { 126590, 126590 }, + { 126592, 126601 }, + { 126603, 126619 }, + { 126625, 126627 }, + { 126629, 126633 }, + { 126635, 126651 }, { 131072, 173782 }, { 173824, 177972 }, { 177984, 178205 }, { 194560, 195101 }, }; -static URange16 N_range16[] = { +static const URange16 N_range16[] = { { 48, 57 }, { 178, 179 }, { 185, 185 }, @@ -3090,6 +3261,7 @@ static URange16 N_range16[] = { { 12344, 12346 }, { 12690, 12693 }, { 12832, 12841 }, + { 12872, 12879 }, { 12881, 12895 }, { 12928, 12937 }, { 12977, 12991 }, @@ -3103,7 +3275,7 @@ static URange16 N_range16[] = { { 44016, 44025 }, { 65296, 65305 }, }; -static URange32 N_range32[] = { +static const URange32 N_range32[] = { { 65799, 65843 }, { 65856, 65912 }, { 65930, 65930 }, @@ -3120,12 +3292,16 @@ static URange32 N_range32[] = { { 68472, 68479 }, { 69216, 69246 }, { 69714, 69743 }, + { 69872, 69881 }, + { 69942, 69951 }, + { 70096, 70105 }, + { 71360, 71369 }, { 74752, 74850 }, { 119648, 119665 }, { 120782, 120831 }, { 127232, 127242 }, }; -static URange16 Sk_range16[] = { +static const URange16 Sk_range16[] = { { 94, 94 }, { 96, 96 }, { 168, 168 }, @@ -3154,7 +3330,7 @@ static URange16 Sk_range16[] = { { 65344, 65344 }, { 65507, 65507 }, }; -static URange16 P_range16[] = { +static const URange16 P_range16[] = { { 33, 35 }, { 37, 42 }, { 44, 47 }, @@ -3165,8 +3341,9 @@ static URange16 P_range16[] = { { 123, 123 }, { 125, 125 }, { 161, 161 }, + { 167, 167 }, { 171, 171 }, - { 183, 183 }, + { 182, 183 }, { 187, 187 }, { 191, 191 }, { 894, 894 }, @@ -3190,17 +3367,19 @@ static URange16 P_range16[] = { { 2142, 2142 }, { 2404, 2405 }, { 2416, 2416 }, + { 2800, 2800 }, { 3572, 3572 }, { 3663, 3663 }, { 3674, 3675 }, { 3844, 3858 }, + { 3860, 3860 }, { 3898, 3901 }, { 3973, 3973 }, { 4048, 4052 }, { 4057, 4058 }, { 4170, 4175 }, { 4347, 4347 }, - { 4961, 4968 }, + { 4960, 4968 }, { 5120, 5120 }, { 5741, 5742 }, { 5787, 5788 }, @@ -3217,6 +3396,7 @@ static URange16 P_range16[] = { { 7164, 7167 }, { 7227, 7231 }, { 7294, 7295 }, + { 7360, 7367 }, { 7379, 7379 }, { 8208, 8231 }, { 8240, 8259 }, @@ -3224,6 +3404,7 @@ static URange16 P_range16[] = { { 8275, 8286 }, { 8317, 8318 }, { 8333, 8334 }, + { 8968, 8971 }, { 9001, 9002 }, { 10088, 10101 }, { 10181, 10182 }, @@ -3235,7 +3416,7 @@ static URange16 P_range16[] = { { 11518, 11519 }, { 11632, 11632 }, { 11776, 11822 }, - { 11824, 11825 }, + { 11824, 11835 }, { 12289, 12291 }, { 12296, 12305 }, { 12308, 12319 }, @@ -3257,6 +3438,7 @@ static URange16 P_range16[] = { { 43486, 43487 }, { 43612, 43615 }, { 43742, 43743 }, + { 43760, 43761 }, { 44011, 44011 }, { 64830, 64831 }, { 65040, 65049 }, @@ -3276,8 +3458,8 @@ static URange16 P_range16[] = { { 65373, 65373 }, { 65375, 65381 }, }; -static URange32 P_range32[] = { - { 65792, 65793 }, +static const URange32 P_range32[] = { + { 65792, 65794 }, { 66463, 66463 }, { 66512, 66512 }, { 67671, 67671 }, @@ -3289,9 +3471,11 @@ static URange32 P_range32[] = { { 69703, 69709 }, { 69819, 69820 }, { 69822, 69825 }, + { 69952, 69955 }, + { 70085, 70088 }, { 74864, 74867 }, }; -static URange16 S_range16[] = { +static const URange16 S_range16[] = { { 36, 36 }, { 43, 43 }, { 60, 62 }, @@ -3299,11 +3483,11 @@ static URange16 S_range16[] = { { 96, 96 }, { 124, 124 }, { 126, 126 }, - { 162, 169 }, + { 162, 166 }, + { 168, 169 }, { 172, 172 }, { 174, 177 }, { 180, 180 }, - { 182, 182 }, { 184, 184 }, { 215, 215 }, { 247, 247 }, @@ -3316,6 +3500,7 @@ static URange16 S_range16[] = { { 900, 901 }, { 1014, 1014 }, { 1154, 1154 }, + { 1423, 1423 }, { 1542, 1544 }, { 1547, 1547 }, { 1550, 1551 }, @@ -3332,7 +3517,8 @@ static URange16 S_range16[] = { { 3449, 3449 }, { 3647, 3647 }, { 3841, 3843 }, - { 3859, 3863 }, + { 3859, 3859 }, + { 3861, 3863 }, { 3866, 3871 }, { 3892, 3892 }, { 3894, 3894 }, @@ -3342,7 +3528,6 @@ static URange16 S_range16[] = { { 4046, 4047 }, { 4053, 4056 }, { 4254, 4255 }, - { 4960, 4960 }, { 5008, 5017 }, { 6107, 6107 }, { 6464, 6464 }, @@ -3359,7 +3544,7 @@ static URange16 S_range16[] = { { 8274, 8274 }, { 8314, 8316 }, { 8330, 8332 }, - { 8352, 8377 }, + { 8352, 8378 }, { 8448, 8449 }, { 8451, 8454 }, { 8456, 8457 }, @@ -3374,7 +3559,8 @@ static URange16 S_range16[] = { { 8512, 8516 }, { 8522, 8525 }, { 8527, 8527 }, - { 8592, 9000 }, + { 8592, 8967 }, + { 8972, 9000 }, { 9003, 9203 }, { 9216, 9254 }, { 9280, 9290 }, @@ -3382,9 +3568,7 @@ static URange16 S_range16[] = { { 9472, 9983 }, { 9985, 10087 }, { 10132, 10180 }, - { 10183, 10186 }, - { 10188, 10188 }, - { 10190, 10213 }, + { 10183, 10213 }, { 10224, 10626 }, { 10649, 10711 }, { 10716, 10747 }, @@ -3405,7 +3589,8 @@ static URange16 S_range16[] = { { 12694, 12703 }, { 12736, 12771 }, { 12800, 12830 }, - { 12842, 12880 }, + { 12842, 12871 }, + { 12880, 12880 }, { 12896, 12927 }, { 12938, 12976 }, { 12992, 13054 }, @@ -3435,8 +3620,7 @@ static URange16 S_range16[] = { { 65512, 65518 }, { 65532, 65533 }, }; -static URange32 S_range32[] = { - { 65794, 65794 }, +static const URange32 S_range32[] = { { 65847, 65855 }, { 65913, 65929 }, { 65936, 65947 }, @@ -3461,6 +3645,7 @@ static URange32 S_range32[] = { { 120713, 120713 }, { 120745, 120745 }, { 120771, 120771 }, + { 126704, 126705 }, { 126976, 127019 }, { 127024, 127123 }, { 127136, 127150 }, @@ -3468,7 +3653,7 @@ static URange32 S_range32[] = { { 127169, 127183 }, { 127185, 127199 }, { 127248, 127278 }, - { 127280, 127337 }, + { 127280, 127339 }, { 127344, 127386 }, { 127462, 127490 }, { 127504, 127546 }, @@ -3486,29 +3671,18 @@ static URange32 S_range32[] = { { 128066, 128247 }, { 128249, 128252 }, { 128256, 128317 }, + { 128320, 128323 }, { 128336, 128359 }, - { 128507, 128511 }, - { 128513, 128528 }, - { 128530, 128532 }, - { 128534, 128534 }, - { 128536, 128536 }, - { 128538, 128538 }, - { 128540, 128542 }, - { 128544, 128549 }, - { 128552, 128555 }, - { 128557, 128557 }, - { 128560, 128563 }, - { 128565, 128576 }, + { 128507, 128576 }, { 128581, 128591 }, { 128640, 128709 }, { 128768, 128883 }, }; -static URange16 So_range16[] = { - { 166, 167 }, +static const URange16 So_range16[] = { + { 166, 166 }, { 169, 169 }, { 174, 174 }, { 176, 176 }, - { 182, 182 }, { 1154, 1154 }, { 1550, 1551 }, { 1758, 1758 }, @@ -3522,7 +3696,8 @@ static URange16 So_range16[] = { { 3199, 3199 }, { 3449, 3449 }, { 3841, 3843 }, - { 3859, 3863 }, + { 3859, 3859 }, + { 3861, 3863 }, { 3866, 3871 }, { 3892, 3892 }, { 3894, 3894 }, @@ -3532,7 +3707,6 @@ static URange16 So_range16[] = { { 4046, 4047 }, { 4053, 4056 }, { 4254, 4255 }, - { 4960, 4960 }, { 5008, 5017 }, { 6464, 6464 }, { 6622, 6655 }, @@ -3596,7 +3770,8 @@ static URange16 So_range16[] = { { 12694, 12703 }, { 12736, 12771 }, { 12800, 12830 }, - { 12842, 12880 }, + { 12842, 12871 }, + { 12880, 12880 }, { 12896, 12927 }, { 12938, 12976 }, { 12992, 13054 }, @@ -3613,8 +3788,7 @@ static URange16 So_range16[] = { { 65517, 65518 }, { 65532, 65533 }, }; -static URange32 So_range32[] = { - { 65794, 65794 }, +static const URange32 So_range32[] = { { 65847, 65855 }, { 65913, 65929 }, { 65936, 65947 }, @@ -3636,7 +3810,7 @@ static URange32 So_range32[] = { { 127169, 127183 }, { 127185, 127199 }, { 127248, 127278 }, - { 127280, 127337 }, + { 127280, 127339 }, { 127344, 127386 }, { 127462, 127490 }, { 127504, 127546 }, @@ -3654,24 +3828,14 @@ static URange32 So_range32[] = { { 128066, 128247 }, { 128249, 128252 }, { 128256, 128317 }, + { 128320, 128323 }, { 128336, 128359 }, - { 128507, 128511 }, - { 128513, 128528 }, - { 128530, 128532 }, - { 128534, 128534 }, - { 128536, 128536 }, - { 128538, 128538 }, - { 128540, 128542 }, - { 128544, 128549 }, - { 128552, 128555 }, - { 128557, 128557 }, - { 128560, 128563 }, - { 128565, 128576 }, + { 128507, 128576 }, { 128581, 128591 }, { 128640, 128709 }, { 128768, 128883 }, }; -static URange16 Sm_range16[] = { +static const URange16 Sm_range16[] = { { 43, 43 }, { 60, 62 }, { 124, 124 }, @@ -3699,7 +3863,6 @@ static URange16 Sm_range16[] = { { 8658, 8658 }, { 8660, 8660 }, { 8692, 8959 }, - { 8968, 8971 }, { 8992, 8993 }, { 9084, 9084 }, { 9115, 9139 }, @@ -3709,9 +3872,7 @@ static URange16 Sm_range16[] = { { 9720, 9727 }, { 9839, 9839 }, { 10176, 10180 }, - { 10183, 10186 }, - { 10188, 10188 }, - { 10190, 10213 }, + { 10183, 10213 }, { 10224, 10239 }, { 10496, 10626 }, { 10649, 10711 }, @@ -3729,7 +3890,7 @@ static URange16 Sm_range16[] = { { 65506, 65506 }, { 65513, 65516 }, }; -static URange32 Sm_range32[] = { +static const URange32 Sm_range32[] = { { 120513, 120513 }, { 120539, 120539 }, { 120571, 120571 }, @@ -3740,10 +3901,12 @@ static URange32 Sm_range32[] = { { 120713, 120713 }, { 120745, 120745 }, { 120771, 120771 }, + { 126704, 126705 }, }; -static URange16 Sc_range16[] = { +static const URange16 Sc_range16[] = { { 36, 36 }, { 162, 165 }, + { 1423, 1423 }, { 1547, 1547 }, { 2546, 2547 }, { 2555, 2555 }, @@ -3751,7 +3914,7 @@ static URange16 Sc_range16[] = { { 3065, 3065 }, { 3647, 3647 }, { 6107, 6107 }, - { 8352, 8377 }, + { 8352, 8378 }, { 43064, 43064 }, { 65020, 65020 }, { 65129, 65129 }, @@ -3759,70 +3922,69 @@ static URange16 Sc_range16[] = { { 65504, 65505 }, { 65509, 65510 }, }; -static URange16 Z_range16[] = { +static const URange16 Z_range16[] = { { 32, 32 }, { 160, 160 }, { 5760, 5760 }, - { 6158, 6158 }, { 8192, 8202 }, { 8232, 8233 }, { 8239, 8239 }, { 8287, 8287 }, { 12288, 12288 }, }; -static URange16 Zl_range16[] = { +static const URange16 Zl_range16[] = { { 8232, 8232 }, }; -static URange16 Co_range16[] = { +static const URange16 Co_range16[] = { { 57344, 63743 }, }; -static URange32 Co_range32[] = { +static const URange32 Co_range32[] = { { 983040, 1048573 }, { 1048576, 1114109 }, }; -static URange16 Cc_range16[] = { +static const URange16 Cc_range16[] = { { 0, 31 }, { 127, 159 }, }; -static URange16 Cf_range16[] = { +static const URange16 Cf_range16[] = { { 173, 173 }, - { 1536, 1539 }, + { 1536, 1540 }, + { 1564, 1564 }, { 1757, 1757 }, { 1807, 1807 }, - { 6068, 6069 }, + { 6158, 6158 }, { 8203, 8207 }, { 8234, 8238 }, { 8288, 8292 }, - { 8298, 8303 }, + { 8294, 8303 }, { 65279, 65279 }, { 65529, 65531 }, }; -static URange32 Cf_range32[] = { +static const URange32 Cf_range32[] = { { 69821, 69821 }, { 119155, 119162 }, { 917505, 917505 }, { 917536, 917631 }, }; -static URange16 Cs_range16[] = { +static const URange16 Cs_range16[] = { { 55296, 57343 }, }; -static URange16 Zp_range16[] = { +static const URange16 Zp_range16[] = { { 8233, 8233 }, }; -static URange16 Zs_range16[] = { +static const URange16 Zs_range16[] = { { 32, 32 }, { 160, 160 }, { 5760, 5760 }, - { 6158, 6158 }, { 8192, 8202 }, { 8239, 8239 }, { 8287, 8287 }, { 12288, 12288 }, }; -static URange16 Thaana_range16[] = { +static const URange16 Thaana_range16[] = { { 1920, 1969 }, }; -static URange16 Telugu_range16[] = { +static const URange16 Telugu_range16[] = { { 3073, 3075 }, { 3077, 3084 }, { 3086, 3088 }, @@ -3838,16 +4000,16 @@ static URange16 Telugu_range16[] = { { 3174, 3183 }, { 3192, 3199 }, }; -static URange16 Cyrillic_range16[] = { +static const URange16 Cyrillic_range16[] = { { 1024, 1156 }, { 1159, 1319 }, { 7467, 7467 }, { 7544, 7544 }, { 11744, 11775 }, - { 42560, 42611 }, - { 42620, 42647 }, + { 42560, 42647 }, + { 42655, 42655 }, }; -static URange16 Hangul_range16[] = { +static const URange16 Hangul_range16[] = { { 4352, 4607 }, { 12334, 12335 }, { 12593, 12686 }, @@ -3863,10 +4025,10 @@ static URange16 Hangul_range16[] = { { 65490, 65495 }, { 65498, 65500 }, }; -static URange32 Old_South_Arabian_range32[] = { +static const URange32 Old_South_Arabian_range32[] = { { 68192, 68223 }, }; -static URange16 Ethiopic_range16[] = { +static const URange16 Ethiopic_range16[] = { { 4608, 4680 }, { 4682, 4685 }, { 4688, 4694 }, @@ -3900,17 +4062,17 @@ static URange16 Ethiopic_range16[] = { { 43808, 43814 }, { 43816, 43822 }, }; -static URange16 Inherited_range16[] = { +static const URange16 Inherited_range16[] = { { 768, 879 }, { 1157, 1158 }, { 1611, 1621 }, - { 1631, 1631 }, { 1648, 1648 }, { 2385, 2386 }, { 7376, 7378 }, { 7380, 7392 }, { 7394, 7400 }, { 7405, 7405 }, + { 7412, 7412 }, { 7616, 7654 }, { 7676, 7679 }, { 8204, 8205 }, @@ -3920,7 +4082,7 @@ static URange16 Inherited_range16[] = { { 65024, 65039 }, { 65056, 65062 }, }; -static URange32 Inherited_range32[] = { +static const URange32 Inherited_range32[] = { { 66045, 66045 }, { 119143, 119145 }, { 119163, 119170 }, @@ -3928,7 +4090,11 @@ static URange32 Inherited_range32[] = { { 119210, 119213 }, { 917760, 917999 }, }; -static URange16 Han_range16[] = { +static const URange32 Meroitic_Cursive_range32[] = { + { 68000, 68023 }, + { 68030, 68031 }, +}; +static const URange16 Han_range16[] = { { 11904, 11929 }, { 11931, 12019 }, { 12032, 12245 }, @@ -3937,25 +4103,25 @@ static URange16 Han_range16[] = { { 12321, 12329 }, { 12344, 12347 }, { 13312, 19893 }, - { 19968, 40907 }, - { 63744, 64045 }, - { 64048, 64109 }, + { 19968, 40908 }, + { 63744, 64109 }, { 64112, 64217 }, }; -static URange32 Han_range32[] = { +static const URange32 Han_range32[] = { { 131072, 173782 }, { 173824, 177972 }, { 177984, 178205 }, { 194560, 195101 }, }; -static URange16 Armenian_range16[] = { +static const URange16 Armenian_range16[] = { { 1329, 1366 }, { 1369, 1375 }, { 1377, 1415 }, { 1418, 1418 }, + { 1423, 1423 }, { 64275, 64279 }, }; -static URange16 Tamil_range16[] = { +static const URange16 Tamil_range16[] = { { 2946, 2947 }, { 2949, 2954 }, { 2958, 2960 }, @@ -3973,20 +4139,20 @@ static URange16 Tamil_range16[] = { { 3031, 3031 }, { 3046, 3066 }, }; -static URange16 Bopomofo_range16[] = { +static const URange16 Bopomofo_range16[] = { { 746, 747 }, { 12549, 12589 }, { 12704, 12730 }, }; -static URange16 Sundanese_range16[] = { - { 7040, 7082 }, - { 7086, 7097 }, +static const URange16 Sundanese_range16[] = { + { 7040, 7103 }, + { 7360, 7367 }, }; -static URange16 Tagalog_range16[] = { +static const URange16 Tagalog_range16[] = { { 5888, 5900 }, { 5902, 5908 }, }; -static URange16 Malayalam_range16[] = { +static const URange16 Malayalam_range16[] = { { 3330, 3331 }, { 3333, 3340 }, { 3342, 3344 }, @@ -3999,38 +4165,39 @@ static URange16 Malayalam_range16[] = { { 3430, 3445 }, { 3449, 3455 }, }; -static URange32 Carian_range32[] = { +static const URange32 Carian_range32[] = { { 66208, 66256 }, }; -static URange16 Hiragana_range16[] = { +static const URange16 Hiragana_range16[] = { { 12353, 12438 }, { 12445, 12447 }, }; -static URange32 Hiragana_range32[] = { +static const URange32 Hiragana_range32[] = { { 110593, 110593 }, { 127488, 127488 }, }; -static URange16 Tagbanwa_range16[] = { +static const URange16 Tagbanwa_range16[] = { { 5984, 5996 }, { 5998, 6000 }, { 6002, 6003 }, }; -static URange16 Meetei_Mayek_range16[] = { +static const URange16 Meetei_Mayek_range16[] = { + { 43744, 43766 }, { 43968, 44013 }, { 44016, 44025 }, }; -static URange16 Tai_Le_range16[] = { +static const URange16 Tai_Le_range16[] = { { 6480, 6509 }, { 6512, 6516 }, }; -static URange16 Kayah_Li_range16[] = { +static const URange16 Kayah_Li_range16[] = { { 43264, 43311 }, }; -static URange16 Buginese_range16[] = { +static const URange16 Buginese_range16[] = { { 6656, 6683 }, { 6686, 6687 }, }; -static URange32 Kharoshthi_range32[] = { +static const URange32 Kharoshthi_range32[] = { { 68096, 68099 }, { 68101, 68102 }, { 68108, 68115 }, @@ -4040,22 +4207,22 @@ static URange32 Kharoshthi_range32[] = { { 68159, 68167 }, { 68176, 68184 }, }; -static URange16 Tai_Tham_range16[] = { +static const URange16 Tai_Tham_range16[] = { { 6688, 6750 }, { 6752, 6780 }, { 6783, 6793 }, { 6800, 6809 }, { 6816, 6829 }, }; -static URange32 Old_Italic_range32[] = { +static const URange32 Old_Italic_range32[] = { { 66304, 66334 }, { 66336, 66339 }, }; -static URange32 Old_Persian_range32[] = { +static const URange32 Old_Persian_range32[] = { { 66464, 66499 }, { 66504, 66517 }, }; -static URange16 Latin_range16[] = { +static const URange16 Latin_range16[] = { { 65, 90 }, { 97, 122 }, { 170, 170 }, @@ -4080,43 +4247,46 @@ static URange16 Latin_range16[] = { { 11360, 11391 }, { 42786, 42887 }, { 42891, 42894 }, - { 42896, 42897 }, - { 42912, 42921 }, - { 43002, 43007 }, + { 42896, 42899 }, + { 42912, 42922 }, + { 43000, 43007 }, { 64256, 64262 }, { 65313, 65338 }, { 65345, 65370 }, }; -static URange16 Saurashtra_range16[] = { +static const URange16 Saurashtra_range16[] = { { 43136, 43204 }, { 43214, 43225 }, }; -static URange32 Shavian_range32[] = { +static const URange32 Shavian_range32[] = { { 66640, 66687 }, }; -static URange16 Georgian_range16[] = { +static const URange16 Georgian_range16[] = { { 4256, 4293 }, + { 4295, 4295 }, + { 4301, 4301 }, { 4304, 4346 }, - { 4348, 4348 }, + { 4348, 4351 }, { 11520, 11557 }, + { 11559, 11559 }, + { 11565, 11565 }, }; -static URange16 Batak_range16[] = { +static const URange16 Batak_range16[] = { { 7104, 7155 }, { 7164, 7167 }, }; -static URange16 Devanagari_range16[] = { +static const URange16 Devanagari_range16[] = { { 2304, 2384 }, { 2387, 2403 }, - { 2406, 2415 }, - { 2417, 2423 }, + { 2406, 2423 }, { 2425, 2431 }, { 43232, 43259 }, }; -static URange16 Thai_range16[] = { +static const URange16 Thai_range16[] = { { 3585, 3642 }, { 3648, 3675 }, }; -static URange16 Tibetan_range16[] = { +static const URange16 Tibetan_range16[] = { { 3840, 3911 }, { 3913, 3948 }, { 3953, 3991 }, @@ -4125,19 +4295,19 @@ static URange16 Tibetan_range16[] = { { 4046, 4052 }, { 4057, 4058 }, }; -static URange16 Tifinagh_range16[] = { - { 11568, 11621 }, +static const URange16 Tifinagh_range16[] = { + { 11568, 11623 }, { 11631, 11632 }, { 11647, 11647 }, }; -static URange32 Ugaritic_range32[] = { +static const URange32 Ugaritic_range32[] = { { 66432, 66461 }, { 66463, 66463 }, }; -static URange16 Braille_range16[] = { +static const URange16 Braille_range16[] = { { 10240, 10495 }, }; -static URange16 Greek_range16[] = { +static const URange16 Greek_range16[] = { { 880, 883 }, { 885, 887 }, { 890, 893 }, @@ -4170,46 +4340,46 @@ static URange16 Greek_range16[] = { { 8182, 8190 }, { 8486, 8486 }, }; -static URange32 Greek_range32[] = { +static const URange32 Greek_range32[] = { { 65856, 65930 }, { 119296, 119365 }, }; -static URange32 Lycian_range32[] = { +static const URange32 Lycian_range32[] = { { 66176, 66204 }, }; -static URange16 Tai_Viet_range16[] = { +static const URange16 Tai_Viet_range16[] = { { 43648, 43714 }, { 43739, 43743 }, }; -static URange16 Vai_range16[] = { +static const URange16 Vai_range16[] = { { 42240, 42539 }, }; -static URange16 Ogham_range16[] = { +static const URange16 Ogham_range16[] = { { 5760, 5788 }, }; -static URange32 Inscriptional_Parthian_range32[] = { +static const URange32 Inscriptional_Parthian_range32[] = { { 68416, 68437 }, { 68440, 68447 }, }; -static URange16 Cham_range16[] = { +static const URange16 Cham_range16[] = { { 43520, 43574 }, { 43584, 43597 }, { 43600, 43609 }, { 43612, 43615 }, }; -static URange16 Syriac_range16[] = { +static const URange16 Syriac_range16[] = { { 1792, 1805 }, { 1807, 1866 }, { 1869, 1871 }, }; -static URange16 Runic_range16[] = { +static const URange16 Runic_range16[] = { { 5792, 5866 }, { 5870, 5872 }, }; -static URange32 Gothic_range32[] = { +static const URange32 Gothic_range32[] = { { 66352, 66378 }, }; -static URange16 Katakana_range16[] = { +static const URange16 Katakana_range16[] = { { 12449, 12538 }, { 12541, 12543 }, { 12784, 12799 }, @@ -4218,33 +4388,38 @@ static URange16 Katakana_range16[] = { { 65382, 65391 }, { 65393, 65437 }, }; -static URange32 Katakana_range32[] = { +static const URange32 Katakana_range32[] = { { 110592, 110592 }, }; -static URange32 Osmanya_range32[] = { +static const URange32 Osmanya_range32[] = { { 66688, 66717 }, { 66720, 66729 }, }; -static URange16 New_Tai_Lue_range16[] = { +static const URange16 New_Tai_Lue_range16[] = { { 6528, 6571 }, { 6576, 6601 }, { 6608, 6618 }, { 6622, 6623 }, }; -static URange16 Ol_Chiki_range16[] = { +static const URange16 Ol_Chiki_range16[] = { { 7248, 7295 }, }; -static URange16 Limbu_range16[] = { +static const URange16 Limbu_range16[] = { { 6400, 6428 }, { 6432, 6443 }, { 6448, 6459 }, { 6464, 6464 }, { 6468, 6479 }, }; -static URange16 Cherokee_range16[] = { +static const URange16 Cherokee_range16[] = { { 5024, 5108 }, }; -static URange16 Oriya_range16[] = { +static const URange32 Miao_range32[] = { + { 93952, 94020 }, + { 94032, 94078 }, + { 94095, 94111 }, +}; +static const URange16 Oriya_range16[] = { { 2817, 2819 }, { 2821, 2828 }, { 2831, 2832 }, @@ -4260,7 +4435,11 @@ static URange16 Oriya_range16[] = { { 2911, 2915 }, { 2918, 2935 }, }; -static URange16 Gujarati_range16[] = { +static const URange32 Sharada_range32[] = { + { 70016, 70088 }, + { 70096, 70105 }, +}; +static const URange16 Gujarati_range16[] = { { 2689, 2691 }, { 2693, 2701 }, { 2703, 2705 }, @@ -4273,43 +4452,45 @@ static URange16 Gujarati_range16[] = { { 2763, 2765 }, { 2768, 2768 }, { 2784, 2787 }, - { 2790, 2799 }, - { 2801, 2801 }, + { 2790, 2801 }, }; -static URange32 Inscriptional_Pahlavi_range32[] = { +static const URange32 Inscriptional_Pahlavi_range32[] = { { 68448, 68466 }, { 68472, 68479 }, }; -static URange16 Khmer_range16[] = { +static const URange16 Khmer_range16[] = { { 6016, 6109 }, { 6112, 6121 }, { 6128, 6137 }, { 6624, 6655 }, }; -static URange32 Cuneiform_range32[] = { +static const URange32 Cuneiform_range32[] = { { 73728, 74606 }, { 74752, 74850 }, { 74864, 74867 }, }; -static URange16 Mandaic_range16[] = { +static const URange16 Mandaic_range16[] = { { 2112, 2139 }, { 2142, 2142 }, }; -static URange16 Syloti_Nagri_range16[] = { +static const URange16 Syloti_Nagri_range16[] = { { 43008, 43051 }, }; -static URange16 Nko_range16[] = { +static const URange16 Nko_range16[] = { { 1984, 2042 }, }; -static URange16 Canadian_Aboriginal_range16[] = { +static const URange16 Canadian_Aboriginal_range16[] = { { 5120, 5759 }, { 6320, 6389 }, }; -static URange32 Phoenician_range32[] = { +static const URange32 Meroitic_Hieroglyphs_range32[] = { + { 67968, 67999 }, +}; +static const URange32 Phoenician_range32[] = { { 67840, 67867 }, { 67871, 67871 }, }; -static URange16 Bengali_range16[] = { +static const URange16 Bengali_range16[] = { { 2433, 2435 }, { 2437, 2444 }, { 2447, 2448 }, @@ -4325,18 +4506,22 @@ static URange16 Bengali_range16[] = { { 2527, 2531 }, { 2534, 2555 }, }; -static URange32 Kaithi_range32[] = { +static const URange32 Kaithi_range32[] = { { 69760, 69825 }, }; -static URange16 Glagolitic_range16[] = { +static const URange16 Glagolitic_range16[] = { { 11264, 11310 }, { 11312, 11358 }, }; -static URange32 Imperial_Aramaic_range32[] = { +static const URange32 Imperial_Aramaic_range32[] = { { 67648, 67669 }, { 67671, 67679 }, }; -static URange16 Gurmukhi_range16[] = { +static const URange32 Sora_Sompeng_range32[] = { + { 69840, 69864 }, + { 69872, 69881 }, +}; +static const URange16 Gurmukhi_range16[] = { { 2561, 2563 }, { 2565, 2570 }, { 2575, 2576 }, @@ -4354,15 +4539,15 @@ static URange16 Gurmukhi_range16[] = { { 2654, 2654 }, { 2662, 2677 }, }; -static URange16 Javanese_range16[] = { +static const URange16 Javanese_range16[] = { { 43392, 43469 }, - { 43471, 43481 }, + { 43472, 43481 }, { 43486, 43487 }, }; -static URange16 Phags_Pa_range16[] = { +static const URange16 Phags_Pa_range16[] = { { 43072, 43127 }, }; -static URange32 Cypriot_range32[] = { +static const URange32 Cypriot_range32[] = { { 67584, 67589 }, { 67592, 67592 }, { 67594, 67637 }, @@ -4370,7 +4555,7 @@ static URange32 Cypriot_range32[] = { { 67644, 67644 }, { 67647, 67647 }, }; -static URange16 Kannada_range16[] = { +static const URange16 Kannada_range16[] = { { 3202, 3203 }, { 3205, 3212 }, { 3214, 3216 }, @@ -4386,7 +4571,7 @@ static URange16 Kannada_range16[] = { { 3302, 3311 }, { 3313, 3314 }, }; -static URange16 Mongolian_range16[] = { +static const URange16 Mongolian_range16[] = { { 6144, 6145 }, { 6148, 6148 }, { 6150, 6158 }, @@ -4394,7 +4579,7 @@ static URange16 Mongolian_range16[] = { { 6176, 6263 }, { 6272, 6314 }, }; -static URange16 Sinhala_range16[] = { +static const URange16 Sinhala_range16[] = { { 3458, 3459 }, { 3461, 3478 }, { 3482, 3505 }, @@ -4407,26 +4592,26 @@ static URange16 Sinhala_range16[] = { { 3544, 3551 }, { 3570, 3572 }, }; -static URange32 Brahmi_range32[] = { +static const URange32 Brahmi_range32[] = { { 69632, 69709 }, { 69714, 69743 }, }; -static URange32 Deseret_range32[] = { +static const URange32 Deseret_range32[] = { { 66560, 66639 }, }; -static URange16 Rejang_range16[] = { +static const URange16 Rejang_range16[] = { { 43312, 43347 }, { 43359, 43359 }, }; -static URange16 Yi_range16[] = { +static const URange16 Yi_range16[] = { { 40960, 42124 }, { 42128, 42182 }, }; -static URange16 Balinese_range16[] = { +static const URange16 Balinese_range16[] = { { 6912, 6987 }, { 6992, 7036 }, }; -static URange16 Lao_range16[] = { +static const URange16 Lao_range16[] = { { 3713, 3714 }, { 3716, 3716 }, { 3719, 3720 }, @@ -4444,12 +4629,12 @@ static URange16 Lao_range16[] = { { 3782, 3782 }, { 3784, 3789 }, { 3792, 3801 }, - { 3804, 3805 }, + { 3804, 3807 }, }; -static URange16 Hanunoo_range16[] = { +static const URange16 Hanunoo_range16[] = { { 5920, 5940 }, }; -static URange32 Linear_B_range32[] = { +static const URange32 Linear_B_range32[] = { { 65536, 65547 }, { 65549, 65574 }, { 65576, 65594 }, @@ -4458,32 +4643,32 @@ static URange32 Linear_B_range32[] = { { 65616, 65629 }, { 65664, 65786 }, }; -static URange32 Old_Turkic_range32[] = { +static const URange32 Old_Turkic_range32[] = { { 68608, 68680 }, }; -static URange16 Lepcha_range16[] = { +static const URange16 Lepcha_range16[] = { { 7168, 7223 }, { 7227, 7241 }, { 7245, 7247 }, }; -static URange32 Lydian_range32[] = { +static const URange32 Lydian_range32[] = { { 67872, 67897 }, { 67903, 67903 }, }; -static URange32 Egyptian_Hieroglyphs_range32[] = { +static const URange32 Egyptian_Hieroglyphs_range32[] = { { 77824, 78894 }, }; -static URange16 Samaritan_range16[] = { +static const URange16 Samaritan_range16[] = { { 2048, 2093 }, { 2096, 2110 }, }; -static URange16 Lisu_range16[] = { +static const URange16 Lisu_range16[] = { { 42192, 42239 }, }; -static URange16 Buhid_range16[] = { +static const URange16 Buhid_range16[] = { { 5952, 5971 }, }; -static URange16 Common_range16[] = { +static const URange16 Common_range16[] = { { 0, 64 }, { 91, 96 }, { 123, 169 }, @@ -4506,7 +4691,6 @@ static URange16 Common_range16[] = { { 1632, 1641 }, { 1757, 1757 }, { 2404, 2405 }, - { 2416, 2416 }, { 3647, 3647 }, { 4053, 4056 }, { 4347, 4347 }, @@ -4517,13 +4701,14 @@ static URange16 Common_range16[] = { { 7379, 7379 }, { 7393, 7393 }, { 7401, 7404 }, - { 7406, 7410 }, + { 7406, 7411 }, + { 7413, 7414 }, { 8192, 8203 }, { 8206, 8292 }, - { 8298, 8304 }, + { 8294, 8304 }, { 8308, 8318 }, { 8320, 8334 }, - { 8352, 8377 }, + { 8352, 8378 }, { 8448, 8485 }, { 8487, 8489 }, { 8492, 8497 }, @@ -4534,12 +4719,10 @@ static URange16 Common_range16[] = { { 9216, 9254 }, { 9280, 9290 }, { 9312, 9983 }, - { 9985, 10186 }, - { 10188, 10188 }, - { 10190, 10239 }, + { 9985, 10239 }, { 10496, 11084 }, { 11088, 11097 }, - { 11776, 11825 }, + { 11776, 11835 }, { 12272, 12283 }, { 12288, 12292 }, { 12294, 12294 }, @@ -4558,6 +4741,7 @@ static URange16 Common_range16[] = { { 42752, 42785 }, { 42888, 42890 }, { 43056, 43065 }, + { 43471, 43471 }, { 64830, 64831 }, { 65021, 65021 }, { 65040, 65049 }, @@ -4574,7 +4758,7 @@ static URange16 Common_range16[] = { { 65512, 65518 }, { 65529, 65533 }, }; -static URange32 Common_range32[] = { +static const URange32 Common_range32[] = { { 65792, 65794 }, { 65799, 65843 }, { 65847, 65855 }, @@ -4618,7 +4802,7 @@ static URange32 Common_range32[] = { { 127185, 127199 }, { 127232, 127242 }, { 127248, 127278 }, - { 127280, 127337 }, + { 127280, 127339 }, { 127344, 127386 }, { 127462, 127487 }, { 127489, 127490 }, @@ -4637,42 +4821,40 @@ static URange32 Common_range32[] = { { 128066, 128247 }, { 128249, 128252 }, { 128256, 128317 }, + { 128320, 128323 }, { 128336, 128359 }, - { 128507, 128511 }, - { 128513, 128528 }, - { 128530, 128532 }, - { 128534, 128534 }, - { 128536, 128536 }, - { 128538, 128538 }, - { 128540, 128542 }, - { 128544, 128549 }, - { 128552, 128555 }, - { 128557, 128557 }, - { 128560, 128563 }, - { 128565, 128576 }, + { 128507, 128576 }, { 128581, 128591 }, { 128640, 128709 }, { 128768, 128883 }, { 917505, 917505 }, { 917536, 917631 }, }; -static URange16 Coptic_range16[] = { +static const URange16 Coptic_range16[] = { { 994, 1007 }, - { 11392, 11505 }, + { 11392, 11507 }, { 11513, 11519 }, }; -static URange16 Arabic_range16[] = { - { 1536, 1539 }, +static const URange32 Chakma_range32[] = { + { 69888, 69940 }, + { 69942, 69955 }, +}; +static const URange16 Arabic_range16[] = { + { 1536, 1540 }, { 1542, 1547 }, { 1549, 1562 }, + { 1564, 1564 }, { 1566, 1566 }, { 1568, 1599 }, { 1601, 1610 }, - { 1622, 1630 }, + { 1622, 1631 }, { 1642, 1647 }, { 1649, 1756 }, { 1758, 1791 }, { 1872, 1919 }, + { 2208, 2208 }, + { 2210, 2220 }, + { 2276, 2302 }, { 64336, 64449 }, { 64467, 64829 }, { 64848, 64911 }, @@ -4681,24 +4863,58 @@ static URange16 Arabic_range16[] = { { 65136, 65140 }, { 65142, 65276 }, }; -static URange32 Arabic_range32[] = { +static const URange32 Arabic_range32[] = { { 69216, 69246 }, -}; -static URange16 Bamum_range16[] = { + { 126464, 126467 }, + { 126469, 126495 }, + { 126497, 126498 }, + { 126500, 126500 }, + { 126503, 126503 }, + { 126505, 126514 }, + { 126516, 126519 }, + { 126521, 126521 }, + { 126523, 126523 }, + { 126530, 126530 }, + { 126535, 126535 }, + { 126537, 126537 }, + { 126539, 126539 }, + { 126541, 126543 }, + { 126545, 126546 }, + { 126548, 126548 }, + { 126551, 126551 }, + { 126553, 126553 }, + { 126555, 126555 }, + { 126557, 126557 }, + { 126559, 126559 }, + { 126561, 126562 }, + { 126564, 126564 }, + { 126567, 126570 }, + { 126572, 126578 }, + { 126580, 126583 }, + { 126585, 126588 }, + { 126590, 126590 }, + { 126592, 126601 }, + { 126603, 126619 }, + { 126625, 126627 }, + { 126629, 126633 }, + { 126635, 126651 }, + { 126704, 126705 }, +}; +static const URange16 Bamum_range16[] = { { 42656, 42743 }, }; -static URange32 Bamum_range32[] = { +static const URange32 Bamum_range32[] = { { 92160, 92728 }, }; -static URange16 Myanmar_range16[] = { +static const URange16 Myanmar_range16[] = { { 4096, 4255 }, { 43616, 43643 }, }; -static URange32 Avestan_range32[] = { +static const URange32 Avestan_range32[] = { { 68352, 68405 }, { 68409, 68415 }, }; -static URange16 Hebrew_range16[] = { +static const URange16 Hebrew_range16[] = { { 1425, 1479 }, { 1488, 1514 }, { 1520, 1524 }, @@ -4709,10 +4925,14 @@ static URange16 Hebrew_range16[] = { { 64323, 64324 }, { 64326, 64335 }, }; -// 3804 16-bit ranges, 582 32-bit ranges -UGroup unicode_groups[] = { - { "Arabic", +1, Arabic_range16, 18, Arabic_range32, 1 }, - { "Armenian", +1, Armenian_range16, 5, 0, 0 }, +static const URange32 Takri_range32[] = { + { 71296, 71351 }, + { 71360, 71369 }, +}; +// 3867 16-bit ranges, 723 32-bit ranges +const UGroup unicode_groups[] = { + { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 35 }, + { "Armenian", +1, Armenian_range16, 6, 0, 0 }, { "Avestan", +1, 0, 0, Avestan_range32, 2 }, { "Balinese", +1, Balinese_range16, 2, 0, 0 }, { "Bamum", +1, Bamum_range16, 1, Bamum_range32, 1 }, @@ -4723,31 +4943,32 @@ UGroup unicode_groups[] = { { "Braille", +1, Braille_range16, 1, 0, 0 }, { "Buginese", +1, Buginese_range16, 2, 0, 0 }, { "Buhid", +1, Buhid_range16, 1, 0, 0 }, - { "C", +1, C_range16, 14, C_range32, 6 }, + { "C", +1, C_range16, 15, C_range32, 6 }, { "Canadian_Aboriginal", +1, Canadian_Aboriginal_range16, 2, 0, 0 }, { "Carian", +1, 0, 0, Carian_range32, 1 }, { "Cc", +1, Cc_range16, 2, 0, 0 }, - { "Cf", +1, Cf_range16, 11, Cf_range32, 4 }, + { "Cf", +1, Cf_range16, 12, Cf_range32, 4 }, + { "Chakma", +1, 0, 0, Chakma_range32, 2 }, { "Cham", +1, Cham_range16, 4, 0, 0 }, { "Cherokee", +1, Cherokee_range16, 1, 0, 0 }, { "Co", +1, Co_range16, 1, Co_range32, 2 }, - { "Common", +1, Common_range16, 89, Common_range32, 80 }, + { "Common", +1, Common_range16, 88, Common_range32, 70 }, { "Coptic", +1, Coptic_range16, 3, 0, 0 }, { "Cs", +1, Cs_range16, 1, 0, 0 }, { "Cuneiform", +1, 0, 0, Cuneiform_range32, 3 }, { "Cypriot", +1, 0, 0, Cypriot_range32, 6 }, { "Cyrillic", +1, Cyrillic_range16, 7, 0, 0 }, { "Deseret", +1, 0, 0, Deseret_range32, 1 }, - { "Devanagari", +1, Devanagari_range16, 6, 0, 0 }, + { "Devanagari", +1, Devanagari_range16, 5, 0, 0 }, { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 1 }, { "Ethiopic", +1, Ethiopic_range16, 32, 0, 0 }, - { "Georgian", +1, Georgian_range16, 4, 0, 0 }, + { "Georgian", +1, Georgian_range16, 8, 0, 0 }, { "Glagolitic", +1, Glagolitic_range16, 2, 0, 0 }, { "Gothic", +1, 0, 0, Gothic_range32, 1 }, { "Greek", +1, Greek_range16, 31, Greek_range32, 2 }, - { "Gujarati", +1, Gujarati_range16, 14, 0, 0 }, + { "Gujarati", +1, Gujarati_range16, 13, 0, 0 }, { "Gurmukhi", +1, Gurmukhi_range16, 16, 0, 0 }, - { "Han", +1, Han_range16, 12, Han_range32, 4 }, + { "Han", +1, Han_range16, 11, Han_range32, 4 }, { "Hangul", +1, Hangul_range16, 14, 0, 0 }, { "Hanunoo", +1, Hanunoo_range16, 1, 0, 0 }, { "Hebrew", +1, Hebrew_range16, 9, 0, 0 }, @@ -4763,35 +4984,38 @@ UGroup unicode_groups[] = { { "Kayah_Li", +1, Kayah_Li_range16, 1, 0, 0 }, { "Kharoshthi", +1, 0, 0, Kharoshthi_range32, 8 }, { "Khmer", +1, Khmer_range16, 4, 0, 0 }, - { "L", +1, L_range16, 362, L_range32, 73 }, + { "L", +1, L_range16, 370, L_range32, 116 }, { "Lao", +1, Lao_range16, 18, 0, 0 }, { "Latin", +1, Latin_range16, 30, 0, 0 }, { "Lepcha", +1, Lepcha_range16, 3, 0, 0 }, { "Limbu", +1, Limbu_range16, 5, 0, 0 }, { "Linear_B", +1, 0, 0, Linear_B_range32, 7 }, { "Lisu", +1, Lisu_range16, 1, 0, 0 }, - { "Ll", +1, Ll_range16, 580, Ll_range32, 29 }, - { "Lm", +1, Lm_range16, 49, 0, 0 }, - { "Lo", +1, Lo_range16, 280, Lo_range32, 43 }, + { "Ll", +1, Ll_range16, 582, Ll_range32, 29 }, + { "Lm", +1, Lm_range16, 51, Lm_range32, 1 }, + { "Lo", +1, Lo_range16, 286, Lo_range32, 85 }, { "Lt", +1, Lt_range16, 10, 0, 0 }, - { "Lu", +1, Lu_range16, 571, Lu_range32, 32 }, + { "Lu", +1, Lu_range16, 576, Lu_range32, 32 }, { "Lycian", +1, 0, 0, Lycian_range32, 1 }, { "Lydian", +1, 0, 0, Lydian_range32, 2 }, - { "M", +1, M_range16, 176, M_range32, 17 }, + { "M", +1, M_range16, 180, M_range32, 24 }, { "Malayalam", +1, Malayalam_range16, 11, 0, 0 }, { "Mandaic", +1, Mandaic_range16, 2, 0, 0 }, - { "Mc", +1, Mc_range16, 106, Mc_range32, 7 }, + { "Mc", +1, Mc_range16, 111, Mc_range32, 15 }, { "Me", +1, Me_range16, 4, 0, 0 }, - { "Meetei_Mayek", +1, Meetei_Mayek_range16, 2, 0, 0 }, - { "Mn", +1, Mn_range16, 186, Mn_range32, 17 }, + { "Meetei_Mayek", +1, Meetei_Mayek_range16, 3, 0, 0 }, + { "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 2 }, + { "Meroitic_Hieroglyphs", +1, 0, 0, Meroitic_Hieroglyphs_range32, 1 }, + { "Miao", +1, 0, 0, Miao_range32, 3 }, + { "Mn", +1, Mn_range16, 194, Mn_range32, 27 }, { "Mongolian", +1, Mongolian_range16, 6, 0, 0 }, { "Myanmar", +1, Myanmar_range16, 2, 0, 0 }, - { "N", +1, N_range16, 63, N_range32, 20 }, - { "Nd", +1, Nd_range16, 35, Nd_range32, 3 }, + { "N", +1, N_range16, 64, N_range32, 24 }, + { "Nd", +1, Nd_range16, 35, Nd_range32, 7 }, { "New_Tai_Lue", +1, New_Tai_Lue_range16, 4, 0, 0 }, { "Nko", +1, Nko_range16, 1, 0, 0 }, { "Nl", +1, Nl_range16, 7, Nl_range32, 5 }, - { "No", +1, No_range16, 27, No_range32, 14 }, + { "No", +1, No_range16, 28, No_range32, 14 }, { "Ogham", +1, Ogham_range16, 1, 0, 0 }, { "Ol_Chiki", +1, Ol_Chiki_range16, 1, 0, 0 }, { "Old_Italic", +1, 0, 0, Old_Italic_range32, 2 }, @@ -4800,27 +5024,29 @@ UGroup unicode_groups[] = { { "Old_Turkic", +1, 0, 0, Old_Turkic_range32, 1 }, { "Oriya", +1, Oriya_range16, 14, 0, 0 }, { "Osmanya", +1, 0, 0, Osmanya_range32, 2 }, - { "P", +1, P_range16, 120, P_range32, 13 }, + { "P", +1, P_range16, 126, P_range32, 15 }, { "Pc", +1, Pc_range16, 6, 0, 0 }, - { "Pd", +1, Pd_range16, 15, 0, 0 }, - { "Pe", +1, Pe_range16, 70, 0, 0 }, + { "Pd", +1, Pd_range16, 16, 0, 0 }, + { "Pe", +1, Pe_range16, 72, 0, 0 }, { "Pf", +1, Pf_range16, 10, 0, 0 }, { "Phags_Pa", +1, Phags_Pa_range16, 1, 0, 0 }, { "Phoenician", +1, 0, 0, Phoenician_range32, 2 }, { "Pi", +1, Pi_range16, 11, 0, 0 }, - { "Po", +1, Po_range16, 115, Po_range32, 13 }, - { "Ps", +1, Ps_range16, 72, 0, 0 }, + { "Po", +1, Po_range16, 120, Po_range32, 15 }, + { "Ps", +1, Ps_range16, 74, 0, 0 }, { "Rejang", +1, Rejang_range16, 2, 0, 0 }, { "Runic", +1, Runic_range16, 2, 0, 0 }, - { "S", +1, S_range16, 142, S_range32, 66 }, + { "S", +1, S_range16, 143, S_range32, 56 }, { "Samaritan", +1, Samaritan_range16, 2, 0, 0 }, { "Saurashtra", +1, Saurashtra_range16, 2, 0, 0 }, - { "Sc", +1, Sc_range16, 16, 0, 0 }, + { "Sc", +1, Sc_range16, 17, 0, 0 }, + { "Sharada", +1, 0, 0, Sharada_range32, 2 }, { "Shavian", +1, 0, 0, Shavian_range32, 1 }, { "Sinhala", +1, Sinhala_range16, 11, 0, 0 }, { "Sk", +1, Sk_range16, 27, 0, 0 }, - { "Sm", +1, Sm_range16, 56, Sm_range32, 10 }, - { "So", +1, So_range16, 108, So_range32, 56 }, + { "Sm", +1, Sm_range16, 53, Sm_range32, 11 }, + { "So", +1, So_range16, 108, So_range32, 45 }, + { "Sora_Sompeng", +1, 0, 0, Sora_Sompeng_range32, 2 }, { "Sundanese", +1, Sundanese_range16, 2, 0, 0 }, { "Syloti_Nagri", +1, Syloti_Nagri_range16, 1, 0, 0 }, { "Syriac", +1, Syriac_range16, 3, 0, 0 }, @@ -4829,6 +5055,7 @@ UGroup unicode_groups[] = { { "Tai_Le", +1, Tai_Le_range16, 2, 0, 0 }, { "Tai_Tham", +1, Tai_Tham_range16, 5, 0, 0 }, { "Tai_Viet", +1, Tai_Viet_range16, 2, 0, 0 }, + { "Takri", +1, 0, 0, Takri_range32, 2 }, { "Tamil", +1, Tamil_range16, 16, 0, 0 }, { "Telugu", +1, Telugu_range16, 14, 0, 0 }, { "Thaana", +1, Thaana_range16, 1, 0, 0 }, @@ -4838,12 +5065,12 @@ UGroup unicode_groups[] = { { "Ugaritic", +1, 0, 0, Ugaritic_range32, 2 }, { "Vai", +1, Vai_range16, 1, 0, 0 }, { "Yi", +1, Yi_range16, 2, 0, 0 }, - { "Z", +1, Z_range16, 9, 0, 0 }, + { "Z", +1, Z_range16, 8, 0, 0 }, { "Zl", +1, Zl_range16, 1, 0, 0 }, { "Zp", +1, Zp_range16, 1, 0, 0 }, - { "Zs", +1, Zs_range16, 8, 0, 0 }, + { "Zs", +1, Zs_range16, 7, 0, 0 }, }; -int num_unicode_groups = 131; +const int num_unicode_groups = 138; } // namespace re2 diff --git a/third_party/re2/re2/unicode_groups.h b/third_party/re2/re2/unicode_groups.h index f91c51f..fc1c253 100644 --- a/third_party/re2/re2/unicode_groups.h +++ b/third_party/re2/re2/unicode_groups.h @@ -30,34 +30,34 @@ struct URange16 struct URange32 { - uint32 lo; - uint32 hi; + Rune lo; + Rune hi; }; struct UGroup { const char *name; int sign; // +1 for [abc], -1 for [^abc] - URange16 *r16; + const URange16 *r16; int nr16; - URange32 *r32; + const URange32 *r32; int nr32; }; // Named by property or script name (e.g., "Nd", "N", "Han"). // Negated groups are not included. -extern UGroup unicode_groups[]; -extern int num_unicode_groups; +extern const UGroup unicode_groups[]; +extern const int num_unicode_groups; // Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]"). // Negated groups are included. -extern UGroup posix_groups[]; -extern int num_posix_groups; +extern const UGroup posix_groups[]; +extern const int num_posix_groups; // Named by Perl name (e.g., "\\d", "\\D"). // Negated groups are included. -extern UGroup perl_groups[]; -extern int num_perl_groups; +extern const UGroup perl_groups[]; +extern const int num_perl_groups; } // namespace re2 diff --git a/third_party/re2/re2/walker-inl.h b/third_party/re2/re2/walker-inl.h index 4d2045f..bdcf7f5 100644 --- a/third_party/re2/re2/walker-inl.h +++ b/third_party/re2/re2/walker-inl.h @@ -92,7 +92,7 @@ template<typename T> class Regexp::Walker { T WalkInternal(Regexp* re, T top_arg, bool use_copy); - DISALLOW_EVIL_CONSTRUCTORS(Walker); + DISALLOW_COPY_AND_ASSIGN(Walker); }; template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re, diff --git a/third_party/re2/re2_test.bzl b/third_party/re2/re2_test.bzl new file mode 100644 index 0000000..a52cd9f --- /dev/null +++ b/third_party/re2/re2_test.bzl @@ -0,0 +1,14 @@ +# Copyright 2009 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Define a bazel macro that creates cc_test for re2. +def re2_test(name, deps=[]): + native.cc_test( + name=name, + srcs=["re2/testing/%s.cc" % (name)], + deps=[ + ":re2", + ":test", + ] + deps + ) diff --git a/third_party/re2/testinstall.cc b/third_party/re2/testinstall.cc index 17edfb4..97990c2 100644 --- a/third_party/re2/testinstall.cc +++ b/third_party/re2/testinstall.cc @@ -14,6 +14,8 @@ int main(void) { f.Add("a.*b.*c", RE2::DefaultOptions, &id); vector<string> v; f.Compile(&v); + vector<int> ids; + f.FirstMatch("abbccc", ids); if(RE2::FullMatch("axbyc", "a.*b.*c")) { printf("PASS\n"); diff --git a/third_party/re2/util/arena.cc b/third_party/re2/util/arena.cc deleted file mode 100644 index 25753c5..0000000 --- a/third_party/re2/util/arena.cc +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright 2000 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "util/util.h" - -namespace re2 { - -// ---------------------------------------------------------------------- -// UnsafeArena::UnsafeArena() -// UnsafeArena::~UnsafeArena() -// Destroying the arena automatically calls Reset() -// ---------------------------------------------------------------------- - - -UnsafeArena::UnsafeArena(const size_t block_size) - : block_size_(block_size), - freestart_(NULL), // set for real in Reset() - last_alloc_(NULL), - remaining_(0), - blocks_alloced_(1), - overflow_blocks_(NULL) { - assert(block_size > kDefaultAlignment); - - first_blocks_[0].mem = reinterpret_cast<char*>(malloc(block_size_)); - first_blocks_[0].size = block_size_; - - Reset(); -} - -UnsafeArena::~UnsafeArena() { - FreeBlocks(); - assert(overflow_blocks_ == NULL); // FreeBlocks() should do that - // The first X blocks stay allocated always by default. Delete them now. - for (int i = 0; i < blocks_alloced_; i++) - free(first_blocks_[i].mem); -} - -// ---------------------------------------------------------------------- -// UnsafeArena::Reset() -// Clears all the memory an arena is using. -// ---------------------------------------------------------------------- - -void UnsafeArena::Reset() { - FreeBlocks(); - freestart_ = first_blocks_[0].mem; - remaining_ = first_blocks_[0].size; - last_alloc_ = NULL; - - // We do not know for sure whether or not the first block is aligned, - // so we fix that right now. - const int overage = reinterpret_cast<uintptr_t>(freestart_) & - (kDefaultAlignment-1); - if (overage > 0) { - const int waste = kDefaultAlignment - overage; - freestart_ += waste; - remaining_ -= waste; - } - freestart_when_empty_ = freestart_; - assert(!(reinterpret_cast<uintptr_t>(freestart_)&(kDefaultAlignment-1))); -} - -// ------------------------------------------------------------- -// UnsafeArena::AllocNewBlock() -// Adds and returns an AllocatedBlock. -// The returned AllocatedBlock* is valid until the next call -// to AllocNewBlock or Reset. (i.e. anything that might -// affect overflow_blocks_). -// ------------------------------------------------------------- - -UnsafeArena::AllocatedBlock* UnsafeArena::AllocNewBlock(const size_t block_size) { - AllocatedBlock *block; - // Find the next block. - if ( blocks_alloced_ < arraysize(first_blocks_) ) { - // Use one of the pre-allocated blocks - block = &first_blocks_[blocks_alloced_++]; - } else { // oops, out of space, move to the vector - if (overflow_blocks_ == NULL) overflow_blocks_ = new vector<AllocatedBlock>; - // Adds another block to the vector. - overflow_blocks_->resize(overflow_blocks_->size()+1); - // block points to the last block of the vector. - block = &overflow_blocks_->back(); - } - - block->mem = reinterpret_cast<char*>(malloc(block_size)); - block->size = block_size; - - return block; -} - -// ---------------------------------------------------------------------- -// UnsafeArena::GetMemoryFallback() -// We take memory out of our pool, aligned on the byte boundary -// requested. If we don't have space in our current pool, we -// allocate a new block (wasting the remaining space in the -// current block) and give you that. If your memory needs are -// too big for a single block, we make a special your-memory-only -// allocation -- this is equivalent to not using the arena at all. -// ---------------------------------------------------------------------- - -void* UnsafeArena::GetMemoryFallback(const size_t size, const int align) { - if (size == 0) - return NULL; // stl/stl_alloc.h says this is okay - - assert(align > 0 && 0 == (align & (align - 1))); // must be power of 2 - - // If the object is more than a quarter of the block size, allocate - // it separately to avoid wasting too much space in leftover bytes - if (block_size_ == 0 || size > block_size_/4) { - // then it gets its own block in the arena - assert(align <= kDefaultAlignment); // because that's what new gives us - // This block stays separate from the rest of the world; in particular - // we don't update last_alloc_ so you can't reclaim space on this block. - return AllocNewBlock(size)->mem; - } - - const int overage = - (reinterpret_cast<uintptr_t>(freestart_) & (align-1)); - if (overage) { - const int waste = align - overage; - freestart_ += waste; - if (waste < remaining_) { - remaining_ -= waste; - } else { - remaining_ = 0; - } - } - if (size > remaining_) { - AllocatedBlock *block = AllocNewBlock(block_size_); - freestart_ = block->mem; - remaining_ = block->size; - } - remaining_ -= size; - last_alloc_ = freestart_; - freestart_ += size; - assert((reinterpret_cast<uintptr_t>(last_alloc_) & (align-1)) == 0); - return reinterpret_cast<void*>(last_alloc_); -} - -// ---------------------------------------------------------------------- -// UnsafeArena::FreeBlocks() -// Unlike GetMemory(), which does actual work, ReturnMemory() is a -// no-op: we don't "free" memory until Reset() is called. We do -// update some stats, though. Note we do no checking that the -// pointer you pass in was actually allocated by us, or that it -// was allocated for the size you say, so be careful here! -// FreeBlocks() does the work for Reset(), actually freeing all -// memory allocated in one fell swoop. -// ---------------------------------------------------------------------- - -void UnsafeArena::FreeBlocks() { - for ( int i = 1; i < blocks_alloced_; ++i ) { // keep first block alloced - free(first_blocks_[i].mem); - first_blocks_[i].mem = NULL; - first_blocks_[i].size = 0; - } - blocks_alloced_ = 1; - if (overflow_blocks_ != NULL) { - vector<AllocatedBlock>::iterator it; - for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) { - free(it->mem); - } - delete overflow_blocks_; // These should be used very rarely - overflow_blocks_ = NULL; - } -} - -} // namespace re2 diff --git a/third_party/re2/util/arena.h b/third_party/re2/util/arena.h deleted file mode 100644 index 7eb385b..0000000 --- a/third_party/re2/util/arena.h +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2000 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Sometimes it is necessary to allocate a large number of small -// objects. Doing this the usual way (malloc, new) is slow, -// especially for multithreaded programs. An UnsafeArena provides a -// mark/release method of memory management: it asks for a large chunk -// from the operating system and doles it out bit by bit as required. -// Then you free all the memory at once by calling UnsafeArena::Reset(). -// The "Unsafe" refers to the fact that UnsafeArena is not safe to -// call from multiple threads. -// -// The global operator new that can be used as follows: -// -// #include "lib/arena-inl.h" -// -// UnsafeArena arena(1000); -// Foo* foo = new (AllocateInArena, &arena) Foo; -// - -#ifndef RE2_UTIL_ARENA_H_ -#define RE2_UTIL_ARENA_H_ - -namespace re2 { - -// This class is thread-compatible. -class UnsafeArena { - public: - UnsafeArena(const size_t block_size); - virtual ~UnsafeArena(); - - void Reset(); - - // This should be the worst-case alignment for any type. This is - // good for IA-32, SPARC version 7 (the last one I know), and - // supposedly Alpha. i386 would be more time-efficient with a - // default alignment of 8, but ::operator new() uses alignment of 4, - // and an assertion will fail below after the call to MakeNewBlock() - // if you try to use a larger alignment. -#ifdef __i386__ - static const int kDefaultAlignment = 4; -#else - static const int kDefaultAlignment = 8; -#endif - - private: - void* GetMemoryFallback(const size_t size, const int align); - - public: - void* GetMemory(const size_t size, const int align) { - if ( size > 0 && size < remaining_ && align == 1 ) { // common case - last_alloc_ = freestart_; - freestart_ += size; - remaining_ -= size; - return reinterpret_cast<void*>(last_alloc_); - } - return GetMemoryFallback(size, align); - } - - private: - struct AllocatedBlock { - char *mem; - size_t size; - }; - - // The returned AllocatedBlock* is valid until the next call to AllocNewBlock - // or Reset (i.e. anything that might affect overflow_blocks_). - AllocatedBlock *AllocNewBlock(const size_t block_size); - - const AllocatedBlock *IndexToBlock(int index) const; - - const size_t block_size_; - char* freestart_; // beginning of the free space in most recent block - char* freestart_when_empty_; // beginning of the free space when we're empty - char* last_alloc_; // used to make sure ReturnBytes() is safe - size_t remaining_; - // STL vector isn't as efficient as it could be, so we use an array at first - int blocks_alloced_; // how many of the first_blocks_ have been alloced - AllocatedBlock first_blocks_[16]; // the length of this array is arbitrary - // if the first_blocks_ aren't enough, expand into overflow_blocks_. - vector<AllocatedBlock>* overflow_blocks_; - - void FreeBlocks(); // Frees all except first block - - DISALLOW_EVIL_CONSTRUCTORS(UnsafeArena); -}; - -// Operators for allocation on the arena -// Syntax: new (AllocateInArena, arena) MyClass; -// STL containers, etc. -enum AllocateInArenaType { AllocateInArena }; - -} // namespace re2 - -inline void* operator new(size_t size, - re2::AllocateInArenaType /* unused */, - re2::UnsafeArena *arena) { - return reinterpret_cast<char*>(arena->GetMemory(size, 1)); -} - -#endif // RE2_UTIL_ARENA_H_ - diff --git a/third_party/re2/util/atomicops.h b/third_party/re2/util/atomicops.h index 11c1196..dc944e7 100644 --- a/third_party/re2/util/atomicops.h +++ b/third_party/re2/util/atomicops.h @@ -5,6 +5,35 @@ #ifndef RE2_UTIL_ATOMICOPS_H__ #define RE2_UTIL_ATOMICOPS_H__ +// The memory ordering constraints resemble the ones in C11. +// RELAXED - no memory ordering, just an atomic operation. +// CONSUME - data-dependent ordering. +// ACQUIRE - prevents memory accesses from hoisting above the operation. +// RELEASE - prevents memory accesses from sinking below the operation. + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +#if !defined(OS_NACL) && (__has_builtin(__atomic_load_n) || (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__ >= 40801)) + +#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_RELAXED); } while (0) +#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_CONSUME); } while (0) +#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_ACQUIRE); } while (0) +#define ATOMIC_STORE_RELAXED(p, v) __atomic_store_n((p), (v), __ATOMIC_RELAXED) +#define ATOMIC_STORE_RELEASE(p, v) __atomic_store_n((p), (v), __ATOMIC_RELEASE) + +#else // old compiler + +#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = *(p); } while (0) +#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = *(p); MaybeReadMemoryBarrier(); } while (0) +#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = *(p); ReadMemoryBarrier(); } while (0) +#define ATOMIC_STORE_RELAXED(p, v) do { *(p) = (v); } while (0) +#define ATOMIC_STORE_RELEASE(p, v) do { WriteMemoryBarrier(); *(p) = (v); } while (0) + +// WriteMemoryBarrier(), ReadMemoryBarrier() and MaybeReadMemoryBarrier() +// are an implementation detail and must not be used in the rest of the code. + #if defined(__i386__) static inline void WriteMemoryBarrier() { @@ -21,10 +50,16 @@ static inline void WriteMemoryBarrier() { __asm__ __volatile__("sfence" : : : "memory"); } -#elif defined(__ppc__) +#elif defined(__ppc__) || defined(__powerpc64__) static inline void WriteMemoryBarrier() { - __asm__ __volatile__("eieio" : : : "memory"); + __asm__ __volatile__("lwsync" : : : "memory"); +} + +#elif defined(__aarch64__) + +static inline void WriteMemoryBarrier() { + __asm__ __volatile__("dmb st" : : : "memory"); } #elif defined(__alpha__) @@ -33,6 +68,43 @@ static inline void WriteMemoryBarrier() { __asm__ __volatile__("wmb" : : : "memory"); } +#elif defined(__arm__) && defined(__linux__) + +// Linux on ARM puts a suitable memory barrier at a magic address for us to call. +static inline void WriteMemoryBarrier() { + ((void(*)(void))0xffff0fa0)(); +} + +#elif defined(__windows__) || defined(_WIN32) + +#include <intrin.h> +#include <windows.h> + +static inline void WriteMemoryBarrier() { +#if defined(_M_IX86) || defined(_M_X64) + // x86 and x64 CPUs have a strong memory model that prohibits most types of + // reordering, so a non-instruction intrinsic to suppress compiler reordering + // is sufficient. _WriteBarrier is deprecated, but is still appropriate for + // the "old compiler" path (pre C++11). + _WriteBarrier(); +#else + LONG x; + ::InterlockedExchange(&x, 0); +#endif +} + +#elif defined(OS_NACL) + +static inline void WriteMemoryBarrier() { + __sync_synchronize(); +} + +#elif defined(__mips__) + +static inline void WriteMemoryBarrier() { + __asm__ __volatile__("sync" : : : "memory"); +} + #else #include "util/mutex.h" @@ -50,19 +122,9 @@ static inline void WriteMemoryBarrier() { re2::MutexLock l(&mu); } -/* -#error Need WriteMemoryBarrier for architecture. - -// Windows -inline void WriteMemoryBarrier() { - LONG x; - ::InterlockedExchange(&x, 0); -} -*/ - #endif -// Alpha has very weak memory ordering. If relying on WriteBarriers, must one +// Alpha has very weak memory ordering. If relying on WriteBarriers, one must // use read barriers for the readers too. #if defined(__alpha__) @@ -74,6 +136,44 @@ static inline void MaybeReadMemoryBarrier() { static inline void MaybeReadMemoryBarrier() {} -#endif // __alpha__ +#endif // __alpha__ + +// Read barrier for various targets. + +#if defined(__ppc__) || defined(__powerpc64__) + +static inline void ReadMemoryBarrier() { + __asm__ __volatile__("lwsync" : : : "memory"); +} + +#elif defined(__aarch64__) + +static inline void ReadMemoryBarrier() { + __asm__ __volatile__("dmb ld" : : : "memory"); +} + +#elif defined(__alpha__) + +static inline void ReadMemoryBarrier() { + __asm__ __volatile__("mb" : : : "memory"); +} + +#elif defined(__mips__) + +static inline void ReadMemoryBarrier() { + __asm__ __volatile__("sync" : : : "memory"); +} + +#else + +static inline void ReadMemoryBarrier() {} + +#endif + +#endif // old compiler + +#ifndef NO_THREAD_SAFETY_ANALYSIS +#define NO_THREAD_SAFETY_ANALYSIS +#endif #endif // RE2_UTIL_ATOMICOPS_H__ diff --git a/third_party/re2/util/benchmark.cc b/third_party/re2/util/benchmark.cc index c3aad7e..b77e22d 100644 --- a/third_party/re2/util/benchmark.cc +++ b/third_party/re2/util/benchmark.cc @@ -25,10 +25,29 @@ void Benchmark::Register() { } static int64 nsec() { +#if defined(__APPLE__) struct timeval tv; if(gettimeofday(&tv, 0) < 0) return -1; return (int64)tv.tv_sec*1000*1000*1000 + tv.tv_usec*1000; +#elif defined(_WIN32) + // https://msdn.microsoft.com/en-us/library/windows/desktop/dn553408.aspx + // describes how to query ticks and convert to microseconds. Of course, + // what we want in this case are nanoseconds. Also, note that .QuadPart + // is a signed 64-bit integer, so casting to int64 shouldn't be needed. + LARGE_INTEGER freq; + QueryPerformanceFrequency(&freq); + LARGE_INTEGER ticks; + QueryPerformanceCounter(&ticks); + ticks.QuadPart *= 1000*1000*1000; + ticks.QuadPart /= freq.QuadPart; + return ticks.QuadPart; +#else + struct timespec tp; + if(clock_gettime(CLOCK_REALTIME, &tp) < 0) + return -1; + return (int64)tp.tv_sec*1000*1000*1000 + tp.tv_nsec; +#endif } static int64 bytes; @@ -105,9 +124,9 @@ void RunBench(Benchmark* b, int nthread, int siz) { while(ns < (int)1e9 && n < (int)1e9) { last = n; if(ns/n == 0) - n = 1e9; + n = (int)1e9; else - n = 1e9 / (ns/n); + n = (int)1e9 / static_cast<int>(ns/n); n = max(last+1, min(n+n/2, 100*last)); n = round(n); diff --git a/third_party/re2/util/flags.h b/third_party/re2/util/flags.h index 77a06a22..98d5c06 100644 --- a/third_party/re2/util/flags.h +++ b/third_party/re2/util/flags.h @@ -5,7 +5,7 @@ // Simplified version of Google's command line flags. // Does not support parsing the command line. // If you want to do that, see -// http://code.google.com/p/google-gflags +// https://gflags.github.io/gflags/ #ifndef RE2_UTIL_FLAGS_H__ #define RE2_UTIL_FLAGS_H__ diff --git a/third_party/re2/util/logging.cc b/third_party/re2/util/logging.cc new file mode 100644 index 0000000..8a59862 --- /dev/null +++ b/third_party/re2/util/logging.cc @@ -0,0 +1,9 @@ +// Copyright 2015 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/logging.h" + +DEFINE_int32(minloglevel, 0, // INFO + "Messages logged at a lower level than this don't actually get " + "logged anywhere"); diff --git a/third_party/re2/util/logging.h b/third_party/re2/util/logging.h index d0a2d87..feac199 100644 --- a/third_party/re2/util/logging.h +++ b/third_party/re2/util/logging.h @@ -7,13 +7,13 @@ #ifndef RE2_UTIL_LOGGING_H__ #define RE2_UTIL_LOGGING_H__ -#ifndef WIN32 -#include <unistd.h> /* for write */ -#endif +#include <stdio.h> /* for fwrite */ #include <sstream> -#ifdef WIN32 -#include <io.h> -#endif + +#include "util/util.h" +#include "util/flags.h" + +DECLARE_int32(minloglevel); // Debug-only checking. #define DCHECK(condition) assert(condition) @@ -33,13 +33,16 @@ #define CHECK_EQ(x, y) CHECK((x) == (y)) #define CHECK_NE(x, y) CHECK((x) != (y)) -#define LOG_INFO LogMessage(__FILE__, __LINE__) -#define LOG_ERROR LOG_INFO -#define LOG_WARNING LOG_INFO +#define LOG_INFO LogMessage(__FILE__, __LINE__, 0) +#define LOG_WARNING LogMessage(__FILE__, __LINE__, 1) +#define LOG_ERROR LogMessage(__FILE__, __LINE__, 2) #define LOG_FATAL LogMessageFatal(__FILE__, __LINE__) #define LOG_QFATAL LOG_FATAL -#define VLOG(x) if((x)>0){}else LOG_INFO.stream() +// It seems that one of the Windows header files defines ERROR as 0. +#ifdef _WIN32 +#define LOG_0 LOG_INFO +#endif #ifdef NDEBUG #define DEBUG_MODE 0 @@ -51,16 +54,21 @@ #define LOG(severity) LOG_ ## severity.stream() +#define VLOG(x) if((x)>0){}else LOG_INFO.stream() + class LogMessage { public: - LogMessage(const char* file, int line) : flushed_(false) { + LogMessage(const char* file, int line, int severity) + : severity_(severity), flushed_(false) { stream() << file << ":" << line << ": "; } void Flush() { stream() << "\n"; - string s = str_.str(); - int n = (int)s.size(); // shut up msvc - if(write(2, s.data(), n) < 0) {} // shut up gcc + if (severity_ >= re2::FLAGS_minloglevel) { + string s = str_.str(); + size_t n = s.size(); + if (fwrite(s.data(), 1, n, stderr) < n) {} // shut up gcc + } flushed_ = true; } ~LogMessage() { @@ -69,23 +77,33 @@ class LogMessage { } } ostream& stream() { return str_; } - + private: + const int severity_; bool flushed_; std::ostringstream str_; - DISALLOW_EVIL_CONSTRUCTORS(LogMessage); + DISALLOW_COPY_AND_ASSIGN(LogMessage); }; +#ifdef _WIN32 +#pragma warning(push) +#pragma warning(disable: 4722) // destructor never returns +#endif + class LogMessageFatal : public LogMessage { public: LogMessageFatal(const char* file, int line) - : LogMessage(file, line) { } + : LogMessage(file, line, 3) {} ~LogMessageFatal() { Flush(); abort(); } private: - DISALLOW_EVIL_CONSTRUCTORS(LogMessageFatal); + DISALLOW_COPY_AND_ASSIGN(LogMessageFatal); }; +#ifdef _WIN32 +#pragma warning(pop) +#endif + #endif // RE2_UTIL_LOGGING_H__ diff --git a/third_party/re2/util/mutex.h b/third_party/re2/util/mutex.h index e321fae..b479e48 100644 --- a/third_party/re2/util/mutex.h +++ b/third_party/re2/util/mutex.h @@ -10,19 +10,40 @@ #ifndef RE2_UTIL_MUTEX_H_ #define RE2_UTIL_MUTEX_H_ +#include <stdlib.h> + +#if !defined(_WIN32) +#include <unistd.h> // For POSIX options +#endif + namespace re2 { -#ifndef WIN32 -#define HAVE_PTHREAD 1 -#define HAVE_RWLOCK 1 +#if !defined(_WIN32) + // Possible values of POSIX options: + // -1 means not supported, + // 0 means maybe supported (query at runtime), + // >0 means supported. +# if defined(_POSIX_THREADS) && _POSIX_THREADS > 0 +# define HAVE_PTHREAD 1 +# else +# define HAVE_PTHREAD 0 +# endif +# if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0 +# define HAVE_RWLOCK 1 +# else +# define HAVE_RWLOCK 0 +# endif +#else +# define HAVE_PTHREAD 0 +# define HAVE_RWLOCK 0 #endif #if defined(NO_THREADS) typedef int MutexType; // to keep a lock-count -#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK) +#elif HAVE_PTHREAD && HAVE_RWLOCK // Needed for pthread_rwlock_*. If it causes problems, you could take it - // out, but then you'd have to unset HAVE_RWLOCK (at least on linux -- it - // *does* cause problems for FreeBSD, or MacOSX, but isn't needed + // out, but then you'd have to set HAVE_RWLOCK to 0 (at least on linux -- + // it *does* cause problems for FreeBSD, or MacOSX, but isn't needed // for locking there.) # ifdef __linux__ # undef _XOPEN_SOURCE @@ -30,12 +51,12 @@ namespace re2 { # endif # include <pthread.h> typedef pthread_rwlock_t MutexType; -#elif defined(HAVE_PTHREAD) +#elif HAVE_PTHREAD # include <pthread.h> typedef pthread_mutex_t MutexType; -#elif defined(WIN32) +#elif defined(_WIN32) # ifndef WIN32_LEAN_AND_MEAN -# define WIN32_LEAN_AND_MEAN // We only need minimal includes +# define WIN32_LEAN_AND_MEAN // We only need minimal includes # endif # ifdef GMUTEX_TRYLOCK // We need Windows NT or later for TryEnterCriticalSection(). If you @@ -104,9 +125,8 @@ bool Mutex::TryLock() { if (mutex_) return false; Lock(); return true; } void Mutex::ReaderLock() { assert(++mutex_ > 0); } void Mutex::ReaderUnlock() { assert(mutex_-- > 0); } -#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK) +#elif HAVE_PTHREAD && HAVE_RWLOCK -#include <stdlib.h> // for abort() #define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0) Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); } @@ -119,9 +139,8 @@ void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); } #undef SAFE_PTHREAD -#elif defined(HAVE_PTHREAD) +#elif HAVE_PTHREAD -#include <stdlib.h> // for abort() #define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0) Mutex::Mutex() { SAFE_PTHREAD(pthread_mutex_init(&mutex_, NULL)); } @@ -133,7 +152,7 @@ void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks void Mutex::ReaderUnlock() { Unlock(); } #undef SAFE_PTHREAD -#elif defined(WIN32) +#elif defined(_WIN32) Mutex::Mutex() { InitializeCriticalSection(&mutex_); } Mutex::~Mutex() { DeleteCriticalSection(&mutex_); } @@ -190,7 +209,7 @@ class WriterMutexLock { #define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name) // Provide safe way to declare and use global, linker-initialized mutex. Sigh. -#ifdef HAVE_PTHREAD +#if HAVE_PTHREAD #define GLOBAL_MUTEX(name) \ static pthread_mutex_t (name) = PTHREAD_MUTEX_INITIALIZER diff --git a/third_party/re2/util/pcre.cc b/third_party/re2/util/pcre.cc index 1602133..9a3f32d 100644 --- a/third_party/re2/util/pcre.cc +++ b/third_party/re2/util/pcre.cc @@ -7,15 +7,11 @@ // compilation as PCRE in namespace re2. #include <errno.h> +#include <limits> #include "util/util.h" #include "util/flags.h" #include "util/pcre.h" -#ifdef WIN32 -#define strtoll _strtoi64 -#define strtoull _strtoui64 -#endif - #define PCREPORT(level) LOG(level) // Default PCRE limits. @@ -27,6 +23,42 @@ DEFINE_int32(regexp_stack_limit, 256<<10, "default PCRE stack limit (bytes)"); DEFINE_int32(regexp_match_limit, 1000000, "default PCRE match limit (function calls)"); +#ifndef USEPCRE + +// Fake just enough of the PCRE API to allow this file to build. :) + +struct pcre_extra { + int flags; + int match_limit; + int match_limit_recursion; +}; + +#define PCRE_EXTRA_MATCH_LIMIT 0 +#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0 +#define PCRE_ANCHORED 0 +#define PCRE_NOTEMPTY 0 +#define PCRE_ERROR_NOMATCH 1 +#define PCRE_ERROR_MATCHLIMIT 2 +#define PCRE_ERROR_RECURSIONLIMIT 3 +#define PCRE_INFO_CAPTURECOUNT 0 + +void pcre_free(void*) { +} + +pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) { + return NULL; +} + +int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) { + return 0; +} + +int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) { + return 0; +} + +#endif + namespace re2 { // Maximum number of args we can set @@ -118,7 +150,7 @@ pcre* PCRE::Compile(Anchor anchor) { // ANCHOR_BOTH Tack a "\z" to the end of the original pattern // and use a pcre anchored match. - const char* error; + const char* error = ""; int eoffset; pcre* re; if (anchor != ANCHOR_BOTH) { @@ -183,7 +215,7 @@ bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text, done: int consumed; - int vec[kVecSize]; + int vec[kVecSize] = {}; return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); } @@ -226,7 +258,7 @@ bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text, done: int consumed; - int vec[kVecSize]; + int vec[kVecSize] = {}; return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); } @@ -269,7 +301,7 @@ bool PCRE::ConsumeFunctor::operator ()(StringPiece* input, done: int consumed; - int vec[kVecSize]; + int vec[kVecSize] = {}; if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed, args, n, vec, kVecSize)) { input->remove_prefix(consumed); @@ -318,7 +350,7 @@ bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input, done: int consumed; - int vec[kVecSize]; + int vec[kVecSize] = {}; if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed, args, n, vec, kVecSize)) { input->remove_prefix(consumed); @@ -331,7 +363,7 @@ done: bool PCRE::Replace(string *str, const PCRE& pattern, const StringPiece& rewrite) { - int vec[kVecSize]; + int vec[kVecSize] = {}; int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); if (matches == 0) return false; @@ -350,12 +382,12 @@ int PCRE::GlobalReplace(string *str, const PCRE& pattern, const StringPiece& rewrite) { int count = 0; - int vec[kVecSize]; + int vec[kVecSize] = {}; string out; int start = 0; bool last_match_was_empty_string = false; - for (; start <= str->length();) { + while (start <= static_cast<int>(str->size())) { // If the previous match was for the empty string, we shouldn't // just match again: we'll match in the same way and get an // infinite loop. Instead, we do the match in a special way: @@ -371,14 +403,15 @@ int PCRE::GlobalReplace(string *str, matches = pattern.TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize); if (matches <= 0) { - if (start < str->length()) + if (start < static_cast<int>(str->size())) out.push_back((*str)[start]); start++; last_match_was_empty_string = false; continue; } } else { - matches = pattern.TryMatch(*str, start, UNANCHORED, true, vec, kVecSize); + matches = pattern.TryMatch(*str, start, UNANCHORED, true, + vec, kVecSize); if (matches <= 0) break; } @@ -396,8 +429,8 @@ int PCRE::GlobalReplace(string *str, if (count == 0) return 0; - if (start < str->length()) - out.append(*str, start, str->length() - start); + if (start < static_cast<int>(str->size())) + out.append(*str, start, static_cast<int>(str->size()) - start); swap(out, *str); return count; } @@ -406,7 +439,7 @@ bool PCRE::Extract(const StringPiece &text, const PCRE& pattern, const StringPiece &rewrite, string *out) { - int vec[kVecSize]; + int vec[kVecSize] = {}; int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); if (matches == 0) return false; @@ -452,7 +485,7 @@ string PCRE::QuoteMeta(const StringPiece& unquoted) { /***** Actual matching and rewriting code *****/ bool PCRE::HitLimit() { - return hit_limit_; + return hit_limit_ != 0; } void PCRE::ClearHitLimit() { @@ -600,9 +633,9 @@ bool PCRE::DoMatch(const StringPiece& text, const Arg* const args[], int n) const { assert(n >= 0); - size_t const vecsize = (1 + n) * 3; // results + PCRE workspace - // (as for kVecSize) - int *vec = new int[vecsize]; + const int vecsize = (1 + n) * 3; // results + PCRE workspace + // (as for kVecSize) + int* vec = new int[vecsize]; bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); delete[] vec; return b; @@ -808,7 +841,7 @@ bool PCRE::Arg::parse_short_radix(const char* str, if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse if ((short)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast<short*>(dest)) = r; + *(reinterpret_cast<short*>(dest)) = (short)r; return true; } @@ -820,7 +853,7 @@ bool PCRE::Arg::parse_ushort_radix(const char* str, if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse if ((ushort)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast<unsigned short*>(dest)) = r; + *(reinterpret_cast<unsigned short*>(dest)) = (ushort)r; return true; } @@ -898,7 +931,7 @@ bool PCRE::Arg::parse_double(const char* str, int n, void* dest) { char* end; double r = strtod(buf, &end); if (end != buf + n) { -#ifdef COMPILER_MSVC +#ifdef _WIN32 // Microsoft's strtod() doesn't handle inf and nan, so we have to // handle it explicitly. Speed is not important here because this // code is only called in unit tests. @@ -911,11 +944,11 @@ bool PCRE::Arg::parse_double(const char* str, int n, void* dest) { ++i; } if (0 == stricmp(i, "inf") || 0 == stricmp(i, "infinity")) { - r = numeric_limits<double>::infinity(); + r = std::numeric_limits<double>::infinity(); if (!pos) r = -r; } else if (0 == stricmp(i, "nan")) { - r = numeric_limits<double>::quiet_NaN(); + r = std::numeric_limits<double>::quiet_NaN(); } else { return false; } diff --git a/third_party/re2/util/pcre.h b/third_party/re2/util/pcre.h index 771ac91..20b10c0 100644 --- a/third_party/re2/util/pcre.h +++ b/third_party/re2/util/pcre.h @@ -167,28 +167,9 @@ namespace re2 { const bool UsingPCRE = true; } // namespace re2 #else +struct pcre; // opaque namespace re2 { const bool UsingPCRE = false; -struct pcre; -struct pcre_extra { int flags, match_limit, match_limit_recursion; }; -#define pcre_free(x) {} -#define PCRE_EXTRA_MATCH_LIMIT 0 -#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0 -#define PCRE_ANCHORED 0 -#define PCRE_NOTEMPTY 0 -#define PCRE_ERROR_NOMATCH 1 -#define PCRE_ERROR_MATCHLIMIT 2 -#define PCRE_ERROR_RECURSIONLIMIT 3 -#define PCRE_INFO_CAPTURECOUNT 0 -#ifndef WIN32 -#define pcre_compile(a,b,c,d,e) ({ (void)(a); (void)(b); *(c)=""; *(d)=0; (void)(e); ((pcre*)0); }) -#define pcre_exec(a, b, c, d, e, f, g, h) ({ (void)(a); (void)(b); (void)(c); (void)(d); (void)(e); (void)(f); (void)(g); (void)(h); 0; }) -#define pcre_fullinfo(a, b, c, d) ({ (void)(a); (void)(b); (void)(c); *(d) = 0; 0; }) -#else -#define pcre_compile(a,b,c,d,e) NULL -#define pcre_exec(a, b, c, d, e, f, g, h) NULL -#define pcre_fullinfo(a, b, c, d) NULL -#endif } // namespace re2 #endif @@ -516,7 +497,7 @@ class PCRE { int match_limit_; // Limit on execution resources int stack_limit_; // Limit on stack resources (bytes) mutable int32_t hit_limit_; // Hit limit during execution (bool)? - DISALLOW_EVIL_CONSTRUCTORS(PCRE); + DISALLOW_COPY_AND_ASSIGN(PCRE); }; // PCRE_Options allow you to set the PCRE::Options, plus any pcre diff --git a/third_party/re2/util/rune.cc b/third_party/re2/util/rune.cc index 26442b0..e6231ce 100644 --- a/third_party/re2/util/rune.cc +++ b/third_party/re2/util/rune.cc @@ -133,7 +133,7 @@ runetochar(char *str, const Rune *rune) */ c = *rune; if(c <= Rune1) { - str[0] = c; + str[0] = static_cast<char>(c); return 1; } @@ -142,7 +142,7 @@ runetochar(char *str, const Rune *rune) * 0080-07FF => T2 Tx */ if(c <= Rune2) { - str[0] = T2 | (c >> 1*Bitx); + str[0] = T2 | static_cast<char>(c >> 1*Bitx); str[1] = Tx | (c & Maskx); return 2; } @@ -161,9 +161,9 @@ runetochar(char *str, const Rune *rune) * 0800-FFFF => T3 Tx Tx */ if (c <= Rune3) { - str[0] = T3 | (c >> 2*Bitx); + str[0] = T3 | static_cast<char>(c >> 2*Bitx); str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); + str[2] = Tx | (c & Maskx); return 3; } @@ -171,7 +171,7 @@ runetochar(char *str, const Rune *rune) * four character sequence (21-bit value) * 10000-1FFFFF => T4 Tx Tx Tx */ - str[0] = T4 | (c >> 3*Bitx); + str[0] = T4 | static_cast<char>(c >> 3*Bitx); str[1] = Tx | ((c >> 2*Bitx) & Maskx); str[2] = Tx | ((c >> 1*Bitx) & Maskx); str[3] = Tx | (c & Maskx); diff --git a/third_party/re2/util/sparse_array.h b/third_party/re2/util/sparse_array.h index 7bc3a86..8f71fa0 100644 --- a/third_party/re2/util/sparse_array.h +++ b/third_party/re2/util/sparse_array.h @@ -220,19 +220,25 @@ class SparseArray { // and at the beginning and end of all public non-const member functions. inline void DebugCheckInvariants() const; + static bool InitMemory() { +#ifdef MEMORY_SANITIZER + return true; +#else + return RunningOnValgrind(); +#endif + } + int size_; int max_size_; int* sparse_to_dense_; vector<IndexValue> dense_; - bool valgrind_; - DISALLOW_EVIL_CONSTRUCTORS(SparseArray); + DISALLOW_COPY_AND_ASSIGN(SparseArray); }; template<typename Value> SparseArray<Value>::SparseArray() - : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(), - valgrind_(RunningOnValgrindOrMemorySanitizer()) {} + : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_() {} // IndexValue pairs: exposed in SparseArray::iterator. template<typename Value> @@ -275,14 +281,20 @@ void SparseArray<Value>::resize(int new_max_size) { memmove(a, sparse_to_dense_, max_size_*sizeof a[0]); delete[] sparse_to_dense_; } - // Don't need to zero the memory but appease Valgrind. - if (valgrind_) { - for (int i = max_size_; i < new_max_size; i++) - a[i] = 0xababababU; - } sparse_to_dense_ = a; dense_.resize(new_max_size); + + // These don't need to be initialized for correctness, + // but Valgrind will warn about use of uninitialized memory, + // so initialize the new memory when compiling debug binaries. + // Initialize it to garbage to detect bugs in the future. + if (InitMemory()) { + for (int i = max_size_; i < new_max_size; i++) { + sparse_to_dense_[i] = 0xababababU; + dense_[i].index_ = 0xababababU; + } + } } max_size_ = new_max_size; if (size_ > max_size_) @@ -295,7 +307,7 @@ template<typename Value> bool SparseArray<Value>::has_index(int i) const { DCHECK_GE(i, 0); DCHECK_LT(i, max_size_); - if (static_cast<uint>(i) >= max_size_) { + if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) { return false; } // Unsigned comparison avoids checking sparse_to_dense_[i] < 0. @@ -307,7 +319,7 @@ bool SparseArray<Value>::has_index(int i) const { template<typename Value> typename SparseArray<Value>::iterator SparseArray<Value>::set(int i, Value v) { DebugCheckInvariants(); - if (static_cast<uint>(i) >= max_size_) { + if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) { // Semantically, end() would be better here, but we already know // the user did something stupid, so begin() insulates them from // dereferencing an invalid pointer. @@ -369,7 +381,7 @@ template<typename Value> typename SparseArray<Value>::iterator SparseArray<Value>::set_new(int i, Value v) { DebugCheckInvariants(); - if (static_cast<uint>(i) >= max_size_) { + if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) { // Semantically, end() would be better here, but we already know // the user did something stupid, so begin() insulates them from // dereferencing an invalid pointer. @@ -419,10 +431,9 @@ void SparseArray<Value>::create_index(int i) { template<typename Value> SparseArray<Value>::SparseArray(int max_size) { max_size_ = max_size; sparse_to_dense_ = new int[max_size]; - valgrind_ = RunningOnValgrindOrMemorySanitizer(); dense_.resize(max_size); // Don't need to zero the new memory, but appease Valgrind. - if (valgrind_) { + if (InitMemory()) { for (int i = 0; i < max_size; i++) { sparse_to_dense_[i] = 0xababababU; dense_[i].index_ = 0xababababU; diff --git a/third_party/re2/util/sparse_array_test.cc b/third_party/re2/util/sparse_array_test.cc deleted file mode 100644 index bc7a19f8..0000000 --- a/third_party/re2/util/sparse_array_test.cc +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright 2006 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Simple tests that SparseArray behaves. - -#include "util/util.h" -#include "utest/utest.h" - -namespace re2 { - -static const string kNotFound = "NOT FOUND"; - -TEST(SparseArray, BasicOperations) { - static const int n = 50; - SparseArray<int> set(n); - - int order[n]; - int value[n]; - for (int i = 0; i < n; i++) - order[i] = i; - for (int i = 0; i < n; i++) - value[i] = rand()%1000 + 1; - for (int i = 1; i < n; i++) { - int j = rand()%i; - int t = order[i]; - order[i] = order[j]; - order[j] = t; - } - - for (int i = 0;; i++) { - for (int j = 0; j < i; j++) { - ASSERT_TRUE(set.has_index(order[j])); - ASSERT_EQ(value[order[j]], set.get(order[j], -1)); - } - if (i >= n) - break; - for (int j = i; j < n; j++) - ASSERT_FALSE(set.has_index(order[j])); - set.set(order[i], value[order[i]]); - } - - int nn = 0; - for (SparseArray<int>::iterator i = set.begin(); i != set.end(); ++i) { - ASSERT_EQ(order[nn++], i->index()); - ASSERT_EQ(value[i->index()], i->value()); - } - ASSERT_EQ(nn, n); - - set.clear(); - for (int i = 0; i < n; i++) - ASSERT_FALSE(set.has_index(i)); - - ASSERT_EQ(0, set.size()); - ASSERT_EQ(0, distance(set.begin(), set.end())); -} - -class SparseArrayStringTest : public testing::Test { - protected: - SparseArrayStringTest() - : str_map_(10) { - InsertOrUpdate(&str_map_, 1, "a"); - InsertOrUpdate(&str_map_, 5, "b"); - InsertOrUpdate(&str_map_, 2, "c"); - InsertOrUpdate(&str_map_, 7, "d"); - } - - SparseArray<string> str_map_; - typedef SparseArray<string>::iterator iterator; -}; - -TEST_F(SparseArrayStringTest, FindGetsPresentElement) { - iterator it = str_map_.find(2); - ASSERT_TRUE(str_map_.end() != it); - EXPECT_EQ("c", it->second); -} - -TEST_F(SparseArrayStringTest, FindDoesNotFindAbsentElement) { - iterator it = str_map_.find(3); - ASSERT_TRUE(str_map_.end() == it); -} - -TEST_F(SparseArrayStringTest, ContainsKey) { - EXPECT_TRUE(ContainsKey(str_map_, 1)); - EXPECT_TRUE(ContainsKey(str_map_, 2)); - EXPECT_FALSE(ContainsKey(str_map_, 3)); -} - -TEST_F(SparseArrayStringTest, InsertIfNotPresent) { - EXPECT_FALSE(ContainsKey(str_map_, 3)); - EXPECT_TRUE(InsertIfNotPresent(&str_map_, 3, "r")); - EXPECT_EQ("r", FindWithDefault(str_map_, 3, kNotFound)); - EXPECT_FALSE(InsertIfNotPresent(&str_map_, 3, "other value")); - EXPECT_EQ("r", FindWithDefault(str_map_, 3, kNotFound)); -} - -TEST(SparseArrayTest, Erase) { - SparseArray<string> str_map(5); - str_map.set(1, "a"); - str_map.set(2, "b"); - EXPECT_EQ("a", FindWithDefault(str_map, 1, kNotFound)); - EXPECT_EQ("b", FindWithDefault(str_map, 2, kNotFound)); - str_map.erase(1); - EXPECT_EQ("NOT FOUND", FindWithDefault(str_map, 1, kNotFound)); - EXPECT_EQ("b", FindWithDefault(str_map, 2, kNotFound)); -} - -typedef SparseArrayStringTest SparseArrayStringSurvivesInvalidIndexTest; -// TODO(jyasskin): Cover invalid arguments to every method. - -TEST_F(SparseArrayStringSurvivesInvalidIndexTest, SetNegative) { - EXPECT_DEBUG_DEATH(str_map_.set(-123456789, "hi"), - "\\(jyasskin\\) Illegal index -123456789 passed to" - " SparseArray\\(10\\).set\\(\\)."); - EXPECT_EQ(4, str_map_.size()); -} - -TEST_F(SparseArrayStringSurvivesInvalidIndexTest, SetTooBig) { - EXPECT_DEBUG_DEATH(str_map_.set(12345678, "hi"), - "\\(jyasskin\\) Illegal index 12345678 passed to" - " SparseArray\\(10\\).set\\(\\)."); - EXPECT_EQ(4, str_map_.size()); -} - -TEST_F(SparseArrayStringSurvivesInvalidIndexTest, SetNew_Negative) { - EXPECT_DEBUG_DEATH(str_map_.set_new(-123456789, "hi"), - "\\(jyasskin\\) Illegal index -123456789 passed to" - " SparseArray\\(10\\).set_new\\(\\)."); - EXPECT_EQ(4, str_map_.size()); -} - -TEST_F(SparseArrayStringSurvivesInvalidIndexTest, SetNew_Existing) { - EXPECT_DEBUG_DEATH({ - str_map_.set_new(2, "hi"); - EXPECT_EQ("hi", FindWithDefault(str_map_, 2, kNotFound)); - - // The old value for 2 is still present, but can never be removed. - // This risks crashing later, if the map fills up. - EXPECT_EQ(5, str_map_.size()); - }, "Check failed: !has_index\\(i\\)"); -} - -TEST_F(SparseArrayStringSurvivesInvalidIndexTest, SetNew_TooBig) { - EXPECT_DEBUG_DEATH(str_map_.set_new(12345678, "hi"), - "\\(jyasskin\\) Illegal index 12345678 passed to" - " SparseArray\\(10\\).set_new\\(\\)."); - EXPECT_EQ(4, str_map_.size()); -} - -} // namespace re2 diff --git a/third_party/re2/util/sparse_set.h b/third_party/re2/util/sparse_set.h index 4a324d7..9dd41ee 100644 --- a/third_party/re2/util/sparse_set.h +++ b/third_party/re2/util/sparse_set.h @@ -54,17 +54,15 @@ namespace re2 { class SparseSet { public: SparseSet() - : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL), - valgrind_(RunningOnValgrindOrMemorySanitizer()) {} + : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL) {} SparseSet(int max_size) { max_size_ = max_size; sparse_to_dense_ = new int[max_size]; dense_ = new int[max_size]; - valgrind_ = RunningOnValgrindOrMemorySanitizer(); // Don't need to zero the memory, but do so anyway // to appease Valgrind. - if (valgrind_) { + if (InitMemory()) { for (int i = 0; i < max_size; i++) { dense_[i] = 0xababababU; sparse_to_dense_[i] = 0xababababU; @@ -96,7 +94,7 @@ class SparseSet { int* a = new int[new_max_size]; if (sparse_to_dense_) { memmove(a, sparse_to_dense_, max_size_*sizeof a[0]); - if (valgrind_) { + if (InitMemory()) { for (int i = max_size_; i < new_max_size; i++) a[i] = 0xababababU; } @@ -107,7 +105,7 @@ class SparseSet { a = new int[new_max_size]; if (dense_) { memmove(a, dense_, size_*sizeof a[0]); - if (valgrind_) { + if (InitMemory()) { for (int i = size_; i < new_max_size; i++) a[i] = 0xababababU; } @@ -129,7 +127,7 @@ class SparseSet { bool contains(int i) const { DCHECK_GE(i, 0); DCHECK_LT(i, max_size_); - if (static_cast<uint>(i) >= max_size_) { + if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) { return false; } // Unsigned comparison avoids checking sparse_to_dense_[i] < 0. @@ -146,7 +144,7 @@ class SparseSet { // Set the value at the new index i to v. // Fast but unsafe: only use if contains(i) is false. void insert_new(int i) { - if (static_cast<uint>(i) >= max_size_) { + if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) { // Semantically, end() would be better here, but we already know // the user did something stupid, so begin() insulates them from // dereferencing an invalid pointer. @@ -166,13 +164,20 @@ class SparseSet { static bool less(int a, int b) { return a < b; } private: + static bool InitMemory() { +#ifdef MEMORY_SANITIZER + return true; +#else + return RunningOnValgrind(); +#endif + } + int size_; int max_size_; int* sparse_to_dense_; int* dense_; - bool valgrind_; - DISALLOW_EVIL_CONSTRUCTORS(SparseSet); + DISALLOW_COPY_AND_ASSIGN(SparseSet); }; } // namespace re2 diff --git a/third_party/re2/util/stringprintf.cc b/third_party/re2/util/stringprintf.cc index c908181..e71d9938 100644 --- a/third_party/re2/util/stringprintf.cc +++ b/third_party/re2/util/stringprintf.cc @@ -4,7 +4,7 @@ #include "util/util.h" -namespace re2 { +namespace re2 { static void StringAppendV(string* dst, const char* format, va_list ap) { // First try with a small fixed size buffer @@ -18,7 +18,7 @@ static void StringAppendV(string* dst, const char* format, va_list ap) { int result = vsnprintf(space, sizeof(space), format, backup_ap); va_end(backup_ap); - if ((result >= 0) && (result < sizeof(space))) { + if ((result >= 0) && (static_cast<unsigned long>(result) < sizeof(space))) { // It fit dst->append(space, result); return; @@ -38,7 +38,14 @@ static void StringAppendV(string* dst, const char* format, va_list ap) { // Restore the va_list before we use it again va_copy(backup_ap, ap); +#if !defined(_WIN32) result = vsnprintf(buf, length, format, backup_ap); +#else + // On Windows, the function takes five arguments, not four. With an array, + // the buffer size will be inferred, but not with a pointer. C'est la vie. + // (See https://github.com/google/re2/issues/40 for more details.) + result = vsnprintf(buf, length, _TRUNCATE, format, backup_ap); +#endif va_end(backup_ap); if ((result >= 0) && (result < length)) { diff --git a/third_party/re2/util/strutil.cc b/third_party/re2/util/strutil.cc index 6ab79b3..d3a0249 100644 --- a/third_party/re2/util/strutil.cc +++ b/third_party/re2/util/strutil.cc @@ -20,7 +20,7 @@ int CEscapeString(const char* src, int src_len, char* dest, int used = 0; for (; src < src_end; src++) { - if (dest_len - used < 2) // Need space for two letter escape + if (dest_len - used < 2) // space for two-character escape return -1; unsigned char c = *src; @@ -36,9 +36,15 @@ int CEscapeString(const char* src, int src_len, char* dest, // digit then that digit must be escaped too to prevent it being // interpreted as part of the character code by C. if (c < ' ' || c > '~') { - if (dest_len - used < 4) // need space for 4 letter escape + if (dest_len - used < 5) // space for four-character escape + \0 return -1; - sprintf(dest + used, "\\%03o", c); +#if !defined(_WIN32) + snprintf(dest + used, 5, "\\%03o", c); +#else + // On Windows, the function takes 4+VA arguments, not 3+VA. With an + // array, the buffer size will be inferred, but not with a pointer. + snprintf(dest + used, 5, _TRUNCATE, "\\%03o", c); +#endif used += 4; } else { dest[used++] = c; break; @@ -57,7 +63,7 @@ int CEscapeString(const char* src, int src_len, char* dest, // ---------------------------------------------------------------------- // CEscape() // Copies 'src' to result, escaping dangerous characters using -// C-style escape sequences. 'src' and 'dest' should not overlap. +// C-style escape sequences. 'src' and 'dest' should not overlap. // ---------------------------------------------------------------------- string CEscape(const StringPiece& src) { const int dest_length = src.size() * 4 + 1; // Maximum possible expansion @@ -77,7 +83,7 @@ string PrefixSuccessor(const StringPiece& prefix) { // 255's, we just return the empty string. bool done = false; string limit(prefix.data(), prefix.size()); - int index = limit.length() - 1; + int index = static_cast<int>(limit.size()) - 1; while (!done && index >= 0) { if ((limit[index]&255) == 255) { limit.erase(index); diff --git a/third_party/re2/util/test.cc b/third_party/re2/util/test.cc index 2fe1bfa..b0167e7 100644 --- a/third_party/re2/util/test.cc +++ b/third_party/re2/util/test.cc @@ -3,7 +3,7 @@ // license that can be found in the LICENSE file. #include <stdio.h> -#ifndef WIN32 +#ifndef _WIN32 #include <sys/resource.h> #endif #include "util/test.h" @@ -23,18 +23,6 @@ void RegisterTest(void (*fn)(void), const char *name) { tests[ntests++].name = name; } -namespace re2 { -int64 VirtualProcessSize() { -#ifndef WIN32 - struct rusage ru; - getrusage(RUSAGE_SELF, &ru); - return (int64)ru.ru_maxrss*1024; -#else - return 0; -#endif -} -} // namespace re2 - int main(int argc, char **argv) { for (int i = 0; i < ntests; i++) { printf("%s\n", tests[i].name); diff --git a/third_party/re2/util/test.h b/third_party/re2/util/test.h index 0f93865..3701eab 100644 --- a/third_party/re2/util/test.h +++ b/third_party/re2/util/test.h @@ -31,27 +31,15 @@ class TestRegisterer { #define EXPECT_GE CHECK_GE #define EXPECT_FALSE(x) CHECK(!(x)) -#define ARRAYSIZE arraysize - -#define EXPECT_TRUE_M(x, y) CHECK(x) << (y) -#define EXPECT_FALSE_M(x, y) CHECK(!(x)) << (y) -#define ASSERT_TRUE_M(x, y) CHECK(x) << (y) -#define ASSERT_EQUALS(x, y) CHECK_EQ(x, y) - -const bool UsingMallocCounter = false; namespace testing { class MallocCounter { public: - MallocCounter(int x) { } + MallocCounter(int x) {} static const int THIS_THREAD_ONLY = 0; long long HeapGrowth() { return 0; } long long PeakHeapGrowth() { return 0; } - void Reset() { } + void Reset() {} }; } // namespace testing -namespace re2 { -int64 VirtualProcessSize(); -} // namespace re2 - #endif // RE2_UTIL_TEST_H__ diff --git a/third_party/re2/util/thread.cc b/third_party/re2/util/thread.cc index 7349991..d97f14b 100644 --- a/third_party/re2/util/thread.cc +++ b/third_party/re2/util/thread.cc @@ -2,10 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include <pthread.h> - -#include "util/util.h" #include "util/thread.h" +#include "util/util.h" Thread::Thread() { pid_ = 0; diff --git a/third_party/re2/util/thread.h b/third_party/re2/util/thread.h index b9610e0..fb67bdc 100644 --- a/third_party/re2/util/thread.h +++ b/third_party/re2/util/thread.h @@ -5,7 +5,11 @@ #ifndef RE2_UTIL_THREAD_H__ #define RE2_UTIL_THREAD_H__ +#ifdef _WIN32 +#include <windows.h> +#else #include <pthread.h> +#endif class Thread { public: @@ -15,12 +19,15 @@ class Thread { void Join(); void SetJoinable(bool); virtual void Run() = 0; - + private: +#ifdef _WIN32 + HANDLE pid_; +#else pthread_t pid_; +#endif bool running_; bool joinable_; }; #endif // RE2_UTIL_THREAD_H__ - diff --git a/third_party/re2/util/threadwin.cc b/third_party/re2/util/threadwin.cc new file mode 100644 index 0000000..d68f2c5 --- /dev/null +++ b/third_party/re2/util/threadwin.cc @@ -0,0 +1,44 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/thread.h" +#include "util/util.h" + +Thread::Thread() { + pid_ = 0; + running_ = 0; + joinable_ = 0; +} + +Thread::~Thread() { +} + +DWORD WINAPI startThread(void *v) { + Thread* t = (Thread*)v; + t->Run(); + return 0; +} + +void Thread::Start() { + CHECK(!running_); + pid_ = CreateThread(NULL, 0, startThread, this, 0, NULL); + running_ = true; + if (!joinable_) { + CloseHandle(pid_); + pid_ = 0; + } +} + +void Thread::Join() { + CHECK(running_); + CHECK(joinable_); + if (pid_ != 0) + WaitForSingleObject(pid_, INFINITE); + running_ = 0; +} + +void Thread::SetJoinable(bool j) { + CHECK(!running_); + joinable_ = j; +} diff --git a/third_party/re2/util/util.h b/third_party/re2/util/util.h index a4fdfcc..c59d91f2 100644 --- a/third_party/re2/util/util.h +++ b/third_party/re2/util/util.h @@ -9,16 +9,18 @@ #include <stdio.h> #include <string.h> #include <stdint.h> -#include <stddef.h> // For size_t +#include <stddef.h> // For size_t #include <assert.h> #include <stdarg.h> -#ifndef WIN32 -#include <sys/time.h> +#include <time.h> // For clock_gettime, CLOCK_REALTIME +#include <ctype.h> // For isdigit, isalpha + +#if !defined(_WIN32) +#include <sys/time.h> // For gettimeofday #endif -#include <time.h> -#include <ctype.h> // For isdigit, isalpha. // C++ +#include <ctime> #include <vector> #include <string> #include <algorithm> @@ -29,9 +31,6 @@ #include <utility> #include <set> -#include "build/build_config.h" -#include "base/third_party/dynamic_annotations/dynamic_annotations.h" - // Use std names. using std::set; using std::pair; @@ -46,7 +45,7 @@ using std::sort; using std::swap; using std::make_pair; -#if defined(__GNUC__) && !defined(USE_CXX0X) && !defined(_LIBCPP_ABI_VERSION) && !defined(OS_ANDROID) +#if defined(__GNUC__) && !defined(USE_CXX0X) && !defined(_LIBCPP_ABI_VERSION) #include <tr1/unordered_set> using std::tr1::unordered_set; @@ -54,7 +53,7 @@ using std::tr1::unordered_set; #else #include <unordered_set> -#if defined(WIN32) || (defined(OS_ANDROID) && !defined(_LIBCPP_ABI_VERSION)) +#if defined(_WIN32) using std::tr1::unordered_set; #else using std::unordered_set; @@ -62,6 +61,17 @@ using std::unordered_set; #endif +#ifdef _WIN32 + +#define snprintf _snprintf_s +#define stricmp _stricmp +#define strtof strtod /* not really correct but best we can do */ +#define strtoll _strtoi64 +#define strtoull _strtoui64 +#define vsnprintf vsnprintf_s + +#endif + namespace re2 { typedef int8_t int8; @@ -77,35 +87,31 @@ typedef unsigned long ulong; typedef unsigned int uint; typedef unsigned short ushort; +// Prevent the compiler from complaining about or optimizing away variables +// that appear unused. +#undef ATTRIBUTE_UNUSED +#if defined(__GNUC__) +#define ATTRIBUTE_UNUSED __attribute__ ((unused)) +#else +#define ATTRIBUTE_UNUSED +#endif + // COMPILE_ASSERT causes a compile error about msg if expr is not true. #if __cplusplus >= 201103L #define COMPILE_ASSERT(expr, msg) static_assert(expr, #msg) #else template<bool> struct CompileAssert {}; #define COMPILE_ASSERT(expr, msg) \ - typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] + typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] ATTRIBUTE_UNUSED #endif -// DISALLOW_EVIL_CONSTRUCTORS disallows the copy and operator= functions. +// DISALLOW_COPY_AND_ASSIGN disallows the copy and operator= functions. // It goes in the private: declarations in a class. -#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \ +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ TypeName(const TypeName&); \ void operator=(const TypeName&) -#define arraysize(array) (sizeof(array)/sizeof((array)[0])) - -// Fake lock annotations. For real ones, see -// http://code.google.com/p/data-race-test/ -#ifndef ANNOTATE_PUBLISH_MEMORY_RANGE -#define ANNOTATE_PUBLISH_MEMORY_RANGE(a, b) -#define ANNOTATE_IGNORE_WRITES_BEGIN() -#define ANNOTATE_IGNORE_WRITES_END() -#define ANNOTATE_BENIGN_RACE(a, b) -#define NO_THREAD_SAFETY_ANALYSIS -#define ANNOTATE_HAPPENS_BEFORE(x) -#define ANNOTATE_HAPPENS_AFTER(x) -#define ANNOTATE_UNPROTECTED_READ(x) (x) -#endif +#define arraysize(array) (int)(sizeof(array)/sizeof((array)[0])) class StringPiece; @@ -132,17 +138,10 @@ static inline uint64 Hash64StringWithSeed(const char* s, int len, uint32 seed) { return ((uint64)x << 32) | y; } -inline bool RunningOnValgrindOrMemorySanitizer() { -#if defined(MEMORY_SANITIZER) - return true; -#else - return RunningOnValgrind(); -#endif -} +bool RunningOnValgrind(); } // namespace re2 -#include "util/arena.h" #include "util/logging.h" #include "util/mutex.h" #include "util/utf.h" diff --git a/third_party/re2/util/valgrind.cc b/third_party/re2/util/valgrind.cc index 46f804b..19ec22e 100644 --- a/third_party/re2/util/valgrind.cc +++ b/third_party/re2/util/valgrind.cc @@ -3,15 +3,17 @@ // license that can be found in the LICENSE file. #include "util/util.h" +#ifndef _WIN32 #include "util/valgrind.h" +#endif namespace re2 { -int RunningOnValgrind() { +bool RunningOnValgrind() { #ifdef RUNNING_ON_VALGRIND - return RUNNING_ON_VALGRIND; + return RUNNING_ON_VALGRIND != 0; #else - return 0; + return false; #endif } diff --git a/third_party/re2/util/valgrind.h b/third_party/re2/util/valgrind.h index d097b0c..ca10b1a 100644 --- a/third_party/re2/util/valgrind.h +++ b/third_party/re2/util/valgrind.h @@ -4064,7 +4064,6 @@ typedef #endif /* PLAT_ppc64_aix5 */ -#ifndef WIN32 /* ------------------------------------------------------------------ */ /* ARCHITECTURE INDEPENDENT MACROS for CLIENT REQUESTS. */ /* */ @@ -4171,7 +4170,7 @@ typedef VG_USERREQ__DISCARD_TRANSLATIONS, \ _qzz_addr, _qzz_len, 0, 0, 0); \ } -#endif + /* These requests are for getting Valgrind itself to print something. Possibly with a backtrace. This is a really ugly hack. The return value |