diff options
author | esprehn@chromium.org <esprehn@chromium.org@bbb929c8-8fbe-4397-9dbb-9b2b20218538> | 2013-04-24 19:14:53 +0000 |
---|---|---|
committer | esprehn@chromium.org <esprehn@chromium.org@bbb929c8-8fbe-4397-9dbb-9b2b20218538> | 2013-04-24 19:14:53 +0000 |
commit | ba5b5026c3b0abfd64e5e2c06c113c18275d4c65 (patch) | |
tree | 50d61b75eee4ebede021331bbf562d8a5c1c9364 /third_party | |
parent | f16ae9634ffbe5294c5fb558b6ab056ed38f624c (diff) | |
download | chromium_src-ba5b5026c3b0abfd64e5e2c06c113c18275d4c65.zip chromium_src-ba5b5026c3b0abfd64e5e2c06c113c18275d4c65.tar.gz chromium_src-ba5b5026c3b0abfd64e5e2c06c113c18275d4c65.tar.bz2 |
Remove YARR, we don't use it anymore
Now that RegularExpression uses V8 (r148958) nothing uses YARR
and we can remove it.
Review URL: https://codereview.chromium.org/14051006
git-svn-id: svn://svn.chromium.org/blink/trunk@149026 bbb929c8-8fbe-4397-9dbb-9b2b20218538
Diffstat (limited to 'third_party')
-rw-r--r-- | third_party/WebKit/Source/bindings/bindings.gyp | 1 | ||||
-rw-r--r-- | third_party/WebKit/Source/core/core.gyp/core.gyp | 3 | ||||
-rw-r--r-- | third_party/WebKit/Source/yarr/Yarr.h | 69 | ||||
-rw-r--r-- | third_party/WebKit/Source/yarr/YarrCanonicalizeUCS2.cpp | 463 | ||||
-rw-r--r-- | third_party/WebKit/Source/yarr/YarrCanonicalizeUCS2.h | 138 | ||||
-rw-r--r-- | third_party/WebKit/Source/yarr/YarrCanonicalizeUCS2.js | 219 | ||||
-rw-r--r-- | third_party/WebKit/Source/yarr/YarrInterpreter.cpp | 1959 | ||||
-rw-r--r-- | third_party/WebKit/Source/yarr/YarrInterpreter.h | 380 | ||||
-rw-r--r-- | third_party/WebKit/Source/yarr/YarrParser.h | 880 | ||||
-rw-r--r-- | third_party/WebKit/Source/yarr/YarrPattern.cpp | 880 | ||||
-rw-r--r-- | third_party/WebKit/Source/yarr/YarrPattern.h | 410 | ||||
-rw-r--r-- | third_party/WebKit/Source/yarr/YarrSyntaxChecker.cpp | 59 | ||||
-rw-r--r-- | third_party/WebKit/Source/yarr/YarrSyntaxChecker.h | 38 | ||||
-rw-r--r-- | third_party/WebKit/Source/yarr/create_regex_tables | 121 | ||||
-rw-r--r-- | third_party/WebKit/Source/yarr/yarr.gyp | 100 |
15 files changed, 0 insertions, 5720 deletions
diff --git a/third_party/WebKit/Source/bindings/bindings.gyp b/third_party/WebKit/Source/bindings/bindings.gyp index 1f56f42..da85411 100644 --- a/third_party/WebKit/Source/bindings/bindings.gyp +++ b/third_party/WebKit/Source/bindings/bindings.gyp @@ -45,7 +45,6 @@ 'dependencies': [ '../config.gyp:config', '../wtf/wtf.gyp:wtf', - '../yarr/yarr.gyp:yarr', '../core/core.gyp/core.gyp:webcore', '<(DEPTH)/build/temp_gyp/googleurl.gyp:googleurl', '<(DEPTH)/skia/skia.gyp:skia', diff --git a/third_party/WebKit/Source/core/core.gyp/core.gyp b/third_party/WebKit/Source/core/core.gyp/core.gyp index 41129d6..0e3c318 100644 --- a/third_party/WebKit/Source/core/core.gyp/core.gyp +++ b/third_party/WebKit/Source/core/core.gyp/core.gyp @@ -352,7 +352,6 @@ 'injected_canvas_script_source', 'injected_script_source', 'debugger_script_source', - '../../yarr/yarr.gyp:yarr', '../../wtf/wtf.gyp:wtf', '<(DEPTH)/build/temp_gyp/googleurl.gyp:googleurl', '<(DEPTH)/skia/skia.gyp:skia', @@ -484,7 +483,6 @@ 'core_derived_sources.gyp:make_derived_sources', '../../bindings/derived_sources.gyp:bindings_derived_sources', '../../Platform/Platform.gyp/Platform.gyp:webkit_platform', - '../../yarr/yarr.gyp:yarr', '../../wtf/wtf.gyp:wtf', '../../config.gyp:config', '<(DEPTH)/build/temp_gyp/googleurl.gyp:googleurl', @@ -507,7 +505,6 @@ ], 'export_dependent_settings': [ '../../Platform/Platform.gyp/Platform.gyp:webkit_platform', - '../../yarr/yarr.gyp:yarr', '../../wtf/wtf.gyp:wtf', '../../config.gyp:config', '<(DEPTH)/build/temp_gyp/googleurl.gyp:googleurl', diff --git a/third_party/WebKit/Source/yarr/Yarr.h b/third_party/WebKit/Source/yarr/Yarr.h deleted file mode 100644 index ffbccb1..0000000 --- a/third_party/WebKit/Source/yarr/Yarr.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (C) 2009 Apple Inc. All rights reserved. - * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef Yarr_h -#define Yarr_h - -#include "yarr/YarrInterpreter.h" -#include "yarr/YarrPattern.h" - -namespace JSC { namespace Yarr { - -#define YarrStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. -#define YarrStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers. -#define YarrStackSpaceForBackTrackInfoBackReference 2 -#define YarrStackSpaceForBackTrackInfoAlternative 1 // One per alternative. -#define YarrStackSpaceForBackTrackInfoParentheticalAssertion 1 -#define YarrStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers. -#define YarrStackSpaceForBackTrackInfoParenthesesTerminal 1 -#define YarrStackSpaceForBackTrackInfoParentheses 2 - -static const unsigned quantifyInfinite = UINT_MAX; -static const unsigned offsetNoMatch = (unsigned)-1; - -// The below limit restricts the number of "recursive" match calls in order to -// avoid spending exponential time on complex regular expressions. -static const unsigned matchLimit = 1000000; - -enum JSRegExpResult { - JSRegExpMatch = 1, - JSRegExpNoMatch = 0, - JSRegExpErrorNoMatch = -1, - JSRegExpErrorHitLimit = -2, - JSRegExpErrorNoMemory = -3, - JSRegExpErrorInternal = -4 -}; - -enum YarrCharSize { - Char8, - Char16 -}; - -} } // namespace JSC::Yarr - -#endif // Yarr_h - diff --git a/third_party/WebKit/Source/yarr/YarrCanonicalizeUCS2.cpp b/third_party/WebKit/Source/yarr/YarrCanonicalizeUCS2.cpp deleted file mode 100644 index 7bb3d08..0000000 --- a/third_party/WebKit/Source/yarr/YarrCanonicalizeUCS2.cpp +++ /dev/null @@ -1,463 +0,0 @@ -/* - * Copyright (C) 2012 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js - -#include "config.h" -#include "YarrCanonicalizeUCS2.h" - -namespace JSC { namespace Yarr { - -#include <stdint.h> - -uint16_t ucs2CharacterSet0[] = { 0x01c4u, 0x01c5u, 0x01c6u, 0 }; -uint16_t ucs2CharacterSet1[] = { 0x01c7u, 0x01c8u, 0x01c9u, 0 }; -uint16_t ucs2CharacterSet2[] = { 0x01cau, 0x01cbu, 0x01ccu, 0 }; -uint16_t ucs2CharacterSet3[] = { 0x01f1u, 0x01f2u, 0x01f3u, 0 }; -uint16_t ucs2CharacterSet4[] = { 0x0392u, 0x03b2u, 0x03d0u, 0 }; -uint16_t ucs2CharacterSet5[] = { 0x0395u, 0x03b5u, 0x03f5u, 0 }; -uint16_t ucs2CharacterSet6[] = { 0x0398u, 0x03b8u, 0x03d1u, 0 }; -uint16_t ucs2CharacterSet7[] = { 0x0345u, 0x0399u, 0x03b9u, 0x1fbeu, 0 }; -uint16_t ucs2CharacterSet8[] = { 0x039au, 0x03bau, 0x03f0u, 0 }; -uint16_t ucs2CharacterSet9[] = { 0x00b5u, 0x039cu, 0x03bcu, 0 }; -uint16_t ucs2CharacterSet10[] = { 0x03a0u, 0x03c0u, 0x03d6u, 0 }; -uint16_t ucs2CharacterSet11[] = { 0x03a1u, 0x03c1u, 0x03f1u, 0 }; -uint16_t ucs2CharacterSet12[] = { 0x03a3u, 0x03c2u, 0x03c3u, 0 }; -uint16_t ucs2CharacterSet13[] = { 0x03a6u, 0x03c6u, 0x03d5u, 0 }; -uint16_t ucs2CharacterSet14[] = { 0x1e60u, 0x1e61u, 0x1e9bu, 0 }; - -static const size_t UCS2_CANONICALIZATION_SETS = 15; -uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = { - ucs2CharacterSet0, - ucs2CharacterSet1, - ucs2CharacterSet2, - ucs2CharacterSet3, - ucs2CharacterSet4, - ucs2CharacterSet5, - ucs2CharacterSet6, - ucs2CharacterSet7, - ucs2CharacterSet8, - ucs2CharacterSet9, - ucs2CharacterSet10, - ucs2CharacterSet11, - ucs2CharacterSet12, - ucs2CharacterSet13, - ucs2CharacterSet14, -}; - -const size_t UCS2_CANONICALIZATION_RANGES = 364; -UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = { - { 0x0000u, 0x0040u, 0x0000u, CanonicalizeUnique }, - { 0x0041u, 0x005au, 0x0020u, CanonicalizeRangeLo }, - { 0x005bu, 0x0060u, 0x0000u, CanonicalizeUnique }, - { 0x0061u, 0x007au, 0x0020u, CanonicalizeRangeHi }, - { 0x007bu, 0x00b4u, 0x0000u, CanonicalizeUnique }, - { 0x00b5u, 0x00b5u, 0x0009u, CanonicalizeSet }, - { 0x00b6u, 0x00bfu, 0x0000u, CanonicalizeUnique }, - { 0x00c0u, 0x00d6u, 0x0020u, CanonicalizeRangeLo }, - { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeUnique }, - { 0x00d8u, 0x00deu, 0x0020u, CanonicalizeRangeLo }, - { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeUnique }, - { 0x00e0u, 0x00f6u, 0x0020u, CanonicalizeRangeHi }, - { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeUnique }, - { 0x00f8u, 0x00feu, 0x0020u, CanonicalizeRangeHi }, - { 0x00ffu, 0x00ffu, 0x0079u, CanonicalizeRangeLo }, - { 0x0100u, 0x012fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0130u, 0x0131u, 0x0000u, CanonicalizeUnique }, - { 0x0132u, 0x0137u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0138u, 0x0138u, 0x0000u, CanonicalizeUnique }, - { 0x0139u, 0x0148u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x0149u, 0x0149u, 0x0000u, CanonicalizeUnique }, - { 0x014au, 0x0177u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0178u, 0x0178u, 0x0079u, CanonicalizeRangeHi }, - { 0x0179u, 0x017eu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x017fu, 0x017fu, 0x0000u, CanonicalizeUnique }, - { 0x0180u, 0x0180u, 0x00c3u, CanonicalizeRangeLo }, - { 0x0181u, 0x0181u, 0x00d2u, CanonicalizeRangeLo }, - { 0x0182u, 0x0185u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0186u, 0x0186u, 0x00ceu, CanonicalizeRangeLo }, - { 0x0187u, 0x0188u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x0189u, 0x018au, 0x00cdu, CanonicalizeRangeLo }, - { 0x018bu, 0x018cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x018du, 0x018du, 0x0000u, CanonicalizeUnique }, - { 0x018eu, 0x018eu, 0x004fu, CanonicalizeRangeLo }, - { 0x018fu, 0x018fu, 0x00cau, CanonicalizeRangeLo }, - { 0x0190u, 0x0190u, 0x00cbu, CanonicalizeRangeLo }, - { 0x0191u, 0x0192u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x0193u, 0x0193u, 0x00cdu, CanonicalizeRangeLo }, - { 0x0194u, 0x0194u, 0x00cfu, CanonicalizeRangeLo }, - { 0x0195u, 0x0195u, 0x0061u, CanonicalizeRangeLo }, - { 0x0196u, 0x0196u, 0x00d3u, CanonicalizeRangeLo }, - { 0x0197u, 0x0197u, 0x00d1u, CanonicalizeRangeLo }, - { 0x0198u, 0x0199u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x019au, 0x019au, 0x00a3u, CanonicalizeRangeLo }, - { 0x019bu, 0x019bu, 0x0000u, CanonicalizeUnique }, - { 0x019cu, 0x019cu, 0x00d3u, CanonicalizeRangeLo }, - { 0x019du, 0x019du, 0x00d5u, CanonicalizeRangeLo }, - { 0x019eu, 0x019eu, 0x0082u, CanonicalizeRangeLo }, - { 0x019fu, 0x019fu, 0x00d6u, CanonicalizeRangeLo }, - { 0x01a0u, 0x01a5u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01a6u, 0x01a6u, 0x00dau, CanonicalizeRangeLo }, - { 0x01a7u, 0x01a8u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x01a9u, 0x01a9u, 0x00dau, CanonicalizeRangeLo }, - { 0x01aau, 0x01abu, 0x0000u, CanonicalizeUnique }, - { 0x01acu, 0x01adu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01aeu, 0x01aeu, 0x00dau, CanonicalizeRangeLo }, - { 0x01afu, 0x01b0u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x01b1u, 0x01b2u, 0x00d9u, CanonicalizeRangeLo }, - { 0x01b3u, 0x01b6u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x01b7u, 0x01b7u, 0x00dbu, CanonicalizeRangeLo }, - { 0x01b8u, 0x01b9u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01bau, 0x01bbu, 0x0000u, CanonicalizeUnique }, - { 0x01bcu, 0x01bdu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01beu, 0x01beu, 0x0000u, CanonicalizeUnique }, - { 0x01bfu, 0x01bfu, 0x0038u, CanonicalizeRangeLo }, - { 0x01c0u, 0x01c3u, 0x0000u, CanonicalizeUnique }, - { 0x01c4u, 0x01c6u, 0x0000u, CanonicalizeSet }, - { 0x01c7u, 0x01c9u, 0x0001u, CanonicalizeSet }, - { 0x01cau, 0x01ccu, 0x0002u, CanonicalizeSet }, - { 0x01cdu, 0x01dcu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x01ddu, 0x01ddu, 0x004fu, CanonicalizeRangeHi }, - { 0x01deu, 0x01efu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01f0u, 0x01f0u, 0x0000u, CanonicalizeUnique }, - { 0x01f1u, 0x01f3u, 0x0003u, CanonicalizeSet }, - { 0x01f4u, 0x01f5u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01f6u, 0x01f6u, 0x0061u, CanonicalizeRangeHi }, - { 0x01f7u, 0x01f7u, 0x0038u, CanonicalizeRangeHi }, - { 0x01f8u, 0x021fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0220u, 0x0220u, 0x0082u, CanonicalizeRangeHi }, - { 0x0221u, 0x0221u, 0x0000u, CanonicalizeUnique }, - { 0x0222u, 0x0233u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0234u, 0x0239u, 0x0000u, CanonicalizeUnique }, - { 0x023au, 0x023au, 0x2a2bu, CanonicalizeRangeLo }, - { 0x023bu, 0x023cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x023du, 0x023du, 0x00a3u, CanonicalizeRangeHi }, - { 0x023eu, 0x023eu, 0x2a28u, CanonicalizeRangeLo }, - { 0x023fu, 0x0240u, 0x2a3fu, CanonicalizeRangeLo }, - { 0x0241u, 0x0242u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x0243u, 0x0243u, 0x00c3u, CanonicalizeRangeHi }, - { 0x0244u, 0x0244u, 0x0045u, CanonicalizeRangeLo }, - { 0x0245u, 0x0245u, 0x0047u, CanonicalizeRangeLo }, - { 0x0246u, 0x024fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0250u, 0x0250u, 0x2a1fu, CanonicalizeRangeLo }, - { 0x0251u, 0x0251u, 0x2a1cu, CanonicalizeRangeLo }, - { 0x0252u, 0x0252u, 0x2a1eu, CanonicalizeRangeLo }, - { 0x0253u, 0x0253u, 0x00d2u, CanonicalizeRangeHi }, - { 0x0254u, 0x0254u, 0x00ceu, CanonicalizeRangeHi }, - { 0x0255u, 0x0255u, 0x0000u, CanonicalizeUnique }, - { 0x0256u, 0x0257u, 0x00cdu, CanonicalizeRangeHi }, - { 0x0258u, 0x0258u, 0x0000u, CanonicalizeUnique }, - { 0x0259u, 0x0259u, 0x00cau, CanonicalizeRangeHi }, - { 0x025au, 0x025au, 0x0000u, CanonicalizeUnique }, - { 0x025bu, 0x025bu, 0x00cbu, CanonicalizeRangeHi }, - { 0x025cu, 0x025fu, 0x0000u, CanonicalizeUnique }, - { 0x0260u, 0x0260u, 0x00cdu, CanonicalizeRangeHi }, - { 0x0261u, 0x0262u, 0x0000u, CanonicalizeUnique }, - { 0x0263u, 0x0263u, 0x00cfu, CanonicalizeRangeHi }, - { 0x0264u, 0x0264u, 0x0000u, CanonicalizeUnique }, - { 0x0265u, 0x0265u, 0xa528u, CanonicalizeRangeLo }, - { 0x0266u, 0x0267u, 0x0000u, CanonicalizeUnique }, - { 0x0268u, 0x0268u, 0x00d1u, CanonicalizeRangeHi }, - { 0x0269u, 0x0269u, 0x00d3u, CanonicalizeRangeHi }, - { 0x026au, 0x026au, 0x0000u, CanonicalizeUnique }, - { 0x026bu, 0x026bu, 0x29f7u, CanonicalizeRangeLo }, - { 0x026cu, 0x026eu, 0x0000u, CanonicalizeUnique }, - { 0x026fu, 0x026fu, 0x00d3u, CanonicalizeRangeHi }, - { 0x0270u, 0x0270u, 0x0000u, CanonicalizeUnique }, - { 0x0271u, 0x0271u, 0x29fdu, CanonicalizeRangeLo }, - { 0x0272u, 0x0272u, 0x00d5u, CanonicalizeRangeHi }, - { 0x0273u, 0x0274u, 0x0000u, CanonicalizeUnique }, - { 0x0275u, 0x0275u, 0x00d6u, CanonicalizeRangeHi }, - { 0x0276u, 0x027cu, 0x0000u, CanonicalizeUnique }, - { 0x027du, 0x027du, 0x29e7u, CanonicalizeRangeLo }, - { 0x027eu, 0x027fu, 0x0000u, CanonicalizeUnique }, - { 0x0280u, 0x0280u, 0x00dau, CanonicalizeRangeHi }, - { 0x0281u, 0x0282u, 0x0000u, CanonicalizeUnique }, - { 0x0283u, 0x0283u, 0x00dau, CanonicalizeRangeHi }, - { 0x0284u, 0x0287u, 0x0000u, CanonicalizeUnique }, - { 0x0288u, 0x0288u, 0x00dau, CanonicalizeRangeHi }, - { 0x0289u, 0x0289u, 0x0045u, CanonicalizeRangeHi }, - { 0x028au, 0x028bu, 0x00d9u, CanonicalizeRangeHi }, - { 0x028cu, 0x028cu, 0x0047u, CanonicalizeRangeHi }, - { 0x028du, 0x0291u, 0x0000u, CanonicalizeUnique }, - { 0x0292u, 0x0292u, 0x00dbu, CanonicalizeRangeHi }, - { 0x0293u, 0x0344u, 0x0000u, CanonicalizeUnique }, - { 0x0345u, 0x0345u, 0x0007u, CanonicalizeSet }, - { 0x0346u, 0x036fu, 0x0000u, CanonicalizeUnique }, - { 0x0370u, 0x0373u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0374u, 0x0375u, 0x0000u, CanonicalizeUnique }, - { 0x0376u, 0x0377u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0378u, 0x037au, 0x0000u, CanonicalizeUnique }, - { 0x037bu, 0x037du, 0x0082u, CanonicalizeRangeLo }, - { 0x037eu, 0x0385u, 0x0000u, CanonicalizeUnique }, - { 0x0386u, 0x0386u, 0x0026u, CanonicalizeRangeLo }, - { 0x0387u, 0x0387u, 0x0000u, CanonicalizeUnique }, - { 0x0388u, 0x038au, 0x0025u, CanonicalizeRangeLo }, - { 0x038bu, 0x038bu, 0x0000u, CanonicalizeUnique }, - { 0x038cu, 0x038cu, 0x0040u, CanonicalizeRangeLo }, - { 0x038du, 0x038du, 0x0000u, CanonicalizeUnique }, - { 0x038eu, 0x038fu, 0x003fu, CanonicalizeRangeLo }, - { 0x0390u, 0x0390u, 0x0000u, CanonicalizeUnique }, - { 0x0391u, 0x0391u, 0x0020u, CanonicalizeRangeLo }, - { 0x0392u, 0x0392u, 0x0004u, CanonicalizeSet }, - { 0x0393u, 0x0394u, 0x0020u, CanonicalizeRangeLo }, - { 0x0395u, 0x0395u, 0x0005u, CanonicalizeSet }, - { 0x0396u, 0x0397u, 0x0020u, CanonicalizeRangeLo }, - { 0x0398u, 0x0398u, 0x0006u, CanonicalizeSet }, - { 0x0399u, 0x0399u, 0x0007u, CanonicalizeSet }, - { 0x039au, 0x039au, 0x0008u, CanonicalizeSet }, - { 0x039bu, 0x039bu, 0x0020u, CanonicalizeRangeLo }, - { 0x039cu, 0x039cu, 0x0009u, CanonicalizeSet }, - { 0x039du, 0x039fu, 0x0020u, CanonicalizeRangeLo }, - { 0x03a0u, 0x03a0u, 0x000au, CanonicalizeSet }, - { 0x03a1u, 0x03a1u, 0x000bu, CanonicalizeSet }, - { 0x03a2u, 0x03a2u, 0x0000u, CanonicalizeUnique }, - { 0x03a3u, 0x03a3u, 0x000cu, CanonicalizeSet }, - { 0x03a4u, 0x03a5u, 0x0020u, CanonicalizeRangeLo }, - { 0x03a6u, 0x03a6u, 0x000du, CanonicalizeSet }, - { 0x03a7u, 0x03abu, 0x0020u, CanonicalizeRangeLo }, - { 0x03acu, 0x03acu, 0x0026u, CanonicalizeRangeHi }, - { 0x03adu, 0x03afu, 0x0025u, CanonicalizeRangeHi }, - { 0x03b0u, 0x03b0u, 0x0000u, CanonicalizeUnique }, - { 0x03b1u, 0x03b1u, 0x0020u, CanonicalizeRangeHi }, - { 0x03b2u, 0x03b2u, 0x0004u, CanonicalizeSet }, - { 0x03b3u, 0x03b4u, 0x0020u, CanonicalizeRangeHi }, - { 0x03b5u, 0x03b5u, 0x0005u, CanonicalizeSet }, - { 0x03b6u, 0x03b7u, 0x0020u, CanonicalizeRangeHi }, - { 0x03b8u, 0x03b8u, 0x0006u, CanonicalizeSet }, - { 0x03b9u, 0x03b9u, 0x0007u, CanonicalizeSet }, - { 0x03bau, 0x03bau, 0x0008u, CanonicalizeSet }, - { 0x03bbu, 0x03bbu, 0x0020u, CanonicalizeRangeHi }, - { 0x03bcu, 0x03bcu, 0x0009u, CanonicalizeSet }, - { 0x03bdu, 0x03bfu, 0x0020u, CanonicalizeRangeHi }, - { 0x03c0u, 0x03c0u, 0x000au, CanonicalizeSet }, - { 0x03c1u, 0x03c1u, 0x000bu, CanonicalizeSet }, - { 0x03c2u, 0x03c3u, 0x000cu, CanonicalizeSet }, - { 0x03c4u, 0x03c5u, 0x0020u, CanonicalizeRangeHi }, - { 0x03c6u, 0x03c6u, 0x000du, CanonicalizeSet }, - { 0x03c7u, 0x03cbu, 0x0020u, CanonicalizeRangeHi }, - { 0x03ccu, 0x03ccu, 0x0040u, CanonicalizeRangeHi }, - { 0x03cdu, 0x03ceu, 0x003fu, CanonicalizeRangeHi }, - { 0x03cfu, 0x03cfu, 0x0008u, CanonicalizeRangeLo }, - { 0x03d0u, 0x03d0u, 0x0004u, CanonicalizeSet }, - { 0x03d1u, 0x03d1u, 0x0006u, CanonicalizeSet }, - { 0x03d2u, 0x03d4u, 0x0000u, CanonicalizeUnique }, - { 0x03d5u, 0x03d5u, 0x000du, CanonicalizeSet }, - { 0x03d6u, 0x03d6u, 0x000au, CanonicalizeSet }, - { 0x03d7u, 0x03d7u, 0x0008u, CanonicalizeRangeHi }, - { 0x03d8u, 0x03efu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x03f0u, 0x03f0u, 0x0008u, CanonicalizeSet }, - { 0x03f1u, 0x03f1u, 0x000bu, CanonicalizeSet }, - { 0x03f2u, 0x03f2u, 0x0007u, CanonicalizeRangeLo }, - { 0x03f3u, 0x03f4u, 0x0000u, CanonicalizeUnique }, - { 0x03f5u, 0x03f5u, 0x0005u, CanonicalizeSet }, - { 0x03f6u, 0x03f6u, 0x0000u, CanonicalizeUnique }, - { 0x03f7u, 0x03f8u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x03f9u, 0x03f9u, 0x0007u, CanonicalizeRangeHi }, - { 0x03fau, 0x03fbu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x03fcu, 0x03fcu, 0x0000u, CanonicalizeUnique }, - { 0x03fdu, 0x03ffu, 0x0082u, CanonicalizeRangeHi }, - { 0x0400u, 0x040fu, 0x0050u, CanonicalizeRangeLo }, - { 0x0410u, 0x042fu, 0x0020u, CanonicalizeRangeLo }, - { 0x0430u, 0x044fu, 0x0020u, CanonicalizeRangeHi }, - { 0x0450u, 0x045fu, 0x0050u, CanonicalizeRangeHi }, - { 0x0460u, 0x0481u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0482u, 0x0489u, 0x0000u, CanonicalizeUnique }, - { 0x048au, 0x04bfu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x04c0u, 0x04c0u, 0x000fu, CanonicalizeRangeLo }, - { 0x04c1u, 0x04ceu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x04cfu, 0x04cfu, 0x000fu, CanonicalizeRangeHi }, - { 0x04d0u, 0x0527u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0528u, 0x0530u, 0x0000u, CanonicalizeUnique }, - { 0x0531u, 0x0556u, 0x0030u, CanonicalizeRangeLo }, - { 0x0557u, 0x0560u, 0x0000u, CanonicalizeUnique }, - { 0x0561u, 0x0586u, 0x0030u, CanonicalizeRangeHi }, - { 0x0587u, 0x109fu, 0x0000u, CanonicalizeUnique }, - { 0x10a0u, 0x10c5u, 0x1c60u, CanonicalizeRangeLo }, - { 0x10c6u, 0x1d78u, 0x0000u, CanonicalizeUnique }, - { 0x1d79u, 0x1d79u, 0x8a04u, CanonicalizeRangeLo }, - { 0x1d7au, 0x1d7cu, 0x0000u, CanonicalizeUnique }, - { 0x1d7du, 0x1d7du, 0x0ee6u, CanonicalizeRangeLo }, - { 0x1d7eu, 0x1dffu, 0x0000u, CanonicalizeUnique }, - { 0x1e00u, 0x1e5fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x1e60u, 0x1e61u, 0x000eu, CanonicalizeSet }, - { 0x1e62u, 0x1e95u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x1e96u, 0x1e9au, 0x0000u, CanonicalizeUnique }, - { 0x1e9bu, 0x1e9bu, 0x000eu, CanonicalizeSet }, - { 0x1e9cu, 0x1e9fu, 0x0000u, CanonicalizeUnique }, - { 0x1ea0u, 0x1effu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x1f00u, 0x1f07u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f08u, 0x1f0fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f10u, 0x1f15u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f16u, 0x1f17u, 0x0000u, CanonicalizeUnique }, - { 0x1f18u, 0x1f1du, 0x0008u, CanonicalizeRangeHi }, - { 0x1f1eu, 0x1f1fu, 0x0000u, CanonicalizeUnique }, - { 0x1f20u, 0x1f27u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f28u, 0x1f2fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f30u, 0x1f37u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f38u, 0x1f3fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f40u, 0x1f45u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f46u, 0x1f47u, 0x0000u, CanonicalizeUnique }, - { 0x1f48u, 0x1f4du, 0x0008u, CanonicalizeRangeHi }, - { 0x1f4eu, 0x1f50u, 0x0000u, CanonicalizeUnique }, - { 0x1f51u, 0x1f51u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f52u, 0x1f52u, 0x0000u, CanonicalizeUnique }, - { 0x1f53u, 0x1f53u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f54u, 0x1f54u, 0x0000u, CanonicalizeUnique }, - { 0x1f55u, 0x1f55u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f56u, 0x1f56u, 0x0000u, CanonicalizeUnique }, - { 0x1f57u, 0x1f57u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f58u, 0x1f58u, 0x0000u, CanonicalizeUnique }, - { 0x1f59u, 0x1f59u, 0x0008u, CanonicalizeRangeHi }, - { 0x1f5au, 0x1f5au, 0x0000u, CanonicalizeUnique }, - { 0x1f5bu, 0x1f5bu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f5cu, 0x1f5cu, 0x0000u, CanonicalizeUnique }, - { 0x1f5du, 0x1f5du, 0x0008u, CanonicalizeRangeHi }, - { 0x1f5eu, 0x1f5eu, 0x0000u, CanonicalizeUnique }, - { 0x1f5fu, 0x1f5fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f60u, 0x1f67u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f68u, 0x1f6fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f70u, 0x1f71u, 0x004au, CanonicalizeRangeLo }, - { 0x1f72u, 0x1f75u, 0x0056u, CanonicalizeRangeLo }, - { 0x1f76u, 0x1f77u, 0x0064u, CanonicalizeRangeLo }, - { 0x1f78u, 0x1f79u, 0x0080u, CanonicalizeRangeLo }, - { 0x1f7au, 0x1f7bu, 0x0070u, CanonicalizeRangeLo }, - { 0x1f7cu, 0x1f7du, 0x007eu, CanonicalizeRangeLo }, - { 0x1f7eu, 0x1fafu, 0x0000u, CanonicalizeUnique }, - { 0x1fb0u, 0x1fb1u, 0x0008u, CanonicalizeRangeLo }, - { 0x1fb2u, 0x1fb7u, 0x0000u, CanonicalizeUnique }, - { 0x1fb8u, 0x1fb9u, 0x0008u, CanonicalizeRangeHi }, - { 0x1fbau, 0x1fbbu, 0x004au, CanonicalizeRangeHi }, - { 0x1fbcu, 0x1fbdu, 0x0000u, CanonicalizeUnique }, - { 0x1fbeu, 0x1fbeu, 0x0007u, CanonicalizeSet }, - { 0x1fbfu, 0x1fc7u, 0x0000u, CanonicalizeUnique }, - { 0x1fc8u, 0x1fcbu, 0x0056u, CanonicalizeRangeHi }, - { 0x1fccu, 0x1fcfu, 0x0000u, CanonicalizeUnique }, - { 0x1fd0u, 0x1fd1u, 0x0008u, CanonicalizeRangeLo }, - { 0x1fd2u, 0x1fd7u, 0x0000u, CanonicalizeUnique }, - { 0x1fd8u, 0x1fd9u, 0x0008u, CanonicalizeRangeHi }, - { 0x1fdau, 0x1fdbu, 0x0064u, CanonicalizeRangeHi }, - { 0x1fdcu, 0x1fdfu, 0x0000u, CanonicalizeUnique }, - { 0x1fe0u, 0x1fe1u, 0x0008u, CanonicalizeRangeLo }, - { 0x1fe2u, 0x1fe4u, 0x0000u, CanonicalizeUnique }, - { 0x1fe5u, 0x1fe5u, 0x0007u, CanonicalizeRangeLo }, - { 0x1fe6u, 0x1fe7u, 0x0000u, CanonicalizeUnique }, - { 0x1fe8u, 0x1fe9u, 0x0008u, CanonicalizeRangeHi }, - { 0x1feau, 0x1febu, 0x0070u, CanonicalizeRangeHi }, - { 0x1fecu, 0x1fecu, 0x0007u, CanonicalizeRangeHi }, - { 0x1fedu, 0x1ff7u, 0x0000u, CanonicalizeUnique }, - { 0x1ff8u, 0x1ff9u, 0x0080u, CanonicalizeRangeHi }, - { 0x1ffau, 0x1ffbu, 0x007eu, CanonicalizeRangeHi }, - { 0x1ffcu, 0x2131u, 0x0000u, CanonicalizeUnique }, - { 0x2132u, 0x2132u, 0x001cu, CanonicalizeRangeLo }, - { 0x2133u, 0x214du, 0x0000u, CanonicalizeUnique }, - { 0x214eu, 0x214eu, 0x001cu, CanonicalizeRangeHi }, - { 0x214fu, 0x215fu, 0x0000u, CanonicalizeUnique }, - { 0x2160u, 0x216fu, 0x0010u, CanonicalizeRangeLo }, - { 0x2170u, 0x217fu, 0x0010u, CanonicalizeRangeHi }, - { 0x2180u, 0x2182u, 0x0000u, CanonicalizeUnique }, - { 0x2183u, 0x2184u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x2185u, 0x24b5u, 0x0000u, CanonicalizeUnique }, - { 0x24b6u, 0x24cfu, 0x001au, CanonicalizeRangeLo }, - { 0x24d0u, 0x24e9u, 0x001au, CanonicalizeRangeHi }, - { 0x24eau, 0x2bffu, 0x0000u, CanonicalizeUnique }, - { 0x2c00u, 0x2c2eu, 0x0030u, CanonicalizeRangeLo }, - { 0x2c2fu, 0x2c2fu, 0x0000u, CanonicalizeUnique }, - { 0x2c30u, 0x2c5eu, 0x0030u, CanonicalizeRangeHi }, - { 0x2c5fu, 0x2c5fu, 0x0000u, CanonicalizeUnique }, - { 0x2c60u, 0x2c61u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x2c62u, 0x2c62u, 0x29f7u, CanonicalizeRangeHi }, - { 0x2c63u, 0x2c63u, 0x0ee6u, CanonicalizeRangeHi }, - { 0x2c64u, 0x2c64u, 0x29e7u, CanonicalizeRangeHi }, - { 0x2c65u, 0x2c65u, 0x2a2bu, CanonicalizeRangeHi }, - { 0x2c66u, 0x2c66u, 0x2a28u, CanonicalizeRangeHi }, - { 0x2c67u, 0x2c6cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x2c6du, 0x2c6du, 0x2a1cu, CanonicalizeRangeHi }, - { 0x2c6eu, 0x2c6eu, 0x29fdu, CanonicalizeRangeHi }, - { 0x2c6fu, 0x2c6fu, 0x2a1fu, CanonicalizeRangeHi }, - { 0x2c70u, 0x2c70u, 0x2a1eu, CanonicalizeRangeHi }, - { 0x2c71u, 0x2c71u, 0x0000u, CanonicalizeUnique }, - { 0x2c72u, 0x2c73u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x2c74u, 0x2c74u, 0x0000u, CanonicalizeUnique }, - { 0x2c75u, 0x2c76u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x2c77u, 0x2c7du, 0x0000u, CanonicalizeUnique }, - { 0x2c7eu, 0x2c7fu, 0x2a3fu, CanonicalizeRangeHi }, - { 0x2c80u, 0x2ce3u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x2ce4u, 0x2ceau, 0x0000u, CanonicalizeUnique }, - { 0x2cebu, 0x2ceeu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x2cefu, 0x2cffu, 0x0000u, CanonicalizeUnique }, - { 0x2d00u, 0x2d25u, 0x1c60u, CanonicalizeRangeHi }, - { 0x2d26u, 0xa63fu, 0x0000u, CanonicalizeUnique }, - { 0xa640u, 0xa66du, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa66eu, 0xa67fu, 0x0000u, CanonicalizeUnique }, - { 0xa680u, 0xa697u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa698u, 0xa721u, 0x0000u, CanonicalizeUnique }, - { 0xa722u, 0xa72fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa730u, 0xa731u, 0x0000u, CanonicalizeUnique }, - { 0xa732u, 0xa76fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa770u, 0xa778u, 0x0000u, CanonicalizeUnique }, - { 0xa779u, 0xa77cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0xa77du, 0xa77du, 0x8a04u, CanonicalizeRangeHi }, - { 0xa77eu, 0xa787u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa788u, 0xa78au, 0x0000u, CanonicalizeUnique }, - { 0xa78bu, 0xa78cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0xa78du, 0xa78du, 0xa528u, CanonicalizeRangeHi }, - { 0xa78eu, 0xa78fu, 0x0000u, CanonicalizeUnique }, - { 0xa790u, 0xa791u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa792u, 0xa79fu, 0x0000u, CanonicalizeUnique }, - { 0xa7a0u, 0xa7a9u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa7aau, 0xff20u, 0x0000u, CanonicalizeUnique }, - { 0xff21u, 0xff3au, 0x0020u, CanonicalizeRangeLo }, - { 0xff3bu, 0xff40u, 0x0000u, CanonicalizeUnique }, - { 0xff41u, 0xff5au, 0x0020u, CanonicalizeRangeHi }, - { 0xff5bu, 0xffffu, 0x0000u, CanonicalizeUnique }, -}; - -const size_t LATIN_CANONICALIZATION_RANGES = 20; -LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = { - { 0x0000u, 0x0040u, 0x0000u, CanonicalizeLatinSelf }, - { 0x0041u, 0x005au, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x005bu, 0x0060u, 0x0000u, CanonicalizeLatinSelf }, - { 0x0061u, 0x007au, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x007bu, 0x00bfu, 0x0000u, CanonicalizeLatinSelf }, - { 0x00c0u, 0x00d6u, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeLatinSelf }, - { 0x00d8u, 0x00deu, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeLatinSelf }, - { 0x00e0u, 0x00f6u, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeLatinSelf }, - { 0x00f8u, 0x00feu, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x00ffu, 0x00ffu, 0x0000u, CanonicalizeLatinSelf }, - { 0x0100u, 0x0177u, 0x0000u, CanonicalizeLatinInvalid }, - { 0x0178u, 0x0178u, 0x00ffu, CanonicalizeLatinOther }, - { 0x0179u, 0x039bu, 0x0000u, CanonicalizeLatinInvalid }, - { 0x039cu, 0x039cu, 0x00b5u, CanonicalizeLatinOther }, - { 0x039du, 0x03bbu, 0x0000u, CanonicalizeLatinInvalid }, - { 0x03bcu, 0x03bcu, 0x00b5u, CanonicalizeLatinOther }, - { 0x03bdu, 0xffffu, 0x0000u, CanonicalizeLatinInvalid }, -}; - -} } // JSC::Yarr - diff --git a/third_party/WebKit/Source/yarr/YarrCanonicalizeUCS2.h b/third_party/WebKit/Source/yarr/YarrCanonicalizeUCS2.h deleted file mode 100644 index 9dce782..0000000 --- a/third_party/WebKit/Source/yarr/YarrCanonicalizeUCS2.h +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (C) 2012 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef YarrCanonicalizeUCS2_H -#define YarrCanonicalizeUCS2_H - -#include <stdint.h> -#include <wtf/unicode/Unicode.h> - -namespace JSC { namespace Yarr { - -// This set of data (autogenerated using YarrCanonicalizeUCS2.js into YarrCanonicalizeUCS2.cpp) -// provides information for each UCS2 code point as to the set of code points that it should -// match under the ES5.1 case insensitive RegExp matching rules, specified in 15.10.2.8. -enum UCS2CanonicalizationType { - CanonicalizeUnique, // No canonically equal values, e.g. 0x0. - CanonicalizeSet, // Value indicates a set in characterSetInfo. - CanonicalizeRangeLo, // Value is positive delta to pair, E.g. 0x41 has value 0x20, -> 0x61. - CanonicalizeRangeHi, // Value is positive delta to pair, E.g. 0x61 has value 0x20, -> 0x41. - CanonicalizeAlternatingAligned, // Aligned consequtive pair, e.g. 0x1f4,0x1f5. - CanonicalizeAlternatingUnaligned, // Unaligned consequtive pair, e.g. 0x241,0x242. -}; -struct UCS2CanonicalizationRange { uint16_t begin, end, value, type; }; -extern const size_t UCS2_CANONICALIZATION_RANGES; -extern uint16_t* characterSetInfo[]; -extern UCS2CanonicalizationRange rangeInfo[]; - -// This table is similar to the full rangeInfo table, however this maps from UCS2 codepoints to -// the set of Latin1 codepoints that could match. -enum LatinCanonicalizationType { - CanonicalizeLatinSelf, // This character is in the Latin1 range, but has no canonical equivalent in the range. - CanonicalizeLatinMask0x20, // One of a pair of characters, under the mask 0x20. - CanonicalizeLatinOther, // This character is not in the Latin1 range, but canonicalizes to another that is. - CanonicalizeLatinInvalid, // Cannot match against Latin1 input. -}; -struct LatinCanonicalizationRange { uint16_t begin, end, value, type; }; -extern const size_t LATIN_CANONICALIZATION_RANGES; -extern LatinCanonicalizationRange latinRangeInfo[]; - -// This searches in log2 time over ~364 entries, so should typically result in 8 compares. -inline UCS2CanonicalizationRange* rangeInfoFor(UChar ch) -{ - UCS2CanonicalizationRange* info = rangeInfo; - size_t entries = UCS2_CANONICALIZATION_RANGES; - - while (true) { - size_t candidate = entries >> 1; - UCS2CanonicalizationRange* candidateInfo = info + candidate; - if (ch < candidateInfo->begin) - entries = candidate; - else if (ch <= candidateInfo->end) - return candidateInfo; - else { - info = candidateInfo + 1; - entries -= (candidate + 1); - } - } -} - -// Should only be called for characters that have one canonically matching value. -inline UChar getCanonicalPair(UCS2CanonicalizationRange* info, UChar ch) -{ - ASSERT(ch >= info->begin && ch <= info->end); - switch (info->type) { - case CanonicalizeRangeLo: - return ch + info->value; - case CanonicalizeRangeHi: - return ch - info->value; - case CanonicalizeAlternatingAligned: - return ch ^ 1; - case CanonicalizeAlternatingUnaligned: - return ((ch - 1) ^ 1) + 1; - default: - RELEASE_ASSERT_NOT_REACHED(); - } - RELEASE_ASSERT_NOT_REACHED(); - return 0; -} - -// Returns true if no other UCS2 codepoint can match this value. -inline bool isCanonicallyUnique(UChar ch) -{ - return rangeInfoFor(ch)->type == CanonicalizeUnique; -} - -// Returns true if values are equal, under the canonicalization rules. -inline bool areCanonicallyEquivalent(UChar a, UChar b) -{ - UCS2CanonicalizationRange* info = rangeInfoFor(a); - switch (info->type) { - case CanonicalizeUnique: - return a == b; - case CanonicalizeSet: { - for (uint16_t* set = characterSetInfo[info->value]; (a = *set); ++set) { - if (a == b) - return true; - } - return false; - } - case CanonicalizeRangeLo: - return (a == b) || (a + info->value == b); - case CanonicalizeRangeHi: - return (a == b) || (a - info->value == b); - case CanonicalizeAlternatingAligned: - return (a | 1) == (b | 1); - case CanonicalizeAlternatingUnaligned: - return ((a - 1) | 1) == ((b - 1) | 1); - } - - RELEASE_ASSERT_NOT_REACHED(); - return false; -} - -} } // JSC::Yarr - -#endif diff --git a/third_party/WebKit/Source/yarr/YarrCanonicalizeUCS2.js b/third_party/WebKit/Source/yarr/YarrCanonicalizeUCS2.js deleted file mode 100644 index 98846cb..0000000 --- a/third_party/WebKit/Source/yarr/YarrCanonicalizeUCS2.js +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Copyright (C) 2012 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -// See ES 5.1, 15.10.2.8 -function canonicalize(ch) -{ - var u = String.fromCharCode(ch).toUpperCase(); - if (u.length > 1) - return ch; - var cu = u.charCodeAt(0); - if (ch >= 128 && cu < 128) - return ch; - return cu; -} - -var MAX_UCS2 = 0xFFFF; -var MAX_LATIN = 0xFF; - -var groupedCanonically = []; -// Pass 1: populate groupedCanonically - this is mapping from canonicalized -// values back to the set of character code that canonicalize to them. -for (var i = 0; i <= MAX_UCS2; ++i) { - var ch = canonicalize(i); - if (!groupedCanonically[ch]) - groupedCanonically[ch] = []; - groupedCanonically[ch].push(i); -} - -var typeInfo = []; -var latinTypeInfo = []; -var characterSetInfo = []; -// Pass 2: populate typeInfo & characterSetInfo. For every character calculate -// a typeInfo value, described by the types above, and a value payload. -for (cu in groupedCanonically) { - // The set of characters that canonicalize to cu - var characters = groupedCanonically[cu]; - - // If there is only one, it is unique. - if (characters.length == 1) { - typeInfo[characters[0]] = "CanonicalizeUnique:0"; - latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0"; - continue; - } - - // Sort the array. - characters.sort(function(x,y){return x-y;}); - - // If there are more than two characters, create an entry in characterSetInfo. - if (characters.length > 2) { - for (i in characters) - typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length; - characterSetInfo.push(characters); - - if (characters[1] <= MAX_LATIN) - throw new Error("sets with more than one latin character not supported!"); - if (characters[0] <= MAX_LATIN) { - for (i in characters) - latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0]; - latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0"; - } else { - for (i in characters) - latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0"; - } - - continue; - } - - // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner. - var lo = characters[0]; - var hi = characters[1]; - var delta = hi - lo; - if (delta == 1) { - var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0"; - typeInfo[lo] = type; - typeInfo[hi] = type; - } else { - typeInfo[lo] = "CanonicalizeRangeLo:" + delta; - typeInfo[hi] = "CanonicalizeRangeHi:" + delta; - } - - if (lo > MAX_LATIN) { - latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0"; - latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0"; - } else if (hi > MAX_LATIN) { - latinTypeInfo[lo] = "CanonicalizeLatinSelf:0"; - latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo; - } else { - if (delta != 0x20 || lo & 0x20) - throw new Error("pairs of latin characters that don't mask with 0x20 not supported!"); - latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0"; - latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0"; - } -} - -var rangeInfo = []; -// Pass 3: coallesce types into ranges. -for (var end = 0; end <= MAX_UCS2; ++end) { - var begin = end; - var type = typeInfo[end]; - while (end < MAX_UCS2 && typeInfo[end + 1] == type) - ++end; - rangeInfo.push({begin:begin, end:end, type:type}); -} - -var latinRangeInfo = []; -// Pass 4: coallesce latin-1 types into ranges. -for (var end = 0; end <= MAX_UCS2; ++end) { - var begin = end; - var type = latinTypeInfo[end]; - while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type) - ++end; - latinRangeInfo.push({begin:begin, end:end, type:type}); -} - - -// Helper function to convert a number to a fixed width hex representation of a C uint16_t. -function hex(x) -{ - var s = Number(x).toString(16); - while (s.length < 4) - s = 0 + s; - return "0x" + s + "u"; -} - -var copyright = ( - "/*" + "\n" + - " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" + - " *" + "\n" + - " * Redistribution and use in source and binary forms, with or without" + "\n" + - " * modification, are permitted provided that the following conditions" + "\n" + - " * are met:" + "\n" + - " * 1. Redistributions of source code must retain the above copyright" + "\n" + - " * notice, this list of conditions and the following disclaimer." + "\n" + - " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" + - " * notice, this list of conditions and the following disclaimer in the" + "\n" + - " * documentation and/or other materials provided with the distribution." + "\n" + - " *" + "\n" + - " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" + - " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" + - " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" + - " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" + - " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" + - " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" + - " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" + - " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" + - " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" + - " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" + - " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" + - " */"); - -print(copyright); -print(); -print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js"); -print(); -print('#include "config.h"'); -print('#include "yarr/YarrCanonicalizeUCS2.h"'); -print(); -print("namespace JSC { namespace Yarr {"); -print(); -print("#include <stdint.h>"); -print(); - -for (i in characterSetInfo) { - var characters = "" - var set = characterSetInfo[i]; - for (var j in set) - characters += hex(set[j]) + ", "; - print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };"); -} -print(); -print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";"); -print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {"); -for (i in characterSetInfo) -print(" ucs2CharacterSet" + i + ","); -print("};"); -print(); -print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";"); -print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {"); -for (i in rangeInfo) { - var info = rangeInfo[i]; - var typeAndValue = info.type.split(':'); - print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); -} -print("};"); -print(); -print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";"); -print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {"); -for (i in latinRangeInfo) { - var info = latinRangeInfo[i]; - var typeAndValue = info.type.split(':'); - print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); -} -print("};"); -print(); -print("} } // JSC::Yarr"); -print(); - diff --git a/third_party/WebKit/Source/yarr/YarrInterpreter.cpp b/third_party/WebKit/Source/yarr/YarrInterpreter.cpp deleted file mode 100644 index b9d5910..0000000 --- a/third_party/WebKit/Source/yarr/YarrInterpreter.cpp +++ /dev/null @@ -1,1959 +0,0 @@ -/* - * Copyright (C) 2009 Apple Inc. All rights reserved. - * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" -#include "yarr/YarrInterpreter.h" - -#include "yarr/Yarr.h" -#include "yarr/YarrCanonicalizeUCS2.h" -#include <wtf/BumpPointerAllocator.h> -#include <wtf/DataLog.h> -#include <wtf/text/CString.h> -#include <wtf/text/WTFString.h> - -#ifndef NDEBUG -#include <stdio.h> -#endif - -using namespace WTF; - -namespace JSC { namespace Yarr { - -template<typename CharType> -class Interpreter { -public: - struct ParenthesesDisjunctionContext; - - struct BackTrackInfoPatternCharacter { - uintptr_t matchAmount; - }; - struct BackTrackInfoCharacterClass { - uintptr_t matchAmount; - }; - struct BackTrackInfoBackReference { - uintptr_t begin; // Not really needed for greedy quantifiers. - uintptr_t matchAmount; // Not really needed for fixed quantifiers. - }; - struct BackTrackInfoAlternative { - uintptr_t offset; - }; - struct BackTrackInfoParentheticalAssertion { - uintptr_t begin; - }; - struct BackTrackInfoParenthesesOnce { - uintptr_t begin; - }; - struct BackTrackInfoParenthesesTerminal { - uintptr_t begin; - }; - struct BackTrackInfoParentheses { - uintptr_t matchAmount; - ParenthesesDisjunctionContext* lastContext; - }; - - static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) - { - context->next = backTrack->lastContext; - backTrack->lastContext = context; - ++backTrack->matchAmount; - } - - static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) - { - RELEASE_ASSERT(backTrack->matchAmount); - RELEASE_ASSERT(backTrack->lastContext); - backTrack->lastContext = backTrack->lastContext->next; - --backTrack->matchAmount; - } - - struct DisjunctionContext - { - DisjunctionContext() - : term(0) - { - } - - void* operator new(size_t, void* where) - { - return where; - } - - int term; - unsigned matchBegin; - unsigned matchEnd; - uintptr_t frame[1]; - }; - - DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) - { - size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); - allocatorPool = allocatorPool->ensureCapacity(size); - RELEASE_ASSERT(allocatorPool); - return new (allocatorPool->alloc(size)) DisjunctionContext(); - } - - void freeDisjunctionContext(DisjunctionContext* context) - { - allocatorPool = allocatorPool->dealloc(context); - } - - struct ParenthesesDisjunctionContext - { - ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term) - : next(0) - { - unsigned firstSubpatternId = term.atom.subpatternId; - unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; - - for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { - subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; - output[(firstSubpatternId << 1) + i] = offsetNoMatch; - } - - new (getDisjunctionContext(term)) DisjunctionContext(); - } - - void* operator new(size_t, void* where) - { - return where; - } - - void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) - { - for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) - output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; - } - - DisjunctionContext* getDisjunctionContext(ByteTerm& term) - { - return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); - } - - ParenthesesDisjunctionContext* next; - unsigned subpatternBackup[1]; - }; - - ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term) - { - size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); - allocatorPool = allocatorPool->ensureCapacity(size); - RELEASE_ASSERT(allocatorPool); - return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); - } - - void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) - { - allocatorPool = allocatorPool->dealloc(context); - } - - class InputStream { - public: - InputStream(const CharType* input, unsigned start, unsigned length) - : input(input) - , pos(start) - , length(length) - { - } - - void next() - { - ++pos; - } - - void rewind(unsigned amount) - { - ASSERT(pos >= amount); - pos -= amount; - } - - int read() - { - ASSERT(pos < length); - if (pos < length) - return input[pos]; - return -1; - } - - int readPair() - { - ASSERT(pos + 1 < length); - return input[pos] | input[pos + 1] << 16; - } - - int readChecked(unsigned negativePositionOffest) - { - RELEASE_ASSERT(pos >= negativePositionOffest); - unsigned p = pos - negativePositionOffest; - ASSERT(p < length); - return input[p]; - } - - int reread(unsigned from) - { - ASSERT(from < length); - return input[from]; - } - - int prev() - { - ASSERT(!(pos > length)); - if (pos && length) - return input[pos - 1]; - return -1; - } - - unsigned getPos() - { - return pos; - } - - void setPos(unsigned p) - { - pos = p; - } - - bool atStart() - { - return pos == 0; - } - - bool atEnd() - { - return pos == length; - } - - unsigned end() - { - return length; - } - - bool checkInput(unsigned count) - { - if (((pos + count) <= length) && ((pos + count) >= pos)) { - pos += count; - return true; - } - return false; - } - - void uncheckInput(unsigned count) - { - RELEASE_ASSERT(pos >= count); - pos -= count; - } - - bool atStart(unsigned negativePositionOffest) - { - return pos == negativePositionOffest; - } - - bool atEnd(unsigned negativePositionOffest) - { - RELEASE_ASSERT(pos >= negativePositionOffest); - return (pos - negativePositionOffest) == length; - } - - bool isAvailableInput(unsigned offset) - { - return (((pos + offset) <= length) && ((pos + offset) >= pos)); - } - - private: - const CharType* input; - unsigned pos; - unsigned length; - }; - - bool testCharacterClass(CharacterClass* characterClass, int ch) - { - if (ch & 0xFF80) { - for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) - if (ch == characterClass->m_matchesUnicode[i]) - return true; - for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) - if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) - return true; - } else { - for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) - if (ch == characterClass->m_matches[i]) - return true; - for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) - if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) - return true; - } - - return false; - } - - bool checkCharacter(int testChar, unsigned negativeInputOffset) - { - return testChar == input.readChecked(negativeInputOffset); - } - - bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset) - { - int ch = input.readChecked(negativeInputOffset); - return (loChar == ch) || (hiChar == ch); - } - - bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset) - { - bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset)); - return invert ? !match : match; - } - - bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset) - { - unsigned matchSize = (unsigned)(matchEnd - matchBegin); - - if (!input.checkInput(matchSize)) - return false; - - if (pattern->m_ignoreCase) { - for (unsigned i = 0; i < matchSize; ++i) { - int oldCh = input.reread(matchBegin + i); - int ch = input.readChecked(negativeInputOffset + matchSize - i); - - if (oldCh == ch) - continue; - - // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that - // unicode values are never allowed to match against ascii ones. - if (isASCII(oldCh) || isASCII(ch)) { - if (toASCIIUpper(oldCh) == toASCIIUpper(ch)) - continue; - } else if (areCanonicallyEquivalent(oldCh, ch)) - continue; - - input.uncheckInput(matchSize); - return false; - } - } else { - for (unsigned i = 0; i < matchSize; ++i) { - if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) { - input.uncheckInput(matchSize); - return false; - } - } - } - - return true; - } - - bool matchAssertionBOL(ByteTerm& term) - { - return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1))); - } - - bool matchAssertionEOL(ByteTerm& term) - { - if (term.inputPosition) - return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); - - return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); - } - - bool matchAssertionWordBoundary(ByteTerm& term) - { - bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1)); - bool readIsWordchar; - if (term.inputPosition) - readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); - else - readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); - - bool wordBoundary = prevIsWordchar != readIsWordchar; - return term.invert() ? !wordBoundary : wordBoundary; - } - - bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) - { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierFixedCount: - break; - - case QuantifierGreedy: - if (backTrack->matchAmount) { - --backTrack->matchAmount; - input.uncheckInput(1); - return true; - } - break; - - case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { - ++backTrack->matchAmount; - if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1)) - return true; - } - input.uncheckInput(backTrack->matchAmount); - break; - } - - return false; - } - - bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) - { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierFixedCount: - break; - - case QuantifierGreedy: - if (backTrack->matchAmount) { - --backTrack->matchAmount; - input.uncheckInput(1); - return true; - } - break; - - case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { - ++backTrack->matchAmount; - if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1)) - return true; - } - input.uncheckInput(backTrack->matchAmount); - break; - } - - return false; - } - - bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeCharacterClass); - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierFixedCount: { - for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { - if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount)) - return false; - } - return true; - } - - case QuantifierGreedy: { - unsigned matchAmount = 0; - while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { - if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) { - input.uncheckInput(1); - break; - } - ++matchAmount; - } - backTrack->matchAmount = matchAmount; - - return true; - } - - case QuantifierNonGreedy: - backTrack->matchAmount = 0; - return true; - } - - RELEASE_ASSERT_NOT_REACHED(); - return false; - } - - bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeCharacterClass); - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierFixedCount: - break; - - case QuantifierGreedy: - if (backTrack->matchAmount) { - --backTrack->matchAmount; - input.uncheckInput(1); - return true; - } - break; - - case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { - ++backTrack->matchAmount; - if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) - return true; - } - input.uncheckInput(backTrack->matchAmount); - break; - } - - return false; - } - - bool matchBackReference(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeBackReference); - BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); - - unsigned matchBegin = output[(term.atom.subpatternId << 1)]; - unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1]; - - // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that. - // In this case the result of match is empty string like when it references to a parentheses with zero-width match. - // Eg.: /(a\1)/ - if (matchEnd == offsetNoMatch) - return true; - - if (matchBegin == offsetNoMatch) - return true; - - ASSERT(matchBegin <= matchEnd); - - if (matchBegin == matchEnd) - return true; - - switch (term.atom.quantityType) { - case QuantifierFixedCount: { - backTrack->begin = input.getPos(); - for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { - if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { - input.setPos(backTrack->begin); - return false; - } - } - return true; - } - - case QuantifierGreedy: { - unsigned matchAmount = 0; - while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) - ++matchAmount; - backTrack->matchAmount = matchAmount; - return true; - } - - case QuantifierNonGreedy: - backTrack->begin = input.getPos(); - backTrack->matchAmount = 0; - return true; - } - - RELEASE_ASSERT_NOT_REACHED(); - return false; - } - - bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeBackReference); - BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); - - unsigned matchBegin = output[(term.atom.subpatternId << 1)]; - unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1]; - - if (matchBegin == offsetNoMatch) - return false; - - ASSERT(matchBegin <= matchEnd); - - if (matchBegin == matchEnd) - return false; - - switch (term.atom.quantityType) { - case QuantifierFixedCount: - // for quantityCount == 1, could rewind. - input.setPos(backTrack->begin); - break; - - case QuantifierGreedy: - if (backTrack->matchAmount) { - --backTrack->matchAmount; - input.rewind(matchEnd - matchBegin); - return true; - } - break; - - case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { - ++backTrack->matchAmount; - return true; - } - input.setPos(backTrack->begin); - break; - } - - return false; - } - - void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) - { - if (term.capture()) { - unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; - output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; - } - } - void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) - { - unsigned firstSubpatternId = term.atom.subpatternId; - unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; - context->restoreOutput(output, firstSubpatternId, count); - } - JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) - { - while (backTrack->matchAmount) { - ParenthesesDisjunctionContext* context = backTrack->lastContext; - - JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true); - if (result == JSRegExpMatch) - return JSRegExpMatch; - - resetMatches(term, context); - popParenthesesDisjunctionContext(backTrack); - freeParenthesesDisjunctionContext(context); - - if (result != JSRegExpNoMatch) - return result; - } - - return JSRegExpNoMatch; - } - - bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierGreedy: { - // set this speculatively; if we get to the parens end this will be true. - backTrack->begin = input.getPos(); - break; - } - case QuantifierNonGreedy: { - backTrack->begin = notFound; - context->term += term.atom.parenthesesWidth; - return true; - } - case QuantifierFixedCount: - break; - } - - if (term.capture()) { - unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1)] = input.getPos() - term.inputPosition; - } - - return true; - } - - bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); - ASSERT(term.atom.quantityCount == 1); - - if (term.capture()) { - unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; - } - - if (term.atom.quantityType == QuantifierFixedCount) - return true; - - BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); - return backTrack->begin != input.getPos(); - } - - bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); - - if (term.capture()) { - unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1)] = offsetNoMatch; - output[(subpatternId << 1) + 1] = offsetNoMatch; - } - - switch (term.atom.quantityType) { - case QuantifierGreedy: - // if we backtrack to this point, there is another chance - try matching nothing. - ASSERT(backTrack->begin != notFound); - backTrack->begin = notFound; - context->term += term.atom.parenthesesWidth; - return true; - case QuantifierNonGreedy: - ASSERT(backTrack->begin != notFound); - case QuantifierFixedCount: - break; - } - - return false; - } - - bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierGreedy: - if (backTrack->begin == notFound) { - context->term -= term.atom.parenthesesWidth; - return false; - } - case QuantifierNonGreedy: - if (backTrack->begin == notFound) { - backTrack->begin = input.getPos(); - if (term.capture()) { - // Technically this access to inputPosition should be accessing the begin term's - // inputPosition, but for repeats other than fixed these values should be - // the same anyway! (We don't pre-check for greedy or non-greedy matches.) - ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); - unsigned subpatternId = term.atom.subpatternId; - output[subpatternId << 1] = input.getPos() + term.inputPosition; - } - context->term -= term.atom.parenthesesWidth; - return true; - } - case QuantifierFixedCount: - break; - } - - return false; - } - - bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); - ASSERT(term.atom.quantityType == QuantifierGreedy); - ASSERT(term.atom.quantityCount == quantifyInfinite); - ASSERT(!term.capture()); - - BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); - backTrack->begin = input.getPos(); - return true; - } - - bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd); - - BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); - // Empty match is a failed match. - if (backTrack->begin == input.getPos()) - return false; - - // Successful match! Okay, what's next? - loop around and try to match moar! - context->term -= (term.atom.parenthesesWidth + 1); - return true; - } - - bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); - ASSERT(term.atom.quantityType == QuantifierGreedy); - ASSERT(term.atom.quantityCount == quantifyInfinite); - ASSERT(!term.capture()); - - // If we backtrack to this point, we have failed to match this iteration of the parens. - // Since this is greedy / zero minimum a failed is also accepted as a match! - context->term += term.atom.parenthesesWidth; - return true; - } - - bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*) - { - // 'Terminal' parentheses are at the end of the regex, and as such a match past end - // should always be returned as a successful match - we should never backtrack to here. - RELEASE_ASSERT_NOT_REACHED(); - return false; - } - - bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); - - backTrack->begin = input.getPos(); - return true; - } - - bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); - - input.setPos(backTrack->begin); - - // We've reached the end of the parens; if they are inverted, this is failure. - if (term.invert()) { - context->term -= term.atom.parenthesesWidth; - return false; - } - - return true; - } - - bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); - ASSERT(term.atom.quantityCount == 1); - - // We've failed to match parens; if they are inverted, this is win! - if (term.invert()) { - context->term += term.atom.parenthesesWidth; - return true; - } - - return false; - } - - bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); - - input.setPos(backTrack->begin); - - context->term -= term.atom.parenthesesWidth; - return false; - } - - JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); - - BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); - ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; - - backTrack->matchAmount = 0; - backTrack->lastContext = 0; - - switch (term.atom.quantityType) { - case QuantifierFixedCount: { - // While we haven't yet reached our fixed limit, - while (backTrack->matchAmount < term.atom.quantityCount) { - // Try to do a match, and it it succeeds, add it to the list. - ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); - if (result == JSRegExpMatch) - appendParenthesesDisjunctionContext(backTrack, context); - else { - // The match failed; try to find an alternate point to carry on from. - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - - if (result != JSRegExpNoMatch) - return result; - JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); - if (backtrackResult != JSRegExpMatch) - return backtrackResult; - } - } - - ASSERT(backTrack->matchAmount == term.atom.quantityCount); - ParenthesesDisjunctionContext* context = backTrack->lastContext; - recordParenthesesMatch(term, context); - return JSRegExpMatch; - } - - case QuantifierGreedy: { - while (backTrack->matchAmount < term.atom.quantityCount) { - ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); - if (result == JSRegExpMatch) - appendParenthesesDisjunctionContext(backTrack, context); - else { - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - - if (result != JSRegExpNoMatch) - return result; - - break; - } - } - - if (backTrack->matchAmount) { - ParenthesesDisjunctionContext* context = backTrack->lastContext; - recordParenthesesMatch(term, context); - } - return JSRegExpMatch; - } - - case QuantifierNonGreedy: - return JSRegExpMatch; - } - - RELEASE_ASSERT_NOT_REACHED(); - return JSRegExpErrorNoMatch; - } - - // Rules for backtracking differ depending on whether this is greedy or non-greedy. - // - // Greedy matches never should try just adding more - you should already have done - // the 'more' cases. Always backtrack, at least a leetle bit. However cases where - // you backtrack an item off the list needs checking, since we'll never have matched - // the one less case. Tracking forwards, still add as much as possible. - // - // Non-greedy, we've already done the one less case, so don't match on popping. - // We haven't done the one more case, so always try to add that. - // - JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); - - BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); - ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; - - switch (term.atom.quantityType) { - case QuantifierFixedCount: { - ASSERT(backTrack->matchAmount == term.atom.quantityCount); - - ParenthesesDisjunctionContext* context = 0; - JSRegExpResult result = parenthesesDoBacktrack(term, backTrack); - - if (result != JSRegExpMatch) - return result; - - // While we haven't yet reached our fixed limit, - while (backTrack->matchAmount < term.atom.quantityCount) { - // Try to do a match, and it it succeeds, add it to the list. - context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); - - if (result == JSRegExpMatch) - appendParenthesesDisjunctionContext(backTrack, context); - else { - // The match failed; try to find an alternate point to carry on from. - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - - if (result != JSRegExpNoMatch) - return result; - JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); - if (backtrackResult != JSRegExpMatch) - return backtrackResult; - } - } - - ASSERT(backTrack->matchAmount == term.atom.quantityCount); - context = backTrack->lastContext; - recordParenthesesMatch(term, context); - return JSRegExpMatch; - } - - case QuantifierGreedy: { - if (!backTrack->matchAmount) - return JSRegExpNoMatch; - - ParenthesesDisjunctionContext* context = backTrack->lastContext; - JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); - if (result == JSRegExpMatch) { - while (backTrack->matchAmount < term.atom.quantityCount) { - ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); - if (parenthesesResult == JSRegExpMatch) - appendParenthesesDisjunctionContext(backTrack, context); - else { - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - - if (parenthesesResult != JSRegExpNoMatch) - return parenthesesResult; - - break; - } - } - } else { - resetMatches(term, context); - popParenthesesDisjunctionContext(backTrack); - freeParenthesesDisjunctionContext(context); - - if (result != JSRegExpNoMatch) - return result; - } - - if (backTrack->matchAmount) { - ParenthesesDisjunctionContext* context = backTrack->lastContext; - recordParenthesesMatch(term, context); - } - return JSRegExpMatch; - } - - case QuantifierNonGreedy: { - // If we've not reached the limit, try to add one more match. - if (backTrack->matchAmount < term.atom.quantityCount) { - ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); - if (result == JSRegExpMatch) { - appendParenthesesDisjunctionContext(backTrack, context); - recordParenthesesMatch(term, context); - return JSRegExpMatch; - } - - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - - if (result != JSRegExpNoMatch) - return result; - } - - // Nope - okay backtrack looking for an alternative. - while (backTrack->matchAmount) { - ParenthesesDisjunctionContext* context = backTrack->lastContext; - JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); - if (result == JSRegExpMatch) { - // successful backtrack! we're back in the game! - if (backTrack->matchAmount) { - context = backTrack->lastContext; - recordParenthesesMatch(term, context); - } - return JSRegExpMatch; - } - - // pop a match off the stack - resetMatches(term, context); - popParenthesesDisjunctionContext(backTrack); - freeParenthesesDisjunctionContext(context); - - if (result != JSRegExpNoMatch) - return result; - } - - return JSRegExpNoMatch; - } - } - - RELEASE_ASSERT_NOT_REACHED(); - return JSRegExpErrorNoMatch; - } - - bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context) - { - UNUSED_PARAM(term); - unsigned matchBegin = context->matchBegin; - - if (matchBegin) { - for (matchBegin--; true; matchBegin--) { - if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) { - ++matchBegin; - break; - } - - if (!matchBegin) - break; - } - } - - unsigned matchEnd = input.getPos(); - - for (; (matchEnd != input.end()) - && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { } - - if (((matchBegin && term.anchors.m_bol) - || ((matchEnd != input.end()) && term.anchors.m_eol)) - && !pattern->m_multiline) - return false; - - context->matchBegin = matchBegin; - context->matchEnd = matchEnd; - return true; - } - -#define MATCH_NEXT() { ++context->term; goto matchAgain; } -#define BACKTRACK() { --context->term; goto backtrack; } -#define currentTerm() (disjunction->terms[context->term]) - JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) - { - if (!--remainingMatchCount) - return JSRegExpErrorHitLimit; - - if (btrack) - BACKTRACK(); - - context->matchBegin = input.getPos(); - context->term = 0; - - matchAgain: - ASSERT(context->term < static_cast<int>(disjunction->terms.size())); - - switch (currentTerm().type) { - case ByteTerm::TypeSubpatternBegin: - MATCH_NEXT(); - case ByteTerm::TypeSubpatternEnd: - context->matchEnd = input.getPos(); - return JSRegExpMatch; - - case ByteTerm::TypeBodyAlternativeBegin: - MATCH_NEXT(); - case ByteTerm::TypeBodyAlternativeDisjunction: - case ByteTerm::TypeBodyAlternativeEnd: - context->matchEnd = input.getPos(); - return JSRegExpMatch; - - case ByteTerm::TypeAlternativeBegin: - MATCH_NEXT(); - case ByteTerm::TypeAlternativeDisjunction: - case ByteTerm::TypeAlternativeEnd: { - int offset = currentTerm().alternative.end; - BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); - backTrack->offset = offset; - context->term += offset; - MATCH_NEXT(); - } - - case ByteTerm::TypeAssertionBOL: - if (matchAssertionBOL(currentTerm())) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeAssertionEOL: - if (matchAssertionEOL(currentTerm())) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeAssertionWordBoundary: - if (matchAssertionWordBoundary(currentTerm())) - MATCH_NEXT(); - BACKTRACK(); - - case ByteTerm::TypePatternCharacterOnce: - case ByteTerm::TypePatternCharacterFixed: { - for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { - if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) - BACKTRACK(); - } - MATCH_NEXT(); - } - case ByteTerm::TypePatternCharacterGreedy: { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); - unsigned matchAmount = 0; - while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { - if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) { - input.uncheckInput(1); - break; - } - ++matchAmount; - } - backTrack->matchAmount = matchAmount; - - MATCH_NEXT(); - } - case ByteTerm::TypePatternCharacterNonGreedy: { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); - backTrack->matchAmount = 0; - MATCH_NEXT(); - } - - case ByteTerm::TypePatternCasedCharacterOnce: - case ByteTerm::TypePatternCasedCharacterFixed: { - for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { - if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) - BACKTRACK(); - } - MATCH_NEXT(); - } - case ByteTerm::TypePatternCasedCharacterGreedy: { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); - unsigned matchAmount = 0; - while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { - if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) { - input.uncheckInput(1); - break; - } - ++matchAmount; - } - backTrack->matchAmount = matchAmount; - - MATCH_NEXT(); - } - case ByteTerm::TypePatternCasedCharacterNonGreedy: { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); - backTrack->matchAmount = 0; - MATCH_NEXT(); - } - - case ByteTerm::TypeCharacterClass: - if (matchCharacterClass(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeBackReference: - if (matchBackReference(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpattern: { - JSRegExpResult result = matchParentheses(currentTerm(), context); - - if (result == JSRegExpMatch) { - MATCH_NEXT(); - } else if (result != JSRegExpNoMatch) - return result; - - BACKTRACK(); - } - case ByteTerm::TypeParenthesesSubpatternOnceBegin: - if (matchParenthesesOnceBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternOnceEnd: - if (matchParenthesesOnceEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternTerminalBegin: - if (matchParenthesesTerminalBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternTerminalEnd: - if (matchParenthesesTerminalEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParentheticalAssertionBegin: - if (matchParentheticalAssertionBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParentheticalAssertionEnd: - if (matchParentheticalAssertionEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - - case ByteTerm::TypeCheckInput: - if (input.checkInput(currentTerm().checkInputCount)) - MATCH_NEXT(); - BACKTRACK(); - - case ByteTerm::TypeUncheckInput: - input.uncheckInput(currentTerm().checkInputCount); - MATCH_NEXT(); - - case ByteTerm::TypeDotStarEnclosure: - if (matchDotStarEnclosure(currentTerm(), context)) - return JSRegExpMatch; - BACKTRACK(); - } - - // We should never fall-through to here. - RELEASE_ASSERT_NOT_REACHED(); - - backtrack: - ASSERT(context->term < static_cast<int>(disjunction->terms.size())); - - switch (currentTerm().type) { - case ByteTerm::TypeSubpatternBegin: - return JSRegExpNoMatch; - case ByteTerm::TypeSubpatternEnd: - RELEASE_ASSERT_NOT_REACHED(); - - case ByteTerm::TypeBodyAlternativeBegin: - case ByteTerm::TypeBodyAlternativeDisjunction: { - int offset = currentTerm().alternative.next; - context->term += offset; - if (offset > 0) - MATCH_NEXT(); - - if (input.atEnd()) - return JSRegExpNoMatch; - - input.next(); - - context->matchBegin = input.getPos(); - - if (currentTerm().alternative.onceThrough) - context->term += currentTerm().alternative.next; - - MATCH_NEXT(); - } - case ByteTerm::TypeBodyAlternativeEnd: - RELEASE_ASSERT_NOT_REACHED(); - - case ByteTerm::TypeAlternativeBegin: - case ByteTerm::TypeAlternativeDisjunction: { - int offset = currentTerm().alternative.next; - context->term += offset; - if (offset > 0) - MATCH_NEXT(); - BACKTRACK(); - } - case ByteTerm::TypeAlternativeEnd: { - // We should never backtrack back into an alternative of the main body of the regex. - BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); - unsigned offset = backTrack->offset; - context->term -= offset; - BACKTRACK(); - } - - case ByteTerm::TypeAssertionBOL: - case ByteTerm::TypeAssertionEOL: - case ByteTerm::TypeAssertionWordBoundary: - BACKTRACK(); - - case ByteTerm::TypePatternCharacterOnce: - case ByteTerm::TypePatternCharacterFixed: - case ByteTerm::TypePatternCharacterGreedy: - case ByteTerm::TypePatternCharacterNonGreedy: - if (backtrackPatternCharacter(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypePatternCasedCharacterOnce: - case ByteTerm::TypePatternCasedCharacterFixed: - case ByteTerm::TypePatternCasedCharacterGreedy: - case ByteTerm::TypePatternCasedCharacterNonGreedy: - if (backtrackPatternCasedCharacter(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeCharacterClass: - if (backtrackCharacterClass(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeBackReference: - if (backtrackBackReference(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpattern: { - JSRegExpResult result = backtrackParentheses(currentTerm(), context); - - if (result == JSRegExpMatch) { - MATCH_NEXT(); - } else if (result != JSRegExpNoMatch) - return result; - - BACKTRACK(); - } - case ByteTerm::TypeParenthesesSubpatternOnceBegin: - if (backtrackParenthesesOnceBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternOnceEnd: - if (backtrackParenthesesOnceEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternTerminalBegin: - if (backtrackParenthesesTerminalBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternTerminalEnd: - if (backtrackParenthesesTerminalEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParentheticalAssertionBegin: - if (backtrackParentheticalAssertionBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParentheticalAssertionEnd: - if (backtrackParentheticalAssertionEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - - case ByteTerm::TypeCheckInput: - input.uncheckInput(currentTerm().checkInputCount); - BACKTRACK(); - - case ByteTerm::TypeUncheckInput: - input.checkInput(currentTerm().checkInputCount); - BACKTRACK(); - - case ByteTerm::TypeDotStarEnclosure: - RELEASE_ASSERT_NOT_REACHED(); - } - - RELEASE_ASSERT_NOT_REACHED(); - return JSRegExpErrorNoMatch; - } - - JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) - { - JSRegExpResult result = matchDisjunction(disjunction, context, btrack); - - if (result == JSRegExpMatch) { - while (context->matchBegin == context->matchEnd) { - result = matchDisjunction(disjunction, context, true); - if (result != JSRegExpMatch) - return result; - } - return JSRegExpMatch; - } - - return result; - } - - unsigned interpret() - { - if (!input.isAvailableInput(0)) - return offsetNoMatch; - - for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i) - output[i << 1] = offsetNoMatch; - - allocatorPool = pattern->m_allocator->startAllocator(); - RELEASE_ASSERT(allocatorPool); - - DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); - - JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false); - if (result == JSRegExpMatch) { - output[0] = context->matchBegin; - output[1] = context->matchEnd; - } - - freeDisjunctionContext(context); - - pattern->m_allocator->stopAllocator(); - - ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch)); - return output[0]; - } - - Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start) - : pattern(pattern) - , output(output) - , input(input, start, length) - , allocatorPool(0) - , remainingMatchCount(matchLimit) - { - } - -private: - BytecodePattern* pattern; - unsigned* output; - InputStream input; - BumpPointerPool* allocatorPool; - unsigned remainingMatchCount; -}; - -class ByteCompiler { - struct ParenthesesStackEntry { - unsigned beginTerm; - unsigned savedAlternativeIndex; - ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) - : beginTerm(beginTerm) - , savedAlternativeIndex(savedAlternativeIndex) - { - } - }; - -public: - ByteCompiler(YarrPattern& pattern) - : m_pattern(pattern) - { - m_currentAlternativeIndex = 0; - } - - PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator) - { - regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough()); - emitDisjunction(m_pattern.m_body); - regexEnd(); - - return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator)); - } - - void checkInput(unsigned count) - { - m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); - } - - void uncheckInput(unsigned count) - { - m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count)); - } - - void assertionBOL(unsigned inputPosition) - { - m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); - } - - void assertionEOL(unsigned inputPosition) - { - m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); - } - - void assertionWordBoundary(bool invert, unsigned inputPosition) - { - m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); - } - - void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) - { - if (m_pattern.m_ignoreCase) { - UChar lo = Unicode::toLower(ch); - UChar hi = Unicode::toUpper(ch); - - if (lo != hi) { - m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); - return; - } - } - - m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); - } - - void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) - { - m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); - - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet(); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - } - - void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) - { - ASSERT(subpatternId); - - m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); - - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet(); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - } - - void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) - { - int beginTerm = m_bodyDisjunction->terms.size(); - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; - - m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); - m_currentAlternativeIndex = beginTerm + 1; - } - - void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) - { - int beginTerm = m_bodyDisjunction->terms.size(); - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; - - m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); - m_currentAlternativeIndex = beginTerm + 1; - } - - void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) - { - // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin, - // then fix this up at the end! - simplifying this should make it much clearer. - // https://bugs.webkit.org/show_bug.cgi?id=50136 - - int beginTerm = m_bodyDisjunction->terms.size(); - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; - - m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); - m_currentAlternativeIndex = beginTerm + 1; - } - - void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) - { - int beginTerm = m_bodyDisjunction->terms.size(); - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; - - m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); - m_currentAlternativeIndex = beginTerm + 1; - } - - void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) - { - unsigned beginTerm = popParenthesesStack(); - closeAlternative(beginTerm + 1); - unsigned endTerm = m_bodyDisjunction->terms.size(); - - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin); - - bool invert = m_bodyDisjunction->terms[beginTerm].invert(); - unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition)); - m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; - m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; - m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); - m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); - m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; - } - - void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored) - { - m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored)); - } - - unsigned popParenthesesStack() - { - ASSERT(m_parenthesesStack.size()); - int stackEnd = m_parenthesesStack.size() - 1; - unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; - m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; - m_parenthesesStack.shrink(stackEnd); - - ASSERT(beginTerm < m_bodyDisjunction->terms.size()); - ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); - - return beginTerm; - } - -#ifndef NDEBUG - void dumpDisjunction(ByteDisjunction* disjunction) - { - dataLogF("ByteDisjunction(%p):\n\t", disjunction); - for (unsigned i = 0; i < disjunction->terms.size(); ++i) - dataLogF("{ %d } ", disjunction->terms[i].type); - dataLogF("\n"); - } -#endif - - void closeAlternative(int beginTerm) - { - int origBeginTerm = beginTerm; - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); - int endIndex = m_bodyDisjunction->terms.size(); - - unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; - - if (!m_bodyDisjunction->terms[beginTerm].alternative.next) - m_bodyDisjunction->terms.remove(beginTerm); - else { - while (m_bodyDisjunction->terms[beginTerm].alternative.next) { - beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); - m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; - m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; - } - - m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; - - m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); - m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; - } - } - - void closeBodyAlternative() - { - int beginTerm = 0; - int origBeginTerm = 0; - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); - int endIndex = m_bodyDisjunction->terms.size(); - - unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; - - while (m_bodyDisjunction->terms[beginTerm].alternative.next) { - beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); - m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; - m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; - } - - m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; - - m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); - m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; - } - - void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) - { - unsigned beginTerm = popParenthesesStack(); - closeAlternative(beginTerm + 1); - unsigned endTerm = m_bodyDisjunction->terms.size(); - - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - - ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; - - bool capture = parenthesesBegin.capture(); - unsigned subpatternId = parenthesesBegin.atom.subpatternId; - - unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; - OwnPtr<ByteDisjunction> parenthesesDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); - - unsigned firstTermInParentheses = beginTerm + 1; - parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2); - - parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); - for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses) - parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); - parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); - - m_bodyDisjunction->terms.shrink(beginTerm); - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition)); - m_allParenthesesInfo.append(parenthesesDisjunction.release()); - - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); - m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; - } - - void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) - { - unsigned beginTerm = popParenthesesStack(); - closeAlternative(beginTerm + 1); - unsigned endTerm = m_bodyDisjunction->terms.size(); - - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - - bool capture = m_bodyDisjunction->terms[beginTerm].capture(); - unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition)); - m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; - m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; - m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); - m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); - m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; - } - - void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) - { - unsigned beginTerm = popParenthesesStack(); - closeAlternative(beginTerm + 1); - unsigned endTerm = m_bodyDisjunction->terms.size(); - - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); - - bool capture = m_bodyDisjunction->terms[beginTerm].capture(); - unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition)); - m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; - m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; - m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); - m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); - m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; - } - - void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) - { - m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); - m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough)); - m_bodyDisjunction->terms[0].frameLocation = 0; - m_currentAlternativeIndex = 0; - } - - void regexEnd() - { - closeBodyAlternative(); - } - - void alternativeBodyDisjunction(bool onceThrough) - { - int newAlternativeIndex = m_bodyDisjunction->terms.size(); - m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; - m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough)); - - m_currentAlternativeIndex = newAlternativeIndex; - } - - void alternativeDisjunction() - { - int newAlternativeIndex = m_bodyDisjunction->terms.size(); - m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; - m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); - - m_currentAlternativeIndex = newAlternativeIndex; - } - - void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0) - { - for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { - unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; - - PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); - - if (alt) { - if (disjunction == m_pattern.m_body) - alternativeBodyDisjunction(alternative->onceThrough()); - else - alternativeDisjunction(); - } - - unsigned minimumSize = alternative->m_minimumSize; - ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked); - unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; - - if (countToCheck) { - checkInput(countToCheck); - currentCountAlreadyChecked += countToCheck; - } - - for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { - PatternTerm& term = alternative->m_terms[i]; - - switch (term.type) { - case PatternTerm::TypeAssertionBOL: - assertionBOL(currentCountAlreadyChecked - term.inputPosition); - break; - - case PatternTerm::TypeAssertionEOL: - assertionEOL(currentCountAlreadyChecked - term.inputPosition); - break; - - case PatternTerm::TypeAssertionWordBoundary: - assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition); - break; - - case PatternTerm::TypePatternCharacter: - atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); - break; - - case PatternTerm::TypeCharacterClass: - atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); - break; - - case PatternTerm::TypeBackReference: - atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); - break; - - case PatternTerm::TypeForwardReference: - break; - - case PatternTerm::TypeParenthesesSubpattern: { - unsigned disjunctionAlreadyCheckedCount = 0; - if (term.quantityCount == 1 && !term.parentheses.isCopy) { - unsigned alternativeFrameLocation = term.frameLocation; - // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame. - if (term.quantityType == QuantifierFixedCount) - disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; - else - alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, alternativeFrameLocation); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); - atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); - } else if (term.parentheses.isTerminal) { - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); - atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); - } else { - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, 0); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); - atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); - } - break; - } - - case PatternTerm::TypeParentheticalAssertion: { - unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion; - - ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition)); - unsigned positiveInputOffset = currentCountAlreadyChecked - static_cast<unsigned>(term.inputPosition); - unsigned uncheckAmount = 0; - if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) { - uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize; - uncheckInput(uncheckAmount); - currentCountAlreadyChecked -= uncheckAmount; - } - - atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount); - atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType); - if (uncheckAmount) { - checkInput(uncheckAmount); - currentCountAlreadyChecked += uncheckAmount; - } - break; - } - - case PatternTerm::TypeDotStarEnclosure: - assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor); - break; - } - } - } - } - -private: - YarrPattern& m_pattern; - OwnPtr<ByteDisjunction> m_bodyDisjunction; - unsigned m_currentAlternativeIndex; - Vector<ParenthesesStackEntry> m_parenthesesStack; - Vector<OwnPtr<ByteDisjunction> > m_allParenthesesInfo; -}; - -PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator) -{ - return ByteCompiler(pattern).compile(allocator); -} - -unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output) -{ - if (input.is8Bit()) - return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret(); - return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret(); -} - -unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output) -{ - return Interpreter<LChar>(bytecode, output, input, length, start).interpret(); -} - -unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output) -{ - return Interpreter<UChar>(bytecode, output, input, length, start).interpret(); -} - -// These should be the same for both UChar & LChar. -COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter); -COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass); -COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference); -COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative); -COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion); -COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce); -COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses); - - -} } diff --git a/third_party/WebKit/Source/yarr/YarrInterpreter.h b/third_party/WebKit/Source/yarr/YarrInterpreter.h deleted file mode 100644 index b3ceb7f..0000000 --- a/third_party/WebKit/Source/yarr/YarrInterpreter.h +++ /dev/null @@ -1,380 +0,0 @@ -/* - * Copyright (C) 2009, 2010 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef YarrInterpreter_h -#define YarrInterpreter_h - -#include "yarr/YarrPattern.h" -#include <wtf/PassOwnPtr.h> -#include <wtf/unicode/Unicode.h> - -namespace WTF { -class BumpPointerAllocator; -} -using WTF::BumpPointerAllocator; - -namespace JSC { namespace Yarr { - -class ByteDisjunction; - -struct ByteTerm { - enum Type { - TypeBodyAlternativeBegin, - TypeBodyAlternativeDisjunction, - TypeBodyAlternativeEnd, - TypeAlternativeBegin, - TypeAlternativeDisjunction, - TypeAlternativeEnd, - TypeSubpatternBegin, - TypeSubpatternEnd, - TypeAssertionBOL, - TypeAssertionEOL, - TypeAssertionWordBoundary, - TypePatternCharacterOnce, - TypePatternCharacterFixed, - TypePatternCharacterGreedy, - TypePatternCharacterNonGreedy, - TypePatternCasedCharacterOnce, - TypePatternCasedCharacterFixed, - TypePatternCasedCharacterGreedy, - TypePatternCasedCharacterNonGreedy, - TypeCharacterClass, - TypeBackReference, - TypeParenthesesSubpattern, - TypeParenthesesSubpatternOnceBegin, - TypeParenthesesSubpatternOnceEnd, - TypeParenthesesSubpatternTerminalBegin, - TypeParenthesesSubpatternTerminalEnd, - TypeParentheticalAssertionBegin, - TypeParentheticalAssertionEnd, - TypeCheckInput, - TypeUncheckInput, - TypeDotStarEnclosure, - } type; - union { - struct { - union { - UChar patternCharacter; - struct { - UChar lo; - UChar hi; - } casedCharacter; - CharacterClass* characterClass; - unsigned subpatternId; - }; - union { - ByteDisjunction* parenthesesDisjunction; - unsigned parenthesesWidth; - }; - QuantifierType quantityType; - unsigned quantityCount; - } atom; - struct { - int next; - int end; - bool onceThrough; - } alternative; - struct { - bool m_bol : 1; - bool m_eol : 1; - } anchors; - unsigned checkInputCount; - }; - unsigned frameLocation; - bool m_capture : 1; - bool m_invert : 1; - unsigned inputPosition; - - ByteTerm(UChar ch, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) - : frameLocation(frameLocation) - , m_capture(false) - , m_invert(false) - { - switch (quantityType) { - case QuantifierFixedCount: - type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; - break; - case QuantifierGreedy: - type = ByteTerm::TypePatternCharacterGreedy; - break; - case QuantifierNonGreedy: - type = ByteTerm::TypePatternCharacterNonGreedy; - break; - } - - atom.patternCharacter = ch; - atom.quantityType = quantityType; - atom.quantityCount = quantityCount.unsafeGet(); - inputPosition = inputPos; - } - - ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) - : frameLocation(frameLocation) - , m_capture(false) - , m_invert(false) - { - switch (quantityType) { - case QuantifierFixedCount: - type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; - break; - case QuantifierGreedy: - type = ByteTerm::TypePatternCasedCharacterGreedy; - break; - case QuantifierNonGreedy: - type = ByteTerm::TypePatternCasedCharacterNonGreedy; - break; - } - - atom.casedCharacter.lo = lo; - atom.casedCharacter.hi = hi; - atom.quantityType = quantityType; - atom.quantityCount = quantityCount.unsafeGet(); - inputPosition = inputPos; - } - - ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) - : type(ByteTerm::TypeCharacterClass) - , m_capture(false) - , m_invert(invert) - { - atom.characterClass = characterClass; - atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; - inputPosition = inputPos; - } - - ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos) - : type(type) - , m_capture(capture) - , m_invert(false) - { - atom.subpatternId = subpatternId; - atom.parenthesesDisjunction = parenthesesInfo; - atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; - inputPosition = inputPos; - } - - ByteTerm(Type type, bool invert = false) - : type(type) - , m_capture(false) - , m_invert(invert) - { - atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; - } - - ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos) - : type(type) - , m_capture(capture) - , m_invert(invert) - { - atom.subpatternId = subpatternId; - atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; - inputPosition = inputPos; - } - - static ByteTerm BOL(int inputPos) - { - ByteTerm term(TypeAssertionBOL); - term.inputPosition = inputPos; - return term; - } - - static ByteTerm CheckInput(Checked<unsigned> count) - { - ByteTerm term(TypeCheckInput); - term.checkInputCount = count.unsafeGet(); - return term; - } - - static ByteTerm UncheckInput(Checked<unsigned> count) - { - ByteTerm term(TypeUncheckInput); - term.checkInputCount = count.unsafeGet(); - return term; - } - - static ByteTerm EOL(int inputPos) - { - ByteTerm term(TypeAssertionEOL); - term.inputPosition = inputPos; - return term; - } - - static ByteTerm WordBoundary(bool invert, int inputPos) - { - ByteTerm term(TypeAssertionWordBoundary, invert); - term.inputPosition = inputPos; - return term; - } - - static ByteTerm BackReference(unsigned subpatternId, int inputPos) - { - return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); - } - - static ByteTerm BodyAlternativeBegin(bool onceThrough) - { - ByteTerm term(TypeBodyAlternativeBegin); - term.alternative.next = 0; - term.alternative.end = 0; - term.alternative.onceThrough = onceThrough; - return term; - } - - static ByteTerm BodyAlternativeDisjunction(bool onceThrough) - { - ByteTerm term(TypeBodyAlternativeDisjunction); - term.alternative.next = 0; - term.alternative.end = 0; - term.alternative.onceThrough = onceThrough; - return term; - } - - static ByteTerm BodyAlternativeEnd() - { - ByteTerm term(TypeBodyAlternativeEnd); - term.alternative.next = 0; - term.alternative.end = 0; - term.alternative.onceThrough = false; - return term; - } - - static ByteTerm AlternativeBegin() - { - ByteTerm term(TypeAlternativeBegin); - term.alternative.next = 0; - term.alternative.end = 0; - term.alternative.onceThrough = false; - return term; - } - - static ByteTerm AlternativeDisjunction() - { - ByteTerm term(TypeAlternativeDisjunction); - term.alternative.next = 0; - term.alternative.end = 0; - term.alternative.onceThrough = false; - return term; - } - - static ByteTerm AlternativeEnd() - { - ByteTerm term(TypeAlternativeEnd); - term.alternative.next = 0; - term.alternative.end = 0; - term.alternative.onceThrough = false; - return term; - } - - static ByteTerm SubpatternBegin() - { - return ByteTerm(TypeSubpatternBegin); - } - - static ByteTerm SubpatternEnd() - { - return ByteTerm(TypeSubpatternEnd); - } - - static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor) - { - ByteTerm term(TypeDotStarEnclosure); - term.anchors.m_bol = bolAnchor; - term.anchors.m_eol = eolAnchor; - return term; - } - - bool invert() - { - return m_invert; - } - - bool capture() - { - return m_capture; - } -}; - -class ByteDisjunction { - WTF_MAKE_FAST_ALLOCATED; -public: - ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) - : m_numSubpatterns(numSubpatterns) - , m_frameSize(frameSize) - { - } - - Vector<ByteTerm> terms; - unsigned m_numSubpatterns; - unsigned m_frameSize; -}; - -struct BytecodePattern { - WTF_MAKE_FAST_ALLOCATED; -public: - BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<OwnPtr<ByteDisjunction> >& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator) - : m_body(body) - , m_ignoreCase(pattern.m_ignoreCase) - , m_multiline(pattern.m_multiline) - , m_allocator(allocator) - { - m_body->terms.shrinkToFit(); - - newlineCharacterClass = pattern.newlineCharacterClass(); - wordcharCharacterClass = pattern.wordcharCharacterClass(); - - m_allParenthesesInfo.swap(parenthesesInfoToAdopt); - m_allParenthesesInfo.shrinkToFit(); - - m_userCharacterClasses.swap(pattern.m_userCharacterClasses); - m_userCharacterClasses.shrinkToFit(); - } - - OwnPtr<ByteDisjunction> m_body; - bool m_ignoreCase; - bool m_multiline; - // Each BytecodePattern is associated with a RegExp, each RegExp is associated - // with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regExpAllocator. - BumpPointerAllocator* m_allocator; - - CharacterClass* newlineCharacterClass; - CharacterClass* wordcharCharacterClass; - -private: - Vector<OwnPtr<ByteDisjunction> > m_allParenthesesInfo; - Vector<OwnPtr<CharacterClass> > m_userCharacterClasses; -}; - -PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*); -unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output); -unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output); -unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output); - -} } // namespace JSC::Yarr - -#endif // YarrInterpreter_h diff --git a/third_party/WebKit/Source/yarr/YarrParser.h b/third_party/WebKit/Source/yarr/YarrParser.h deleted file mode 100644 index 61b016a..0000000 --- a/third_party/WebKit/Source/yarr/YarrParser.h +++ /dev/null @@ -1,880 +0,0 @@ -/* - * Copyright (C) 2009 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef YarrParser_h -#define YarrParser_h - -#include "yarr/Yarr.h" -#include <wtf/ASCIICType.h> -#include <wtf/text/WTFString.h> -#include <wtf/unicode/Unicode.h> - -namespace JSC { namespace Yarr { - -#define REGEXP_ERROR_PREFIX "Invalid regular expression: " - -enum BuiltInCharacterClassID { - DigitClassID, - SpaceClassID, - WordClassID, - NewlineClassID, -}; - -// The Parser class should not be used directly - only via the Yarr::parse() method. -template<class Delegate, typename CharType> -class Parser { -private: - template<class FriendDelegate> - friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); - - enum ErrorCode { - NoError, - PatternTooLarge, - QuantifierOutOfOrder, - QuantifierWithoutAtom, - QuantifierTooLarge, - MissingParentheses, - ParenthesesUnmatched, - ParenthesesTypeInvalid, - CharacterClassUnmatched, - CharacterClassOutOfOrder, - EscapeUnterminated, - NumberOfErrorCodes - }; - - /* - * CharacterClassParserDelegate: - * - * The class CharacterClassParserDelegate is used in the parsing of character - * classes. This class handles detection of character ranges. This class - * implements enough of the delegate interface such that it can be passed to - * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused - * to perform the parsing of escape characters in character sets. - */ - class CharacterClassParserDelegate { - public: - CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) - : m_delegate(delegate) - , m_err(err) - , m_state(Empty) - , m_character(0) - { - } - - /* - * begin(): - * - * Called at beginning of construction. - */ - void begin(bool invert) - { - m_delegate.atomCharacterClassBegin(invert); - } - - /* - * atomPatternCharacter(): - * - * This method is called either from parseCharacterClass() (for an unescaped - * character in a character class), or from parseEscape(). In the former case - * the value true will be passed for the argument 'hyphenIsRange', and in this - * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ - * is different to /[a\-z]/). - */ - void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) - { - switch (m_state) { - case AfterCharacterClass: - // Following a builtin character class we need look out for a hyphen. - // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. - // If we see a hyphen following a charater class then unlike usual - // we'll report it to the delegate immediately, and put ourself into - // a poisoned state. Any following calls to add another character or - // character class will result in an error. (A hypen following a - // character-class is itself valid, but only at the end of a regex). - if (hyphenIsRange && ch == '-') { - m_delegate.atomCharacterClassAtom('-'); - m_state = AfterCharacterClassHyphen; - return; - } - // Otherwise just fall through - cached character so treat this as Empty. - - case Empty: - m_character = ch; - m_state = CachedCharacter; - return; - - case CachedCharacter: - if (hyphenIsRange && ch == '-') - m_state = CachedCharacterHyphen; - else { - m_delegate.atomCharacterClassAtom(m_character); - m_character = ch; - } - return; - - case CachedCharacterHyphen: - if (ch < m_character) { - m_err = CharacterClassOutOfOrder; - return; - } - m_delegate.atomCharacterClassRange(m_character, ch); - m_state = Empty; - return; - - // See coment in atomBuiltInCharacterClass below. - // This too is technically an error, per ECMA-262, and again we - // we chose to allow this. Note a subtlely here that while we - // diverge from the spec's definition of CharacterRange we do - // remain in compliance with the grammar. For example, consider - // the expression /[\d-a-z]/. We comply with the grammar in - // this case by not allowing a-z to be matched as a range. - case AfterCharacterClassHyphen: - m_delegate.atomCharacterClassAtom(ch); - m_state = Empty; - return; - } - } - - /* - * atomBuiltInCharacterClass(): - * - * Adds a built-in character class, called by parseEscape(). - */ - void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) - { - switch (m_state) { - case CachedCharacter: - // Flush the currently cached character, then fall through. - m_delegate.atomCharacterClassAtom(m_character); - - case Empty: - case AfterCharacterClass: - m_state = AfterCharacterClass; - m_delegate.atomCharacterClassBuiltIn(classID, invert); - return; - - // If we hit either of these cases, we have an invalid range that - // looks something like /[x-\d]/ or /[\d-\d]/. - // According to ECMA-262 this should be a syntax error, but - // empirical testing shows this to break teh webz. Instead we - // comply with to the ECMA-262 grammar, and assume the grammar to - // have matched the range correctly, but tweak our interpretation - // of CharacterRange. Effectively we implicitly handle the hyphen - // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. - case CachedCharacterHyphen: - m_delegate.atomCharacterClassAtom(m_character); - m_delegate.atomCharacterClassAtom('-'); - // fall through - case AfterCharacterClassHyphen: - m_delegate.atomCharacterClassBuiltIn(classID, invert); - m_state = Empty; - return; - } - } - - /* - * end(): - * - * Called at end of construction. - */ - void end() - { - if (m_state == CachedCharacter) - m_delegate.atomCharacterClassAtom(m_character); - else if (m_state == CachedCharacterHyphen) { - m_delegate.atomCharacterClassAtom(m_character); - m_delegate.atomCharacterClassAtom('-'); - } - m_delegate.atomCharacterClassEnd(); - } - - // parseEscape() should never call these delegate methods when - // invoked with inCharacterClass set. - NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); } - NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); } - - private: - Delegate& m_delegate; - ErrorCode& m_err; - enum CharacterClassConstructionState { - Empty, - CachedCharacter, - CachedCharacterHyphen, - AfterCharacterClass, - AfterCharacterClassHyphen, - } m_state; - UChar m_character; - }; - - Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit) - : m_delegate(delegate) - , m_backReferenceLimit(backReferenceLimit) - , m_err(NoError) - , m_data(pattern.getCharacters<CharType>()) - , m_size(pattern.length()) - , m_index(0) - , m_parenthesesNestingDepth(0) - { - } - - /* - * parseEscape(): - * - * Helper for parseTokens() AND parseCharacterClass(). - * Unlike the other parser methods, this function does not report tokens - * directly to the member delegate (m_delegate), instead tokens are - * emitted to the delegate provided as an argument. In the case of atom - * escapes, parseTokens() will call parseEscape() passing m_delegate as - * an argument, and as such the escape will be reported to the delegate. - * - * However this method may also be used by parseCharacterClass(), in which - * case a CharacterClassParserDelegate will be passed as the delegate that - * tokens should be added to. A boolean flag is also provided to indicate - * whether that an escape in a CharacterClass is being parsed (some parsing - * rules change in this context). - * - * The boolean value returned by this method indicates whether the token - * parsed was an atom (outside of a characted class \b and \B will be - * interpreted as assertions). - */ - template<bool inCharacterClass, class EscapeDelegate> - bool parseEscape(EscapeDelegate& delegate) - { - ASSERT(!m_err); - ASSERT(peek() == '\\'); - consume(); - - if (atEndOfPattern()) { - m_err = EscapeUnterminated; - return false; - } - - switch (peek()) { - // Assertions - case 'b': - consume(); - if (inCharacterClass) - delegate.atomPatternCharacter('\b'); - else { - delegate.assertionWordBoundary(false); - return false; - } - break; - case 'B': - consume(); - if (inCharacterClass) - delegate.atomPatternCharacter('B'); - else { - delegate.assertionWordBoundary(true); - return false; - } - break; - - // CharacterClassEscape - case 'd': - consume(); - delegate.atomBuiltInCharacterClass(DigitClassID, false); - break; - case 's': - consume(); - delegate.atomBuiltInCharacterClass(SpaceClassID, false); - break; - case 'w': - consume(); - delegate.atomBuiltInCharacterClass(WordClassID, false); - break; - case 'D': - consume(); - delegate.atomBuiltInCharacterClass(DigitClassID, true); - break; - case 'S': - consume(); - delegate.atomBuiltInCharacterClass(SpaceClassID, true); - break; - case 'W': - consume(); - delegate.atomBuiltInCharacterClass(WordClassID, true); - break; - - // DecimalEscape - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. - // First, try to parse this as backreference. - if (!inCharacterClass) { - ParseState state = saveState(); - - unsigned backReference = consumeNumber(); - if (backReference <= m_backReferenceLimit) { - delegate.atomBackReference(backReference); - break; - } - - restoreState(state); - } - - // Not a backreference, and not octal. - if (peek() >= '8') { - delegate.atomPatternCharacter('\\'); - break; - } - - // Fall-through to handle this as an octal escape. - } - - // Octal escape - case '0': - delegate.atomPatternCharacter(consumeOctal()); - break; - - // ControlEscape - case 'f': - consume(); - delegate.atomPatternCharacter('\f'); - break; - case 'n': - consume(); - delegate.atomPatternCharacter('\n'); - break; - case 'r': - consume(); - delegate.atomPatternCharacter('\r'); - break; - case 't': - consume(); - delegate.atomPatternCharacter('\t'); - break; - case 'v': - consume(); - delegate.atomPatternCharacter('\v'); - break; - - // ControlLetter - case 'c': { - ParseState state = saveState(); - consume(); - if (!atEndOfPattern()) { - int control = consume(); - - // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. - if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { - delegate.atomPatternCharacter(control & 0x1f); - break; - } - } - restoreState(state); - delegate.atomPatternCharacter('\\'); - break; - } - - // HexEscape - case 'x': { - consume(); - int x = tryConsumeHex(2); - if (x == -1) - delegate.atomPatternCharacter('x'); - else - delegate.atomPatternCharacter(x); - break; - } - - // UnicodeEscape - case 'u': { - consume(); - int u = tryConsumeHex(4); - if (u == -1) - delegate.atomPatternCharacter('u'); - else - delegate.atomPatternCharacter(u); - break; - } - - // IdentityEscape - default: - delegate.atomPatternCharacter(consume()); - } - - return true; - } - - /* - * parseAtomEscape(), parseCharacterClassEscape(): - * - * These methods alias to parseEscape(). - */ - bool parseAtomEscape() - { - return parseEscape<false>(m_delegate); - } - void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) - { - parseEscape<true>(delegate); - } - - /* - * parseCharacterClass(): - * - * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) - * to an instance of CharacterClassParserDelegate, to describe the character class to the - * delegate. - */ - void parseCharacterClass() - { - ASSERT(!m_err); - ASSERT(peek() == '['); - consume(); - - CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); - - characterClassConstructor.begin(tryConsume('^')); - - while (!atEndOfPattern()) { - switch (peek()) { - case ']': - consume(); - characterClassConstructor.end(); - return; - - case '\\': - parseCharacterClassEscape(characterClassConstructor); - break; - - default: - characterClassConstructor.atomPatternCharacter(consume(), true); - } - - if (m_err) - return; - } - - m_err = CharacterClassUnmatched; - } - - /* - * parseParenthesesBegin(): - * - * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. - */ - void parseParenthesesBegin() - { - ASSERT(!m_err); - ASSERT(peek() == '('); - consume(); - - if (tryConsume('?')) { - if (atEndOfPattern()) { - m_err = ParenthesesTypeInvalid; - return; - } - - switch (consume()) { - case ':': - m_delegate.atomParenthesesSubpatternBegin(false); - break; - - case '=': - m_delegate.atomParentheticalAssertionBegin(); - break; - - case '!': - m_delegate.atomParentheticalAssertionBegin(true); - break; - - default: - m_err = ParenthesesTypeInvalid; - } - } else - m_delegate.atomParenthesesSubpatternBegin(); - - ++m_parenthesesNestingDepth; - } - - /* - * parseParenthesesEnd(): - * - * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). - */ - void parseParenthesesEnd() - { - ASSERT(!m_err); - ASSERT(peek() == ')'); - consume(); - - if (m_parenthesesNestingDepth > 0) - m_delegate.atomParenthesesEnd(); - else - m_err = ParenthesesUnmatched; - - --m_parenthesesNestingDepth; - } - - /* - * parseQuantifier(): - * - * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. - */ - void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) - { - ASSERT(!m_err); - ASSERT(min <= max); - - if (min == UINT_MAX) { - m_err = QuantifierTooLarge; - return; - } - - if (lastTokenWasAnAtom) - m_delegate.quantifyAtom(min, max, !tryConsume('?')); - else - m_err = QuantifierWithoutAtom; - } - - /* - * parseTokens(): - * - * This method loops over the input pattern reporting tokens to the delegate. - * The method returns when a parse error is detected, or the end of the pattern - * is reached. One piece of state is tracked around the loop, which is whether - * the last token passed to the delegate was an atom (this is necessary to detect - * a parse error when a quantifier provided without an atom to quantify). - */ - void parseTokens() - { - bool lastTokenWasAnAtom = false; - - while (!atEndOfPattern()) { - switch (peek()) { - case '|': - consume(); - m_delegate.disjunction(); - lastTokenWasAnAtom = false; - break; - - case '(': - parseParenthesesBegin(); - lastTokenWasAnAtom = false; - break; - - case ')': - parseParenthesesEnd(); - lastTokenWasAnAtom = true; - break; - - case '^': - consume(); - m_delegate.assertionBOL(); - lastTokenWasAnAtom = false; - break; - - case '$': - consume(); - m_delegate.assertionEOL(); - lastTokenWasAnAtom = false; - break; - - case '.': - consume(); - m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); - lastTokenWasAnAtom = true; - break; - - case '[': - parseCharacterClass(); - lastTokenWasAnAtom = true; - break; - - case '\\': - lastTokenWasAnAtom = parseAtomEscape(); - break; - - case '*': - consume(); - parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); - lastTokenWasAnAtom = false; - break; - - case '+': - consume(); - parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); - lastTokenWasAnAtom = false; - break; - - case '?': - consume(); - parseQuantifier(lastTokenWasAnAtom, 0, 1); - lastTokenWasAnAtom = false; - break; - - case '{': { - ParseState state = saveState(); - - consume(); - if (peekIsDigit()) { - unsigned min = consumeNumber(); - unsigned max = min; - - if (tryConsume(',')) - max = peekIsDigit() ? consumeNumber() : quantifyInfinite; - - if (tryConsume('}')) { - if (min <= max) - parseQuantifier(lastTokenWasAnAtom, min, max); - else - m_err = QuantifierOutOfOrder; - lastTokenWasAnAtom = false; - break; - } - } - - restoreState(state); - } // if we did not find a complete quantifer, fall through to the default case. - - default: - m_delegate.atomPatternCharacter(consume()); - lastTokenWasAnAtom = true; - } - - if (m_err) - return; - } - - if (m_parenthesesNestingDepth > 0) - m_err = MissingParentheses; - } - - /* - * parse(): - * - * This method calls parseTokens() to parse over the input and converts any - * error code to a const char* for a result. - */ - const char* parse() - { - if (m_size > MAX_PATTERN_SIZE) - m_err = PatternTooLarge; - else - parseTokens(); - ASSERT(atEndOfPattern() || m_err); - - // The order of this array must match the ErrorCode enum. - static const char* errorMessages[NumberOfErrorCodes] = { - 0, // NoError - REGEXP_ERROR_PREFIX "regular expression too large", - REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", - REGEXP_ERROR_PREFIX "nothing to repeat", - REGEXP_ERROR_PREFIX "number too large in {} quantifier", - REGEXP_ERROR_PREFIX "missing )", - REGEXP_ERROR_PREFIX "unmatched parentheses", - REGEXP_ERROR_PREFIX "unrecognized character after (?", - REGEXP_ERROR_PREFIX "missing terminating ] for character class", - REGEXP_ERROR_PREFIX "range out of order in character class", - REGEXP_ERROR_PREFIX "\\ at end of pattern" - }; - - return errorMessages[m_err]; - } - - // Misc helper functions: - - typedef unsigned ParseState; - - ParseState saveState() - { - return m_index; - } - - void restoreState(ParseState state) - { - m_index = state; - } - - bool atEndOfPattern() - { - ASSERT(m_index <= m_size); - return m_index == m_size; - } - - int peek() - { - ASSERT(m_index < m_size); - return m_data[m_index]; - } - - bool peekIsDigit() - { - return !atEndOfPattern() && WTF::isASCIIDigit(peek()); - } - - unsigned peekDigit() - { - ASSERT(peekIsDigit()); - return peek() - '0'; - } - - int consume() - { - ASSERT(m_index < m_size); - return m_data[m_index++]; - } - - unsigned consumeDigit() - { - ASSERT(peekIsDigit()); - return consume() - '0'; - } - - unsigned consumeNumber() - { - unsigned n = consumeDigit(); - // check for overflow. - for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { - n = newValue; - consume(); - } - return n; - } - - unsigned consumeOctal() - { - ASSERT(WTF::isASCIIOctalDigit(peek())); - - unsigned n = consumeDigit(); - while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) - n = n * 8 + consumeDigit(); - return n; - } - - bool tryConsume(UChar ch) - { - if (atEndOfPattern() || (m_data[m_index] != ch)) - return false; - ++m_index; - return true; - } - - int tryConsumeHex(int count) - { - ParseState state = saveState(); - - int n = 0; - while (count--) { - if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { - restoreState(state); - return -1; - } - n = (n << 4) | WTF::toASCIIHexValue(consume()); - } - return n; - } - - Delegate& m_delegate; - unsigned m_backReferenceLimit; - ErrorCode m_err; - const CharType* m_data; - unsigned m_size; - unsigned m_index; - unsigned m_parenthesesNestingDepth; - - // Derived by empirical testing of compile time in PCRE and WREC. - static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; -}; - -/* - * Yarr::parse(): - * - * The parse method is passed a pattern to be parsed and a delegate upon which - * callbacks will be made to record the parsed tokens forming the regex. - * Yarr::parse() returns null on success, or a const C string providing an error - * message where a parse error occurs. - * - * The Delegate must implement the following interface: - * - * void assertionBOL(); - * void assertionEOL(); - * void assertionWordBoundary(bool invert); - * - * void atomPatternCharacter(UChar ch); - * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); - * void atomCharacterClassBegin(bool invert) - * void atomCharacterClassAtom(UChar ch) - * void atomCharacterClassRange(UChar begin, UChar end) - * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) - * void atomCharacterClassEnd() - * void atomParenthesesSubpatternBegin(bool capture = true); - * void atomParentheticalAssertionBegin(bool invert = false); - * void atomParenthesesEnd(); - * void atomBackReference(unsigned subpatternId); - * - * void quantifyAtom(unsigned min, unsigned max, bool greedy); - * - * void disjunction(); - * - * The regular expression is described by a sequence of assertion*() and atom*() - * callbacks to the delegate, describing the terms in the regular expression. - * Following an atom a quantifyAtom() call may occur to indicate that the previous - * atom should be quantified. In the case of atoms described across multiple - * calls (parentheses and character classes) the call to quantifyAtom() will come - * after the call to the atom*End() method, never after atom*Begin(). - * - * Character classes may either be described by a single call to - * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. - * In the latter case, ...Begin() will be called, followed by a sequence of - * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). - * - * Sequences of atoms and assertions are broken into alternatives via calls to - * disjunction(). Assertions, atoms, and disjunctions emitted between calls to - * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. - * atomParenthesesBegin() is passed a subpatternId. In the case of a regular - * capturing subpattern, this will be the subpatternId associated with these - * parentheses, and will also by definition be the lowest subpatternId of these - * parentheses and of any nested paretheses. The atomParenthesesEnd() method - * is passed the subpatternId of the last capturing subexpression nested within - * these paretheses. In the case of a capturing subpattern with no nested - * capturing subpatterns, the same subpatternId will be passed to the begin and - * end functions. In the case of non-capturing subpatterns the subpatternId - * passed to the begin method is also the first possible subpatternId that might - * be nested within these paretheses. If a set of non-capturing parentheses does - * not contain any capturing subpatterns, then the subpatternId passed to begin - * will be greater than the subpatternId passed to end. - */ - -template<class Delegate> -const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) -{ - if (pattern.is8Bit()) - return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse(); - return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse(); -} - -} } // namespace JSC::Yarr - -#endif // YarrParser_h diff --git a/third_party/WebKit/Source/yarr/YarrPattern.cpp b/third_party/WebKit/Source/yarr/YarrPattern.cpp deleted file mode 100644 index 978c074..0000000 --- a/third_party/WebKit/Source/yarr/YarrPattern.cpp +++ /dev/null @@ -1,880 +0,0 @@ -/* - * Copyright (C) 2009 Apple Inc. All rights reserved. - * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" -#include "yarr/YarrPattern.h" - -#include "yarr/Yarr.h" -#include "yarr/YarrCanonicalizeUCS2.h" -#include "yarr/YarrParser.h" -#include <wtf/Vector.h> - -using namespace WTF; - -namespace JSC { namespace Yarr { - -#include "RegExpJitTables.h" - -class CharacterClassConstructor { -public: - CharacterClassConstructor(bool isCaseInsensitive = false) - : m_isCaseInsensitive(isCaseInsensitive) - { - } - - void reset() - { - m_matches.clear(); - m_ranges.clear(); - m_matchesUnicode.clear(); - m_rangesUnicode.clear(); - } - - void append(const CharacterClass* other) - { - for (size_t i = 0; i < other->m_matches.size(); ++i) - addSorted(m_matches, other->m_matches[i]); - for (size_t i = 0; i < other->m_ranges.size(); ++i) - addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); - for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) - addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); - for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) - addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); - } - - void putChar(UChar ch) - { - // Handle ascii cases. - if (ch <= 0x7f) { - if (m_isCaseInsensitive && isASCIIAlpha(ch)) { - addSorted(m_matches, toASCIIUpper(ch)); - addSorted(m_matches, toASCIILower(ch)); - } else - addSorted(m_matches, ch); - return; - } - - // Simple case, not a case-insensitive match. - if (!m_isCaseInsensitive) { - addSorted(m_matchesUnicode, ch); - return; - } - - // Add multiple matches, if necessary. - UCS2CanonicalizationRange* info = rangeInfoFor(ch); - if (info->type == CanonicalizeUnique) - addSorted(m_matchesUnicode, ch); - else - putUnicodeIgnoreCase(ch, info); - } - - void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info) - { - ASSERT(m_isCaseInsensitive); - ASSERT(ch > 0x7f); - ASSERT(ch >= info->begin && ch <= info->end); - ASSERT(info->type != CanonicalizeUnique); - if (info->type == CanonicalizeSet) { - for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) - addSorted(m_matchesUnicode, ch); - } else { - addSorted(m_matchesUnicode, ch); - addSorted(m_matchesUnicode, getCanonicalPair(info, ch)); - } - } - - void putRange(UChar lo, UChar hi) - { - if (lo <= 0x7f) { - char asciiLo = lo; - char asciiHi = std::min(hi, (UChar)0x7f); - addSortedRange(m_ranges, lo, asciiHi); - - if (m_isCaseInsensitive) { - if ((asciiLo <= 'Z') && (asciiHi >= 'A')) - addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); - if ((asciiLo <= 'z') && (asciiHi >= 'a')) - addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); - } - } - if (hi <= 0x7f) - return; - - lo = std::max(lo, (UChar)0x80); - addSortedRange(m_rangesUnicode, lo, hi); - - if (!m_isCaseInsensitive) - return; - - UCS2CanonicalizationRange* info = rangeInfoFor(lo); - while (true) { - // Handle the range [lo .. end] - UChar end = std::min<UChar>(info->end, hi); - - switch (info->type) { - case CanonicalizeUnique: - // Nothing to do - no canonical equivalents. - break; - case CanonicalizeSet: { - UChar ch; - for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) - addSorted(m_matchesUnicode, ch); - break; - } - case CanonicalizeRangeLo: - addSortedRange(m_rangesUnicode, lo + info->value, end + info->value); - break; - case CanonicalizeRangeHi: - addSortedRange(m_rangesUnicode, lo - info->value, end - info->value); - break; - case CanonicalizeAlternatingAligned: - // Use addSortedRange since there is likely an abutting range to combine with. - if (lo & 1) - addSortedRange(m_rangesUnicode, lo - 1, lo - 1); - if (!(end & 1)) - addSortedRange(m_rangesUnicode, end + 1, end + 1); - break; - case CanonicalizeAlternatingUnaligned: - // Use addSortedRange since there is likely an abutting range to combine with. - if (!(lo & 1)) - addSortedRange(m_rangesUnicode, lo - 1, lo - 1); - if (end & 1) - addSortedRange(m_rangesUnicode, end + 1, end + 1); - break; - } - - if (hi == end) - return; - - ++info; - lo = info->begin; - }; - - } - - PassOwnPtr<CharacterClass> charClass() - { - OwnPtr<CharacterClass> characterClass = adoptPtr(new CharacterClass(0)); - - characterClass->m_matches.swap(m_matches); - characterClass->m_ranges.swap(m_ranges); - characterClass->m_matchesUnicode.swap(m_matchesUnicode); - characterClass->m_rangesUnicode.swap(m_rangesUnicode); - - return characterClass.release(); - } - -private: - void addSorted(Vector<UChar>& matches, UChar ch) - { - unsigned pos = 0; - unsigned range = matches.size(); - - // binary chop, find position to insert char. - while (range) { - unsigned index = range >> 1; - - int val = matches[pos+index] - ch; - if (!val) - return; - else if (val > 0) - range = index; - else { - pos += (index+1); - range -= (index+1); - } - } - - if (pos == matches.size()) - matches.append(ch); - else - matches.insert(pos, ch); - } - - void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) - { - unsigned end = ranges.size(); - - // Simple linear scan - I doubt there are that many ranges anyway... - // feel free to fix this with something faster (eg binary chop). - for (unsigned i = 0; i < end; ++i) { - // does the new range fall before the current position in the array - if (hi < ranges[i].begin) { - // optional optimization: concatenate appending ranges? - may not be worthwhile. - if (hi == (ranges[i].begin - 1)) { - ranges[i].begin = lo; - return; - } - ranges.insert(i, CharacterRange(lo, hi)); - return; - } - // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining - // If the new range start at or before the end of the last range, then the overlap (if it starts one after the - // end of the last range they concatenate, which is just as good. - if (lo <= (ranges[i].end + 1)) { - // found an intersect! we'll replace this entry in the array. - ranges[i].begin = std::min(ranges[i].begin, lo); - ranges[i].end = std::max(ranges[i].end, hi); - - // now check if the new range can subsume any subsequent ranges. - unsigned next = i+1; - // each iteration of the loop we will either remove something from the list, or break the loop. - while (next < ranges.size()) { - if (ranges[next].begin <= (ranges[i].end + 1)) { - // the next entry now overlaps / concatenates this one. - ranges[i].end = std::max(ranges[i].end, ranges[next].end); - ranges.remove(next); - } else - break; - } - - return; - } - } - - // CharacterRange comes after all existing ranges. - ranges.append(CharacterRange(lo, hi)); - } - - bool m_isCaseInsensitive; - - Vector<UChar> m_matches; - Vector<CharacterRange> m_ranges; - Vector<UChar> m_matchesUnicode; - Vector<CharacterRange> m_rangesUnicode; -}; - -class YarrPatternConstructor { -public: - YarrPatternConstructor(YarrPattern& pattern) - : m_pattern(pattern) - , m_characterClassConstructor(pattern.m_ignoreCase) - , m_invertParentheticalAssertion(false) - { - OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction); - m_pattern.m_body = body.get(); - m_alternative = body->addNewAlternative(); - m_pattern.m_disjunctions.append(body.release()); - } - - ~YarrPatternConstructor() - { - } - - void reset() - { - m_pattern.reset(); - m_characterClassConstructor.reset(); - - OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction); - m_pattern.m_body = body.get(); - m_alternative = body->addNewAlternative(); - m_pattern.m_disjunctions.append(body.release()); - } - - void assertionBOL() - { - if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { - m_alternative->m_startsWithBOL = true; - m_alternative->m_containsBOL = true; - m_pattern.m_containsBOL = true; - } - m_alternative->m_terms.append(PatternTerm::BOL()); - } - void assertionEOL() - { - m_alternative->m_terms.append(PatternTerm::EOL()); - } - void assertionWordBoundary(bool invert) - { - m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); - } - - void atomPatternCharacter(UChar ch) - { - // We handle case-insensitive checking of unicode characters which do have both - // cases by handling them as if they were defined using a CharacterClass. - if (!m_pattern.m_ignoreCase || isASCII(ch)) { - m_alternative->m_terms.append(PatternTerm(ch)); - return; - } - - UCS2CanonicalizationRange* info = rangeInfoFor(ch); - if (info->type == CanonicalizeUnique) { - m_alternative->m_terms.append(PatternTerm(ch)); - return; - } - - m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); - OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass(); - m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false)); - m_pattern.m_userCharacterClasses.append(newCharacterClass.release()); - } - - void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) - { - switch (classID) { - case DigitClassID: - m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); - break; - case SpaceClassID: - m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); - break; - case WordClassID: - m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); - break; - case NewlineClassID: - m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); - break; - } - } - - void atomCharacterClassBegin(bool invert = false) - { - m_invertCharacterClass = invert; - } - - void atomCharacterClassAtom(UChar ch) - { - m_characterClassConstructor.putChar(ch); - } - - void atomCharacterClassRange(UChar begin, UChar end) - { - m_characterClassConstructor.putRange(begin, end); - } - - void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) - { - ASSERT(classID != NewlineClassID); - - switch (classID) { - case DigitClassID: - m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); - break; - - case SpaceClassID: - m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); - break; - - case WordClassID: - m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); - break; - - default: - RELEASE_ASSERT_NOT_REACHED(); - } - } - - void atomCharacterClassEnd() - { - OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass(); - m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass)); - m_pattern.m_userCharacterClasses.append(newCharacterClass.release()); - } - - void atomParenthesesSubpatternBegin(bool capture = true) - { - unsigned subpatternId = m_pattern.m_numSubpatterns + 1; - if (capture) - m_pattern.m_numSubpatterns++; - - OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative)); - m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false)); - m_alternative = parenthesesDisjunction->addNewAlternative(); - m_pattern.m_disjunctions.append(parenthesesDisjunction.release()); - } - - void atomParentheticalAssertionBegin(bool invert = false) - { - OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative)); - m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert)); - m_alternative = parenthesesDisjunction->addNewAlternative(); - m_invertParentheticalAssertion = invert; - m_pattern.m_disjunctions.append(parenthesesDisjunction.release()); - } - - void atomParenthesesEnd() - { - ASSERT(m_alternative->m_parent); - ASSERT(m_alternative->m_parent->m_parent); - - PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent; - m_alternative = m_alternative->m_parent->m_parent; - - PatternTerm& lastTerm = m_alternative->lastTerm(); - - unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size(); - unsigned numBOLAnchoredAlts = 0; - - for (unsigned i = 0; i < numParenAlternatives; i++) { - // Bubble up BOL flags - if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL) - numBOLAnchoredAlts++; - } - - if (numBOLAnchoredAlts) { - m_alternative->m_containsBOL = true; - // If all the alternatives in parens start with BOL, then so does this one - if (numBOLAnchoredAlts == numParenAlternatives) - m_alternative->m_startsWithBOL = true; - } - - lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; - m_invertParentheticalAssertion = false; - } - - void atomBackReference(unsigned subpatternId) - { - ASSERT(subpatternId); - m_pattern.m_containsBackreferences = true; - m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); - - if (subpatternId > m_pattern.m_numSubpatterns) { - m_alternative->m_terms.append(PatternTerm::ForwardReference()); - return; - } - - PatternAlternative* currentAlternative = m_alternative; - ASSERT(currentAlternative); - - // Note to self: if we waited until the AST was baked, we could also remove forwards refs - while ((currentAlternative = currentAlternative->m_parent->m_parent)) { - PatternTerm& term = currentAlternative->lastTerm(); - ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); - - if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) { - m_alternative->m_terms.append(PatternTerm::ForwardReference()); - return; - } - } - - m_alternative->m_terms.append(PatternTerm(subpatternId)); - } - - // deep copy the argument disjunction. If filterStartsWithBOL is true, - // skip alternatives with m_startsWithBOL set true. - PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) - { - OwnPtr<PatternDisjunction> newDisjunction; - for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { - PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); - if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { - if (!newDisjunction) { - newDisjunction = adoptPtr(new PatternDisjunction()); - newDisjunction->m_parent = disjunction->m_parent; - } - PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); - newAlternative->m_terms.reserveInitialCapacity(alternative->m_terms.size()); - for (unsigned i = 0; i < alternative->m_terms.size(); ++i) - newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL)); - } - } - - if (!newDisjunction) - return 0; - - PatternDisjunction* copiedDisjunction = newDisjunction.get(); - m_pattern.m_disjunctions.append(newDisjunction.release()); - return copiedDisjunction; - } - - PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) - { - if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) - return PatternTerm(term); - - PatternTerm termCopy = term; - termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); - return termCopy; - } - - void quantifyAtom(unsigned min, unsigned max, bool greedy) - { - ASSERT(min <= max); - ASSERT(m_alternative->m_terms.size()); - - if (!max) { - m_alternative->removeLastTerm(); - return; - } - - PatternTerm& term = m_alternative->lastTerm(); - ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); - ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); - - if (term.type == PatternTerm::TypeParentheticalAssertion) { - // If an assertion is quantified with a minimum count of zero, it can simply be removed. - // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never - // results in any input being consumed, however the continuation passed to the assertion - // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will - // reject all zero length matches (see step 2.1). A match from the continuation of the - // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all - // this is that matches from the assertion are not required, and won't be accepted anyway, - // so no need to ever run it. - if (!min) - m_alternative->removeLastTerm(); - // We never need to run an assertion more than once. Subsequent interations will be run - // with the same start index (since assertions are non-capturing) and the same captures - // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the - // same result and captures. If the first match succeeds then the subsequent (min - 1) - // matches will too. Any additional optional matches will fail (on the same basis as the - // minimum zero quantified assertions, above), but this will still result in a match. - return; - } - - if (min == 0) - term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); - else if (min == max) - term.quantify(min, QuantifierFixedCount); - else { - term.quantify(min, QuantifierFixedCount); - m_alternative->m_terms.append(copyTerm(term)); - // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... - m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); - if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) - m_alternative->lastTerm().parentheses.isCopy = true; - } - } - - void disjunction() - { - m_alternative = m_alternative->m_parent->addNewAlternative(); - } - - unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) - { - alternative->m_hasFixedSize = true; - Checked<unsigned> currentInputPosition = initialInputPosition; - - for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { - PatternTerm& term = alternative->m_terms[i]; - - switch (term.type) { - case PatternTerm::TypeAssertionBOL: - case PatternTerm::TypeAssertionEOL: - case PatternTerm::TypeAssertionWordBoundary: - term.inputPosition = currentInputPosition.unsafeGet(); - break; - - case PatternTerm::TypeBackReference: - term.inputPosition = currentInputPosition.unsafeGet(); - term.frameLocation = currentCallFrameSize; - currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference; - alternative->m_hasFixedSize = false; - break; - - case PatternTerm::TypeForwardReference: - break; - - case PatternTerm::TypePatternCharacter: - term.inputPosition = currentInputPosition.unsafeGet(); - if (term.quantityType != QuantifierFixedCount) { - term.frameLocation = currentCallFrameSize; - currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter; - alternative->m_hasFixedSize = false; - } else - currentInputPosition += term.quantityCount; - break; - - case PatternTerm::TypeCharacterClass: - term.inputPosition = currentInputPosition.unsafeGet(); - if (term.quantityType != QuantifierFixedCount) { - term.frameLocation = currentCallFrameSize; - currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; - alternative->m_hasFixedSize = false; - } else - currentInputPosition += term.quantityCount; - break; - - case PatternTerm::TypeParenthesesSubpattern: - // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. - term.frameLocation = currentCallFrameSize; - if (term.quantityCount == 1 && !term.parentheses.isCopy) { - if (term.quantityType != QuantifierFixedCount) - currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet()); - // If quantity is fixed, then pre-check its minimum size. - if (term.quantityType == QuantifierFixedCount) - currentInputPosition += term.parentheses.disjunction->m_minimumSize; - term.inputPosition = currentInputPosition.unsafeGet(); - } else if (term.parentheses.isTerminal) { - currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet()); - term.inputPosition = currentInputPosition.unsafeGet(); - } else { - term.inputPosition = currentInputPosition.unsafeGet(); - setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet()); - currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses; - } - // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. - alternative->m_hasFixedSize = false; - break; - - case PatternTerm::TypeParentheticalAssertion: - term.inputPosition = currentInputPosition.unsafeGet(); - term.frameLocation = currentCallFrameSize; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet()); - break; - - case PatternTerm::TypeDotStarEnclosure: - alternative->m_hasFixedSize = false; - term.inputPosition = initialInputPosition; - break; - } - } - - alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet(); - return currentCallFrameSize; - } - - unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) - { - if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) - initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative; - - unsigned minimumInputSize = UINT_MAX; - unsigned maximumCallFrameSize = 0; - bool hasFixedSize = true; - - for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { - PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); - unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); - minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize); - maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize); - hasFixedSize &= alternative->m_hasFixedSize; - } - - ASSERT(minimumInputSize != UINT_MAX); - ASSERT(maximumCallFrameSize >= initialCallFrameSize); - - disjunction->m_hasFixedSize = hasFixedSize; - disjunction->m_minimumSize = minimumInputSize; - disjunction->m_callFrameSize = maximumCallFrameSize; - return maximumCallFrameSize; - } - - void setupOffsets() - { - setupDisjunctionOffsets(m_pattern.m_body, 0, 0); - } - - // This optimization identifies sets of parentheses that we will never need to backtrack. - // In these cases we do not need to store state from prior iterations. - // We can presently avoid backtracking for: - // * where the parens are at the end of the regular expression (last term in any of the - // alternatives of the main body disjunction). - // * where the parens are non-capturing, and quantified unbounded greedy (*). - // * where the parens do not contain any capturing subpatterns. - void checkForTerminalParentheses() - { - // This check is much too crude; should be just checking whether the candidate - // node contains nested capturing subpatterns, not the whole expression! - if (m_pattern.m_numSubpatterns) - return; - - Vector<OwnPtr<PatternAlternative> >& alternatives = m_pattern.m_body->m_alternatives; - for (size_t i = 0; i < alternatives.size(); ++i) { - Vector<PatternTerm>& terms = alternatives[i]->m_terms; - if (terms.size()) { - PatternTerm& term = terms.last(); - if (term.type == PatternTerm::TypeParenthesesSubpattern - && term.quantityType == QuantifierGreedy - && term.quantityCount == quantifyInfinite - && !term.capture()) - term.parentheses.isTerminal = true; - } - } - } - - void optimizeBOL() - { - // Look for expressions containing beginning of line (^) anchoring and unroll them. - // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops - // This code relies on the parsing code tagging alternatives with m_containsBOL and - // m_startsWithBOL and rolling those up to containing alternatives. - // At this point, this is only valid for non-multiline expressions. - PatternDisjunction* disjunction = m_pattern.m_body; - - if (!m_pattern.m_containsBOL || m_pattern.m_multiline) - return; - - PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); - - // Set alternatives in disjunction to "onceThrough" - for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) - disjunction->m_alternatives[alt]->setOnceThrough(); - - if (loopDisjunction) { - // Move alternatives from loopDisjunction to disjunction - for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) - disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt].release()); - - loopDisjunction->m_alternatives.clear(); - } - } - - bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex) - { - Vector<PatternTerm>& terms = alternative->m_terms; - - for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) { - PatternTerm& term = terms[termIndex]; - - if (term.m_capture) - return true; - - if (term.type == PatternTerm::TypeParenthesesSubpattern) { - PatternDisjunction* nestedDisjunction = term.parentheses.disjunction; - for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) { - if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1)) - return true; - } - } - } - - return false; - } - - // This optimization identifies alternatives in the form of - // [^].*[?]<expression>.*[$] for expressions that don't have any - // capturing terms. The alternative is changed to <expression> - // followed by processing of the dot stars to find and adjust the - // beginning and the end of the match. - void optimizeDotStarWrappedExpressions() - { - Vector<OwnPtr<PatternAlternative> >& alternatives = m_pattern.m_body->m_alternatives; - if (alternatives.size() != 1) - return; - - PatternAlternative* alternative = alternatives[0].get(); - Vector<PatternTerm>& terms = alternative->m_terms; - if (terms.size() >= 3) { - bool startsWithBOL = false; - bool endsWithEOL = false; - size_t termIndex, firstExpressionTerm, lastExpressionTerm; - - termIndex = 0; - if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) { - startsWithBOL = true; - ++termIndex; - } - - PatternTerm& firstNonAnchorTerm = terms[termIndex]; - if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy))) - return; - - firstExpressionTerm = termIndex + 1; - - termIndex = terms.size() - 1; - if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) { - endsWithEOL = true; - --termIndex; - } - - PatternTerm& lastNonAnchorTerm = terms[termIndex]; - if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy)) - return; - - lastExpressionTerm = termIndex - 1; - - if (firstExpressionTerm > lastExpressionTerm) - return; - - if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) { - for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex) - terms.remove(termIndex); - - for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex) - terms.remove(termIndex - 1); - - terms.append(PatternTerm(startsWithBOL, endsWithEOL)); - - m_pattern.m_containsBOL = false; - } - } - } - -private: - YarrPattern& m_pattern; - PatternAlternative* m_alternative; - CharacterClassConstructor m_characterClassConstructor; - bool m_invertCharacterClass; - bool m_invertParentheticalAssertion; -}; - -const char* YarrPattern::compile(const String& patternString) -{ - YarrPatternConstructor constructor(*this); - - if (const char* error = parse(constructor, patternString)) - return error; - - // If the pattern contains illegal backreferences reset & reparse. - // Quoting Netscape's "What's new in JavaScript 1.2", - // "Note: if the number of left parentheses is less than the number specified - // in \#, the \# is taken as an octal escape as described in the next row." - if (containsIllegalBackReference()) { - unsigned numSubpatterns = m_numSubpatterns; - - constructor.reset(); -#if !ASSERT_DISABLED - const char* error = -#endif - parse(constructor, patternString, numSubpatterns); - - ASSERT(!error); - ASSERT(numSubpatterns == m_numSubpatterns); - } - - constructor.checkForTerminalParentheses(); - constructor.optimizeDotStarWrappedExpressions(); - constructor.optimizeBOL(); - - constructor.setupOffsets(); - - return 0; -} - -YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error) - : m_ignoreCase(ignoreCase) - , m_multiline(multiline) - , m_containsBackreferences(false) - , m_containsBOL(false) - , m_numSubpatterns(0) - , m_maxBackReference(0) - , newlineCached(0) - , digitsCached(0) - , spacesCached(0) - , wordcharCached(0) - , nondigitsCached(0) - , nonspacesCached(0) - , nonwordcharCached(0) -{ - *error = compile(pattern); -} - -} } diff --git a/third_party/WebKit/Source/yarr/YarrPattern.h b/third_party/WebKit/Source/yarr/YarrPattern.h deleted file mode 100644 index 4682c0b..0000000 --- a/third_party/WebKit/Source/yarr/YarrPattern.h +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Copyright (C) 2009 Apple Inc. All rights reserved. - * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef YarrPattern_h -#define YarrPattern_h - -#include <wtf/CheckedArithmetic.h> -#include <wtf/OwnPtr.h> -#include <wtf/PassOwnPtr.h> -#include <wtf/RefCounted.h> -#include <wtf/Vector.h> -#include <wtf/text/WTFString.h> -#include <wtf/unicode/Unicode.h> - -namespace JSC { namespace Yarr { - -struct PatternDisjunction; - -struct CharacterRange { - UChar begin; - UChar end; - - CharacterRange(UChar begin, UChar end) - : begin(begin) - , end(end) - { - } -}; - -struct CharacterClassTable : RefCounted<CharacterClassTable> { - const char* m_table; - bool m_inverted; - static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted) - { - return adoptRef(new CharacterClassTable(table, inverted)); - } - -private: - CharacterClassTable(const char* table, bool inverted) - : m_table(table) - , m_inverted(inverted) - { - } -}; - -struct CharacterClass { - WTF_MAKE_FAST_ALLOCATED; -public: - // All CharacterClass instances have to have the full set of matches and ranges, - // they may have an optional table for faster lookups (which must match the - // specified matches and ranges) - CharacterClass(PassRefPtr<CharacterClassTable> table) - : m_table(table) - { - } - Vector<UChar> m_matches; - Vector<CharacterRange> m_ranges; - Vector<UChar> m_matchesUnicode; - Vector<CharacterRange> m_rangesUnicode; - RefPtr<CharacterClassTable> m_table; -}; - -enum QuantifierType { - QuantifierFixedCount, - QuantifierGreedy, - QuantifierNonGreedy, -}; - -struct PatternTerm { - enum Type { - TypeAssertionBOL, - TypeAssertionEOL, - TypeAssertionWordBoundary, - TypePatternCharacter, - TypeCharacterClass, - TypeBackReference, - TypeForwardReference, - TypeParenthesesSubpattern, - TypeParentheticalAssertion, - TypeDotStarEnclosure, - } type; - bool m_capture :1; - bool m_invert :1; - union { - UChar patternCharacter; - CharacterClass* characterClass; - unsigned backReferenceSubpatternId; - struct { - PatternDisjunction* disjunction; - unsigned subpatternId; - unsigned lastSubpatternId; - bool isCopy; - bool isTerminal; - } parentheses; - struct { - bool bolAnchor : 1; - bool eolAnchor : 1; - } anchors; - }; - QuantifierType quantityType; - Checked<unsigned> quantityCount; - int inputPosition; - unsigned frameLocation; - - PatternTerm(UChar ch) - : type(PatternTerm::TypePatternCharacter) - , m_capture(false) - , m_invert(false) - { - patternCharacter = ch; - quantityType = QuantifierFixedCount; - quantityCount = 1; - } - - PatternTerm(CharacterClass* charClass, bool invert) - : type(PatternTerm::TypeCharacterClass) - , m_capture(false) - , m_invert(invert) - { - characterClass = charClass; - quantityType = QuantifierFixedCount; - quantityCount = 1; - } - - PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) - : type(type) - , m_capture(capture) - , m_invert(invert) - { - parentheses.disjunction = disjunction; - parentheses.subpatternId = subpatternId; - parentheses.isCopy = false; - parentheses.isTerminal = false; - quantityType = QuantifierFixedCount; - quantityCount = 1; - } - - PatternTerm(Type type, bool invert = false) - : type(type) - , m_capture(false) - , m_invert(invert) - { - quantityType = QuantifierFixedCount; - quantityCount = 1; - } - - PatternTerm(unsigned spatternId) - : type(TypeBackReference) - , m_capture(false) - , m_invert(false) - { - backReferenceSubpatternId = spatternId; - quantityType = QuantifierFixedCount; - quantityCount = 1; - } - - PatternTerm(bool bolAnchor, bool eolAnchor) - : type(TypeDotStarEnclosure) - , m_capture(false) - , m_invert(false) - { - anchors.bolAnchor = bolAnchor; - anchors.eolAnchor = eolAnchor; - quantityType = QuantifierFixedCount; - quantityCount = 1; - } - - static PatternTerm ForwardReference() - { - return PatternTerm(TypeForwardReference); - } - - static PatternTerm BOL() - { - return PatternTerm(TypeAssertionBOL); - } - - static PatternTerm EOL() - { - return PatternTerm(TypeAssertionEOL); - } - - static PatternTerm WordBoundary(bool invert) - { - return PatternTerm(TypeAssertionWordBoundary, invert); - } - - bool invert() - { - return m_invert; - } - - bool capture() - { - return m_capture; - } - - void quantify(unsigned count, QuantifierType type) - { - quantityCount = count; - quantityType = type; - } -}; - -struct PatternAlternative { - WTF_MAKE_FAST_ALLOCATED; -public: - PatternAlternative(PatternDisjunction* disjunction) - : m_parent(disjunction) - , m_onceThrough(false) - , m_hasFixedSize(false) - , m_startsWithBOL(false) - , m_containsBOL(false) - { - } - - PatternTerm& lastTerm() - { - ASSERT(m_terms.size()); - return m_terms[m_terms.size() - 1]; - } - - void removeLastTerm() - { - ASSERT(m_terms.size()); - m_terms.shrink(m_terms.size() - 1); - } - - void setOnceThrough() - { - m_onceThrough = true; - } - - bool onceThrough() - { - return m_onceThrough; - } - - Vector<PatternTerm> m_terms; - PatternDisjunction* m_parent; - unsigned m_minimumSize; - bool m_onceThrough : 1; - bool m_hasFixedSize : 1; - bool m_startsWithBOL : 1; - bool m_containsBOL : 1; -}; - -struct PatternDisjunction { - WTF_MAKE_FAST_ALLOCATED; -public: - PatternDisjunction(PatternAlternative* parent = 0) - : m_parent(parent) - , m_hasFixedSize(false) - { - } - - PatternAlternative* addNewAlternative() - { - PatternAlternative* alternative = new PatternAlternative(this); - m_alternatives.append(adoptPtr(alternative)); - return alternative; - } - - Vector<OwnPtr<PatternAlternative> > m_alternatives; - PatternAlternative* m_parent; - unsigned m_minimumSize; - unsigned m_callFrameSize; - bool m_hasFixedSize; -}; - -// You probably don't want to be calling these functions directly -// (please to be calling newlineCharacterClass() et al on your -// friendly neighborhood YarrPattern instance to get nicely -// cached copies). -CharacterClass* newlineCreate(); -CharacterClass* digitsCreate(); -CharacterClass* spacesCreate(); -CharacterClass* wordcharCreate(); -CharacterClass* nondigitsCreate(); -CharacterClass* nonspacesCreate(); -CharacterClass* nonwordcharCreate(); - -struct TermChain { - TermChain(PatternTerm term) - : term(term) - {} - - PatternTerm term; - Vector<TermChain> hotTerms; -}; - -struct YarrPattern { - YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error); - - void reset() - { - m_numSubpatterns = 0; - m_maxBackReference = 0; - - m_containsBackreferences = false; - m_containsBOL = false; - - newlineCached = 0; - digitsCached = 0; - spacesCached = 0; - wordcharCached = 0; - nondigitsCached = 0; - nonspacesCached = 0; - nonwordcharCached = 0; - - m_disjunctions.clear(); - m_userCharacterClasses.clear(); - } - - bool containsIllegalBackReference() - { - return m_maxBackReference > m_numSubpatterns; - } - - CharacterClass* newlineCharacterClass() - { - if (!newlineCached) - m_userCharacterClasses.append(adoptPtr(newlineCached = newlineCreate())); - return newlineCached; - } - CharacterClass* digitsCharacterClass() - { - if (!digitsCached) - m_userCharacterClasses.append(adoptPtr(digitsCached = digitsCreate())); - return digitsCached; - } - CharacterClass* spacesCharacterClass() - { - if (!spacesCached) - m_userCharacterClasses.append(adoptPtr(spacesCached = spacesCreate())); - return spacesCached; - } - CharacterClass* wordcharCharacterClass() - { - if (!wordcharCached) - m_userCharacterClasses.append(adoptPtr(wordcharCached = wordcharCreate())); - return wordcharCached; - } - CharacterClass* nondigitsCharacterClass() - { - if (!nondigitsCached) - m_userCharacterClasses.append(adoptPtr(nondigitsCached = nondigitsCreate())); - return nondigitsCached; - } - CharacterClass* nonspacesCharacterClass() - { - if (!nonspacesCached) - m_userCharacterClasses.append(adoptPtr(nonspacesCached = nonspacesCreate())); - return nonspacesCached; - } - CharacterClass* nonwordcharCharacterClass() - { - if (!nonwordcharCached) - m_userCharacterClasses.append(adoptPtr(nonwordcharCached = nonwordcharCreate())); - return nonwordcharCached; - } - - bool m_ignoreCase : 1; - bool m_multiline : 1; - bool m_containsBackreferences : 1; - bool m_containsBOL : 1; - unsigned m_numSubpatterns; - unsigned m_maxBackReference; - PatternDisjunction* m_body; - Vector<OwnPtr<PatternDisjunction>, 4> m_disjunctions; - Vector<OwnPtr<CharacterClass> > m_userCharacterClasses; - -private: - const char* compile(const String& patternString); - - CharacterClass* newlineCached; - CharacterClass* digitsCached; - CharacterClass* spacesCached; - CharacterClass* wordcharCached; - CharacterClass* nondigitsCached; - CharacterClass* nonspacesCached; - CharacterClass* nonwordcharCached; -}; - -} } // namespace JSC::Yarr - -#endif // YarrPattern_h diff --git a/third_party/WebKit/Source/yarr/YarrSyntaxChecker.cpp b/third_party/WebKit/Source/yarr/YarrSyntaxChecker.cpp deleted file mode 100644 index b1f922f..0000000 --- a/third_party/WebKit/Source/yarr/YarrSyntaxChecker.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (C) 2011 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" -#include "yarr/YarrSyntaxChecker.h" - -#include "yarr/YarrParser.h" - -namespace JSC { namespace Yarr { - -class SyntaxChecker { -public: - void assertionBOL() {} - void assertionEOL() {} - void assertionWordBoundary(bool) {} - void atomPatternCharacter(UChar) {} - void atomBuiltInCharacterClass(BuiltInCharacterClassID, bool) {} - void atomCharacterClassBegin(bool = false) {} - void atomCharacterClassAtom(UChar) {} - void atomCharacterClassRange(UChar, UChar) {} - void atomCharacterClassBuiltIn(BuiltInCharacterClassID, bool) {} - void atomCharacterClassEnd() {} - void atomParenthesesSubpatternBegin(bool = true) {} - void atomParentheticalAssertionBegin(bool = false) {} - void atomParenthesesEnd() {} - void atomBackReference(unsigned) {} - void quantifyAtom(unsigned, unsigned, bool) {} - void disjunction() {} -}; - -const char* checkSyntax(const String& pattern) -{ - SyntaxChecker syntaxChecker; - return parse(syntaxChecker, pattern); -} - -}} // JSC::YARR diff --git a/third_party/WebKit/Source/yarr/YarrSyntaxChecker.h b/third_party/WebKit/Source/yarr/YarrSyntaxChecker.h deleted file mode 100644 index 104ced3..0000000 --- a/third_party/WebKit/Source/yarr/YarrSyntaxChecker.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (C) 2011 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef YarrSyntaxChecker_h -#define YarrSyntaxChecker_h - -#include <wtf/text/WTFString.h> - -namespace JSC { namespace Yarr { - -const char* checkSyntax(const String& pattern); - -}} // JSC::YARR - -#endif // YarrSyntaxChecker_h - diff --git a/third_party/WebKit/Source/yarr/create_regex_tables b/third_party/WebKit/Source/yarr/create_regex_tables deleted file mode 100644 index bd799ba..0000000 --- a/third_party/WebKit/Source/yarr/create_regex_tables +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (C) 2010 Apple Inc. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import sys - -types = { - "wordchar": { "UseTable" : True, "data": ['_', ('0','9'), ('A', 'Z'), ('a','z')]}, - "nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0xffff)]}, - "newline": { "UseTable" : False, "data": ['\n', '\r', 0x2028, 0x2029]}, - "spaces": { "UseTable" : True, "data": [' ', ('\t', '\r'), 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000, (0x2000, 0x200a), 0xfeff]}, - "nonspaces": { "UseTable" : True, "Inverse": "spaces", "data": [(0, ord('\t') - 1), (ord('\r') + 1, ord(' ') - 1), (ord(' ') + 1, 0x009f), (0x00a1, 0x167f), (0x1681, 0x180d), (0x180f, 0x1fff), (0x200b, 0x2027), (0x202a, 0x202e), (0x2030, 0x205e), (0x2060, 0x2fff), (0x3001, 0xfefe), (0xff00, 0xffff)]}, - "digits": { "UseTable" : False, "data": [('0', '9')]}, - "nondigits": { "UseTable" : False, "Inverse": "digits", "data": [(0, ord('0') - 1), (ord('9') + 1, 0xffff)] } -} -entriesPerLine = 50 -arrays = ""; -functions = ""; -emitTables = (len(sys.argv) < 2 or sys.argv[1] != "--no-tables") - -for name, classes in types.items(): - ranges = []; - size = 0; - for _class in classes["data"]: - if type(_class) == str: - ranges.append((ord(_class), ord(_class))) - elif type(_class) == int: - ranges.append((_class, _class)) - else: - (min, max) = _class; - if type(min) == str: - min = ord(min) - if type(max) == str: - max = ord(max) - if max > 0x7f and min <= 0x7f: - ranges.append((min, 0x7f)) - min = 0x80 - ranges.append((min,max)) - ranges.sort(); - - if emitTables and classes["UseTable"] and (not "Inverse" in classes): - array = ("static const char _%sData[65536] = {\n" % name); - i = 0 - for (min,max) in ranges: - while i < min: - i = i + 1 - array += ('0,') - if (i % entriesPerLine == 0) and (i != 0): - array += ('\n') - while i <= max: - i = i + 1 - if (i == 65536): - array += ("1") - else: - array += ('1,') - if (i % entriesPerLine == 0) and (i != 0): - array += ('\n') - while i < 0xffff: - array += ("0,") - i = i + 1; - if (i % entriesPerLine == 0) and (i != 0): - array += ('\n') - if i == 0xffff: - array += ("0") - array += ("\n};\n\n"); - arrays += array - - # Generate createFunction: - function = ""; - function += ("CharacterClass* %sCreate()\n" % name) - function += ("{\n") - if emitTables and classes["UseTable"]: - if "Inverse" in classes: - function += (" CharacterClass* characterClass = new CharacterClass(CharacterClassTable::create(_%sData, true));\n" % (classes["Inverse"])) - else: - function += (" CharacterClass* characterClass = new CharacterClass(CharacterClassTable::create(_%sData, false));\n" % (name)) - else: - function += (" CharacterClass* characterClass = new CharacterClass(0);\n") - for (min, max) in ranges: - if (min == max): - if (min > 127): - function += (" characterClass->m_matchesUnicode.append(0x%04x);\n" % min) - else: - function += (" characterClass->m_matches.append(0x%02x);\n" % min) - continue - if (min > 127) or (max > 127): - function += (" characterClass->m_rangesUnicode.append(CharacterRange(0x%04x, 0x%04x));\n" % (min, max)) - else: - function += (" characterClass->m_ranges.append(CharacterRange(0x%02x, 0x%02x));\n" % (min, max)) - function += (" return characterClass;\n") - function += ("}\n\n") - functions += function - -if (len(sys.argv) > 1): - f = open(sys.argv[-1], "w") - f.write(arrays) - f.write(functions) - f.close() -else: - print(arrays) - print(functions) - diff --git a/third_party/WebKit/Source/yarr/yarr.gyp b/third_party/WebKit/Source/yarr/yarr.gyp deleted file mode 100644 index 8f82ed2..0000000 --- a/third_party/WebKit/Source/yarr/yarr.gyp +++ /dev/null @@ -1,100 +0,0 @@ -# -# Copyright (C) 2013 Google Inc. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# - -{ - 'includes': [ - '../WebKit/chromium/WinPrecompile.gypi', - '../core/features.gypi', - ], - 'conditions': [ - ['os_posix == 1 and OS != "mac" and gcc_version>=46', { - 'target_defaults': { - # Disable warnings about c++0x compatibility, as some names (such as nullptr) conflict - # with upcoming c++0x types. - 'cflags_cc': ['-Wno-c++0x-compat'], - }, - }], - ], - 'targets': [ - { - 'target_name': 'yarr', - 'type': 'static_library', - 'dependencies': [ - '../wtf/wtf.gyp:wtf', - ], - 'variables': { 'optimize': 'max' }, - 'actions': [ - { - 'action_name': 'generate_regex_tables', - 'inputs': [ - 'create_regex_tables', - ], - 'arguments': [ - '--no-tables', - ], - 'outputs': [ - '<(INTERMEDIATE_DIR)/RegExpJitTables.h', - ], - 'action': ['python', '<@(_inputs)', '<@(_arguments)', '<@(_outputs)'], - }, - ], - 'include_dirs': [ - '<(INTERMEDIATE_DIR)', - '..', - ], - 'sources': [ - 'Yarr.h', - 'YarrCanonicalizeUCS2.cpp', - 'YarrCanonicalizeUCS2.h', - 'YarrInterpreter.cpp', - 'YarrInterpreter.h', - 'YarrParser.h', - 'YarrPattern.cpp', - 'YarrPattern.h', - 'YarrSyntaxChecker.cpp', - 'YarrSyntaxChecker.h', - ], - 'direct_dependent_settings': { - 'include_dirs': [ - '..', - ], - }, - 'export_dependent_settings': [ - '../wtf/wtf.gyp:wtf', - ], - 'conditions': [ - ['OS=="win"', { - # Disable c4267 warnings until we fix size_t to int truncations. - 'msvs_disabled_warnings': [4267, ], - }], - ], - }, - ], -} |