diff options
author | Simon Hausmann <simon.hausmann@nokia.com> | 2009-09-16 07:51:00 (GMT) |
---|---|---|
committer | Simon Hausmann <simon.hausmann@nokia.com> | 2009-09-16 07:58:16 (GMT) |
commit | 120329adb47dba60f532c1c2fd2ad0f37b812437 (patch) | |
tree | fc2311cef4b69fe7294e8f5aab27fe6af2546123 /src/3rdparty/javascriptcore/JavaScriptCore/yarr | |
parent | ce17ae5a6159d8ce3a5d2cc98f804a2debb860e5 (diff) | |
download | Qt-120329adb47dba60f532c1c2fd2ad0f37b812437.zip Qt-120329adb47dba60f532c1c2fd2ad0f37b812437.tar.gz Qt-120329adb47dba60f532c1c2fd2ad0f37b812437.tar.bz2 |
Separate the copy of JavaScriptCore that QtScript uses from the copy that
QtWebKit uses.
This is needed to decouple QtScript from QtWebKit, as discussed in the
WebKit team.
Reviewed-by: Kent Hansen
Diffstat (limited to 'src/3rdparty/javascriptcore/JavaScriptCore/yarr')
8 files changed, 5467 insertions, 0 deletions
diff --git a/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexCompiler.cpp b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexCompiler.cpp new file mode 100644 index 0000000..c7b3c81 --- /dev/null +++ b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexCompiler.cpp @@ -0,0 +1,728 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "RegexCompiler.h" + +#include "RegexInterpreter.h" +#include "RegexPattern.h" +#include <wtf/Vector.h> + +#if ENABLE(YARR) + +using namespace WTF; + +namespace JSC { namespace Yarr { + +class CharacterClassConstructor { +public: + CharacterClassConstructor(bool isCaseInsensitive = false) + : m_isCaseInsensitive(isCaseInsensitive) + { + } + + void reset() + { + m_matches.clear(); + m_ranges.clear(); + m_matchesUnicode.clear(); + m_rangesUnicode.clear(); + } + + void append(const CharacterClass* other) + { + for (size_t i = 0; i < other->m_matches.size(); ++i) + addSorted(m_matches, other->m_matches[i]); + for (size_t i = 0; i < other->m_ranges.size(); ++i) + addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); + for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) + addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); + for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) + addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); + } + + void putChar(UChar ch) + { + if (ch <= 0x7f) { + if (m_isCaseInsensitive && isASCIIAlpha(ch)) { + addSorted(m_matches, toASCIIUpper(ch)); + addSorted(m_matches, toASCIILower(ch)); + } else + addSorted(m_matches, ch); + } else { + UChar upper, lower; + if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { + addSorted(m_matchesUnicode, upper); + addSorted(m_matchesUnicode, lower); + } else + addSorted(m_matchesUnicode, ch); + } + } + + // returns true if this character has another case, and 'ch' is the upper case form. + static inline bool isUnicodeUpper(UChar ch) + { + return ch != Unicode::toLower(ch); + } + + // returns true if this character has another case, and 'ch' is the lower case form. + static inline bool isUnicodeLower(UChar ch) + { + return ch != Unicode::toUpper(ch); + } + + void putRange(UChar lo, UChar hi) + { + if (lo <= 0x7f) { + char asciiLo = lo; + char asciiHi = std::min(hi, (UChar)0x7f); + addSortedRange(m_ranges, lo, asciiHi); + + if (m_isCaseInsensitive) { + if ((asciiLo <= 'Z') && (asciiHi >= 'A')) + addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); + if ((asciiLo <= 'z') && (asciiHi >= 'a')) + addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); + } + } + if (hi >= 0x80) { + uint32_t unicodeCurr = std::max(lo, (UChar)0x80); + addSortedRange(m_rangesUnicode, unicodeCurr, hi); + + if (m_isCaseInsensitive) { + while (unicodeCurr <= hi) { + // If the upper bound of the range (hi) is 0xffff, the increments to + // unicodeCurr in this loop may take it to 0x10000. This is fine + // (if so we won't re-enter the loop, since the loop condition above + // will definitely fail) - but this does mean we cannot use a UChar + // to represent unicodeCurr, we must use a 32-bit value instead. + ASSERT(unicodeCurr <= 0xffff); + + if (isUnicodeUpper(unicodeCurr)) { + UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); + UChar lowerCaseRangeEnd = lowerCaseRangeBegin; + while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) + lowerCaseRangeEnd++; + addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); + } else if (isUnicodeLower(unicodeCurr)) { + UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); + UChar upperCaseRangeEnd = upperCaseRangeBegin; + while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) + upperCaseRangeEnd++; + addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); + } else + ++unicodeCurr; + } + } + } + } + + CharacterClass* charClass() + { + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_matches.append(m_matches); + characterClass->m_ranges.append(m_ranges); + characterClass->m_matchesUnicode.append(m_matchesUnicode); + characterClass->m_rangesUnicode.append(m_rangesUnicode); + + reset(); + + return characterClass; + } + +private: + void addSorted(Vector<UChar>& matches, UChar ch) + { + unsigned pos = 0; + unsigned range = matches.size(); + + // binary chop, find position to insert char. + while (range) { + unsigned index = range >> 1; + + int val = matches[pos+index] - ch; + if (!val) + return; + else if (val > 0) + range = index; + else { + pos += (index+1); + range -= (index+1); + } + } + + if (pos == matches.size()) + matches.append(ch); + else + matches.insert(pos, ch); + } + + void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) + { + unsigned end = ranges.size(); + + // Simple linear scan - I doubt there are that many ranges anyway... + // feel free to fix this with something faster (eg binary chop). + for (unsigned i = 0; i < end; ++i) { + // does the new range fall before the current position in the array + if (hi < ranges[i].begin) { + // optional optimization: concatenate appending ranges? - may not be worthwhile. + if (hi == (ranges[i].begin - 1)) { + ranges[i].begin = lo; + return; + } + ranges.insert(i, CharacterRange(lo, hi)); + return; + } + // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining + // If the new range start at or before the end of the last range, then the overlap (if it starts one after the + // end of the last range they concatenate, which is just as good. + if (lo <= (ranges[i].end + 1)) { + // found an intersect! we'll replace this entry in the array. + ranges[i].begin = std::min(ranges[i].begin, lo); + ranges[i].end = std::max(ranges[i].end, hi); + + // now check if the new range can subsume any subsequent ranges. + unsigned next = i+1; + // each iteration of the loop we will either remove something from the list, or break the loop. + while (next < ranges.size()) { + if (ranges[next].begin <= (ranges[i].end + 1)) { + // the next entry now overlaps / concatenates this one. + ranges[i].end = std::max(ranges[i].end, ranges[next].end); + ranges.remove(next); + } else + break; + } + + return; + } + } + + // CharacterRange comes after all existing ranges. + ranges.append(CharacterRange(lo, hi)); + } + + bool m_isCaseInsensitive; + + Vector<UChar> m_matches; + Vector<CharacterRange> m_ranges; + Vector<UChar> m_matchesUnicode; + Vector<CharacterRange> m_rangesUnicode; +}; + + +CharacterClass* newlineCreate() +{ + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_matches.append('\n'); + characterClass->m_matches.append('\r'); + characterClass->m_matchesUnicode.append(0x2028); + characterClass->m_matchesUnicode.append(0x2029); + + return characterClass; +} + +CharacterClass* digitsCreate() +{ + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_ranges.append(CharacterRange('0', '9')); + + return characterClass; +} + +CharacterClass* spacesCreate() +{ + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_matches.append(' '); + characterClass->m_ranges.append(CharacterRange('\t', '\r')); + characterClass->m_matchesUnicode.append(0x00a0); + characterClass->m_matchesUnicode.append(0x1680); + characterClass->m_matchesUnicode.append(0x180e); + characterClass->m_matchesUnicode.append(0x2028); + characterClass->m_matchesUnicode.append(0x2029); + characterClass->m_matchesUnicode.append(0x202f); + characterClass->m_matchesUnicode.append(0x205f); + characterClass->m_matchesUnicode.append(0x3000); + characterClass->m_rangesUnicode.append(CharacterRange(0x2000, 0x200a)); + + return characterClass; +} + +CharacterClass* wordcharCreate() +{ + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_matches.append('_'); + characterClass->m_ranges.append(CharacterRange('0', '9')); + characterClass->m_ranges.append(CharacterRange('A', 'Z')); + characterClass->m_ranges.append(CharacterRange('a', 'z')); + + return characterClass; +} + +CharacterClass* nondigitsCreate() +{ + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_ranges.append(CharacterRange(0, '0' - 1)); + characterClass->m_ranges.append(CharacterRange('9' + 1, 0x7f)); + characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff)); + + return characterClass; +} + +CharacterClass* nonspacesCreate() +{ + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_ranges.append(CharacterRange(0, '\t' - 1)); + characterClass->m_ranges.append(CharacterRange('\r' + 1, ' ' - 1)); + characterClass->m_ranges.append(CharacterRange(' ' + 1, 0x7f)); + characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x009f)); + characterClass->m_rangesUnicode.append(CharacterRange(0x00a1, 0x167f)); + characterClass->m_rangesUnicode.append(CharacterRange(0x1681, 0x180d)); + characterClass->m_rangesUnicode.append(CharacterRange(0x180f, 0x1fff)); + characterClass->m_rangesUnicode.append(CharacterRange(0x200b, 0x2027)); + characterClass->m_rangesUnicode.append(CharacterRange(0x202a, 0x202e)); + characterClass->m_rangesUnicode.append(CharacterRange(0x2030, 0x205e)); + characterClass->m_rangesUnicode.append(CharacterRange(0x2060, 0x2fff)); + characterClass->m_rangesUnicode.append(CharacterRange(0x3001, 0xffff)); + + return characterClass; +} + +CharacterClass* nonwordcharCreate() +{ + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_matches.append('`'); + characterClass->m_ranges.append(CharacterRange(0, '0' - 1)); + characterClass->m_ranges.append(CharacterRange('9' + 1, 'A' - 1)); + characterClass->m_ranges.append(CharacterRange('Z' + 1, '_' - 1)); + characterClass->m_ranges.append(CharacterRange('z' + 1, 0x7f)); + characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff)); + + return characterClass; +} + + +class RegexPatternConstructor { +public: + RegexPatternConstructor(RegexPattern& pattern) + : m_pattern(pattern) + , m_characterClassConstructor(pattern.m_ignoreCase) + { + } + + ~RegexPatternConstructor() + { + } + + void reset() + { + m_pattern.reset(); + m_characterClassConstructor.reset(); + } + + void assertionBOL() + { + m_alternative->m_terms.append(PatternTerm::BOL()); + } + void assertionEOL() + { + m_alternative->m_terms.append(PatternTerm::EOL()); + } + void assertionWordBoundary(bool invert) + { + m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); + } + + void atomPatternCharacter(UChar ch) + { + // We handle case-insensitive checking of unicode characters which do have both + // cases by handling them as if they were defined using a CharacterClass. + if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { + atomCharacterClassBegin(); + atomCharacterClassAtom(ch); + atomCharacterClassEnd(); + } else + m_alternative->m_terms.append(PatternTerm(ch)); + } + + void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) + { + switch (classID) { + case DigitClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); + break; + case SpaceClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); + break; + case WordClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); + break; + case NewlineClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); + break; + } + } + + void atomCharacterClassBegin(bool invert = false) + { + m_invertCharacterClass = invert; + } + + void atomCharacterClassAtom(UChar ch) + { + m_characterClassConstructor.putChar(ch); + } + + void atomCharacterClassRange(UChar begin, UChar end) + { + m_characterClassConstructor.putRange(begin, end); + } + + void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) + { + ASSERT(classID != NewlineClassID); + + switch (classID) { + case DigitClassID: + m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); + break; + + case SpaceClassID: + m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); + break; + + case WordClassID: + m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); + break; + + default: + ASSERT_NOT_REACHED(); + } + } + + void atomCharacterClassEnd() + { + CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); + m_pattern.m_userCharacterClasses.append(newCharacterClass); + m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); + } + + void atomParenthesesSubpatternBegin(bool capture = true) + { + unsigned subpatternId = m_pattern.m_numSubpatterns + 1; + if (capture) + m_pattern.m_numSubpatterns++; + + PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); + m_pattern.m_disjunctions.append(parenthesesDisjunction); + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture)); + m_alternative = parenthesesDisjunction->addNewAlternative(); + } + + void atomParentheticalAssertionBegin(bool invert = false) + { + PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); + m_pattern.m_disjunctions.append(parenthesesDisjunction); + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert)); + m_alternative = parenthesesDisjunction->addNewAlternative(); + } + + void atomParenthesesEnd() + { + ASSERT(m_alternative->m_parent); + ASSERT(m_alternative->m_parent->m_parent); + m_alternative = m_alternative->m_parent->m_parent; + + m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; + } + + void atomBackReference(unsigned subpatternId) + { + ASSERT(subpatternId); + m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); + + if (subpatternId > m_pattern.m_numSubpatterns) { + m_alternative->m_terms.append(PatternTerm::ForwardReference()); + return; + } + + PatternAlternative* currentAlternative = m_alternative; + ASSERT(currentAlternative); + + // Note to self: if we waited until the AST was baked, we could also remove forwards refs + while ((currentAlternative = currentAlternative->m_parent->m_parent)) { + PatternTerm& term = currentAlternative->lastTerm(); + ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); + + if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId)) { + m_alternative->m_terms.append(PatternTerm::ForwardReference()); + return; + } + } + + m_alternative->m_terms.append(PatternTerm(subpatternId)); + } + + PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction) + { + PatternDisjunction* newDisjunction = new PatternDisjunction(); + + newDisjunction->m_parent = disjunction->m_parent; + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) + newAlternative->m_terms.append(copyTerm(alternative->m_terms[i])); + } + + m_pattern.m_disjunctions.append(newDisjunction); + return newDisjunction; + } + + PatternTerm copyTerm(PatternTerm& term) + { + if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) + return PatternTerm(term); + + PatternTerm termCopy = term; + termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction); + return termCopy; + } + + void quantifyAtom(unsigned min, unsigned max, bool greedy) + { + ASSERT(min <= max); + ASSERT(m_alternative->m_terms.size()); + + if (!max) { + m_alternative->removeLastTerm(); + return; + } + + PatternTerm& term = m_alternative->lastTerm(); + ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); + ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); + + // For any assertion with a zero minimum, not matching is valid and has no effect, + // remove it. Otherwise, we need to match as least once, but there is no point + // matching more than once, so remove the quantifier. It is not entirely clear + // from the spec whether or not this behavior is correct, but I believe this + // matches Firefox. :-/ + if (term.type == PatternTerm::TypeParentheticalAssertion) { + if (!min) + m_alternative->removeLastTerm(); + return; + } + + if (min == 0) + term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); + else if (min == max) + term.quantify(min, QuantifierFixedCount); + else { + term.quantify(min, QuantifierFixedCount); + m_alternative->m_terms.append(copyTerm(term)); + // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... + m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); + if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) + m_alternative->lastTerm().parentheses.isCopy = true; + } + } + + void disjunction() + { + m_alternative = m_alternative->m_parent->addNewAlternative(); + } + + void regexBegin() + { + m_pattern.m_body = new PatternDisjunction(); + m_alternative = m_pattern.m_body->addNewAlternative(); + m_pattern.m_disjunctions.append(m_pattern.m_body); + } + void regexEnd() + { + } + void regexError() + { + } + + unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) + { + alternative->m_hasFixedSize = true; + unsigned currentInputPosition = initialInputPosition; + + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { + PatternTerm& term = alternative->m_terms[i]; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + case PatternTerm::TypeAssertionEOL: + case PatternTerm::TypeAssertionWordBoundary: + term.inputPosition = currentInputPosition; + break; + + case PatternTerm::TypeBackReference: + term.inputPosition = currentInputPosition; + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference; + alternative->m_hasFixedSize = false; + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypePatternCharacter: + term.inputPosition = currentInputPosition; + if (term.quantityType != QuantifierFixedCount) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter; + alternative->m_hasFixedSize = false; + } else + currentInputPosition += term.quantityCount; + break; + + case PatternTerm::TypeCharacterClass: + term.inputPosition = currentInputPosition; + if (term.quantityType != QuantifierFixedCount) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass; + alternative->m_hasFixedSize = false; + } else + currentInputPosition += term.quantityCount; + break; + + case PatternTerm::TypeParenthesesSubpattern: + // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. + term.frameLocation = currentCallFrameSize; + if ((term.quantityCount == 1) && !term.parentheses.isCopy) { + if (term.quantityType == QuantifierFixedCount) { + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); + currentInputPosition += term.parentheses.disjunction->m_minimumSize; + } else { + currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); + } + term.inputPosition = currentInputPosition; + } else { + term.inputPosition = currentInputPosition; + setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition); + currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses; + } + // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. + alternative->m_hasFixedSize = false; + break; + + case PatternTerm::TypeParentheticalAssertion: + term.inputPosition = currentInputPosition; + term.frameLocation = currentCallFrameSize; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition); + break; + } + } + + alternative->m_minimumSize = currentInputPosition - initialInputPosition; + return currentCallFrameSize; + } + + unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) + { + if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) + initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative; + + unsigned minimumInputSize = UINT_MAX; + unsigned maximumCallFrameSize = 0; + bool hasFixedSize = true; + + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); + minimumInputSize = min(minimumInputSize, alternative->m_minimumSize); + maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize); + hasFixedSize &= alternative->m_hasFixedSize; + } + + ASSERT(minimumInputSize != UINT_MAX); + ASSERT(maximumCallFrameSize >= initialCallFrameSize); + + disjunction->m_hasFixedSize = hasFixedSize; + disjunction->m_minimumSize = minimumInputSize; + disjunction->m_callFrameSize = maximumCallFrameSize; + return maximumCallFrameSize; + } + + void setupOffsets() + { + setupDisjunctionOffsets(m_pattern.m_body, 0, 0); + } + +private: + RegexPattern& m_pattern; + PatternAlternative* m_alternative; + CharacterClassConstructor m_characterClassConstructor; + bool m_invertCharacterClass; +}; + + +const char* compileRegex(const UString& patternString, RegexPattern& pattern) +{ + RegexPatternConstructor constructor(pattern); + + if (const char* error = parse(constructor, patternString)) + return error; + + // If the pattern contains illegal backreferences reset & reparse. + // Quoting Netscape's "What's new in JavaScript 1.2", + // "Note: if the number of left parentheses is less than the number specified + // in \#, the \# is taken as an octal escape as described in the next row." + if (pattern.containsIllegalBackReference()) { + unsigned numSubpatterns = pattern.m_numSubpatterns; + + constructor.reset(); +#ifndef NDEBUG + const char* error = +#endif + parse(constructor, patternString, numSubpatterns); + + ASSERT(!error); + ASSERT(numSubpatterns == pattern.m_numSubpatterns); + } + + constructor.setupOffsets(); + + return false; +}; + + +} } + +#endif diff --git a/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexCompiler.h b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexCompiler.h new file mode 100644 index 0000000..3ed2be9 --- /dev/null +++ b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexCompiler.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RegexCompiler_h +#define RegexCompiler_h + +#include <wtf/Platform.h> + +#if ENABLE(YARR) + +#include <wtf/unicode/Unicode.h> +#include "RegexParser.h" +#include "RegexPattern.h" + +namespace JSC { namespace Yarr { + +const char* compileRegex(const UString& patternString, RegexPattern& pattern); + +} } // namespace JSC::Yarr + +#endif + +#endif // RegexCompiler_h diff --git a/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexInterpreter.cpp b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexInterpreter.cpp new file mode 100644 index 0000000..b0aae65 --- /dev/null +++ b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexInterpreter.cpp @@ -0,0 +1,1638 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "RegexInterpreter.h" + +#include "RegexCompiler.h" +#include "RegexPattern.h" + +#ifndef NDEBUG +#include <stdio.h> +#endif + +#if ENABLE(YARR) + +using namespace WTF; + +namespace JSC { namespace Yarr { + +class Interpreter { +public: + struct ParenthesesDisjunctionContext; + + struct BackTrackInfoPatternCharacter { + uintptr_t matchAmount; + }; + struct BackTrackInfoCharacterClass { + uintptr_t matchAmount; + }; + struct BackTrackInfoBackReference { + uintptr_t begin; // Not really needed for greedy quantifiers. + uintptr_t matchAmount; // Not really needed for fixed quantifiers. + }; + struct BackTrackInfoAlternative { + uintptr_t offset; + }; + struct BackTrackInfoParentheticalAssertion { + uintptr_t begin; + }; + struct BackTrackInfoParenthesesOnce { + uintptr_t inParentheses; + }; + struct BackTrackInfoParentheses { + uintptr_t matchAmount; + ParenthesesDisjunctionContext* lastContext; + uintptr_t prevBegin; + uintptr_t prevEnd; + }; + + static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) + { + context->next = backTrack->lastContext; + backTrack->lastContext = context; + ++backTrack->matchAmount; + } + + static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) + { + ASSERT(backTrack->matchAmount); + ASSERT(backTrack->lastContext); + backTrack->lastContext = backTrack->lastContext->next; + --backTrack->matchAmount; + } + + struct DisjunctionContext + { + DisjunctionContext() + : term(0) + { + } + + void* operator new(size_t, void* where) + { + return where; + } + + int term; + unsigned matchBegin; + unsigned matchEnd; + uintptr_t frame[1]; + }; + + DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) + { + return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext(); + } + + void freeDisjunctionContext(DisjunctionContext* context) + { + free(context); + } + + struct ParenthesesDisjunctionContext + { + ParenthesesDisjunctionContext(int* output, ByteTerm& term) + : next(0) + { + unsigned firstSubpatternId = term.atom.subpatternId; + unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; + + for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { + subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; + output[(firstSubpatternId << 1) + i] = -1; + } + + new(getDisjunctionContext(term)) DisjunctionContext(); + } + + void* operator new(size_t, void* where) + { + return where; + } + + void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) + { + for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) + output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; + } + + DisjunctionContext* getDisjunctionContext(ByteTerm& term) + { + return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); + } + + ParenthesesDisjunctionContext* next; + int subpatternBackup[1]; + }; + + ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) + { + return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term); + } + + void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) + { + free(context); + } + + class InputStream { + public: + InputStream(const UChar* input, unsigned start, unsigned length) + : input(input) + , pos(start) + , length(length) + { + } + + void next() + { + ++pos; + } + + void rewind(unsigned amount) + { + ASSERT(pos >= amount); + pos -= amount; + } + + int read() + { + ASSERT(pos < length); + if (pos < length) + return input[pos]; + return -1; + } + + int readChecked(int position) + { + ASSERT(position < 0); + ASSERT((unsigned)-position <= pos); + unsigned p = pos + position; + ASSERT(p < length); + return input[p]; + } + + int reread(unsigned from) + { + ASSERT(from < length); + return input[from]; + } + + int prev() + { + ASSERT(!(pos > length)); + if (pos && length) + return input[pos - 1]; + return -1; + } + + unsigned getPos() + { + return pos; + } + + void setPos(unsigned p) + { + pos = p; + } + + bool atStart() + { + return pos == 0; + } + + bool atEnd() + { + return pos == length; + } + + bool checkInput(int count) + { + if ((pos + count) <= length) { + pos += count; + return true; + } else + return false; + } + + void uncheckInput(int count) + { + pos -= count; + } + + bool atStart(int position) + { + return (pos + position) == 0; + } + + bool atEnd(int position) + { + return (pos + position) == length; + } + + private: + const UChar* input; + unsigned pos; + unsigned length; + }; + + bool testCharacterClass(CharacterClass* characterClass, int ch) + { + if (ch & 0xFF80) { + for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) + if (ch == characterClass->m_matchesUnicode[i]) + return true; + for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) + if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) + return true; + } else { + for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) + if (ch == characterClass->m_matches[i]) + return true; + for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) + if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) + return true; + } + + return false; + } + + bool tryConsumeCharacter(int testChar) + { + if (input.atEnd()) + return false; + + int ch = input.read(); + + if (pattern->m_ignoreCase ? ((Unicode::toLower(testChar) == ch) || (Unicode::toUpper(testChar) == ch)) : (testChar == ch)) { + input.next(); + return true; + } + return false; + } + + bool checkCharacter(int testChar, int inputPosition) + { + return testChar == input.readChecked(inputPosition); + } + + bool checkCasedCharacter(int loChar, int hiChar, int inputPosition) + { + int ch = input.readChecked(inputPosition); + return (loChar == ch) || (hiChar == ch); + } + + bool tryConsumeCharacterClass(CharacterClass* characterClass, bool invert) + { + if (input.atEnd()) + return false; + + bool match = testCharacterClass(characterClass, input.read()); + + if (invert) + match = !match; + + if (match) { + input.next(); + return true; + } + return false; + } + + bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) + { + bool match = testCharacterClass(characterClass, input.readChecked(inputPosition)); + return invert ? !match : match; + } + + bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset) + { + int matchSize = matchEnd - matchBegin; + + if (!input.checkInput(matchSize)) + return false; + + for (int i = 0; i < matchSize; ++i) { + if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { + input.uncheckInput(matchSize); + return false; + } + } + + return true; + } + + bool matchAssertionBOL(ByteTerm& term) + { + return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1))); + } + + bool matchAssertionEOL(ByteTerm& term) + { + if (term.inputPosition) + return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); + else + return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); + } + + bool matchAssertionWordBoundary(ByteTerm& term) + { + bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1)); + bool readIsWordchar; + if (term.inputPosition) + readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); + else + readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); + + bool wordBoundary = prevIsWordchar != readIsWordchar; + return term.invert() ? !wordBoundary : wordBoundary; + } + + bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) + { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) + { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeCharacterClass); + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount)) + return false; + } + return true; + } + + case QuantifierGreedy: { + unsigned matchAmount = 0; + while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + return true; + } + + case QuantifierNonGreedy: + backTrack->matchAmount = 0; + return true; + } + + ASSERT_NOT_REACHED(); + return false; + } + + bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeCharacterClass); + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool matchBackReference(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeBackReference); + BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); + + int matchBegin = output[(term.atom.subpatternId << 1)]; + int matchEnd = output[(term.atom.subpatternId << 1) + 1]; + ASSERT((matchBegin == -1) == (matchEnd == -1)); + ASSERT(matchBegin <= matchEnd); + + if (matchBegin == matchEnd) + return true; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + backTrack->begin = input.getPos(); + for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { + if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { + input.setPos(backTrack->begin); + return false; + } + } + return true; + } + + case QuantifierGreedy: { + unsigned matchAmount = 0; + while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) + ++matchAmount; + backTrack->matchAmount = matchAmount; + return true; + } + + case QuantifierNonGreedy: + backTrack->begin = input.getPos(); + backTrack->matchAmount = 0; + return true; + } + + ASSERT_NOT_REACHED(); + return false; + } + + bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeBackReference); + BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); + + int matchBegin = output[(term.atom.subpatternId << 1)]; + int matchEnd = output[(term.atom.subpatternId << 1) + 1]; + ASSERT((matchBegin == -1) == (matchEnd == -1)); + ASSERT(matchBegin <= matchEnd); + + if (matchBegin == matchEnd) + return false; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + // for quantityCount == 1, could rewind. + input.setPos(backTrack->begin); + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.rewind(matchEnd - matchBegin); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { + ++backTrack->matchAmount; + return true; + } else + input.setPos(backTrack->begin); + break; + } + + return false; + } + + void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) + { + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; + output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; + } + } + void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) + { + unsigned firstSubpatternId = term.atom.subpatternId; + unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; + context->restoreOutput(output, firstSubpatternId, count); + } + void resetAssertionMatches(ByteTerm& term) + { + unsigned firstSubpatternId = term.atom.subpatternId; + unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; + for (unsigned i = 0; i < (count << 1); ++i) + output[(firstSubpatternId << 1) + i] = -1; + } + bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) + { + while (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + + if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true)) + return true; + + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + popParenthesesDisjunctionContext(backTrack); + } + + return false; + } + + bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierGreedy: { + // set this speculatively; if we get to the parens end this will be true. + backTrack->inParentheses = 1; + break; + } + case QuantifierNonGreedy: { + backTrack->inParentheses = 0; + context->term += term.atom.parenthesesWidth; + return true; + } + case QuantifierFixedCount: + break; + } + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = input.getPos() + term.inputPosition; + } + + return true; + } + + bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); + ASSERT(term.atom.quantityCount == 1); + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; + } + return true; + } + + bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = -1; + output[(subpatternId << 1) + 1] = -1; + } + + switch (term.atom.quantityType) { + case QuantifierGreedy: + // if we backtrack to this point, there is another chance - try matching nothing. + ASSERT(backTrack->inParentheses); + backTrack->inParentheses = 0; + context->term += term.atom.parenthesesWidth; + return true; + case QuantifierNonGreedy: + ASSERT(backTrack->inParentheses); + case QuantifierFixedCount: + break; + } + + return false; + } + + bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierGreedy: + if (!backTrack->inParentheses) { + context->term -= term.atom.parenthesesWidth; + return false; + } + case QuantifierNonGreedy: + if (!backTrack->inParentheses) { + // now try to match the parens; set this speculatively. + backTrack->inParentheses = 1; + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; + } + context->term -= term.atom.parenthesesWidth; + return true; + } + case QuantifierFixedCount: + break; + } + + return false; + } + + bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); + + backTrack->begin = input.getPos(); + return true; + } + + bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); + + input.setPos(backTrack->begin); + + // We've reached the end of the parens; if they are inverted, this is failure. + if (term.invert()) { + context->term -= term.atom.parenthesesWidth; + return false; + } + + return true; + } + + bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); + ASSERT(term.atom.quantityCount == 1); + + // We've failed to match parens; if they are inverted, this is win! + if (term.invert()) { + context->term += term.atom.parenthesesWidth; + return true; + } + + return false; + } + + bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); + + input.setPos(backTrack->begin); + + context->term -= term.atom.parenthesesWidth; + return false; + } + + bool matchParentheses(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); + + BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); + + unsigned subpatternId = term.atom.subpatternId; + ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; + + backTrack->prevBegin = output[(subpatternId << 1)]; + backTrack->prevEnd = output[(subpatternId << 1) + 1]; + + backTrack->matchAmount = 0; + backTrack->lastContext = 0; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + // While we haven't yet reached our fixed limit, + while (backTrack->matchAmount < term.atom.quantityCount) { + // Try to do a match, and it it succeeds, add it to the list. + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) + appendParenthesesDisjunctionContext(backTrack, context); + else { + // The match failed; try to find an alternate point to carry on from. + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + if (!parenthesesDoBacktrack(term, backTrack)) + return false; + } + } + + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + return true; + } + + case QuantifierGreedy: { + while (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) + appendParenthesesDisjunctionContext(backTrack, context); + else { + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + break; + } + } + + if (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return true; + } + + case QuantifierNonGreedy: + return true; + } + + ASSERT_NOT_REACHED(); + return false; + } + + // Rules for backtracking differ depending on whether this is greedy or non-greedy. + // + // Greedy matches never should try just adding more - you should already have done + // the 'more' cases. Always backtrack, at least a leetle bit. However cases where + // you backtrack an item off the list needs checking, since we'll never have matched + // the one less case. Tracking forwards, still add as much as possible. + // + // Non-greedy, we've already done the one less case, so don't match on popping. + // We haven't done the one more case, so always try to add that. + // + bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); + + BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = backTrack->prevBegin; + output[(subpatternId << 1) + 1] = backTrack->prevEnd; + } + + ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + + ParenthesesDisjunctionContext* context = 0; + + if (!parenthesesDoBacktrack(term, backTrack)) + return false; + + // While we haven't yet reached our fixed limit, + while (backTrack->matchAmount < term.atom.quantityCount) { + // Try to do a match, and it it succeeds, add it to the list. + context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) + appendParenthesesDisjunctionContext(backTrack, context); + else { + // The match failed; try to find an alternate point to carry on from. + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + if (!parenthesesDoBacktrack(term, backTrack)) + return false; + } + } + + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + context = backTrack->lastContext; + recordParenthesesMatch(term, context); + return true; + } + + case QuantifierGreedy: { + if (!backTrack->matchAmount) + return false; + + ParenthesesDisjunctionContext* context = backTrack->lastContext; + if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { + while (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) + appendParenthesesDisjunctionContext(backTrack, context); + else { + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + break; + } + } + } else { + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + popParenthesesDisjunctionContext(backTrack); + } + + if (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return true; + } + + case QuantifierNonGreedy: { + // If we've not reached the limit, try to add one more match. + if (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) { + appendParenthesesDisjunctionContext(backTrack, context); + recordParenthesesMatch(term, context); + return true; + } else { + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + } + } + + // Nope - okay backtrack looking for an alternative. + while (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { + // successful backtrack! we're back in the game! + if (backTrack->matchAmount) { + context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return true; + } + + // pop a match off the stack + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + popParenthesesDisjunctionContext(backTrack); + } + + return false; + } + } + + ASSERT_NOT_REACHED(); + return false; + } + +#define MATCH_NEXT() { ++context->term; goto matchAgain; } +#define BACKTRACK() { --context->term; goto backtrack; } +#define currentTerm() (disjunction->terms[context->term]) + bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) + { + if (btrack) + BACKTRACK(); + + context->matchBegin = input.getPos(); + context->term = 0; + + matchAgain: + ASSERT(context->term < static_cast<int>(disjunction->terms.size())); + + switch (currentTerm().type) { + case ByteTerm::TypeSubpatternBegin: + MATCH_NEXT(); + case ByteTerm::TypeSubpatternEnd: + context->matchEnd = input.getPos(); + return true; + + case ByteTerm::TypeBodyAlternativeBegin: + MATCH_NEXT(); + case ByteTerm::TypeBodyAlternativeDisjunction: + case ByteTerm::TypeBodyAlternativeEnd: + context->matchEnd = input.getPos(); + return true; + + case ByteTerm::TypeAlternativeBegin: + MATCH_NEXT(); + case ByteTerm::TypeAlternativeDisjunction: + case ByteTerm::TypeAlternativeEnd: { + int offset = currentTerm().alternative.end; + BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); + backTrack->offset = offset; + context->term += offset; + MATCH_NEXT(); + } + + case ByteTerm::TypeAssertionBOL: + if (matchAssertionBOL(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeAssertionEOL: + if (matchAssertionEOL(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeAssertionWordBoundary: + if (matchAssertionWordBoundary(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypePatternCharacterOnce: + case ByteTerm::TypePatternCharacterFixed: { + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { + if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount)) + BACKTRACK(); + } + MATCH_NEXT(); + } + case ByteTerm::TypePatternCharacterGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + unsigned matchAmount = 0; + while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { + if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + MATCH_NEXT(); + } + case ByteTerm::TypePatternCharacterNonGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + backTrack->matchAmount = 0; + MATCH_NEXT(); + } + + case ByteTerm::TypePatternCasedCharacterOnce: + case ByteTerm::TypePatternCasedCharacterFixed: { + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { + if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount)) + BACKTRACK(); + } + MATCH_NEXT(); + } + case ByteTerm::TypePatternCasedCharacterGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + unsigned matchAmount = 0; + while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { + if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + MATCH_NEXT(); + } + case ByteTerm::TypePatternCasedCharacterNonGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + backTrack->matchAmount = 0; + MATCH_NEXT(); + } + + case ByteTerm::TypeCharacterClass: + if (matchCharacterClass(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeBackReference: + if (matchBackReference(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpattern: + if (matchParentheses(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternOnceBegin: + if (matchParenthesesOnceBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternOnceEnd: + if (matchParenthesesOnceEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionBegin: + if (matchParentheticalAssertionBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionEnd: + if (matchParentheticalAssertionEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypeCheckInput: + if (input.checkInput(currentTerm().checkInputCount)) + MATCH_NEXT(); + BACKTRACK(); + } + + // We should never fall-through to here. + ASSERT_NOT_REACHED(); + + backtrack: + ASSERT(context->term < static_cast<int>(disjunction->terms.size())); + + switch (currentTerm().type) { + case ByteTerm::TypeSubpatternBegin: + return false; + case ByteTerm::TypeSubpatternEnd: + ASSERT_NOT_REACHED(); + + case ByteTerm::TypeBodyAlternativeBegin: + case ByteTerm::TypeBodyAlternativeDisjunction: { + int offset = currentTerm().alternative.next; + context->term += offset; + if (offset > 0) + MATCH_NEXT(); + + if (input.atEnd()) + return false; + + input.next(); + context->matchBegin = input.getPos(); + MATCH_NEXT(); + } + case ByteTerm::TypeBodyAlternativeEnd: + ASSERT_NOT_REACHED(); + + case ByteTerm::TypeAlternativeBegin: + case ByteTerm::TypeAlternativeDisjunction: { + int offset = currentTerm().alternative.next; + context->term += offset; + if (offset > 0) + MATCH_NEXT(); + BACKTRACK(); + } + case ByteTerm::TypeAlternativeEnd: { + // We should never backtrack back into an alternative of the main body of the regex. + BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); + unsigned offset = backTrack->offset; + context->term -= offset; + BACKTRACK(); + } + + case ByteTerm::TypeAssertionBOL: + case ByteTerm::TypeAssertionEOL: + case ByteTerm::TypeAssertionWordBoundary: + BACKTRACK(); + + case ByteTerm::TypePatternCharacterOnce: + case ByteTerm::TypePatternCharacterFixed: + case ByteTerm::TypePatternCharacterGreedy: + case ByteTerm::TypePatternCharacterNonGreedy: + if (backtrackPatternCharacter(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypePatternCasedCharacterOnce: + case ByteTerm::TypePatternCasedCharacterFixed: + case ByteTerm::TypePatternCasedCharacterGreedy: + case ByteTerm::TypePatternCasedCharacterNonGreedy: + if (backtrackPatternCasedCharacter(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeCharacterClass: + if (backtrackCharacterClass(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeBackReference: + if (backtrackBackReference(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpattern: + if (backtrackParentheses(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternOnceBegin: + if (backtrackParenthesesOnceBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternOnceEnd: + if (backtrackParenthesesOnceEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionBegin: + if (backtrackParentheticalAssertionBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionEnd: + if (backtrackParentheticalAssertionEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypeCheckInput: + input.uncheckInput(currentTerm().checkInputCount); + BACKTRACK(); + } + + ASSERT_NOT_REACHED(); + return false; + } + + bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) + { + if (matchDisjunction(disjunction, context, btrack)) { + while (context->matchBegin == context->matchEnd) { + if (!matchDisjunction(disjunction, context, true)) + return false; + } + return true; + } + + return false; + } + + int interpret() + { + for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) + output[i] = -1; + + DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); + + if (matchDisjunction(pattern->m_body.get(), context)) { + output[0] = context->matchBegin; + output[1] = context->matchEnd; + } + + freeDisjunctionContext(context); + + return output[0]; + } + + Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length) + : pattern(pattern) + , output(output) + , input(inputChar, start, length) + { + } + +private: + BytecodePattern *pattern; + int* output; + InputStream input; +}; + + + +class ByteCompiler { + struct ParenthesesStackEntry { + unsigned beginTerm; + unsigned savedAlternativeIndex; + ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) + : beginTerm(beginTerm) + , savedAlternativeIndex(savedAlternativeIndex) + { + } + }; + +public: + ByteCompiler(RegexPattern& pattern) + : m_pattern(pattern) + { + bodyDisjunction = 0; + currentAlternativeIndex = 0; + } + + BytecodePattern* compile() + { + regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize); + emitDisjunction(m_pattern.m_body); + regexEnd(); + + return new BytecodePattern(bodyDisjunction, m_allParenthesesInfo, m_pattern); + } + + void checkInput(unsigned count) + { + bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); + } + + void assertionBOL(int inputPosition) + { + bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); + } + + void assertionEOL(int inputPosition) + { + bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); + } + + void assertionWordBoundary(bool invert, int inputPosition) + { + bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); + } + + void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + if (m_pattern.m_ignoreCase) { + UChar lo = Unicode::toLower(ch); + UChar hi = Unicode::toUpper(ch); + + if (lo != hi) { + bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); + return; + } + } + + bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); + } + + void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); + + bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; + bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; + bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + } + + void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + { + ASSERT(subpatternId); + + bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); + + bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; + bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; + bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + } + + void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = bodyDisjunction->terms.size(); + + bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition)); + bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, currentAlternativeIndex)); + currentAlternativeIndex = beginTerm + 1; + } + + void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = bodyDisjunction->terms.size(); + + bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0)); + bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + bodyDisjunction->terms[bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, currentAlternativeIndex)); + currentAlternativeIndex = beginTerm + 1; + } + + unsigned popParenthesesStack() + { + ASSERT(m_parenthesesStack.size()); + int stackEnd = m_parenthesesStack.size() - 1; + unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; + currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; + m_parenthesesStack.shrink(stackEnd); + + ASSERT(beginTerm < bodyDisjunction->terms.size()); + ASSERT(currentAlternativeIndex < bodyDisjunction->terms.size()); + + return beginTerm; + } + +#ifndef NDEBUG + void dumpDisjunction(ByteDisjunction* disjunction) + { + printf("ByteDisjunction(%p):\n\t", disjunction); + for (unsigned i = 0; i < disjunction->terms.size(); ++i) + printf("{ %d } ", disjunction->terms[i].type); + printf("\n"); + } +#endif + + void closeAlternative(int beginTerm) + { + int origBeginTerm = beginTerm; + ASSERT(bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); + int endIndex = bodyDisjunction->terms.size(); + + unsigned frameLocation = bodyDisjunction->terms[beginTerm].frameLocation; + + if (!bodyDisjunction->terms[beginTerm].alternative.next) + bodyDisjunction->terms.remove(beginTerm); + else { + while (bodyDisjunction->terms[beginTerm].alternative.next) { + beginTerm += bodyDisjunction->terms[beginTerm].alternative.next; + ASSERT(bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); + bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; + bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; + + bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); + bodyDisjunction->terms[endIndex].frameLocation = frameLocation; + } + } + + void closeBodyAlternative() + { + int beginTerm = 0; + int origBeginTerm = 0; + ASSERT(bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); + int endIndex = bodyDisjunction->terms.size(); + + unsigned frameLocation = bodyDisjunction->terms[beginTerm].frameLocation; + + while (bodyDisjunction->terms[beginTerm].alternative.next) { + beginTerm += bodyDisjunction->terms[beginTerm].alternative.next; + ASSERT(bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); + bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; + bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; + + bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); + bodyDisjunction->terms[endIndex].frameLocation = frameLocation; + } + + void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = bodyDisjunction->terms.size(); + + bool isAssertion = bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin; + bool invertOrCapture = bodyDisjunction->terms[beginTerm].invertOrCapture; + unsigned subpatternId = bodyDisjunction->terms[beginTerm].atom.subpatternId; + + bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition)); + bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + if (doInline) { + bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } else { + ByteTerm& parenthesesBegin = bodyDisjunction->terms[beginTerm]; + ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + + bool invertOrCapture = parenthesesBegin.invertOrCapture; + unsigned subpatternId = parenthesesBegin.atom.subpatternId; + + unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; + ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); + + parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); + for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) + parenthesesDisjunction->terms.append(bodyDisjunction->terms[termInParentheses]); + parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); + + bodyDisjunction->terms.shrink(beginTerm); + + m_allParenthesesInfo.append(parenthesesDisjunction); + bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition)); + + bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + } + + void regexBegin(unsigned numSubpatterns, unsigned callFrameSize) + { + bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); + bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin()); + bodyDisjunction->terms[0].frameLocation = 0; + currentAlternativeIndex = 0; + } + + void regexEnd() + { + closeBodyAlternative(); + } + + void alterantiveBodyDisjunction() + { + int newAlternativeIndex = bodyDisjunction->terms.size(); + bodyDisjunction->terms[currentAlternativeIndex].alternative.next = newAlternativeIndex - currentAlternativeIndex; + bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction()); + + currentAlternativeIndex = newAlternativeIndex; + } + + void alterantiveDisjunction() + { + int newAlternativeIndex = bodyDisjunction->terms.size(); + bodyDisjunction->terms[currentAlternativeIndex].alternative.next = newAlternativeIndex - currentAlternativeIndex; + bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); + + currentAlternativeIndex = newAlternativeIndex; + } + + void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0) + { + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; + + if (alt) { + if (disjunction == m_pattern.m_body) + alterantiveBodyDisjunction(); + else + alterantiveDisjunction(); + } + + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + unsigned minimumSize = alternative->m_minimumSize; + + ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked); + unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; + if (countToCheck) + checkInput(countToCheck); + currentCountAlreadyChecked += countToCheck; + + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { + PatternTerm& term = alternative->m_terms[i]; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + assertionBOL(term.inputPosition - currentCountAlreadyChecked); + break; + + case PatternTerm::TypeAssertionEOL: + assertionEOL(term.inputPosition - currentCountAlreadyChecked); + break; + + case PatternTerm::TypeAssertionWordBoundary: + assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked); + break; + + case PatternTerm::TypePatternCharacter: + atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeCharacterClass: + atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeBackReference: + atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypeParenthesesSubpattern: { + unsigned disjunctionAlreadyCheckedCount = 0; + if ((term.quantityCount == 1) && !term.parentheses.isCopy) { + if (term.quantityType == QuantifierFixedCount) { + disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize); + atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); + } else { + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); + atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); + } + } else { + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); + atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); + } + break; + } + + case PatternTerm::TypeParentheticalAssertion: { + unsigned alternativeFrameLocation = term.inputPosition + RegexStackSpaceForBackTrackInfoParentheticalAssertion; + + atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); + atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType); + break; + } + } + } + } + } + +private: + RegexPattern& m_pattern; + ByteDisjunction* bodyDisjunction; + unsigned currentAlternativeIndex; + Vector<ParenthesesStackEntry> m_parenthesesStack; + Vector<ByteDisjunction*> m_allParenthesesInfo; +}; + + +BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline) +{ + RegexPattern pattern(ignoreCase, multiline); + + if ((error = compileRegex(patternString, pattern))) + return 0; + + numSubpatterns = pattern.m_numSubpatterns; + + return ByteCompiler(pattern).compile(); +} + +int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output) +{ + return Interpreter(regex, output, input, start, length).interpret(); +} + + +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses); + + +} } + +#endif diff --git a/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexInterpreter.h b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexInterpreter.h new file mode 100644 index 0000000..a8c122a --- /dev/null +++ b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexInterpreter.h @@ -0,0 +1,337 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RegexInterpreter_h +#define RegexInterpreter_h + +#include <wtf/Platform.h> + +#if ENABLE(YARR) + +#include <wtf/unicode/Unicode.h> +#include "RegexParser.h" +#include "RegexPattern.h" + +namespace JSC { namespace Yarr { + +class ByteDisjunction; + +struct ByteTerm { + enum Type { + TypeBodyAlternativeBegin, + TypeBodyAlternativeDisjunction, + TypeBodyAlternativeEnd, + TypeAlternativeBegin, + TypeAlternativeDisjunction, + TypeAlternativeEnd, + TypeSubpatternBegin, + TypeSubpatternEnd, + TypeAssertionBOL, + TypeAssertionEOL, + TypeAssertionWordBoundary, + TypePatternCharacterOnce, + TypePatternCharacterFixed, + TypePatternCharacterGreedy, + TypePatternCharacterNonGreedy, + TypePatternCasedCharacterOnce, + TypePatternCasedCharacterFixed, + TypePatternCasedCharacterGreedy, + TypePatternCasedCharacterNonGreedy, + TypeCharacterClass, + TypeBackReference, + TypeParenthesesSubpattern, + TypeParenthesesSubpatternOnceBegin, + TypeParenthesesSubpatternOnceEnd, + TypeParentheticalAssertionBegin, + TypeParentheticalAssertionEnd, + TypeCheckInput, + } type; + bool invertOrCapture; + union { + struct { + union { + UChar patternCharacter; + struct { + UChar lo; + UChar hi; + } casedCharacter; + CharacterClass* characterClass; + unsigned subpatternId; + }; + union { + ByteDisjunction* parenthesesDisjunction; + unsigned parenthesesWidth; + }; + QuantifierType quantityType; + unsigned quantityCount; + } atom; + struct { + int next; + int end; + } alternative; + unsigned checkInputCount; + }; + unsigned frameLocation; + int inputPosition; + + ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + : frameLocation(frameLocation) + { + switch (quantityType) { + case QuantifierFixedCount: + type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; + break; + case QuantifierGreedy: + type = ByteTerm::TypePatternCharacterGreedy; + break; + case QuantifierNonGreedy: + type = ByteTerm::TypePatternCharacterNonGreedy; + break; + } + + atom.patternCharacter = ch; + atom.quantityType = quantityType; + atom.quantityCount = quantityCount; + inputPosition = inputPos; + } + + ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + : frameLocation(frameLocation) + { + switch (quantityType) { + case QuantifierFixedCount: + type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; + break; + case QuantifierGreedy: + type = ByteTerm::TypePatternCasedCharacterGreedy; + break; + case QuantifierNonGreedy: + type = ByteTerm::TypePatternCasedCharacterNonGreedy; + break; + } + + atom.casedCharacter.lo = lo; + atom.casedCharacter.hi = hi; + atom.quantityType = quantityType; + atom.quantityCount = quantityCount; + inputPosition = inputPos; + } + + ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) + : type(ByteTerm::TypeCharacterClass) + , invertOrCapture(invert) + { + atom.characterClass = characterClass; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool invertOrCapture, int inputPos) + : type(type) + , invertOrCapture(invertOrCapture) + { + atom.subpatternId = subpatternId; + atom.parenthesesDisjunction = parenthesesInfo; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + ByteTerm(Type type, bool invert = false) + : type(type) + , invertOrCapture(invert) + { + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + } + + ByteTerm(Type type, unsigned subpatternId, bool invertOrCapture, int inputPos) + : type(type) + , invertOrCapture(invertOrCapture) + { + atom.subpatternId = subpatternId; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + static ByteTerm BOL(int inputPos) + { + ByteTerm term(TypeAssertionBOL); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm CheckInput(unsigned count) + { + ByteTerm term(TypeCheckInput); + term.checkInputCount = count; + return term; + } + + static ByteTerm EOL(int inputPos) + { + ByteTerm term(TypeAssertionEOL); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm WordBoundary(bool invert, int inputPos) + { + ByteTerm term(TypeAssertionWordBoundary, invert); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm BackReference(unsigned subpatternId, int inputPos) + { + return ByteTerm(TypeBackReference, subpatternId, false, inputPos); + } + + static ByteTerm BodyAlternativeBegin() + { + ByteTerm term(TypeBodyAlternativeBegin); + term.alternative.next = 0; + term.alternative.end = 0; + return term; + } + + static ByteTerm BodyAlternativeDisjunction() + { + ByteTerm term(TypeBodyAlternativeDisjunction); + term.alternative.next = 0; + term.alternative.end = 0; + return term; + } + + static ByteTerm BodyAlternativeEnd() + { + ByteTerm term(TypeBodyAlternativeEnd); + term.alternative.next = 0; + term.alternative.end = 0; + return term; + } + + static ByteTerm AlternativeBegin() + { + ByteTerm term(TypeAlternativeBegin); + term.alternative.next = 0; + term.alternative.end = 0; + return term; + } + + static ByteTerm AlternativeDisjunction() + { + ByteTerm term(TypeAlternativeDisjunction); + term.alternative.next = 0; + term.alternative.end = 0; + return term; + } + + static ByteTerm AlternativeEnd() + { + ByteTerm term(TypeAlternativeEnd); + term.alternative.next = 0; + term.alternative.end = 0; + return term; + } + + static ByteTerm SubpatternBegin() + { + return ByteTerm(TypeSubpatternBegin); + } + + static ByteTerm SubpatternEnd() + { + return ByteTerm(TypeSubpatternEnd); + } + + bool invert() + { + return invertOrCapture; + } + + bool capture() + { + return invertOrCapture; + } +}; + +class ByteDisjunction { +public: + ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) + : m_numSubpatterns(numSubpatterns) + , m_frameSize(frameSize) + { + } + + Vector<ByteTerm> terms; + unsigned m_numSubpatterns; + unsigned m_frameSize; +}; + +struct BytecodePattern { + BytecodePattern(ByteDisjunction* body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern) + : m_body(body) + , m_ignoreCase(pattern.m_ignoreCase) + , m_multiline(pattern.m_multiline) + { + newlineCharacterClass = pattern.newlineCharacterClass(); + wordcharCharacterClass = pattern.wordcharCharacterClass(); + + m_allParenthesesInfo.append(allParenthesesInfo); + m_userCharacterClasses.append(pattern.m_userCharacterClasses); + // 'Steal' the RegexPattern's CharacterClasses! We clear its + // array, so that it won't delete them on destruction. We'll + // take responsibility for that. + pattern.m_userCharacterClasses.clear(); + } + + ~BytecodePattern() + { + deleteAllValues(m_allParenthesesInfo); + deleteAllValues(m_userCharacterClasses); + } + + OwnPtr<ByteDisjunction> m_body; + bool m_ignoreCase; + bool m_multiline; + + CharacterClass* newlineCharacterClass; + CharacterClass* wordcharCharacterClass; +private: + Vector<ByteDisjunction*> m_allParenthesesInfo; + Vector<CharacterClass*> m_userCharacterClasses; +}; + +BytecodePattern* byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false); +int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output); + +} } // namespace JSC::Yarr + +#endif + +#endif // RegexInterpreter_h diff --git a/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexJIT.cpp b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexJIT.cpp new file mode 100644 index 0000000..663a524 --- /dev/null +++ b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexJIT.cpp @@ -0,0 +1,1418 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "RegexJIT.h" + +#include "ASCIICType.h" +#include "JSGlobalData.h" +#include "LinkBuffer.h" +#include "MacroAssembler.h" +#include "RegexCompiler.h" + +#include "pcre.h" // temporary, remove when fallback is removed. + +#if ENABLE(YARR_JIT) + +using namespace WTF; + +namespace JSC { namespace Yarr { + + +class RegexGenerator : private MacroAssembler { + friend void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline); + +#if PLATFORM(ARM) + static const RegisterID input = ARM::r0; + static const RegisterID index = ARM::r1; + static const RegisterID length = ARM::r2; + static const RegisterID output = ARM::r4; + + static const RegisterID regT0 = ARM::r5; + static const RegisterID regT1 = ARM::r6; + + static const RegisterID returnRegister = ARM::r0; +#elif PLATFORM(X86) + static const RegisterID input = X86::eax; + static const RegisterID index = X86::edx; + static const RegisterID length = X86::ecx; + static const RegisterID output = X86::edi; + + static const RegisterID regT0 = X86::ebx; + static const RegisterID regT1 = X86::esi; + + static const RegisterID returnRegister = X86::eax; +#elif PLATFORM(X86_64) + static const RegisterID input = X86::edi; + static const RegisterID index = X86::esi; + static const RegisterID length = X86::edx; + static const RegisterID output = X86::ecx; + + static const RegisterID regT0 = X86::eax; + static const RegisterID regT1 = X86::ebx; + + static const RegisterID returnRegister = X86::eax; +#endif + + void optimizeAlternative(PatternAlternative* alternative) + { + if (!alternative->m_terms.size()) + return; + + for (unsigned i = 0; i < alternative->m_terms.size() - 1; ++i) { + PatternTerm& term = alternative->m_terms[i]; + PatternTerm& nextTerm = alternative->m_terms[i + 1]; + + if ((term.type == PatternTerm::TypeCharacterClass) + && (term.quantityType == QuantifierFixedCount) + && (nextTerm.type == PatternTerm::TypePatternCharacter) + && (nextTerm.quantityType == QuantifierFixedCount)) { + PatternTerm termCopy = term; + alternative->m_terms[i] = nextTerm; + alternative->m_terms[i + 1] = termCopy; + } + } + } + + void matchCharacterClassRange(RegisterID character, JumpList& failures, JumpList& matchDest, const CharacterRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount) + { + do { + // pick which range we're going to generate + int which = count >> 1; + char lo = ranges[which].begin; + char hi = ranges[which].end; + + // check if there are any ranges or matches below lo. If not, just jl to failure - + // if there is anything else to check, check that first, if it falls through jmp to failure. + if ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { + Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo)); + + // generate code for all ranges before this one + if (which) + matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount); + + while ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { + matchDest.append(branch32(Equal, character, Imm32((unsigned short)matches[*matchIndex]))); + ++*matchIndex; + } + failures.append(jump()); + + loOrAbove.link(this); + } else if (which) { + Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo)); + + matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount); + failures.append(jump()); + + loOrAbove.link(this); + } else + failures.append(branch32(LessThan, character, Imm32((unsigned short)lo))); + + while ((*matchIndex < matchCount) && (matches[*matchIndex] <= hi)) + ++*matchIndex; + + matchDest.append(branch32(LessThanOrEqual, character, Imm32((unsigned short)hi))); + // fall through to here, the value is above hi. + + // shuffle along & loop around if there are any more matches to handle. + unsigned next = which + 1; + ranges += next; + count -= next; + } while (count); + } + + void matchCharacterClass(RegisterID character, JumpList& matchDest, const CharacterClass* charClass) + { + Jump unicodeFail; + if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) { + Jump isAscii = branch32(LessThanOrEqual, character, Imm32(0x7f)); + + if (charClass->m_matchesUnicode.size()) { + for (unsigned i = 0; i < charClass->m_matchesUnicode.size(); ++i) { + UChar ch = charClass->m_matchesUnicode[i]; + matchDest.append(branch32(Equal, character, Imm32(ch))); + } + } + + if (charClass->m_rangesUnicode.size()) { + for (unsigned i = 0; i < charClass->m_rangesUnicode.size(); ++i) { + UChar lo = charClass->m_rangesUnicode[i].begin; + UChar hi = charClass->m_rangesUnicode[i].end; + + Jump below = branch32(LessThan, character, Imm32(lo)); + matchDest.append(branch32(LessThanOrEqual, character, Imm32(hi))); + below.link(this); + } + } + + unicodeFail = jump(); + isAscii.link(this); + } + + if (charClass->m_ranges.size()) { + unsigned matchIndex = 0; + JumpList failures; + matchCharacterClassRange(character, failures, matchDest, charClass->m_ranges.begin(), charClass->m_ranges.size(), &matchIndex, charClass->m_matches.begin(), charClass->m_matches.size()); + while (matchIndex < charClass->m_matches.size()) + matchDest.append(branch32(Equal, character, Imm32((unsigned short)charClass->m_matches[matchIndex++]))); + + failures.link(this); + } else if (charClass->m_matches.size()) { + // optimization: gather 'a','A' etc back together, can mask & test once. + Vector<char> matchesAZaz; + + for (unsigned i = 0; i < charClass->m_matches.size(); ++i) { + char ch = charClass->m_matches[i]; + if (m_pattern.m_ignoreCase) { + if (isASCIILower(ch)) { + matchesAZaz.append(ch); + continue; + } + if (isASCIIUpper(ch)) + continue; + } + matchDest.append(branch32(Equal, character, Imm32((unsigned short)ch))); + } + + if (unsigned countAZaz = matchesAZaz.size()) { + or32(Imm32(32), character); + for (unsigned i = 0; i < countAZaz; ++i) + matchDest.append(branch32(Equal, character, Imm32(matchesAZaz[i]))); + } + } + + if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) + unicodeFail.link(this); + } + + // Jumps if input not available; will have (incorrectly) incremented already! + Jump jumpIfNoAvailableInput(unsigned countToCheck) + { + add32(Imm32(countToCheck), index); + return branch32(Above, index, length); + } + + Jump jumpIfAvailableInput(unsigned countToCheck) + { + add32(Imm32(countToCheck), index); + return branch32(BelowOrEqual, index, length); + } + + Jump checkInput() + { + return branch32(BelowOrEqual, index, length); + } + + Jump atEndOfInput() + { + return branch32(Equal, index, length); + } + + Jump notAtEndOfInput() + { + return branch32(NotEqual, index, length); + } + + Jump jumpIfCharEquals(UChar ch, int inputPosition) + { + return branch16(Equal, BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), Imm32(ch)); + } + + Jump jumpIfCharNotEquals(UChar ch, int inputPosition) + { + return branch16(NotEqual, BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), Imm32(ch)); + } + + void readCharacter(int inputPosition, RegisterID reg) + { + load16(BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), reg); + } + + void storeToFrame(RegisterID reg, unsigned frameLocation) + { + poke(reg, frameLocation); + } + + void storeToFrame(Imm32 imm, unsigned frameLocation) + { + poke(imm, frameLocation); + } + + DataLabelPtr storeToFrameWithPatch(unsigned frameLocation) + { + return storePtrWithPatch(ImmPtr(0), Address(stackPointerRegister, frameLocation * sizeof(void*))); + } + + void loadFromFrame(unsigned frameLocation, RegisterID reg) + { + peek(reg, frameLocation); + } + + void loadFromFrameAndJump(unsigned frameLocation) + { + jump(Address(stackPointerRegister, frameLocation * sizeof(void*))); + } + + struct AlternativeBacktrackRecord { + DataLabelPtr dataLabel; + Label backtrackLocation; + + AlternativeBacktrackRecord(DataLabelPtr dataLabel, Label backtrackLocation) + : dataLabel(dataLabel) + , backtrackLocation(backtrackLocation) + { + } + }; + + struct TermGenerationState { + TermGenerationState(PatternDisjunction* disjunction, unsigned checkedTotal) + : disjunction(disjunction) + , checkedTotal(checkedTotal) + { + } + + void resetAlternative() + { + isBackTrackGenerated = false; + alt = 0; + } + bool alternativeValid() + { + return alt < disjunction->m_alternatives.size(); + } + void nextAlternative() + { + ++alt; + } + PatternAlternative* alternative() + { + return disjunction->m_alternatives[alt]; + } + + void resetTerm() + { + ASSERT(alternativeValid()); + t = 0; + } + bool termValid() + { + ASSERT(alternativeValid()); + return t < alternative()->m_terms.size(); + } + void nextTerm() + { + ASSERT(alternativeValid()); + ++t; + } + PatternTerm& term() + { + ASSERT(alternativeValid()); + return alternative()->m_terms[t]; + } + + PatternTerm& lookaheadTerm() + { + ASSERT(alternativeValid()); + ASSERT((t + 1) < alternative()->m_terms.size()); + return alternative()->m_terms[t + 1]; + } + bool isSinglePatternCharacterLookaheadTerm() + { + ASSERT(alternativeValid()); + return ((t + 1) < alternative()->m_terms.size()) + && (lookaheadTerm().type == PatternTerm::TypePatternCharacter) + && (lookaheadTerm().quantityType == QuantifierFixedCount) + && (lookaheadTerm().quantityCount == 1); + } + + int inputOffset() + { + return term().inputPosition - checkedTotal; + } + + void jumpToBacktrack(Jump jump, MacroAssembler* masm) + { + if (isBackTrackGenerated) + jump.linkTo(backtrackLabel, masm); + else + backTrackJumps.append(jump); + } + void jumpToBacktrack(JumpList& jumps, MacroAssembler* masm) + { + if (isBackTrackGenerated) + jumps.linkTo(backtrackLabel, masm); + else + backTrackJumps.append(jumps); + } + bool plantJumpToBacktrackIfExists(MacroAssembler* masm) + { + if (isBackTrackGenerated) { + masm->jump(backtrackLabel); + return true; + } + return false; + } + void addBacktrackJump(Jump jump) + { + backTrackJumps.append(jump); + } + void setBacktrackGenerated(Label label) + { + isBackTrackGenerated = true; + backtrackLabel = label; + } + void linkAlternativeBacktracks(MacroAssembler* masm) + { + isBackTrackGenerated = false; + backTrackJumps.link(masm); + } + void linkAlternativeBacktracksTo(Label label, MacroAssembler* masm) + { + isBackTrackGenerated = false; + backTrackJumps.linkTo(label, masm); + } + void propagateBacktrackingFrom(TermGenerationState& nestedParenthesesState, MacroAssembler* masm) + { + jumpToBacktrack(nestedParenthesesState.backTrackJumps, masm); + if (nestedParenthesesState.isBackTrackGenerated) + setBacktrackGenerated(nestedParenthesesState.backtrackLabel); + } + + PatternDisjunction* disjunction; + int checkedTotal; + private: + unsigned alt; + unsigned t; + JumpList backTrackJumps; + Label backtrackLabel; + bool isBackTrackGenerated; + }; + + void generateAssertionBOL(TermGenerationState& state) + { + PatternTerm& term = state.term(); + + if (m_pattern.m_multiline) { + const RegisterID character = regT0; + + JumpList matchDest; + if (!term.inputPosition) + matchDest.append(branch32(Equal, index, Imm32(state.checkedTotal))); + + readCharacter(state.inputOffset() - 1, character); + matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); + state.jumpToBacktrack(jump(), this); + + matchDest.link(this); + } else { + // Erk, really should poison out these alternatives early. :-/ + if (term.inputPosition) + state.jumpToBacktrack(jump(), this); + else + state.jumpToBacktrack(branch32(NotEqual, index, Imm32(state.checkedTotal)), this); + } + } + + void generateAssertionEOL(TermGenerationState& state) + { + PatternTerm& term = state.term(); + + if (m_pattern.m_multiline) { + const RegisterID character = regT0; + + JumpList matchDest; + if (term.inputPosition == state.checkedTotal) + matchDest.append(atEndOfInput()); + + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); + state.jumpToBacktrack(jump(), this); + + matchDest.link(this); + } else { + if (term.inputPosition == state.checkedTotal) + state.jumpToBacktrack(notAtEndOfInput(), this); + // Erk, really should poison out these alternatives early. :-/ + else + state.jumpToBacktrack(jump(), this); + } + } + + // Also falls though on nextIsNotWordChar. + void matchAssertionWordchar(TermGenerationState& state, JumpList& nextIsWordChar, JumpList& nextIsNotWordChar) + { + const RegisterID character = regT0; + PatternTerm& term = state.term(); + + if (term.inputPosition == state.checkedTotal) + nextIsNotWordChar.append(atEndOfInput()); + + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, nextIsWordChar, m_pattern.wordcharCharacterClass()); + } + + void generateAssertionWordBoundary(TermGenerationState& state) + { + const RegisterID character = regT0; + PatternTerm& term = state.term(); + + Jump atBegin; + JumpList matchDest; + if (!term.inputPosition) + atBegin = branch32(Equal, index, Imm32(state.checkedTotal)); + readCharacter(state.inputOffset() - 1, character); + matchCharacterClass(character, matchDest, m_pattern.wordcharCharacterClass()); + if (!term.inputPosition) + atBegin.link(this); + + // We fall through to here if the last character was not a wordchar. + JumpList nonWordCharThenWordChar; + JumpList nonWordCharThenNonWordChar; + if (term.invertOrCapture) { + matchAssertionWordchar(state, nonWordCharThenNonWordChar, nonWordCharThenWordChar); + nonWordCharThenWordChar.append(jump()); + } else { + matchAssertionWordchar(state, nonWordCharThenWordChar, nonWordCharThenNonWordChar); + nonWordCharThenNonWordChar.append(jump()); + } + state.jumpToBacktrack(nonWordCharThenNonWordChar, this); + + // We jump here if the last character was a wordchar. + matchDest.link(this); + JumpList wordCharThenWordChar; + JumpList wordCharThenNonWordChar; + if (term.invertOrCapture) { + matchAssertionWordchar(state, wordCharThenNonWordChar, wordCharThenWordChar); + wordCharThenWordChar.append(jump()); + } else { + matchAssertionWordchar(state, wordCharThenWordChar, wordCharThenNonWordChar); + // This can fall-though! + } + + state.jumpToBacktrack(wordCharThenWordChar, this); + + nonWordCharThenWordChar.link(this); + wordCharThenNonWordChar.link(this); + } + + void generatePatternCharacterSingle(TermGenerationState& state) + { + const RegisterID character = regT0; + UChar ch = state.term().patternCharacter; + + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + readCharacter(state.inputOffset(), character); + or32(Imm32(32), character); + state.jumpToBacktrack(branch32(NotEqual, character, Imm32(Unicode::toLower(ch))), this); + } else { + ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + state.jumpToBacktrack(jumpIfCharNotEquals(ch, state.inputOffset()), this); + } + } + + void generatePatternCharacterPair(TermGenerationState& state) + { + const RegisterID character = regT0; + UChar ch1 = state.term().patternCharacter; + UChar ch2 = state.lookaheadTerm().patternCharacter; + + int mask = 0; + int chPair = ch1 | (ch2 << 16); + + if (m_pattern.m_ignoreCase) { + if (isASCIIAlpha(ch1)) + mask |= 32; + if (isASCIIAlpha(ch2)) + mask |= 32 << 16; + } + + if (mask) { + load32(BaseIndex(input, index, TimesTwo, state.inputOffset() * sizeof(UChar)), character); + or32(Imm32(mask), character); + state.jumpToBacktrack(branch32(NotEqual, character, Imm32(chPair | mask)), this); + } else + state.jumpToBacktrack(branch32(NotEqual, BaseIndex(input, index, TimesTwo, state.inputOffset() * sizeof(UChar)), Imm32(chPair)), this); + } + + void generatePatternCharacterFixed(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + UChar ch = term.patternCharacter; + + move(index, countRegister); + sub32(Imm32(term.quantityCount), countRegister); + + Label loop(this); + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + load16(BaseIndex(input, countRegister, TimesTwo, (state.inputOffset() + term.quantityCount) * sizeof(UChar)), character); + or32(Imm32(32), character); + state.jumpToBacktrack(branch32(NotEqual, character, Imm32(Unicode::toLower(ch))), this); + } else { + ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + state.jumpToBacktrack(branch16(NotEqual, BaseIndex(input, countRegister, TimesTwo, (state.inputOffset() + term.quantityCount) * sizeof(UChar)), Imm32(ch)), this); + } + add32(Imm32(1), countRegister); + branch32(NotEqual, countRegister, index).linkTo(loop, this); + } + + void generatePatternCharacterGreedy(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + UChar ch = term.patternCharacter; + + move(Imm32(0), countRegister); + + JumpList failures; + Label loop(this); + failures.append(atEndOfInput()); + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + readCharacter(state.inputOffset(), character); + or32(Imm32(32), character); + failures.append(branch32(NotEqual, character, Imm32(Unicode::toLower(ch)))); + } else { + ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + failures.append(jumpIfCharNotEquals(ch, state.inputOffset())); + } + add32(Imm32(1), countRegister); + add32(Imm32(1), index); + branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); + failures.append(jump()); + + Label backtrackBegin(this); + loadFromFrame(term.frameLocation, countRegister); + state.jumpToBacktrack(branchTest32(Zero, countRegister), this); + sub32(Imm32(1), countRegister); + sub32(Imm32(1), index); + + failures.link(this); + + storeToFrame(countRegister, term.frameLocation); + + state.setBacktrackGenerated(backtrackBegin); + } + + void generatePatternCharacterNonGreedy(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + UChar ch = term.patternCharacter; + + move(Imm32(0), countRegister); + + Jump firstTimeDoNothing = jump(); + + Label hardFail(this); + sub32(countRegister, index); + state.jumpToBacktrack(jump(), this); + + Label backtrackBegin(this); + loadFromFrame(term.frameLocation, countRegister); + + atEndOfInput().linkTo(hardFail, this); + branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail); + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + readCharacter(state.inputOffset(), character); + or32(Imm32(32), character); + branch32(NotEqual, character, Imm32(Unicode::toLower(ch))).linkTo(hardFail, this); + } else { + ASSERT(!m_pattern.m_ignoreCase || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + jumpIfCharNotEquals(ch, state.inputOffset()).linkTo(hardFail, this); + } + + add32(Imm32(1), countRegister); + add32(Imm32(1), index); + + firstTimeDoNothing.link(this); + storeToFrame(countRegister, term.frameLocation); + + state.setBacktrackGenerated(backtrackBegin); + } + + void generateCharacterClassSingle(TermGenerationState& state) + { + const RegisterID character = regT0; + PatternTerm& term = state.term(); + + JumpList matchDest; + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, matchDest, term.characterClass); + + if (term.invertOrCapture) + state.jumpToBacktrack(matchDest, this); + else { + state.jumpToBacktrack(jump(), this); + matchDest.link(this); + } + } + + void generateCharacterClassFixed(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + + move(index, countRegister); + sub32(Imm32(term.quantityCount), countRegister); + + Label loop(this); + JumpList matchDest; + load16(BaseIndex(input, countRegister, TimesTwo, (state.inputOffset() + term.quantityCount) * sizeof(UChar)), character); + matchCharacterClass(character, matchDest, term.characterClass); + + if (term.invertOrCapture) + state.jumpToBacktrack(matchDest, this); + else { + state.jumpToBacktrack(jump(), this); + matchDest.link(this); + } + + add32(Imm32(1), countRegister); + branch32(NotEqual, countRegister, index).linkTo(loop, this); + } + + void generateCharacterClassGreedy(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + + move(Imm32(0), countRegister); + + JumpList failures; + Label loop(this); + failures.append(atEndOfInput()); + + if (term.invertOrCapture) { + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, failures, term.characterClass); + } else { + JumpList matchDest; + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, matchDest, term.characterClass); + failures.append(jump()); + matchDest.link(this); + } + + add32(Imm32(1), countRegister); + add32(Imm32(1), index); + branch32(NotEqual, countRegister, Imm32(term.quantityCount)).linkTo(loop, this); + failures.append(jump()); + + Label backtrackBegin(this); + loadFromFrame(term.frameLocation, countRegister); + state.jumpToBacktrack(branchTest32(Zero, countRegister), this); + sub32(Imm32(1), countRegister); + sub32(Imm32(1), index); + + failures.link(this); + + storeToFrame(countRegister, term.frameLocation); + + state.setBacktrackGenerated(backtrackBegin); + } + + void generateCharacterClassNonGreedy(TermGenerationState& state) + { + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + PatternTerm& term = state.term(); + + move(Imm32(0), countRegister); + + Jump firstTimeDoNothing = jump(); + + Label hardFail(this); + sub32(countRegister, index); + state.jumpToBacktrack(jump(), this); + + Label backtrackBegin(this); + loadFromFrame(term.frameLocation, countRegister); + + atEndOfInput().linkTo(hardFail, this); + branch32(Equal, countRegister, Imm32(term.quantityCount), hardFail); + + JumpList matchDest; + readCharacter(state.inputOffset(), character); + matchCharacterClass(character, matchDest, term.characterClass); + + if (term.invertOrCapture) + matchDest.linkTo(hardFail, this); + else { + jump(hardFail); + matchDest.link(this); + } + + add32(Imm32(1), countRegister); + add32(Imm32(1), index); + + firstTimeDoNothing.link(this); + storeToFrame(countRegister, term.frameLocation); + + state.setBacktrackGenerated(backtrackBegin); + } + + void generateParenthesesDisjunction(PatternTerm& parenthesesTerm, TermGenerationState& state, unsigned alternativeFrameLocation) + { + ASSERT((parenthesesTerm.type == PatternTerm::TypeParenthesesSubpattern) || (parenthesesTerm.type == PatternTerm::TypeParentheticalAssertion)); + ASSERT(parenthesesTerm.quantityCount == 1); + + PatternDisjunction* disjunction = parenthesesTerm.parentheses.disjunction; + unsigned preCheckedCount = ((parenthesesTerm.quantityType == QuantifierFixedCount) && (parenthesesTerm.type != PatternTerm::TypeParentheticalAssertion)) ? disjunction->m_minimumSize : 0; + + if (disjunction->m_alternatives.size() == 1) { + state.resetAlternative(); + ASSERT(state.alternativeValid()); + PatternAlternative* alternative = state.alternative(); + optimizeAlternative(alternative); + + int countToCheck = alternative->m_minimumSize - preCheckedCount; + if (countToCheck) { + ASSERT((parenthesesTerm.type == PatternTerm::TypeParentheticalAssertion) || (parenthesesTerm.quantityType != QuantifierFixedCount)); + + // FIXME: This is quite horrible. The call to 'plantJumpToBacktrackIfExists' + // will be forced to always trampoline into here, just to decrement the index. + // Ick. + Jump skip = jump(); + + Label backtrackBegin(this); + sub32(Imm32(countToCheck), index); + state.addBacktrackJump(jump()); + + skip.link(this); + + state.setBacktrackGenerated(backtrackBegin); + + state.jumpToBacktrack(jumpIfNoAvailableInput(countToCheck), this); + state.checkedTotal += countToCheck; + } + + for (state.resetTerm(); state.termValid(); state.nextTerm()) + generateTerm(state); + + state.checkedTotal -= countToCheck; + } else { + JumpList successes; + + for (state.resetAlternative(); state.alternativeValid(); state.nextAlternative()) { + + PatternAlternative* alternative = state.alternative(); + optimizeAlternative(alternative); + + ASSERT(alternative->m_minimumSize >= preCheckedCount); + int countToCheck = alternative->m_minimumSize - preCheckedCount; + if (countToCheck) { + state.addBacktrackJump(jumpIfNoAvailableInput(countToCheck)); + state.checkedTotal += countToCheck; + } + + for (state.resetTerm(); state.termValid(); state.nextTerm()) + generateTerm(state); + + // Matched an alternative. + DataLabelPtr dataLabel = storeToFrameWithPatch(alternativeFrameLocation); + successes.append(jump()); + + // Alternative did not match. + Label backtrackLocation(this); + + // Can we backtrack the alternative? - if so, do so. If not, just fall through to the next one. + state.plantJumpToBacktrackIfExists(this); + + state.linkAlternativeBacktracks(this); + + if (countToCheck) { + sub32(Imm32(countToCheck), index); + state.checkedTotal -= countToCheck; + } + + m_backtrackRecords.append(AlternativeBacktrackRecord(dataLabel, backtrackLocation)); + } + // We fall through to here when the last alternative fails. + // Add a backtrack out of here for the parenthese handling code to link up. + state.addBacktrackJump(jump()); + + // Generate a trampoline for the parens code to backtrack to, to retry the + // next alternative. + state.setBacktrackGenerated(label()); + loadFromFrameAndJump(alternativeFrameLocation); + + // FIXME: both of the above hooks are a little inefficient, in that you + // may end up trampolining here, just to trampoline back out to the + // parentheses code, or vice versa. We can probably eliminate a jump + // by restructuring, but coding this way for now for simplicity during + // development. + + successes.link(this); + } + } + + void generateParenthesesSingle(TermGenerationState& state) + { + const RegisterID indexTemporary = regT0; + PatternTerm& term = state.term(); + PatternDisjunction* disjunction = term.parentheses.disjunction; + ASSERT(term.quantityCount == 1); + + unsigned preCheckedCount = ((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)) ? disjunction->m_minimumSize : 0; + + unsigned parenthesesFrameLocation = term.frameLocation; + unsigned alternativeFrameLocation = parenthesesFrameLocation; + if (term.quantityType != QuantifierFixedCount) + alternativeFrameLocation += RegexStackSpaceForBackTrackInfoParenthesesOnce; + + // optimized case - no capture & no quantifier can be handled in a light-weight manner. + if (!term.invertOrCapture && (term.quantityType == QuantifierFixedCount)) { + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); + // this expects that any backtracks back out of the parentheses will be in the + // parenthesesState's backTrackJumps vector, and that if they need backtracking + // they will have set an entry point on the parenthesesState's backtrackLabel. + state.propagateBacktrackingFrom(parenthesesState, this); + } else { + Jump nonGreedySkipParentheses; + Label nonGreedyTryParentheses; + if (term.quantityType == QuantifierGreedy) + storeToFrame(Imm32(1), parenthesesFrameLocation); + else if (term.quantityType == QuantifierNonGreedy) { + storeToFrame(Imm32(0), parenthesesFrameLocation); + nonGreedySkipParentheses = jump(); + nonGreedyTryParentheses = label(); + storeToFrame(Imm32(1), parenthesesFrameLocation); + } + + // store the match start index + if (term.invertOrCapture) { + int inputOffset = state.inputOffset() - preCheckedCount; + if (inputOffset) { + move(index, indexTemporary); + add32(Imm32(inputOffset), indexTemporary); + store32(indexTemporary, Address(output, (term.parentheses.subpatternId << 1) * sizeof(int))); + } else + store32(index, Address(output, (term.parentheses.subpatternId << 1) * sizeof(int))); + } + + // generate the body of the parentheses + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); + + // store the match end index + if (term.invertOrCapture) { + int inputOffset = state.inputOffset(); + if (inputOffset) { + move(index, indexTemporary); + add32(Imm32(state.inputOffset()), indexTemporary); + store32(indexTemporary, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); + } else + store32(index, Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); + } + Jump success = jump(); + + // A failure AFTER the parens jumps here + Label backtrackFromAfterParens(this); + + if (term.quantityType == QuantifierGreedy) { + // If this is zero we have now tested with both with and without the parens. + loadFromFrame(parenthesesFrameLocation, indexTemporary); + state.jumpToBacktrack(branchTest32(Zero, indexTemporary), this); + } else if (term.quantityType == QuantifierNonGreedy) { + // If this is zero we have now tested with both with and without the parens. + loadFromFrame(parenthesesFrameLocation, indexTemporary); + branchTest32(Zero, indexTemporary).linkTo(nonGreedyTryParentheses, this); + } + + parenthesesState.plantJumpToBacktrackIfExists(this); + // A failure WITHIN the parens jumps here + parenthesesState.linkAlternativeBacktracks(this); + if (term.invertOrCapture) { + store32(Imm32(-1), Address(output, (term.parentheses.subpatternId << 1) * sizeof(int))); + store32(Imm32(-1), Address(output, ((term.parentheses.subpatternId << 1) + 1) * sizeof(int))); + } + + if (term.quantityType == QuantifierGreedy) + storeToFrame(Imm32(0), parenthesesFrameLocation); + else + state.jumpToBacktrack(jump(), this); + + state.setBacktrackGenerated(backtrackFromAfterParens); + if (term.quantityType == QuantifierNonGreedy) + nonGreedySkipParentheses.link(this); + success.link(this); + } + } + + void generateParentheticalAssertion(TermGenerationState& state) + { + PatternTerm& term = state.term(); + PatternDisjunction* disjunction = term.parentheses.disjunction; + ASSERT(term.quantityCount == 1); + ASSERT(term.quantityType == QuantifierFixedCount); + + unsigned parenthesesFrameLocation = term.frameLocation; + unsigned alternativeFrameLocation = parenthesesFrameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion; + + int countCheckedAfterAssertion = state.checkedTotal - term.inputPosition; + + if (term.invertOrCapture) { + // Inverted case + storeToFrame(index, parenthesesFrameLocation); + + state.checkedTotal -= countCheckedAfterAssertion; + if (countCheckedAfterAssertion) + sub32(Imm32(countCheckedAfterAssertion), index); + + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); + // Success! - which means - Fail! + loadFromFrame(parenthesesFrameLocation, index); + state.jumpToBacktrack(jump(), this); + + // And fail means success. + parenthesesState.linkAlternativeBacktracks(this); + loadFromFrame(parenthesesFrameLocation, index); + + state.checkedTotal += countCheckedAfterAssertion; + } else { + // Normal case + storeToFrame(index, parenthesesFrameLocation); + + state.checkedTotal -= countCheckedAfterAssertion; + if (countCheckedAfterAssertion) + sub32(Imm32(countCheckedAfterAssertion), index); + + TermGenerationState parenthesesState(disjunction, state.checkedTotal); + generateParenthesesDisjunction(state.term(), parenthesesState, alternativeFrameLocation); + // Success! - which means - Success! + loadFromFrame(parenthesesFrameLocation, index); + Jump success = jump(); + + parenthesesState.linkAlternativeBacktracks(this); + loadFromFrame(parenthesesFrameLocation, index); + state.jumpToBacktrack(jump(), this); + + success.link(this); + + state.checkedTotal += countCheckedAfterAssertion; + } + } + + void generateTerm(TermGenerationState& state) + { + PatternTerm& term = state.term(); + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + generateAssertionBOL(state); + break; + + case PatternTerm::TypeAssertionEOL: + generateAssertionEOL(state); + break; + + case PatternTerm::TypeAssertionWordBoundary: + generateAssertionWordBoundary(state); + break; + + case PatternTerm::TypePatternCharacter: + switch (term.quantityType) { + case QuantifierFixedCount: + if (term.quantityCount == 1) { + if (state.isSinglePatternCharacterLookaheadTerm() && (state.lookaheadTerm().inputPosition == (term.inputPosition + 1))) { + generatePatternCharacterPair(state); + state.nextTerm(); + } else + generatePatternCharacterSingle(state); + } else + generatePatternCharacterFixed(state); + break; + case QuantifierGreedy: + generatePatternCharacterGreedy(state); + break; + case QuantifierNonGreedy: + generatePatternCharacterNonGreedy(state); + break; + } + break; + + case PatternTerm::TypeCharacterClass: + switch (term.quantityType) { + case QuantifierFixedCount: + if (term.quantityCount == 1) + generateCharacterClassSingle(state); + else + generateCharacterClassFixed(state); + break; + case QuantifierGreedy: + generateCharacterClassGreedy(state); + break; + case QuantifierNonGreedy: + generateCharacterClassNonGreedy(state); + break; + } + break; + + case PatternTerm::TypeBackReference: + m_generationFailed = true; + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypeParenthesesSubpattern: + if ((term.quantityCount == 1) && !term.parentheses.isCopy) + generateParenthesesSingle(state); + else + m_generationFailed = true; + break; + + case PatternTerm::TypeParentheticalAssertion: + generateParentheticalAssertion(state); + break; + } + } + + void generateDisjunction(PatternDisjunction* disjunction) + { + TermGenerationState state(disjunction, 0); + state.resetAlternative(); + + // Plant a check to see if there is sufficient input available to run the first alternative. + // Jumping back to the label 'firstAlternative' will get to this check, jumping to + // 'firstAlternativeInputChecked' will jump directly to matching the alternative having + // skipped this check. + + Label firstAlternative(this); + + // check availability for the next alternative + int countCheckedForCurrentAlternative = 0; + int countToCheckForFirstAlternative = 0; + bool hasShorterAlternatives = false; + JumpList notEnoughInputForPreviousAlternative; + + if (state.alternativeValid()) { + PatternAlternative* alternative = state.alternative(); + countToCheckForFirstAlternative = alternative->m_minimumSize; + state.checkedTotal += countToCheckForFirstAlternative; + if (countToCheckForFirstAlternative) + notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForFirstAlternative)); + countCheckedForCurrentAlternative = countToCheckForFirstAlternative; + } + + Label firstAlternativeInputChecked(this); + + while (state.alternativeValid()) { + // Track whether any alternatives are shorter than the first one. + hasShorterAlternatives = hasShorterAlternatives || (countCheckedForCurrentAlternative < countToCheckForFirstAlternative); + + PatternAlternative* alternative = state.alternative(); + optimizeAlternative(alternative); + + for (state.resetTerm(); state.termValid(); state.nextTerm()) + generateTerm(state); + + // If we get here, the alternative matched. + if (m_pattern.m_body->m_callFrameSize) + addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); + + ASSERT(index != returnRegister); + if (m_pattern.m_body->m_hasFixedSize) { + move(index, returnRegister); + if (alternative->m_minimumSize) + sub32(Imm32(alternative->m_minimumSize), returnRegister); + } else + pop(returnRegister); + store32(index, Address(output, 4)); + store32(returnRegister, output); + + generateReturn(); + + state.nextAlternative(); + + // if there are any more alternatives, plant the check for input before looping. + if (state.alternativeValid()) { + PatternAlternative* nextAlternative = state.alternative(); + int countToCheckForNextAlternative = nextAlternative->m_minimumSize; + + if (countCheckedForCurrentAlternative > countToCheckForNextAlternative) { // CASE 1: current alternative was longer than the next one. + // If we get here, there the last input checked failed. + notEnoughInputForPreviousAlternative.link(this); + + // Check if sufficent input available to run the next alternative + notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative)); + // We are now in the correct state to enter the next alternative; this add is only required + // to mirror and revert operation of the sub32, just below. + add32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index); + + // If we get here, there the last input checked passed. + state.linkAlternativeBacktracks(this); + // No need to check if we can run the next alternative, since it is shorter - + // just update index. + sub32(Imm32(countCheckedForCurrentAlternative - countToCheckForNextAlternative), index); + } else if (countCheckedForCurrentAlternative < countToCheckForNextAlternative) { // CASE 2: next alternative is longer than the current one. + // If we get here, there the last input checked failed. + // If there is insufficient input to run the current alternative, and the next alternative is longer, + // then there is definitely not enough input to run it - don't even check. Just adjust index, as if + // we had checked. + notEnoughInputForPreviousAlternative.link(this); + add32(Imm32(countToCheckForNextAlternative - countCheckedForCurrentAlternative), index); + notEnoughInputForPreviousAlternative.append(jump()); + + // The next alternative is longer than the current one; check the difference. + state.linkAlternativeBacktracks(this); + notEnoughInputForPreviousAlternative.append(jumpIfNoAvailableInput(countToCheckForNextAlternative - countCheckedForCurrentAlternative)); + } else { // CASE 3: Both alternatives are the same length. + ASSERT(countCheckedForCurrentAlternative == countToCheckForNextAlternative); + + // If the next alterative is the same length as this one, then no need to check the input - + // if there was sufficent input to run the current alternative then there is sufficient + // input to run the next one; if not, there isn't. + state.linkAlternativeBacktracks(this); + } + + state.checkedTotal -= countCheckedForCurrentAlternative; + countCheckedForCurrentAlternative = countToCheckForNextAlternative; + state.checkedTotal += countCheckedForCurrentAlternative; + } + } + + // If we get here, all Alternatives failed... + + state.checkedTotal -= countCheckedForCurrentAlternative; + + // How much more input need there be to be able to retry from the first alternative? + // examples: + // /yarr_jit/ or /wrec|pcre/ + // In these examples we need check for one more input before looping. + // /yarr_jit|pcre/ + // In this case we need check for 5 more input to loop (+4 to allow for the first alterative + // being four longer than the last alternative checked, and another +1 to effectively move + // the start position along by one). + // /yarr|rules/ or /wrec|notsomuch/ + // In these examples, provided that there was sufficient input to have just been matching for + // the second alternative we can loop without checking for available input (since the second + // alternative is longer than the first). In the latter example we need to decrement index + // (by 4) so the start position is only progressed by 1 from the last iteration. + int incrementForNextIter = (countToCheckForFirstAlternative - countCheckedForCurrentAlternative) + 1; + + // First, deal with the cases where there was sufficient input to try the last alternative. + if (incrementForNextIter > 0) // We need to check for more input anyway, fall through to the checking below. + state.linkAlternativeBacktracks(this); + else if (m_pattern.m_body->m_hasFixedSize && !incrementForNextIter) // No need to update anything, link these backtracks straight to the to pof the loop! + state.linkAlternativeBacktracksTo(firstAlternativeInputChecked, this); + else { // no need to check the input, but we do have some bookkeeping to do first. + state.linkAlternativeBacktracks(this); + + // Where necessary update our preserved start position. + if (!m_pattern.m_body->m_hasFixedSize) { + move(index, regT0); + sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0); + poke(regT0, m_pattern.m_body->m_callFrameSize); + } + + // Update index if necessary, and loop (without checking). + if (incrementForNextIter) + add32(Imm32(incrementForNextIter), index); + jump().linkTo(firstAlternativeInputChecked, this); + } + + notEnoughInputForPreviousAlternative.link(this); + // Update our idea of the start position, if we're tracking this. + if (!m_pattern.m_body->m_hasFixedSize) { + if (countCheckedForCurrentAlternative - 1) { + move(index, regT0); + sub32(Imm32(countCheckedForCurrentAlternative - 1), regT0); + poke(regT0, m_pattern.m_body->m_callFrameSize); + } else + poke(index, m_pattern.m_body->m_callFrameSize); + } + // Check if there is sufficent input to run the first alternative again. + jumpIfAvailableInput(incrementForNextIter).linkTo(firstAlternativeInputChecked, this); + // No - insufficent input to run the first alteranative, are there any other alternatives we + // might need to check? If so, the last check will have left the index incremented by + // (countToCheckForFirstAlternative + 1), so we need test whether countToCheckForFirstAlternative + // LESS input is available, to have the effect of just progressing the start position by 1 + // from the last iteration. If this check passes we can just jump up to the check associated + // with the first alternative in the loop. This is a bit sad, since we'll end up trying the + // first alternative again, and this check will fail (otherwise the check planted just above + // here would have passed). This is a bit sad, however it saves trying to do something more + // complex here in compilation, and in the common case we should end up coallescing the checks. + // + // FIXME: a nice improvement here may be to stop trying to match sooner, based on the least + // of the minimum-alterantive-lengths. E.g. if I have two alternatives of length 200 and 150, + // and a string of length 100, we'll end up looping index from 0 to 100, checking whether there + // is sufficient input to run either alternative (constantly failing). If there had been only + // one alternative, or if the shorter alternative had come first, we would have terminated + // immediately. :-/ + if (hasShorterAlternatives) + jumpIfAvailableInput(-countToCheckForFirstAlternative).linkTo(firstAlternative, this); + // index will now be a bit garbled (depending on whether 'hasShorterAlternatives' is true, + // it has either been incremented by 1 or by (countToCheckForFirstAlternative + 1) ... + // but since we're about to return a failure this doesn't really matter!) + + unsigned frameSize = m_pattern.m_body->m_callFrameSize; + if (!m_pattern.m_body->m_hasFixedSize) + ++frameSize; + if (frameSize) + addPtr(Imm32(frameSize * sizeof(void*)), stackPointerRegister); + + move(Imm32(-1), returnRegister); + + generateReturn(); + } + + void generateEnter() + { +#if PLATFORM(X86_64) + push(X86::ebp); + move(stackPointerRegister, X86::ebp); + push(X86::ebx); +#elif PLATFORM(X86) + push(X86::ebp); + move(stackPointerRegister, X86::ebp); + // TODO: do we need spill registers to fill the output pointer if there are no sub captures? + push(X86::ebx); + push(X86::edi); + push(X86::esi); + // load output into edi (2 = saved ebp + return address). + #if COMPILER(MSVC) + loadPtr(Address(X86::ebp, 2 * sizeof(void*)), input); + loadPtr(Address(X86::ebp, 3 * sizeof(void*)), index); + loadPtr(Address(X86::ebp, 4 * sizeof(void*)), length); + loadPtr(Address(X86::ebp, 5 * sizeof(void*)), output); + #else + loadPtr(Address(X86::ebp, 2 * sizeof(void*)), output); + #endif +#elif PLATFORM(ARM) +#if !PLATFORM_ARM_ARCH(7) + push(ARM::lr); +#endif + push(ARM::r4); + push(ARM::r5); + push(ARM::r6); + move(ARM::r3, output); +#endif + } + + void generateReturn() + { +#if PLATFORM(X86_64) + pop(X86::ebx); + pop(X86::ebp); +#elif PLATFORM(X86) + pop(X86::esi); + pop(X86::edi); + pop(X86::ebx); + pop(X86::ebp); +#elif PLATFORM(ARM) + pop(ARM::r6); + pop(ARM::r5); + pop(ARM::r4); +#endif + ret(); + } + +public: + RegexGenerator(RegexPattern& pattern) + : m_pattern(pattern) + , m_generationFailed(false) + { + } + + void generate() + { + generateEnter(); + + // TODO: do I really want this on the stack? + if (!m_pattern.m_body->m_hasFixedSize) + push(index); + + if (m_pattern.m_body->m_callFrameSize) + subPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); + + generateDisjunction(m_pattern.m_body); + } + + void compile(JSGlobalData* globalData, RegexCodeBlock& jitObject) + { + generate(); + + LinkBuffer patchBuffer(this, globalData->executableAllocator.poolForSize(size())); + + for (unsigned i = 0; i < m_backtrackRecords.size(); ++i) + patchBuffer.patch(m_backtrackRecords[i].dataLabel, patchBuffer.locationOf(m_backtrackRecords[i].backtrackLocation)); + + jitObject.set(patchBuffer.finalizeCode()); + } + + bool generationFailed() + { + return m_generationFailed; + } + +private: + RegexPattern& m_pattern; + Vector<AlternativeBacktrackRecord> m_backtrackRecords; + bool m_generationFailed; +}; + +void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline) +{ + RegexPattern pattern(ignoreCase, multiline); + + if ((error = compileRegex(patternString, pattern))) + return; + + numSubpatterns = pattern.m_numSubpatterns; + + RegexGenerator generator(pattern); + generator.compile(globalData, jitObject); + + if (generator.generationFailed()) { + JSRegExpIgnoreCaseOption ignoreCaseOption = ignoreCase ? JSRegExpIgnoreCase : JSRegExpDoNotIgnoreCase; + JSRegExpMultilineOption multilineOption = multiline ? JSRegExpMultiline : JSRegExpSingleLine; + jitObject.setFallback(jsRegExpCompile(reinterpret_cast<const UChar*>(patternString.data()), patternString.size(), ignoreCaseOption, multilineOption, &numSubpatterns, &error)); + } +} + +int executeRegex(RegexCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output, int outputArraySize) +{ + if (JSRegExp* fallback = jitObject.getFallback()) + return (jsRegExpExecute(fallback, input, length, start, output, outputArraySize) < 0) ? -1 : output[0]; + + return jitObject.execute(input, start, length, output); +} + +}} + +#endif + + + + + diff --git a/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexJIT.h b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexJIT.h new file mode 100644 index 0000000..5b0df9d --- /dev/null +++ b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexJIT.h @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RegexJIT_h +#define RegexJIT_h + +#include <wtf/Platform.h> + +#if ENABLE(YARR_JIT) + +#include "MacroAssembler.h" +#include "RegexPattern.h" +#include <UString.h> + +#include <pcre.h> +struct JSRegExp; // temporary, remove when fallback is removed. + +#if PLATFORM(X86) && !COMPILER(MSVC) +#define YARR_CALL __attribute__ ((regparm (3))) +#else +#define YARR_CALL +#endif + +namespace JSC { + +class JSGlobalData; +class ExecutablePool; + +namespace Yarr { + +class RegexCodeBlock { + typedef int (*RegexJITCode)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL; + +public: + RegexCodeBlock() + : m_fallback(0) + { + } + + ~RegexCodeBlock() + { + if (m_fallback) + jsRegExpFree(m_fallback); + } + + JSRegExp* getFallback() { return m_fallback; } + void setFallback(JSRegExp* fallback) { m_fallback = fallback; } + + bool operator!() { return !m_ref.m_code.executableAddress(); } + void set(MacroAssembler::CodeRef ref) { m_ref = ref; } + + int execute(const UChar* input, unsigned start, unsigned length, int* output) + { + return reinterpret_cast<RegexJITCode>(m_ref.m_code.executableAddress())(input, start, length, output); + } + +private: + MacroAssembler::CodeRef m_ref; + JSRegExp* m_fallback; +}; + +void jitCompileRegex(JSGlobalData* globalData, RegexCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false); +int executeRegex(RegexCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output, int outputArraySize); + +} } // namespace JSC::Yarr + +#endif + +#endif // RegexJIT_h diff --git a/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexParser.h b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexParser.h new file mode 100644 index 0000000..64e8463 --- /dev/null +++ b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexParser.h @@ -0,0 +1,854 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RegexParser_h +#define RegexParser_h + +#include <wtf/Platform.h> + +#if ENABLE(YARR) + +#include <UString.h> +#include <wtf/ASCIICType.h> +#include <wtf/unicode/Unicode.h> +#include <limits.h> + +namespace JSC { namespace Yarr { + +enum BuiltInCharacterClassID { + DigitClassID, + SpaceClassID, + WordClassID, + NewlineClassID, +}; + +// The Parser class should not be used directly - only via the Yarr::parse() method. +template<class Delegate> +class Parser { +private: + template<class FriendDelegate> + friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit); + + enum ErrorCode { + NoError, + PatternTooLarge, + QuantifierOutOfOrder, + QuantifierWithoutAtom, + MissingParentheses, + ParenthesesUnmatched, + ParenthesesTypeInvalid, + CharacterClassUnmatched, + CharacterClassOutOfOrder, + EscapeUnterminated, + NumberOfErrorCodes + }; + + /* + * CharacterClassParserDelegate: + * + * The class CharacterClassParserDelegate is used in the parsing of character + * classes. This class handles detection of character ranges. This class + * implements enough of the delegate interface such that it can be passed to + * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused + * to perform the parsing of escape characters in character sets. + */ + class CharacterClassParserDelegate { + public: + CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) + : m_delegate(delegate) + , m_err(err) + , m_state(empty) + { + } + + /* + * begin(): + * + * Called at beginning of construction. + */ + void begin(bool invert) + { + m_delegate.atomCharacterClassBegin(invert); + } + + /* + * atomPatternCharacterUnescaped(): + * + * This method is called directly from parseCharacterClass(), to report a new + * pattern character token. This method differs from atomPatternCharacter(), + * which will be called from parseEscape(), since a hypen provided via this + * method may be indicating a character range, but a hyphen parsed by + * parseEscape() cannot be interpreted as doing so. + */ + void atomPatternCharacterUnescaped(UChar ch) + { + switch (m_state) { + case empty: + m_character = ch; + m_state = cachedCharacter; + break; + + case cachedCharacter: + if (ch == '-') + m_state = cachedCharacterHyphen; + else { + m_delegate.atomCharacterClassAtom(m_character); + m_character = ch; + } + break; + + case cachedCharacterHyphen: + if (ch >= m_character) + m_delegate.atomCharacterClassRange(m_character, ch); + else + m_err = CharacterClassOutOfOrder; + m_state = empty; + } + } + + /* + * atomPatternCharacter(): + * + * Adds a pattern character, called by parseEscape(), as such will not + * interpret a hyphen as indicating a character range. + */ + void atomPatternCharacter(UChar ch) + { + // Flush if a character is already pending to prevent the + // hyphen from begin interpreted as indicating a range. + if((ch == '-') && (m_state == cachedCharacter)) + flush(); + + atomPatternCharacterUnescaped(ch); + } + + /* + * atomBuiltInCharacterClass(): + * + * Adds a built-in character class, called by parseEscape(). + */ + void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) + { + flush(); + m_delegate.atomCharacterClassBuiltIn(classID, invert); + } + + /* + * end(): + * + * Called at end of construction. + */ + void end() + { + flush(); + m_delegate.atomCharacterClassEnd(); + } + + // parseEscape() should never call these delegate methods when + // invoked with inCharacterClass set. + void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } + void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } + + private: + void flush() + { + if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen + m_delegate.atomCharacterClassAtom(m_character); + if (m_state == cachedCharacterHyphen) + m_delegate.atomCharacterClassAtom('-'); + m_state = empty; + } + + Delegate& m_delegate; + ErrorCode& m_err; + enum CharacterClassConstructionState { + empty, + cachedCharacter, + cachedCharacterHyphen, + } m_state; + UChar m_character; + }; + + Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit) + : m_delegate(delegate) + , m_backReferenceLimit(backReferenceLimit) + , m_err(NoError) + , m_data(pattern.data()) + , m_size(pattern.size()) + , m_index(0) + , m_parenthesesNestingDepth(0) + { + } + + /* + * parseEscape(): + * + * Helper for parseTokens() AND parseCharacterClass(). + * Unlike the other parser methods, this function does not report tokens + * directly to the member delegate (m_delegate), instead tokens are + * emitted to the delegate provided as an argument. In the case of atom + * escapes, parseTokens() will call parseEscape() passing m_delegate as + * an argument, and as such the escape will be reported to the delegate. + * + * However this method may also be used by parseCharacterClass(), in which + * case a CharacterClassParserDelegate will be passed as the delegate that + * tokens should be added to. A boolean flag is also provided to indicate + * whether that an escape in a CharacterClass is being parsed (some parsing + * rules change in this context). + * + * The boolean value returned by this method indicates whether the token + * parsed was an atom (outside of a characted class \b and \B will be + * interpreted as assertions). + */ + template<bool inCharacterClass, class EscapeDelegate> + bool parseEscape(EscapeDelegate& delegate) + { + ASSERT(!m_err); + ASSERT(peek() == '\\'); + consume(); + + if (atEndOfPattern()) { + m_err = EscapeUnterminated; + return false; + } + + switch (peek()) { + // Assertions + case 'b': + consume(); + if (inCharacterClass) + delegate.atomPatternCharacter('\b'); + else { + delegate.assertionWordBoundary(false); + return false; + } + break; + case 'B': + consume(); + if (inCharacterClass) + delegate.atomPatternCharacter('B'); + else { + delegate.assertionWordBoundary(true); + return false; + } + break; + + // CharacterClassEscape + case 'd': + consume(); + delegate.atomBuiltInCharacterClass(DigitClassID, false); + break; + case 's': + consume(); + delegate.atomBuiltInCharacterClass(SpaceClassID, false); + break; + case 'w': + consume(); + delegate.atomBuiltInCharacterClass(WordClassID, false); + break; + case 'D': + consume(); + delegate.atomBuiltInCharacterClass(DigitClassID, true); + break; + case 'S': + consume(); + delegate.atomBuiltInCharacterClass(SpaceClassID, true); + break; + case 'W': + consume(); + delegate.atomBuiltInCharacterClass(WordClassID, true); + break; + + // DecimalEscape + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. + // First, try to parse this as backreference. + if (!inCharacterClass) { + ParseState state = saveState(); + + unsigned backReference = consumeNumber(); + if (backReference <= m_backReferenceLimit) { + delegate.atomBackReference(backReference); + break; + } + + restoreState(state); + } + + // Not a backreference, and not octal. + if (peek() >= '8') { + delegate.atomPatternCharacter('\\'); + break; + } + + // Fall-through to handle this as an octal escape. + } + + // Octal escape + case '0': + delegate.atomPatternCharacter(consumeOctal()); + break; + + // ControlEscape + case 'f': + consume(); + delegate.atomPatternCharacter('\f'); + break; + case 'n': + consume(); + delegate.atomPatternCharacter('\n'); + break; + case 'r': + consume(); + delegate.atomPatternCharacter('\r'); + break; + case 't': + consume(); + delegate.atomPatternCharacter('\t'); + break; + case 'v': + consume(); + delegate.atomPatternCharacter('\v'); + break; + + // ControlLetter + case 'c': { + ParseState state = saveState(); + consume(); + if (!atEndOfPattern()) { + int control = consume(); + + // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. + if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { + delegate.atomPatternCharacter(control & 0x1f); + break; + } + } + restoreState(state); + delegate.atomPatternCharacter('\\'); + break; + } + + // HexEscape + case 'x': { + consume(); + int x = tryConsumeHex(2); + if (x == -1) + delegate.atomPatternCharacter('x'); + else + delegate.atomPatternCharacter(x); + break; + } + + // UnicodeEscape + case 'u': { + consume(); + int u = tryConsumeHex(4); + if (u == -1) + delegate.atomPatternCharacter('u'); + else + delegate.atomPatternCharacter(u); + break; + } + + // IdentityEscape + default: + delegate.atomPatternCharacter(consume()); + } + + return true; + } + + /* + * parseAtomEscape(), parseCharacterClassEscape(): + * + * These methods alias to parseEscape(). + */ + bool parseAtomEscape() + { + return parseEscape<false>(m_delegate); + } + void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) + { + parseEscape<true>(delegate); + } + + /* + * parseCharacterClass(): + * + * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) + * to an instance of CharacterClassParserDelegate, to describe the character class to the + * delegate. + */ + void parseCharacterClass() + { + ASSERT(!m_err); + ASSERT(peek() == '['); + consume(); + + CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); + + characterClassConstructor.begin(tryConsume('^')); + + while (!atEndOfPattern()) { + switch (peek()) { + case ']': + consume(); + characterClassConstructor.end(); + return; + + case '\\': + parseCharacterClassEscape(characterClassConstructor); + break; + + default: + characterClassConstructor.atomPatternCharacterUnescaped(consume()); + } + + if (m_err) + return; + } + + m_err = CharacterClassUnmatched; + } + + /* + * parseParenthesesBegin(): + * + * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. + */ + void parseParenthesesBegin() + { + ASSERT(!m_err); + ASSERT(peek() == '('); + consume(); + + if (tryConsume('?')) { + if (atEndOfPattern()) { + m_err = ParenthesesTypeInvalid; + return; + } + + switch (consume()) { + case ':': + m_delegate.atomParenthesesSubpatternBegin(false); + break; + + case '=': + m_delegate.atomParentheticalAssertionBegin(); + break; + + case '!': + m_delegate.atomParentheticalAssertionBegin(true); + break; + + default: + m_err = ParenthesesTypeInvalid; + } + } else + m_delegate.atomParenthesesSubpatternBegin(); + + ++m_parenthesesNestingDepth; + } + + /* + * parseParenthesesEnd(): + * + * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). + */ + void parseParenthesesEnd() + { + ASSERT(!m_err); + ASSERT(peek() == ')'); + consume(); + + if (m_parenthesesNestingDepth > 0) + m_delegate.atomParenthesesEnd(); + else + m_err = ParenthesesUnmatched; + + --m_parenthesesNestingDepth; + } + + /* + * parseQuantifier(): + * + * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. + */ + void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) + { + ASSERT(!m_err); + ASSERT(min <= max); + + if (lastTokenWasAnAtom) + m_delegate.quantifyAtom(min, max, !tryConsume('?')); + else + m_err = QuantifierWithoutAtom; + } + + /* + * parseTokens(): + * + * This method loops over the input pattern reporting tokens to the delegate. + * The method returns when a parse error is detected, or the end of the pattern + * is reached. One piece of state is tracked around the loop, which is whether + * the last token passed to the delegate was an atom (this is necessary to detect + * a parse error when a quantifier provided without an atom to quantify). + */ + void parseTokens() + { + bool lastTokenWasAnAtom = false; + + while (!atEndOfPattern()) { + switch (peek()) { + case '|': + consume(); + m_delegate.disjunction(); + lastTokenWasAnAtom = false; + break; + + case '(': + parseParenthesesBegin(); + lastTokenWasAnAtom = false; + break; + + case ')': + parseParenthesesEnd(); + lastTokenWasAnAtom = true; + break; + + case '^': + consume(); + m_delegate.assertionBOL(); + lastTokenWasAnAtom = false; + break; + + case '$': + consume(); + m_delegate.assertionEOL(); + lastTokenWasAnAtom = false; + break; + + case '.': + consume(); + m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); + lastTokenWasAnAtom = true; + break; + + case '[': + parseCharacterClass(); + lastTokenWasAnAtom = true; + break; + + case '\\': + lastTokenWasAnAtom = parseAtomEscape(); + break; + + case '*': + consume(); + parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX); + lastTokenWasAnAtom = false; + break; + + case '+': + consume(); + parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX); + lastTokenWasAnAtom = false; + break; + + case '?': + consume(); + parseQuantifier(lastTokenWasAnAtom, 0, 1); + lastTokenWasAnAtom = false; + break; + + case '{': { + ParseState state = saveState(); + + consume(); + if (peekIsDigit()) { + unsigned min = consumeNumber(); + unsigned max = min; + + if (tryConsume(',')) + max = peekIsDigit() ? consumeNumber() : UINT_MAX; + + if (tryConsume('}')) { + if (min <= max) + parseQuantifier(lastTokenWasAnAtom, min, max); + else + m_err = QuantifierOutOfOrder; + lastTokenWasAnAtom = false; + break; + } + } + + restoreState(state); + } // if we did not find a complete quantifer, fall through to the default case. + + default: + m_delegate.atomPatternCharacter(consume()); + lastTokenWasAnAtom = true; + } + + if (m_err) + return; + } + + if (m_parenthesesNestingDepth > 0) + m_err = MissingParentheses; + } + + /* + * parse(): + * + * This method calls regexBegin(), calls parseTokens() to parse over the input + * patterns, calls regexEnd() or regexError() as appropriate, and converts any + * error code to a const char* for a result. + */ + const char* parse() + { + m_delegate.regexBegin(); + + if (m_size > MAX_PATTERN_SIZE) + m_err = PatternTooLarge; + else + parseTokens(); + ASSERT(atEndOfPattern() || m_err); + + if (m_err) + m_delegate.regexError(); + else + m_delegate.regexEnd(); + + // The order of this array must match the ErrorCode enum. + static const char* errorMessages[NumberOfErrorCodes] = { + 0, // NoError + "regular expression too large", + "numbers out of order in {} quantifier", + "nothing to repeat", + "missing )", + "unmatched parentheses", + "unrecognized character after (?", + "missing terminating ] for character class", + "range out of order in character class", + "\\ at end of pattern" + }; + + return errorMessages[m_err]; + } + + + // Misc helper functions: + + typedef unsigned ParseState; + + ParseState saveState() + { + return m_index; + } + + void restoreState(ParseState state) + { + m_index = state; + } + + bool atEndOfPattern() + { + ASSERT(m_index <= m_size); + return m_index == m_size; + } + + int peek() + { + ASSERT(m_index < m_size); + return m_data[m_index]; + } + + bool peekIsDigit() + { + return !atEndOfPattern() && WTF::isASCIIDigit(peek()); + } + + unsigned peekDigit() + { + ASSERT(peekIsDigit()); + return peek() - '0'; + } + + int consume() + { + ASSERT(m_index < m_size); + return m_data[m_index++]; + } + + unsigned consumeDigit() + { + ASSERT(peekIsDigit()); + return consume() - '0'; + } + + unsigned consumeNumber() + { + unsigned n = consumeDigit(); + // check for overflow. + for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { + n = newValue; + consume(); + } + return n; + } + + unsigned consumeOctal() + { + ASSERT(WTF::isASCIIOctalDigit(peek())); + + unsigned n = consumeDigit(); + while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) + n = n * 8 + consumeDigit(); + return n; + } + + bool tryConsume(UChar ch) + { + if (atEndOfPattern() || (m_data[m_index] != ch)) + return false; + ++m_index; + return true; + } + + int tryConsumeHex(int count) + { + ParseState state = saveState(); + + int n = 0; + while (count--) { + if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { + restoreState(state); + return -1; + } + n = (n << 4) | WTF::toASCIIHexValue(consume()); + } + return n; + } + + Delegate& m_delegate; + unsigned m_backReferenceLimit; + ErrorCode m_err; + const UChar* m_data; + unsigned m_size; + unsigned m_index; + unsigned m_parenthesesNestingDepth; + + // Derived by empirical testing of compile time in PCRE and WREC. + static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; +}; + +/* + * Yarr::parse(): + * + * The parse method is passed a pattern to be parsed and a delegate upon which + * callbacks will be made to record the parsed tokens forming the regex. + * Yarr::parse() returns null on success, or a const C string providing an error + * message where a parse error occurs. + * + * The Delegate must implement the following interface: + * + * void assertionBOL(); + * void assertionEOL(); + * void assertionWordBoundary(bool invert); + * + * void atomPatternCharacter(UChar ch); + * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); + * void atomCharacterClassBegin(bool invert) + * void atomCharacterClassAtom(UChar ch) + * void atomCharacterClassRange(UChar begin, UChar end) + * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) + * void atomCharacterClassEnd() + * void atomParenthesesSubpatternBegin(bool capture = true); + * void atomParentheticalAssertionBegin(bool invert = false); + * void atomParenthesesEnd(); + * void atomBackReference(unsigned subpatternId); + * + * void quantifyAtom(unsigned min, unsigned max, bool greedy); + * + * void disjunction(); + * + * void regexBegin(); + * void regexEnd(); + * void regexError(); + * + * Before any call recording tokens are made, regexBegin() will be called on the + * delegate once. Once parsing is complete either regexEnd() or regexError() will + * be called, as appropriate. + * + * The regular expression is described by a sequence of assertion*() and atom*() + * callbacks to the delegate, describing the terms in the regular expression. + * Following an atom a quantifyAtom() call may occur to indicate that the previous + * atom should be quantified. In the case of atoms described across multiple + * calls (parentheses and character classes) the call to quantifyAtom() will come + * after the call to the atom*End() method, never after atom*Begin(). + * + * Character classes may either be described by a single call to + * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. + * In the latter case, ...Begin() will be called, followed by a sequence of + * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). + * + * Sequences of atoms and assertions are broken into alternatives via calls to + * disjunction(). Assertions, atoms, and disjunctions emitted between calls to + * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. + * atomParenthesesBegin() is passed a subpatternId. In the case of a regular + * capturing subpattern, this will be the subpatternId associated with these + * parentheses, and will also by definition be the lowest subpatternId of these + * parentheses and of any nested paretheses. The atomParenthesesEnd() method + * is passed the subpatternId of the last capturing subexpression nested within + * these paretheses. In the case of a capturing subpattern with no nested + * capturing subpatterns, the same subpatternId will be passed to the begin and + * end functions. In the case of non-capturing subpatterns the subpatternId + * passed to the begin method is also the first possible subpatternId that might + * be nested within these paretheses. If a set of non-capturing parentheses does + * not contain any capturing subpatterns, then the subpatternId passed to begin + * will be greater than the subpatternId passed to end. + */ + +template<class Delegate> +const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX) +{ + return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse(); +} + +} } // namespace JSC::Yarr + +#endif + +#endif // RegexParser_h diff --git a/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexPattern.h b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexPattern.h new file mode 100644 index 0000000..a451131 --- /dev/null +++ b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexPattern.h @@ -0,0 +1,356 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RegexPattern_h +#define RegexPattern_h + +#include <wtf/Platform.h> + +#if ENABLE(YARR) + +#include <wtf/Vector.h> +#include <wtf/unicode/Unicode.h> + + +namespace JSC { namespace Yarr { + +#define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. +#define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers. +#define RegexStackSpaceForBackTrackInfoBackReference 2 +#define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative. +#define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1 +#define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers. +#define RegexStackSpaceForBackTrackInfoParentheses 4 + +struct PatternDisjunction; + +struct CharacterRange { + UChar begin; + UChar end; + + CharacterRange(UChar begin, UChar end) + : begin(begin) + , end(end) + { + } +}; + +struct CharacterClass : FastAllocBase { + Vector<UChar> m_matches; + Vector<CharacterRange> m_ranges; + Vector<UChar> m_matchesUnicode; + Vector<CharacterRange> m_rangesUnicode; +}; + +enum QuantifierType { + QuantifierFixedCount, + QuantifierGreedy, + QuantifierNonGreedy, +}; + +struct PatternTerm { + enum Type { + TypeAssertionBOL, + TypeAssertionEOL, + TypeAssertionWordBoundary, + TypePatternCharacter, + TypeCharacterClass, + TypeBackReference, + TypeForwardReference, + TypeParenthesesSubpattern, + TypeParentheticalAssertion, + } type; + bool invertOrCapture; + union { + UChar patternCharacter; + CharacterClass* characterClass; + unsigned subpatternId; + struct { + PatternDisjunction* disjunction; + unsigned subpatternId; + unsigned lastSubpatternId; + bool isCopy; + } parentheses; + }; + QuantifierType quantityType; + unsigned quantityCount; + int inputPosition; + unsigned frameLocation; + + PatternTerm(UChar ch) + : type(PatternTerm::TypePatternCharacter) + { + patternCharacter = ch; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(CharacterClass* charClass, bool invert) + : type(PatternTerm::TypeCharacterClass) + , invertOrCapture(invert) + { + characterClass = charClass; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool invertOrCapture) + : type(type) + , invertOrCapture(invertOrCapture) + { + parentheses.disjunction = disjunction; + parentheses.subpatternId = subpatternId; + parentheses.isCopy = false; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(Type type, bool invert = false) + : type(type) + , invertOrCapture(invert) + { + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(unsigned spatternId) + : type(TypeBackReference) + , invertOrCapture(invertOrCapture) + { + subpatternId = spatternId; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + static PatternTerm ForwardReference() + { + return PatternTerm(TypeForwardReference); + } + + static PatternTerm BOL() + { + return PatternTerm(TypeAssertionBOL); + } + + static PatternTerm EOL() + { + return PatternTerm(TypeAssertionEOL); + } + + static PatternTerm WordBoundary(bool invert) + { + return PatternTerm(TypeAssertionWordBoundary, invert); + } + + bool invert() + { + return invertOrCapture; + } + + bool capture() + { + return invertOrCapture; + } + + void quantify(unsigned count, QuantifierType type) + { + quantityCount = count; + quantityType = type; + } +}; + +struct PatternAlternative : FastAllocBase { + PatternAlternative(PatternDisjunction* disjunction) + : m_parent(disjunction) + { + } + + PatternTerm& lastTerm() + { + ASSERT(m_terms.size()); + return m_terms[m_terms.size() - 1]; + } + + void removeLastTerm() + { + ASSERT(m_terms.size()); + m_terms.shrink(m_terms.size() - 1); + } + + Vector<PatternTerm> m_terms; + PatternDisjunction* m_parent; + unsigned m_minimumSize; + bool m_hasFixedSize; +}; + +struct PatternDisjunction : FastAllocBase { + PatternDisjunction(PatternAlternative* parent = 0) + : m_parent(parent) + { + } + + ~PatternDisjunction() + { + deleteAllValues(m_alternatives); + } + + PatternAlternative* addNewAlternative() + { + PatternAlternative* alternative = new PatternAlternative(this); + m_alternatives.append(alternative); + return alternative; + } + + Vector<PatternAlternative*> m_alternatives; + PatternAlternative* m_parent; + unsigned m_minimumSize; + unsigned m_callFrameSize; + bool m_hasFixedSize; +}; + +// You probably don't want to be calling these functions directly +// (please to be calling newlineCharacterClass() et al on your +// friendly neighborhood RegexPattern instance to get nicely +// cached copies). +CharacterClass* newlineCreate(); +CharacterClass* digitsCreate(); +CharacterClass* spacesCreate(); +CharacterClass* wordcharCreate(); +CharacterClass* nondigitsCreate(); +CharacterClass* nonspacesCreate(); +CharacterClass* nonwordcharCreate(); + +struct RegexPattern { + RegexPattern(bool ignoreCase, bool multiline) + : m_ignoreCase(ignoreCase) + , m_multiline(multiline) + , m_numSubpatterns(0) + , m_maxBackReference(0) + , newlineCached(0) + , digitsCached(0) + , spacesCached(0) + , wordcharCached(0) + , nondigitsCached(0) + , nonspacesCached(0) + , nonwordcharCached(0) + { + } + + ~RegexPattern() + { + deleteAllValues(m_disjunctions); + deleteAllValues(m_userCharacterClasses); + } + + void reset() + { + m_numSubpatterns = 0; + m_maxBackReference = 0; + + newlineCached = 0; + digitsCached = 0; + spacesCached = 0; + wordcharCached = 0; + nondigitsCached = 0; + nonspacesCached = 0; + nonwordcharCached = 0; + + deleteAllValues(m_disjunctions); + m_disjunctions.clear(); + deleteAllValues(m_userCharacterClasses); + m_userCharacterClasses.clear(); + } + + bool containsIllegalBackReference() + { + return m_maxBackReference > m_numSubpatterns; + } + + CharacterClass* newlineCharacterClass() + { + if (!newlineCached) + m_userCharacterClasses.append(newlineCached = newlineCreate()); + return newlineCached; + } + CharacterClass* digitsCharacterClass() + { + if (!digitsCached) + m_userCharacterClasses.append(digitsCached = digitsCreate()); + return digitsCached; + } + CharacterClass* spacesCharacterClass() + { + if (!spacesCached) + m_userCharacterClasses.append(spacesCached = spacesCreate()); + return spacesCached; + } + CharacterClass* wordcharCharacterClass() + { + if (!wordcharCached) + m_userCharacterClasses.append(wordcharCached = wordcharCreate()); + return wordcharCached; + } + CharacterClass* nondigitsCharacterClass() + { + if (!nondigitsCached) + m_userCharacterClasses.append(nondigitsCached = nondigitsCreate()); + return nondigitsCached; + } + CharacterClass* nonspacesCharacterClass() + { + if (!nonspacesCached) + m_userCharacterClasses.append(nonspacesCached = nonspacesCreate()); + return nonspacesCached; + } + CharacterClass* nonwordcharCharacterClass() + { + if (!nonwordcharCached) + m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate()); + return nonwordcharCached; + } + + bool m_ignoreCase; + bool m_multiline; + unsigned m_numSubpatterns; + unsigned m_maxBackReference; + PatternDisjunction* m_body; + Vector<PatternDisjunction*, 4> m_disjunctions; + Vector<CharacterClass*> m_userCharacterClasses; + +private: + CharacterClass* newlineCached; + CharacterClass* digitsCached; + CharacterClass* spacesCached; + CharacterClass* wordcharCached; + CharacterClass* nondigitsCached; + CharacterClass* nonspacesCached; + CharacterClass* nonwordcharCached; +}; + +} } // namespace JSC::Yarr + +#endif + +#endif // RegexPattern_h |