diff options
author | Bradley T. Hughes <bradley.hughes@nokia.com> | 2009-06-15 09:57:36 (GMT) |
---|---|---|
committer | Bradley T. Hughes <bradley.hughes@nokia.com> | 2009-06-15 09:57:36 (GMT) |
commit | 336dfcef05cb63df0a6d550b59a4badc7a0f01c1 (patch) | |
tree | a218ec97413e0c8ebc9600ac5db9b2adea485b32 /src/3rdparty/webkit/JavaScriptCore/yarr/RegexCompiler.cpp | |
parent | e44d64510e019e5d3b379b704cfb824e0d7ccc9d (diff) | |
download | Qt-336dfcef05cb63df0a6d550b59a4badc7a0f01c1.zip Qt-336dfcef05cb63df0a6d550b59a4badc7a0f01c1.tar.gz Qt-336dfcef05cb63df0a6d550b59a4badc7a0f01c1.tar.bz2 |
Merge of master
Diffstat (limited to 'src/3rdparty/webkit/JavaScriptCore/yarr/RegexCompiler.cpp')
-rw-r--r-- | src/3rdparty/webkit/JavaScriptCore/yarr/RegexCompiler.cpp | 728 |
1 files changed, 728 insertions, 0 deletions
diff --git a/src/3rdparty/webkit/JavaScriptCore/yarr/RegexCompiler.cpp b/src/3rdparty/webkit/JavaScriptCore/yarr/RegexCompiler.cpp new file mode 100644 index 0000000..c7b3c81 --- /dev/null +++ b/src/3rdparty/webkit/JavaScriptCore/yarr/RegexCompiler.cpp @@ -0,0 +1,728 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "RegexCompiler.h" + +#include "RegexInterpreter.h" +#include "RegexPattern.h" +#include <wtf/Vector.h> + +#if ENABLE(YARR) + +using namespace WTF; + +namespace JSC { namespace Yarr { + +class CharacterClassConstructor { +public: + CharacterClassConstructor(bool isCaseInsensitive = false) + : m_isCaseInsensitive(isCaseInsensitive) + { + } + + void reset() + { + m_matches.clear(); + m_ranges.clear(); + m_matchesUnicode.clear(); + m_rangesUnicode.clear(); + } + + void append(const CharacterClass* other) + { + for (size_t i = 0; i < other->m_matches.size(); ++i) + addSorted(m_matches, other->m_matches[i]); + for (size_t i = 0; i < other->m_ranges.size(); ++i) + addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); + for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) + addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); + for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) + addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); + } + + void putChar(UChar ch) + { + if (ch <= 0x7f) { + if (m_isCaseInsensitive && isASCIIAlpha(ch)) { + addSorted(m_matches, toASCIIUpper(ch)); + addSorted(m_matches, toASCIILower(ch)); + } else + addSorted(m_matches, ch); + } else { + UChar upper, lower; + if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { + addSorted(m_matchesUnicode, upper); + addSorted(m_matchesUnicode, lower); + } else + addSorted(m_matchesUnicode, ch); + } + } + + // returns true if this character has another case, and 'ch' is the upper case form. + static inline bool isUnicodeUpper(UChar ch) + { + return ch != Unicode::toLower(ch); + } + + // returns true if this character has another case, and 'ch' is the lower case form. + static inline bool isUnicodeLower(UChar ch) + { + return ch != Unicode::toUpper(ch); + } + + void putRange(UChar lo, UChar hi) + { + if (lo <= 0x7f) { + char asciiLo = lo; + char asciiHi = std::min(hi, (UChar)0x7f); + addSortedRange(m_ranges, lo, asciiHi); + + if (m_isCaseInsensitive) { + if ((asciiLo <= 'Z') && (asciiHi >= 'A')) + addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); + if ((asciiLo <= 'z') && (asciiHi >= 'a')) + addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); + } + } + if (hi >= 0x80) { + uint32_t unicodeCurr = std::max(lo, (UChar)0x80); + addSortedRange(m_rangesUnicode, unicodeCurr, hi); + + if (m_isCaseInsensitive) { + while (unicodeCurr <= hi) { + // If the upper bound of the range (hi) is 0xffff, the increments to + // unicodeCurr in this loop may take it to 0x10000. This is fine + // (if so we won't re-enter the loop, since the loop condition above + // will definitely fail) - but this does mean we cannot use a UChar + // to represent unicodeCurr, we must use a 32-bit value instead. + ASSERT(unicodeCurr <= 0xffff); + + if (isUnicodeUpper(unicodeCurr)) { + UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); + UChar lowerCaseRangeEnd = lowerCaseRangeBegin; + while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) + lowerCaseRangeEnd++; + addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); + } else if (isUnicodeLower(unicodeCurr)) { + UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); + UChar upperCaseRangeEnd = upperCaseRangeBegin; + while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) + upperCaseRangeEnd++; + addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); + } else + ++unicodeCurr; + } + } + } + } + + CharacterClass* charClass() + { + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_matches.append(m_matches); + characterClass->m_ranges.append(m_ranges); + characterClass->m_matchesUnicode.append(m_matchesUnicode); + characterClass->m_rangesUnicode.append(m_rangesUnicode); + + reset(); + + return characterClass; + } + +private: + void addSorted(Vector<UChar>& matches, UChar ch) + { + unsigned pos = 0; + unsigned range = matches.size(); + + // binary chop, find position to insert char. + while (range) { + unsigned index = range >> 1; + + int val = matches[pos+index] - ch; + if (!val) + return; + else if (val > 0) + range = index; + else { + pos += (index+1); + range -= (index+1); + } + } + + if (pos == matches.size()) + matches.append(ch); + else + matches.insert(pos, ch); + } + + void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) + { + unsigned end = ranges.size(); + + // Simple linear scan - I doubt there are that many ranges anyway... + // feel free to fix this with something faster (eg binary chop). + for (unsigned i = 0; i < end; ++i) { + // does the new range fall before the current position in the array + if (hi < ranges[i].begin) { + // optional optimization: concatenate appending ranges? - may not be worthwhile. + if (hi == (ranges[i].begin - 1)) { + ranges[i].begin = lo; + return; + } + ranges.insert(i, CharacterRange(lo, hi)); + return; + } + // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining + // If the new range start at or before the end of the last range, then the overlap (if it starts one after the + // end of the last range they concatenate, which is just as good. + if (lo <= (ranges[i].end + 1)) { + // found an intersect! we'll replace this entry in the array. + ranges[i].begin = std::min(ranges[i].begin, lo); + ranges[i].end = std::max(ranges[i].end, hi); + + // now check if the new range can subsume any subsequent ranges. + unsigned next = i+1; + // each iteration of the loop we will either remove something from the list, or break the loop. + while (next < ranges.size()) { + if (ranges[next].begin <= (ranges[i].end + 1)) { + // the next entry now overlaps / concatenates this one. + ranges[i].end = std::max(ranges[i].end, ranges[next].end); + ranges.remove(next); + } else + break; + } + + return; + } + } + + // CharacterRange comes after all existing ranges. + ranges.append(CharacterRange(lo, hi)); + } + + bool m_isCaseInsensitive; + + Vector<UChar> m_matches; + Vector<CharacterRange> m_ranges; + Vector<UChar> m_matchesUnicode; + Vector<CharacterRange> m_rangesUnicode; +}; + + +CharacterClass* newlineCreate() +{ + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_matches.append('\n'); + characterClass->m_matches.append('\r'); + characterClass->m_matchesUnicode.append(0x2028); + characterClass->m_matchesUnicode.append(0x2029); + + return characterClass; +} + +CharacterClass* digitsCreate() +{ + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_ranges.append(CharacterRange('0', '9')); + + return characterClass; +} + +CharacterClass* spacesCreate() +{ + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_matches.append(' '); + characterClass->m_ranges.append(CharacterRange('\t', '\r')); + characterClass->m_matchesUnicode.append(0x00a0); + characterClass->m_matchesUnicode.append(0x1680); + characterClass->m_matchesUnicode.append(0x180e); + characterClass->m_matchesUnicode.append(0x2028); + characterClass->m_matchesUnicode.append(0x2029); + characterClass->m_matchesUnicode.append(0x202f); + characterClass->m_matchesUnicode.append(0x205f); + characterClass->m_matchesUnicode.append(0x3000); + characterClass->m_rangesUnicode.append(CharacterRange(0x2000, 0x200a)); + + return characterClass; +} + +CharacterClass* wordcharCreate() +{ + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_matches.append('_'); + characterClass->m_ranges.append(CharacterRange('0', '9')); + characterClass->m_ranges.append(CharacterRange('A', 'Z')); + characterClass->m_ranges.append(CharacterRange('a', 'z')); + + return characterClass; +} + +CharacterClass* nondigitsCreate() +{ + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_ranges.append(CharacterRange(0, '0' - 1)); + characterClass->m_ranges.append(CharacterRange('9' + 1, 0x7f)); + characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff)); + + return characterClass; +} + +CharacterClass* nonspacesCreate() +{ + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_ranges.append(CharacterRange(0, '\t' - 1)); + characterClass->m_ranges.append(CharacterRange('\r' + 1, ' ' - 1)); + characterClass->m_ranges.append(CharacterRange(' ' + 1, 0x7f)); + characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x009f)); + characterClass->m_rangesUnicode.append(CharacterRange(0x00a1, 0x167f)); + characterClass->m_rangesUnicode.append(CharacterRange(0x1681, 0x180d)); + characterClass->m_rangesUnicode.append(CharacterRange(0x180f, 0x1fff)); + characterClass->m_rangesUnicode.append(CharacterRange(0x200b, 0x2027)); + characterClass->m_rangesUnicode.append(CharacterRange(0x202a, 0x202e)); + characterClass->m_rangesUnicode.append(CharacterRange(0x2030, 0x205e)); + characterClass->m_rangesUnicode.append(CharacterRange(0x2060, 0x2fff)); + characterClass->m_rangesUnicode.append(CharacterRange(0x3001, 0xffff)); + + return characterClass; +} + +CharacterClass* nonwordcharCreate() +{ + CharacterClass* characterClass = new CharacterClass(); + + characterClass->m_matches.append('`'); + characterClass->m_ranges.append(CharacterRange(0, '0' - 1)); + characterClass->m_ranges.append(CharacterRange('9' + 1, 'A' - 1)); + characterClass->m_ranges.append(CharacterRange('Z' + 1, '_' - 1)); + characterClass->m_ranges.append(CharacterRange('z' + 1, 0x7f)); + characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff)); + + return characterClass; +} + + +class RegexPatternConstructor { +public: + RegexPatternConstructor(RegexPattern& pattern) + : m_pattern(pattern) + , m_characterClassConstructor(pattern.m_ignoreCase) + { + } + + ~RegexPatternConstructor() + { + } + + void reset() + { + m_pattern.reset(); + m_characterClassConstructor.reset(); + } + + void assertionBOL() + { + m_alternative->m_terms.append(PatternTerm::BOL()); + } + void assertionEOL() + { + m_alternative->m_terms.append(PatternTerm::EOL()); + } + void assertionWordBoundary(bool invert) + { + m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); + } + + void atomPatternCharacter(UChar ch) + { + // We handle case-insensitive checking of unicode characters which do have both + // cases by handling them as if they were defined using a CharacterClass. + if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { + atomCharacterClassBegin(); + atomCharacterClassAtom(ch); + atomCharacterClassEnd(); + } else + m_alternative->m_terms.append(PatternTerm(ch)); + } + + void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) + { + switch (classID) { + case DigitClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); + break; + case SpaceClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); + break; + case WordClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); + break; + case NewlineClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); + break; + } + } + + void atomCharacterClassBegin(bool invert = false) + { + m_invertCharacterClass = invert; + } + + void atomCharacterClassAtom(UChar ch) + { + m_characterClassConstructor.putChar(ch); + } + + void atomCharacterClassRange(UChar begin, UChar end) + { + m_characterClassConstructor.putRange(begin, end); + } + + void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) + { + ASSERT(classID != NewlineClassID); + + switch (classID) { + case DigitClassID: + m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); + break; + + case SpaceClassID: + m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); + break; + + case WordClassID: + m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); + break; + + default: + ASSERT_NOT_REACHED(); + } + } + + void atomCharacterClassEnd() + { + CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); + m_pattern.m_userCharacterClasses.append(newCharacterClass); + m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); + } + + void atomParenthesesSubpatternBegin(bool capture = true) + { + unsigned subpatternId = m_pattern.m_numSubpatterns + 1; + if (capture) + m_pattern.m_numSubpatterns++; + + PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); + m_pattern.m_disjunctions.append(parenthesesDisjunction); + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture)); + m_alternative = parenthesesDisjunction->addNewAlternative(); + } + + void atomParentheticalAssertionBegin(bool invert = false) + { + PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); + m_pattern.m_disjunctions.append(parenthesesDisjunction); + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert)); + m_alternative = parenthesesDisjunction->addNewAlternative(); + } + + void atomParenthesesEnd() + { + ASSERT(m_alternative->m_parent); + ASSERT(m_alternative->m_parent->m_parent); + m_alternative = m_alternative->m_parent->m_parent; + + m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; + } + + void atomBackReference(unsigned subpatternId) + { + ASSERT(subpatternId); + m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); + + if (subpatternId > m_pattern.m_numSubpatterns) { + m_alternative->m_terms.append(PatternTerm::ForwardReference()); + return; + } + + PatternAlternative* currentAlternative = m_alternative; + ASSERT(currentAlternative); + + // Note to self: if we waited until the AST was baked, we could also remove forwards refs + while ((currentAlternative = currentAlternative->m_parent->m_parent)) { + PatternTerm& term = currentAlternative->lastTerm(); + ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); + + if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId)) { + m_alternative->m_terms.append(PatternTerm::ForwardReference()); + return; + } + } + + m_alternative->m_terms.append(PatternTerm(subpatternId)); + } + + PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction) + { + PatternDisjunction* newDisjunction = new PatternDisjunction(); + + newDisjunction->m_parent = disjunction->m_parent; + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) + newAlternative->m_terms.append(copyTerm(alternative->m_terms[i])); + } + + m_pattern.m_disjunctions.append(newDisjunction); + return newDisjunction; + } + + PatternTerm copyTerm(PatternTerm& term) + { + if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) + return PatternTerm(term); + + PatternTerm termCopy = term; + termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction); + return termCopy; + } + + void quantifyAtom(unsigned min, unsigned max, bool greedy) + { + ASSERT(min <= max); + ASSERT(m_alternative->m_terms.size()); + + if (!max) { + m_alternative->removeLastTerm(); + return; + } + + PatternTerm& term = m_alternative->lastTerm(); + ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); + ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); + + // For any assertion with a zero minimum, not matching is valid and has no effect, + // remove it. Otherwise, we need to match as least once, but there is no point + // matching more than once, so remove the quantifier. It is not entirely clear + // from the spec whether or not this behavior is correct, but I believe this + // matches Firefox. :-/ + if (term.type == PatternTerm::TypeParentheticalAssertion) { + if (!min) + m_alternative->removeLastTerm(); + return; + } + + if (min == 0) + term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); + else if (min == max) + term.quantify(min, QuantifierFixedCount); + else { + term.quantify(min, QuantifierFixedCount); + m_alternative->m_terms.append(copyTerm(term)); + // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... + m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); + if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) + m_alternative->lastTerm().parentheses.isCopy = true; + } + } + + void disjunction() + { + m_alternative = m_alternative->m_parent->addNewAlternative(); + } + + void regexBegin() + { + m_pattern.m_body = new PatternDisjunction(); + m_alternative = m_pattern.m_body->addNewAlternative(); + m_pattern.m_disjunctions.append(m_pattern.m_body); + } + void regexEnd() + { + } + void regexError() + { + } + + unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) + { + alternative->m_hasFixedSize = true; + unsigned currentInputPosition = initialInputPosition; + + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { + PatternTerm& term = alternative->m_terms[i]; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + case PatternTerm::TypeAssertionEOL: + case PatternTerm::TypeAssertionWordBoundary: + term.inputPosition = currentInputPosition; + break; + + case PatternTerm::TypeBackReference: + term.inputPosition = currentInputPosition; + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference; + alternative->m_hasFixedSize = false; + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypePatternCharacter: + term.inputPosition = currentInputPosition; + if (term.quantityType != QuantifierFixedCount) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter; + alternative->m_hasFixedSize = false; + } else + currentInputPosition += term.quantityCount; + break; + + case PatternTerm::TypeCharacterClass: + term.inputPosition = currentInputPosition; + if (term.quantityType != QuantifierFixedCount) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass; + alternative->m_hasFixedSize = false; + } else + currentInputPosition += term.quantityCount; + break; + + case PatternTerm::TypeParenthesesSubpattern: + // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. + term.frameLocation = currentCallFrameSize; + if ((term.quantityCount == 1) && !term.parentheses.isCopy) { + if (term.quantityType == QuantifierFixedCount) { + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); + currentInputPosition += term.parentheses.disjunction->m_minimumSize; + } else { + currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); + } + term.inputPosition = currentInputPosition; + } else { + term.inputPosition = currentInputPosition; + setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition); + currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses; + } + // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. + alternative->m_hasFixedSize = false; + break; + + case PatternTerm::TypeParentheticalAssertion: + term.inputPosition = currentInputPosition; + term.frameLocation = currentCallFrameSize; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition); + break; + } + } + + alternative->m_minimumSize = currentInputPosition - initialInputPosition; + return currentCallFrameSize; + } + + unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) + { + if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) + initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative; + + unsigned minimumInputSize = UINT_MAX; + unsigned maximumCallFrameSize = 0; + bool hasFixedSize = true; + + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); + minimumInputSize = min(minimumInputSize, alternative->m_minimumSize); + maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize); + hasFixedSize &= alternative->m_hasFixedSize; + } + + ASSERT(minimumInputSize != UINT_MAX); + ASSERT(maximumCallFrameSize >= initialCallFrameSize); + + disjunction->m_hasFixedSize = hasFixedSize; + disjunction->m_minimumSize = minimumInputSize; + disjunction->m_callFrameSize = maximumCallFrameSize; + return maximumCallFrameSize; + } + + void setupOffsets() + { + setupDisjunctionOffsets(m_pattern.m_body, 0, 0); + } + +private: + RegexPattern& m_pattern; + PatternAlternative* m_alternative; + CharacterClassConstructor m_characterClassConstructor; + bool m_invertCharacterClass; +}; + + +const char* compileRegex(const UString& patternString, RegexPattern& pattern) +{ + RegexPatternConstructor constructor(pattern); + + if (const char* error = parse(constructor, patternString)) + return error; + + // If the pattern contains illegal backreferences reset & reparse. + // Quoting Netscape's "What's new in JavaScript 1.2", + // "Note: if the number of left parentheses is less than the number specified + // in \#, the \# is taken as an octal escape as described in the next row." + if (pattern.containsIllegalBackReference()) { + unsigned numSubpatterns = pattern.m_numSubpatterns; + + constructor.reset(); +#ifndef NDEBUG + const char* error = +#endif + parse(constructor, patternString, numSubpatterns); + + ASSERT(!error); + ASSERT(numSubpatterns == pattern.m_numSubpatterns); + } + + constructor.setupOffsets(); + + return false; +}; + + +} } + +#endif |