diff options
Diffstat (limited to 'src/3rdparty/webkit/JavaScriptCore/parser/Lexer.cpp')
-rw-r--r-- | src/3rdparty/webkit/JavaScriptCore/parser/Lexer.cpp | 1499 |
1 files changed, 795 insertions, 704 deletions
diff --git a/src/3rdparty/webkit/JavaScriptCore/parser/Lexer.cpp b/src/3rdparty/webkit/JavaScriptCore/parser/Lexer.cpp index 0bacb22..8e89c18 100644 --- a/src/3rdparty/webkit/JavaScriptCore/parser/Lexer.cpp +++ b/src/3rdparty/webkit/JavaScriptCore/parser/Lexer.cpp @@ -31,14 +31,12 @@ #include <ctype.h> #include <limits.h> #include <string.h> -#include <wtf/ASCIICType.h> #include <wtf/Assertions.h> -#include <wtf/unicode/Unicode.h> using namespace WTF; using namespace Unicode; -// we can't specify the namespace in yacc's C output, so do it here +// We can't specify the namespace in yacc's C output, so do it here instead. using namespace JSC; #ifndef KDE_USE_FINAL @@ -48,7 +46,7 @@ using namespace JSC; #include "Lookup.h" #include "Lexer.lut.h" -// a bridge for yacc from the C world to C++ +// A bridge for yacc from the C world to the C++ world. int jscyylex(void* lvalp, void* llocp, void* globalData) { return static_cast<JSGlobalData*>(globalData)->lexer->lex(lvalp, llocp); @@ -56,825 +54,895 @@ int jscyylex(void* lvalp, void* llocp, void* globalData) namespace JSC { -static bool isDecimalDigit(int); +static const UChar byteOrderMark = 0xFEFF; Lexer::Lexer(JSGlobalData* globalData) - : yylineno(1) - , m_restrKeyword(false) - , m_eatNextIdentifier(false) - , m_stackToken(-1) - , m_lastToken(-1) - , m_position(0) - , m_code(0) - , m_length(0) - , m_isReparsing(false) - , m_atLineStart(true) - , m_current(0) - , m_next1(0) - , m_next2(0) - , m_next3(0) - , m_currentOffset(0) - , m_nextOffset1(0) - , m_nextOffset2(0) - , m_nextOffset3(0) + : m_isReparsing(false) , m_globalData(globalData) - , m_mainTable(JSC::mainTable) + , m_keywordTable(JSC::mainTable) { - m_buffer8.reserveCapacity(initialReadBufferCapacity); - m_buffer16.reserveCapacity(initialReadBufferCapacity); + m_buffer8.reserveInitialCapacity(initialReadBufferCapacity); + m_buffer16.reserveInitialCapacity(initialReadBufferCapacity); } Lexer::~Lexer() { - m_mainTable.deleteTable(); + m_keywordTable.deleteTable(); +} + +inline const UChar* Lexer::currentCharacter() const +{ + return m_code - 4; +} + +inline int Lexer::currentOffset() const +{ + return currentCharacter() - m_codeStart; +} + +ALWAYS_INLINE void Lexer::shift1() +{ + m_current = m_next1; + m_next1 = m_next2; + m_next2 = m_next3; + if (LIKELY(m_code < m_codeEnd)) + m_next3 = m_code[0]; + else + m_next3 = -1; + + ++m_code; +} + +ALWAYS_INLINE void Lexer::shift2() +{ + m_current = m_next2; + m_next1 = m_next3; + if (LIKELY(m_code + 1 < m_codeEnd)) { + m_next2 = m_code[0]; + m_next3 = m_code[1]; + } else { + m_next2 = m_code < m_codeEnd ? m_code[0] : -1; + m_next3 = -1; + } + + m_code += 2; +} + +ALWAYS_INLINE void Lexer::shift3() +{ + m_current = m_next3; + if (LIKELY(m_code + 2 < m_codeEnd)) { + m_next1 = m_code[0]; + m_next2 = m_code[1]; + m_next3 = m_code[2]; + } else { + m_next1 = m_code < m_codeEnd ? m_code[0] : -1; + m_next2 = m_code + 1 < m_codeEnd ? m_code[1] : -1; + m_next3 = -1; + } + + m_code += 3; +} + +ALWAYS_INLINE void Lexer::shift4() +{ + if (LIKELY(m_code + 3 < m_codeEnd)) { + m_current = m_code[0]; + m_next1 = m_code[1]; + m_next2 = m_code[2]; + m_next3 = m_code[3]; + } else { + m_current = m_code < m_codeEnd ? m_code[0] : -1; + m_next1 = m_code + 1 < m_codeEnd ? m_code[1] : -1; + m_next2 = m_code + 2 < m_codeEnd ? m_code[2] : -1; + m_next3 = -1; + } + + m_code += 4; } void Lexer::setCode(const SourceCode& source) { - yylineno = source.firstLine(); - m_restrKeyword = false; + m_lineNumber = source.firstLine(); m_delimited = false; - m_eatNextIdentifier = false; - m_stackToken = -1; m_lastToken = -1; - m_position = source.startOffset(); + const UChar* data = source.provider()->data(); + m_source = &source; - m_code = source.provider()->data(); - m_length = source.endOffset(); - m_skipLF = false; - m_skipCR = false; + m_codeStart = data; + m_code = data + source.startOffset(); + m_codeEnd = data + source.endOffset(); m_error = false; m_atLineStart = true; - // read first characters - shift(4); + // ECMA-262 calls for stripping all Cf characters, but we only strip BOM characters. + // See <https://bugs.webkit.org/show_bug.cgi?id=4931> for details. + if (source.provider()->hasBOMs()) { + for (const UChar* p = m_codeStart; p < m_codeEnd; ++p) { + if (UNLIKELY(*p == byteOrderMark)) { + copyCodeWithoutBOMs(); + break; + } + } + } + + // Read the first characters into the 4-character buffer. + shift4(); + ASSERT(currentOffset() == source.startOffset()); } -void Lexer::shift(unsigned p) +void Lexer::copyCodeWithoutBOMs() { - // ECMA-262 calls for stripping Cf characters here, but we only do this for BOM, - // see <https://bugs.webkit.org/show_bug.cgi?id=4931>. - - while (p--) { - m_current = m_next1; - m_next1 = m_next2; - m_next2 = m_next3; - m_currentOffset = m_nextOffset1; - m_nextOffset1 = m_nextOffset2; - m_nextOffset2 = m_nextOffset3; - do { - if (m_position >= m_length) { - m_nextOffset3 = m_position; - m_position++; - m_next3 = -1; - break; - } - m_nextOffset3 = m_position; - m_next3 = m_code[m_position++]; - } while (m_next3 == 0xFEFF); + // Note: In this case, the character offset data for debugging will be incorrect. + // If it's important to correctly debug code with extraneous BOMs, then the caller + // should strip the BOMs when creating the SourceProvider object and do its own + // mapping of offsets within the stripped text to original text offset. + + m_codeWithoutBOMs.reserveCapacity(m_codeEnd - m_code); + for (const UChar* p = m_code; p < m_codeEnd; ++p) { + UChar c = *p; + if (c != byteOrderMark) + m_codeWithoutBOMs.append(c); + } + ptrdiff_t startDelta = m_codeStart - m_code; + m_code = m_codeWithoutBOMs.data(); + m_codeStart = m_code + startDelta; + m_codeEnd = m_codeWithoutBOMs.data() + m_codeWithoutBOMs.size(); +} + +void Lexer::shiftLineTerminator() +{ + ASSERT(isLineTerminator(m_current)); + + // Allow both CRLF and LFCR. + if (m_current + m_next1 == '\n' + '\r') + shift2(); + else + shift1(); + + ++m_lineNumber; +} + +ALWAYS_INLINE Identifier* Lexer::makeIdentifier(const UChar* characters, size_t length) +{ + m_identifiers.append(Identifier(m_globalData, characters, length)); + return &m_identifiers.last(); +} + +inline bool Lexer::lastTokenWasRestrKeyword() const +{ + return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW; +} + +static NEVER_INLINE bool isNonASCIIIdentStart(int c) +{ + return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other); +} + +static inline bool isIdentStart(int c) +{ + return isASCII(c) ? isASCIIAlpha(c) || c == '$' || c == '_' : isNonASCIIIdentStart(c); +} + +static NEVER_INLINE bool isNonASCIIIdentPart(int c) +{ + return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other + | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector); +} + +static inline bool isIdentPart(int c) +{ + return isASCII(c) ? isASCIIAlphanumeric(c) || c == '$' || c == '_' : isNonASCIIIdentPart(c); +} + +static inline int singleEscape(int c) +{ + switch (c) { + case 'b': + return 0x08; + case 't': + return 0x09; + case 'n': + return 0x0A; + case 'v': + return 0x0B; + case 'f': + return 0x0C; + case 'r': + return 0x0D; + default: + return c; } } -// called on each new line -void Lexer::nextLine() +inline void Lexer::record8(int c) { - yylineno++; - m_atLineStart = true; + ASSERT(c >= 0); + ASSERT(c <= 0xFF); + m_buffer8.append(static_cast<char>(c)); } -void Lexer::setDone(State s) +inline void Lexer::record16(UChar c) { - m_state = s; - m_done = true; + m_buffer16.append(c); +} + +inline void Lexer::record16(int c) +{ + ASSERT(c >= 0); + ASSERT(c <= USHRT_MAX); + record16(UChar(static_cast<unsigned short>(c))); } int Lexer::lex(void* p1, void* p2) { + ASSERT(!m_error); + ASSERT(m_buffer8.isEmpty()); + ASSERT(m_buffer16.isEmpty()); + YYSTYPE* lvalp = static_cast<YYSTYPE*>(p1); YYLTYPE* llocp = static_cast<YYLTYPE*>(p2); int token = 0; - m_state = Start; - unsigned short stringType = 0; // either single or double quotes - m_buffer8.clear(); - m_buffer16.clear(); - m_done = false; m_terminator = false; - m_skipLF = false; - m_skipCR = false; - - // did we push a token on the stack previously ? - // (after an automatic semicolon insertion) - if (m_stackToken >= 0) { - setDone(Other); - token = m_stackToken; - m_stackToken = 0; - } - int startOffset = m_currentOffset; - while (!m_done) { - if (m_skipLF && m_current != '\n') // found \r but not \n afterwards - m_skipLF = false; - if (m_skipCR && m_current != '\r') // found \n but not \r afterwards - m_skipCR = false; - if (m_skipLF || m_skipCR) { // found \r\n or \n\r -> eat the second one - m_skipLF = false; - m_skipCR = false; - shift(1); + +start: + while (isWhiteSpace(m_current)) + shift1(); + + int startOffset = currentOffset(); + + if (m_current == -1) { + if (!m_terminator && !m_delimited && !m_isReparsing) { + // automatic semicolon insertion if program incomplete + token = ';'; + goto doneSemicolon; } - switch (m_state) { - case Start: - startOffset = m_currentOffset; - if (isWhiteSpace()) { - // do nothing - } else if (m_current == '/' && m_next1 == '/') { - shift(1); - m_state = InSingleLineComment; - } else if (m_current == '/' && m_next1 == '*') { - shift(1); - m_state = InMultiLineComment; - } else if (m_current == -1) { - if (!m_terminator && !m_delimited && !m_isReparsing) { - // automatic semicolon insertion if program incomplete - token = ';'; - m_stackToken = 0; - setDone(Other); - } else - setDone(Eof); - } else if (isLineTerminator()) { - nextLine(); - m_terminator = true; - if (m_restrKeyword) { - token = ';'; - setDone(Other); - } - } else if (m_current == '"' || m_current == '\'') { - m_state = InString; - stringType = static_cast<unsigned short>(m_current); - } else if (isIdentStart(m_current)) { - record16(m_current); - m_state = InIdentifierOrKeyword; - } else if (m_current == '\\') - m_state = InIdentifierStartUnicodeEscapeStart; - else if (m_current == '0') { - record8(m_current); - m_state = InNum0; - } else if (isDecimalDigit(m_current)) { - record8(m_current); - m_state = InNum; - } else if (m_current == '.' && isDecimalDigit(m_next1)) { - record8(m_current); - m_state = InDecimal; - // <!-- marks the beginning of a line comment (for www usage) - } else if (m_current == '<' && m_next1 == '!' && m_next2 == '-' && m_next3 == '-') { - shift(3); - m_state = InSingleLineComment; - // same for --> - } else if (m_atLineStart && m_current == '-' && m_next1 == '-' && m_next2 == '>') { - shift(2); - m_state = InSingleLineComment; - } else { - token = matchPunctuator(lvalp->intValue, m_current, m_next1, m_next2, m_next3); - if (token != -1) - setDone(Other); - else - setDone(Bad); + return 0; + } + + m_delimited = false; + switch (m_current) { + case '>': + if (m_next1 == '>' && m_next2 == '>') { + if (m_next3 == '=') { + shift4(); + token = URSHIFTEQUAL; + break; } + shift3(); + token = URSHIFT; break; - case InString: - if (m_current == stringType) { - shift(1); - setDone(String); - } else if (isLineTerminator() || m_current == -1) - setDone(Bad); - else if (m_current == '\\') - m_state = InEscapeSequence; - else - record16(m_current); + } + if (m_next1 == '>') { + if (m_next2 == '=') { + shift3(); + token = RSHIFTEQUAL; + break; + } + shift2(); + token = RSHIFT; break; - // Escape Sequences inside of strings - case InEscapeSequence: - if (isOctalDigit(m_current)) { - if (m_current >= '0' && m_current <= '3' && - isOctalDigit(m_next1) && isOctalDigit(m_next2)) { - record16(convertOctal(m_current, m_next1, m_next2)); - shift(2); - m_state = InString; - } else if (isOctalDigit(m_current) && isOctalDigit(m_next1)) { - record16(convertOctal('0', m_current, m_next1)); - shift(1); - m_state = InString; - } else if (isOctalDigit(m_current)) { - record16(convertOctal('0', '0', m_current)); - m_state = InString; - } else - setDone(Bad); - } else if (m_current == 'x') - m_state = InHexEscape; - else if (m_current == 'u') - m_state = InUnicodeEscape; - else if (isLineTerminator()) { - nextLine(); - m_state = InString; - } else { - record16(singleEscape(static_cast<unsigned short>(m_current))); - m_state = InString; + } + if (m_next1 == '=') { + shift2(); + token = GE; + break; + } + shift1(); + token = '>'; + break; + case '=': + if (m_next1 == '=') { + if (m_next2 == '=') { + shift3(); + token = STREQ; + break; } + shift2(); + token = EQEQ; break; - case InHexEscape: - if (isHexDigit(m_current) && isHexDigit(m_next1)) { - m_state = InString; - record16(convertHex(m_current, m_next1)); - shift(1); - } else if (m_current == stringType) { - record16('x'); - shift(1); - setDone(String); - } else { - record16('x'); - record16(m_current); - m_state = InString; + } + shift1(); + token = '='; + break; + case '!': + if (m_next1 == '=') { + if (m_next2 == '=') { + shift3(); + token = STRNEQ; + break; } + shift2(); + token = NE; break; - case InUnicodeEscape: - if (isHexDigit(m_current) && isHexDigit(m_next1) && isHexDigit(m_next2) && isHexDigit(m_next3)) { - record16(convertUnicode(m_current, m_next1, m_next2, m_next3)); - shift(3); - m_state = InString; - } else if (m_current == stringType) { - record16('u'); - shift(1); - setDone(String); - } else - setDone(Bad); + } + shift1(); + token = '!'; + break; + case '<': + if (m_next1 == '!' && m_next2 == '-' && m_next3 == '-') { + // <!-- marks the beginning of a line comment (for www usage) + shift4(); + goto inSingleLineComment; + } + if (m_next1 == '<') { + if (m_next2 == '=') { + shift3(); + token = LSHIFTEQUAL; + break; + } + shift2(); + token = LSHIFT; break; - case InSingleLineComment: - if (isLineTerminator()) { - nextLine(); - m_terminator = true; - if (m_restrKeyword) { - token = ';'; - setDone(Other); - } else - m_state = Start; - } else if (m_current == -1) - setDone(Eof); + } + if (m_next1 == '=') { + shift2(); + token = LE; break; - case InMultiLineComment: - if (m_current == -1) - setDone(Bad); - else if (isLineTerminator()) - nextLine(); - else if (m_current == '*' && m_next1 == '/') { - m_state = Start; - shift(1); + } + shift1(); + token = '<'; + break; + case '+': + if (m_next1 == '+') { + shift2(); + if (m_terminator) { + token = AUTOPLUSPLUS; + break; } + token = PLUSPLUS; break; - case InIdentifierOrKeyword: - case InIdentifier: - if (isIdentPart(m_current)) - record16(m_current); - else if (m_current == '\\') - m_state = InIdentifierPartUnicodeEscapeStart; - else - setDone(m_state == InIdentifierOrKeyword ? IdentifierOrKeyword : Identifier); + } + if (m_next1 == '=') { + shift2(); + token = PLUSEQUAL; break; - case InNum0: - if (m_current == 'x' || m_current == 'X') { - record8(m_current); - m_state = InHex; - } else if (m_current == '.') { - record8(m_current); - m_state = InDecimal; - } else if (m_current == 'e' || m_current == 'E') { - record8(m_current); - m_state = InExponentIndicator; - } else if (isOctalDigit(m_current)) { - record8(m_current); - m_state = InOctal; - } else if (isDecimalDigit(m_current)) { - record8(m_current); - m_state = InDecimal; - } else - setDone(Number); + } + shift1(); + token = '+'; + break; + case '-': + if (m_next1 == '-') { + if (m_atLineStart && m_next2 == '>') { + shift3(); + goto inSingleLineComment; + } + shift2(); + if (m_terminator) { + token = AUTOMINUSMINUS; + break; + } + token = MINUSMINUS; break; - case InHex: - if (isHexDigit(m_current)) - record8(m_current); - else - setDone(Hex); + } + if (m_next1 == '=') { + shift2(); + token = MINUSEQUAL; break; - case InOctal: - if (isOctalDigit(m_current)) - record8(m_current); - else if (isDecimalDigit(m_current)) { - record8(m_current); - m_state = InDecimal; - } else - setDone(Octal); + } + shift1(); + token = '-'; + break; + case '*': + if (m_next1 == '=') { + shift2(); + token = MULTEQUAL; break; - case InNum: - if (isDecimalDigit(m_current)) - record8(m_current); - else if (m_current == '.') { - record8(m_current); - m_state = InDecimal; - } else if (m_current == 'e' || m_current == 'E') { - record8(m_current); - m_state = InExponentIndicator; - } else - setDone(Number); + } + shift1(); + token = '*'; + break; + case '/': + if (m_next1 == '/') { + shift2(); + goto inSingleLineComment; + } + if (m_next1 == '*') + goto inMultiLineComment; + if (m_next1 == '=') { + shift2(); + token = DIVEQUAL; break; - case InDecimal: - if (isDecimalDigit(m_current)) - record8(m_current); - else if (m_current == 'e' || m_current == 'E') { - record8(m_current); - m_state = InExponentIndicator; - } else - setDone(Number); + } + shift1(); + token = '/'; + break; + case '&': + if (m_next1 == '&') { + shift2(); + token = AND; break; - case InExponentIndicator: - if (m_current == '+' || m_current == '-') - record8(m_current); - else if (isDecimalDigit(m_current)) { - record8(m_current); - m_state = InExponent; - } else - setDone(Bad); + } + if (m_next1 == '=') { + shift2(); + token = ANDEQUAL; break; - case InExponent: - if (isDecimalDigit(m_current)) - record8(m_current); - else - setDone(Number); + } + shift1(); + token = '&'; + break; + case '^': + if (m_next1 == '=') { + shift2(); + token = XOREQUAL; break; - case InIdentifierStartUnicodeEscapeStart: - if (m_current == 'u') - m_state = InIdentifierStartUnicodeEscape; - else - setDone(Bad); + } + shift1(); + token = '^'; + break; + case '%': + if (m_next1 == '=') { + shift2(); + token = MODEQUAL; break; - case InIdentifierPartUnicodeEscapeStart: - if (m_current == 'u') - m_state = InIdentifierPartUnicodeEscape; - else - setDone(Bad); + } + shift1(); + token = '%'; + break; + case '|': + if (m_next1 == '=') { + shift2(); + token = OREQUAL; break; - case InIdentifierStartUnicodeEscape: - if (!isHexDigit(m_current) || !isHexDigit(m_next1) || !isHexDigit(m_next2) || !isHexDigit(m_next3)) { - setDone(Bad); - break; - } - token = convertUnicode(m_current, m_next1, m_next2, m_next3); - shift(3); - if (!isIdentStart(token)) { - setDone(Bad); - break; - } - record16(token); - m_state = InIdentifier; + } + if (m_next1 == '|') { + shift2(); + token = OR; break; - case InIdentifierPartUnicodeEscape: - if (!isHexDigit(m_current) || !isHexDigit(m_next1) || !isHexDigit(m_next2) || !isHexDigit(m_next3)) { - setDone(Bad); - break; - } - token = convertUnicode(m_current, m_next1, m_next2, m_next3); - shift(3); - if (!isIdentPart(token)) { - setDone(Bad); - break; + } + shift1(); + token = '|'; + break; + case '.': + if (isASCIIDigit(m_next1)) { + record8('.'); + shift1(); + goto inNumberAfterDecimalPoint; + } + token = '.'; + shift1(); + break; + case ',': + case '~': + case '?': + case ':': + case '(': + case ')': + case '[': + case ']': + token = m_current; + shift1(); + break; + case ';': + shift1(); + m_delimited = true; + token = ';'; + break; + case '{': + lvalp->intValue = currentOffset(); + shift1(); + token = OPENBRACE; + break; + case '}': + lvalp->intValue = currentOffset(); + shift1(); + m_delimited = true; + token = CLOSEBRACE; + break; + case '\\': + goto startIdentifierWithBackslash; + case '0': + goto startNumberWithZeroDigit; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + goto startNumber; + case '"': + case '\'': + goto startString; + default: + if (isIdentStart(m_current)) + goto startIdentifierOrKeyword; + if (isLineTerminator(m_current)) { + shiftLineTerminator(); + m_atLineStart = true; + m_terminator = true; + if (lastTokenWasRestrKeyword()) { + token = ';'; + goto doneSemicolon; } - record16(token); - m_state = InIdentifier; - break; - default: - ASSERT(!"Unhandled state in switch statement"); - } - - // move on to the next character - if (!m_done) - shift(1); - if (m_state != Start && m_state != InSingleLineComment) - m_atLineStart = false; + goto start; + } + goto returnError; } - // no identifiers allowed directly after numeric literal, e.g. "3in" is bad - if ((m_state == Number || m_state == Octal || m_state == Hex) && isIdentStart(m_current)) - m_state = Bad; + m_atLineStart = false; + goto returnToken; - // terminate string - m_buffer8.append('\0'); - -#ifdef JSC_DEBUG_LEX - fprintf(stderr, "line: %d ", lineNo()); - fprintf(stderr, "yytext (%x): ", m_buffer8[0]); - fprintf(stderr, "%s ", m_buffer8.data()); -#endif +startString: { + int stringQuoteCharacter = m_current; + shift1(); - double dval = 0; - if (m_state == Number) - dval = WTF::strtod(m_buffer8.data(), 0L); - else if (m_state == Hex) { // scan hex numbers - const char* p = m_buffer8.data() + 2; - while (char c = *p++) { - dval *= 16; - dval += convertHex(c); + const UChar* stringStart = currentCharacter(); + while (m_current != stringQuoteCharacter) { + // Fast check for characters that require special handling. + // Catches -1, \n, \r, \, 0x2028, and 0x2029 as efficiently + // as possible, and lets through all common ASCII characters. + if (UNLIKELY(m_current == '\\') || UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) { + m_buffer16.append(stringStart, currentCharacter() - stringStart); + goto inString; + } + shift1(); + } + lvalp->ident = makeIdentifier(stringStart, currentCharacter() - stringStart); + shift1(); + m_atLineStart = false; + m_delimited = false; + token = STRING; + goto returnToken; + +inString: + while (m_current != stringQuoteCharacter) { + if (m_current == '\\') + goto inStringEscapeSequence; + if (UNLIKELY(isLineTerminator(m_current))) + goto returnError; + if (UNLIKELY(m_current == -1)) + goto returnError; + record16(m_current); + shift1(); + } + goto doneString; + +inStringEscapeSequence: + shift1(); + if (m_current == 'x') { + shift1(); + if (isASCIIHexDigit(m_current) && isASCIIHexDigit(m_next1)) { + record16(convertHex(m_current, m_next1)); + shift2(); + goto inString; } + record16('x'); + if (m_current == stringQuoteCharacter) + goto doneString; + goto inString; + } + if (m_current == 'u') { + shift1(); + if (isASCIIHexDigit(m_current) && isASCIIHexDigit(m_next1) && isASCIIHexDigit(m_next2) && isASCIIHexDigit(m_next3)) { + record16(convertUnicode(m_current, m_next1, m_next2, m_next3)); + shift4(); + goto inString; + } + if (m_current == stringQuoteCharacter) { + record16('u'); + goto doneString; + } + goto returnError; + } + if (isASCIIOctalDigit(m_current)) { + if (m_current >= '0' && m_current <= '3' && isASCIIOctalDigit(m_next1) && isASCIIOctalDigit(m_next2)) { + record16((m_current - '0') * 64 + (m_next1 - '0') * 8 + m_next2 - '0'); + shift3(); + goto inString; + } + if (isASCIIOctalDigit(m_next1)) { + record16((m_current - '0') * 8 + m_next1 - '0'); + shift2(); + goto inString; + } + record16(m_current - '0'); + shift1(); + goto inString; + } + if (isLineTerminator(m_current)) { + shiftLineTerminator(); + goto inString; + } + record16(singleEscape(m_current)); + shift1(); + goto inString; +} - if (dval >= mantissaOverflowLowerBound) - dval = parseIntOverflow(m_buffer8.data() + 2, p - (m_buffer8.data() + 3), 16); +startIdentifierWithBackslash: + shift1(); + if (UNLIKELY(m_current != 'u')) + goto returnError; + shift1(); + if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(m_next1) || !isASCIIHexDigit(m_next2) || !isASCIIHexDigit(m_next3))) + goto returnError; + token = convertUnicode(m_current, m_next1, m_next2, m_next3); + if (UNLIKELY(!isIdentStart(token))) + goto returnError; + goto inIdentifierAfterCharacterCheck; + +startIdentifierOrKeyword: { + const UChar* identifierStart = currentCharacter(); + shift1(); + while (isIdentPart(m_current)) + shift1(); + if (LIKELY(m_current != '\\')) { + lvalp->ident = makeIdentifier(identifierStart, currentCharacter() - identifierStart); + goto doneIdentifierOrKeyword; + } + m_buffer16.append(identifierStart, currentCharacter() - identifierStart); +} - m_state = Number; - } else if (m_state == Octal) { // scan octal number - const char* p = m_buffer8.data() + 1; - while (char c = *p++) { - dval *= 8; - dval += c - '0'; + do { + shift1(); + if (UNLIKELY(m_current != 'u')) + goto returnError; + shift1(); + if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(m_next1) || !isASCIIHexDigit(m_next2) || !isASCIIHexDigit(m_next3))) + goto returnError; + token = convertUnicode(m_current, m_next1, m_next2, m_next3); + if (UNLIKELY(!isIdentPart(token))) + goto returnError; +inIdentifierAfterCharacterCheck: + record16(token); + shift4(); + + while (isIdentPart(m_current)) { + record16(m_current); + shift1(); } + } while (UNLIKELY(m_current == '\\')); + goto doneIdentifier; - if (dval >= mantissaOverflowLowerBound) - dval = parseIntOverflow(m_buffer8.data() + 1, p - (m_buffer8.data() + 2), 8); - - m_state = Number; +inSingleLineComment: + while (!isLineTerminator(m_current)) { + if (UNLIKELY(m_current == -1)) + return 0; + shift1(); } - -#ifdef JSC_DEBUG_LEX - switch (m_state) { - case Eof: - printf("(EOF)\n"); - break; - case Other: - printf("(Other)\n"); - break; - case Identifier: - printf("(Identifier)/(Keyword)\n"); - break; - case String: - printf("(String)\n"); - break; - case Number: - printf("(Number)\n"); - break; - default: - printf("(unknown)"); + shiftLineTerminator(); + m_atLineStart = true; + m_terminator = true; + if (lastTokenWasRestrKeyword()) + goto doneSemicolon; + goto start; + +inMultiLineComment: + shift2(); + while (m_current != '*' || m_next1 != '/') { + if (isLineTerminator(m_current)) + shiftLineTerminator(); + else { + shift1(); + if (UNLIKELY(m_current == -1)) + goto returnError; + } } -#endif + shift2(); + m_atLineStart = false; + goto start; + +startNumberWithZeroDigit: + shift1(); + if ((m_current | 0x20) == 'x' && isASCIIHexDigit(m_next1)) { + shift1(); + goto inHex; + } + if (m_current == '.') { + record8('0'); + record8('.'); + shift1(); + goto inNumberAfterDecimalPoint; + } + if ((m_current | 0x20) == 'e') { + record8('0'); + record8('e'); + shift1(); + goto inExponentIndicator; + } + if (isASCIIOctalDigit(m_current)) + goto inOctal; + if (isASCIIDigit(m_current)) + goto startNumber; + lvalp->doubleValue = 0; + goto doneNumeric; + +inNumberAfterDecimalPoint: + while (isASCIIDigit(m_current)) { + record8(m_current); + shift1(); + } + if ((m_current | 0x20) == 'e') { + record8('e'); + shift1(); + goto inExponentIndicator; + } + goto doneNumber; + +inExponentIndicator: + if (m_current == '+' || m_current == '-') { + record8(m_current); + shift1(); + } + if (!isASCIIDigit(m_current)) + goto returnError; + do { + record8(m_current); + shift1(); + } while (isASCIIDigit(m_current)); + goto doneNumber; + +inOctal: { + do { + record8(m_current); + shift1(); + } while (isASCIIOctalDigit(m_current)); + if (isASCIIDigit(m_current)) + goto startNumber; - if (m_state != Identifier) - m_eatNextIdentifier = false; + double dval = 0; - m_restrKeyword = false; - m_delimited = false; - llocp->first_line = yylineno; - llocp->last_line = yylineno; - llocp->first_column = startOffset; - llocp->last_column = m_currentOffset; - switch (m_state) { - case Eof: - token = 0; - break; - case Other: - if (token == '}' || token == ';') - m_delimited = true; - break; - case Identifier: - // Apply anonymous-function hack below (eat the identifier). - if (m_eatNextIdentifier) { - m_eatNextIdentifier = false; - token = lex(lvalp, llocp); - break; - } - lvalp->ident = makeIdentifier(m_buffer16); - token = IDENT; - break; - case IdentifierOrKeyword: { - lvalp->ident = makeIdentifier(m_buffer16); - const HashEntry* entry = m_mainTable.entry(m_globalData, *lvalp->ident); - if (!entry) { - // Lookup for keyword failed, means this is an identifier. - token = IDENT; - break; - } - token = entry->lexerValue(); - // Hack for "f = function somename() { ... }"; too hard to get into the grammar. - m_eatNextIdentifier = token == FUNCTION && m_lastToken == '='; - if (token == CONTINUE || token == BREAK || token == RETURN || token == THROW) - m_restrKeyword = true; - break; - } - case String: - // Atomize constant strings in case they're later used in property lookup. - lvalp->ident = makeIdentifier(m_buffer16); - token = STRING; - break; - case Number: - lvalp->doubleValue = dval; - token = NUMBER; - break; - case Bad: -#ifdef JSC_DEBUG_LEX - fprintf(stderr, "yylex: ERROR.\n"); -#endif - m_error = true; - return -1; - default: - ASSERT(!"unhandled numeration value in switch"); - m_error = true; - return -1; + const char* end = m_buffer8.end(); + for (const char* p = m_buffer8.data(); p < end; ++p) { + dval *= 8; + dval += *p - '0'; } - m_lastToken = token; - return token; -} + if (dval >= mantissaOverflowLowerBound) + dval = parseIntOverflow(m_buffer8.data(), end - m_buffer8.data(), 8); -bool Lexer::isWhiteSpace() const -{ - return m_current == '\t' || m_current == 0x0b || m_current == 0x0c || isSeparatorSpace(m_current); -} + m_buffer8.resize(0); -bool Lexer::isLineTerminator() -{ - bool cr = (m_current == '\r'); - bool lf = (m_current == '\n'); - if (cr) - m_skipLF = true; - else if (lf) - m_skipCR = true; - return cr || lf || m_current == 0x2028 || m_current == 0x2029; + lvalp->doubleValue = dval; + goto doneNumeric; } -bool Lexer::isIdentStart(int c) -{ - return isASCIIAlpha(c) || c == '$' || c == '_' || (!isASCII(c) && (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other))); -} +inHex: { + do { + record8(m_current); + shift1(); + } while (isASCIIHexDigit(m_current)); -bool Lexer::isIdentPart(int c) -{ - return isASCIIAlphanumeric(c) || c == '$' || c == '_' || (!isASCII(c) && (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other - | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector))); -} + double dval = 0; -static bool isDecimalDigit(int c) -{ - return isASCIIDigit(c); -} + const char* end = m_buffer8.end(); + for (const char* p = m_buffer8.data(); p < end; ++p) { + dval *= 16; + dval += toASCIIHexValue(*p); + } + if (dval >= mantissaOverflowLowerBound) + dval = parseIntOverflow(m_buffer8.data(), end - m_buffer8.data(), 16); -bool Lexer::isHexDigit(int c) -{ - return isASCIIHexDigit(c); -} + m_buffer8.resize(0); -bool Lexer::isOctalDigit(int c) -{ - return isASCIIOctalDigit(c); + lvalp->doubleValue = dval; + goto doneNumeric; } -int Lexer::matchPunctuator(int& charPos, int c1, int c2, int c3, int c4) -{ - if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') { - shift(4); - return URSHIFTEQUAL; - } - if (c1 == '=' && c2 == '=' && c3 == '=') { - shift(3); - return STREQ; - } - if (c1 == '!' && c2 == '=' && c3 == '=') { - shift(3); - return STRNEQ; - } - if (c1 == '>' && c2 == '>' && c3 == '>') { - shift(3); - return URSHIFT; - } - if (c1 == '<' && c2 == '<' && c3 == '=') { - shift(3); - return LSHIFTEQUAL; - } - if (c1 == '>' && c2 == '>' && c3 == '=') { - shift(3); - return RSHIFTEQUAL; - } - if (c1 == '<' && c2 == '=') { - shift(2); - return LE; - } - if (c1 == '>' && c2 == '=') { - shift(2); - return GE; - } - if (c1 == '!' && c2 == '=') { - shift(2); - return NE; - } - if (c1 == '+' && c2 == '+') { - shift(2); - if (m_terminator) - return AUTOPLUSPLUS; - return PLUSPLUS; - } - if (c1 == '-' && c2 == '-') { - shift(2); - if (m_terminator) - return AUTOMINUSMINUS; - return MINUSMINUS; - } - if (c1 == '=' && c2 == '=') { - shift(2); - return EQEQ; - } - if (c1 == '+' && c2 == '=') { - shift(2); - return PLUSEQUAL; - } - if (c1 == '-' && c2 == '=') { - shift(2); - return MINUSEQUAL; - } - if (c1 == '*' && c2 == '=') { - shift(2); - return MULTEQUAL; - } - if (c1 == '/' && c2 == '=') { - shift(2); - return DIVEQUAL; - } - if (c1 == '&' && c2 == '=') { - shift(2); - return ANDEQUAL; - } - if (c1 == '^' && c2 == '=') { - shift(2); - return XOREQUAL; - } - if (c1 == '%' && c2 == '=') { - shift(2); - return MODEQUAL; - } - if (c1 == '|' && c2 == '=') { - shift(2); - return OREQUAL; - } - if (c1 == '<' && c2 == '<') { - shift(2); - return LSHIFT; - } - if (c1 == '>' && c2 == '>') { - shift(2); - return RSHIFT; +startNumber: + record8(m_current); + shift1(); + while (isASCIIDigit(m_current)) { + record8(m_current); + shift1(); } - if (c1 == '&' && c2 == '&') { - shift(2); - return AND; + if (m_current == '.') { + record8('.'); + shift1(); + goto inNumberAfterDecimalPoint; } - if (c1 == '|' && c2 == '|') { - shift(2); - return OR; + if ((m_current | 0x20) == 'e') { + record8('e'); + shift1(); + goto inExponentIndicator; } - switch (c1) { - case '=': - case '>': - case '<': - case ',': - case '!': - case '~': - case '?': - case ':': - case '.': - case '+': - case '-': - case '*': - case '/': - case '&': - case '|': - case '^': - case '%': - case '(': - case ')': - case '[': - case ']': - case ';': - shift(1); - return static_cast<int>(c1); - case '{': - charPos = m_position - 4; - shift(1); - return OPENBRACE; - case '}': - charPos = m_position - 4; - shift(1); - return CLOSEBRACE; - default: - return -1; - } -} + // Fall through into doneNumber. -unsigned short Lexer::singleEscape(unsigned short c) -{ - switch (c) { - case 'b': - return 0x08; - case 't': - return 0x09; - case 'n': - return 0x0A; - case 'v': - return 0x0B; - case 'f': - return 0x0C; - case 'r': - return 0x0D; - case '"': - return 0x22; - case '\'': - return 0x27; - case '\\': - return 0x5C; - default: - return c; - } -} +doneNumber: + // Null-terminate string for strtod. + m_buffer8.append('\0'); + lvalp->doubleValue = WTF::strtod(m_buffer8.data(), 0); + m_buffer8.resize(0); -unsigned short Lexer::convertOctal(int c1, int c2, int c3) -{ - return static_cast<unsigned short>((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0'); -} + // Fall through into doneNumeric. -unsigned char Lexer::convertHex(int c) -{ - if (c >= '0' && c <= '9') - return static_cast<unsigned char>(c - '0'); - if (c >= 'a' && c <= 'f') - return static_cast<unsigned char>(c - 'a' + 10); - return static_cast<unsigned char>(c - 'A' + 10); -} +doneNumeric: + // No identifiers allowed directly after numeric literal, e.g. "3in" is bad. + if (UNLIKELY(isIdentStart(m_current))) + goto returnError; -unsigned char Lexer::convertHex(int c1, int c2) -{ - return ((convertHex(c1) << 4) + convertHex(c2)); -} + m_atLineStart = false; + m_delimited = false; + token = NUMBER; + goto returnToken; -UChar Lexer::convertUnicode(int c1, int c2, int c3, int c4) -{ - unsigned char highByte = (convertHex(c1) << 4) + convertHex(c2); - unsigned char lowByte = (convertHex(c3) << 4) + convertHex(c4); - return (highByte << 8 | lowByte); -} +doneSemicolon: + token = ';'; + m_delimited = true; + goto returnToken; -void Lexer::record8(int c) -{ - ASSERT(c >= 0); - ASSERT(c <= 0xff); - m_buffer8.append(static_cast<char>(c)); +doneIdentifier: + m_atLineStart = false; + m_delimited = false; + lvalp->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size()); + m_buffer16.resize(0); + token = IDENT; + goto returnToken; + +doneIdentifierOrKeyword: { + m_atLineStart = false; + m_delimited = false; + m_buffer16.resize(0); + const HashEntry* entry = m_keywordTable.entry(m_globalData, *lvalp->ident); + token = entry ? entry->lexerValue() : IDENT; + goto returnToken; } -void Lexer::record16(int c) -{ - ASSERT(c >= 0); - ASSERT(c <= USHRT_MAX); - record16(UChar(static_cast<unsigned short>(c))); +doneString: + // Atomize constant strings in case they're later used in property lookup. + shift1(); + m_atLineStart = false; + m_delimited = false; + lvalp->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size()); + m_buffer16.resize(0); + token = STRING; + + // Fall through into returnToken. + +returnToken: { + int lineNumber = m_lineNumber; + llocp->first_line = lineNumber; + llocp->last_line = lineNumber; + llocp->first_column = startOffset; + llocp->last_column = currentOffset(); + + m_lastToken = token; + return token; } -void Lexer::record16(UChar c) -{ - m_buffer16.append(c); +returnError: + m_error = true; + return -1; } bool Lexer::scanRegExp() { - m_buffer16.clear(); + ASSERT(m_buffer16.isEmpty()); + bool lastWasEscape = false; bool inBrackets = false; - while (1) { - if (isLineTerminator() || m_current == -1) + while (true) { + if (isLineTerminator(m_current) || m_current == -1) return false; - else if (m_current != '/' || lastWasEscape == true || inBrackets == true) { + if (m_current != '/' || lastWasEscape || inBrackets) { // keep track of '[' and ']' if (!lastWasEscape) { - if ( m_current == '[' && !inBrackets ) + if (m_current == '[' && !inBrackets) inBrackets = true; - if ( m_current == ']' && inBrackets ) + if (m_current == ']' && inBrackets) inBrackets = false; } record16(m_current); - lastWasEscape = - !lastWasEscape && (m_current == '\\'); + lastWasEscape = !lastWasEscape && m_current == '\\'; } else { // end of regexp m_pattern = UString(m_buffer16); - m_buffer16.clear(); - shift(1); + m_buffer16.resize(0); + shift1(); break; } - shift(1); + shift1(); } while (isIdentPart(m_current)) { record16(m_current); - shift(1); + shift1(); } m_flags = UString(m_buffer16); + m_buffer16.resize(0); return true; } @@ -882,19 +950,42 @@ bool Lexer::scanRegExp() void Lexer::clear() { m_identifiers.clear(); + m_codeWithoutBOMs.clear(); Vector<char> newBuffer8; - newBuffer8.reserveCapacity(initialReadBufferCapacity); + newBuffer8.reserveInitialCapacity(initialReadBufferCapacity); m_buffer8.swap(newBuffer8); Vector<UChar> newBuffer16; - newBuffer16.reserveCapacity(initialReadBufferCapacity); + newBuffer16.reserveInitialCapacity(initialReadBufferCapacity); m_buffer16.swap(newBuffer16); m_isReparsing = false; - m_pattern = 0; - m_flags = 0; + m_pattern = UString(); + m_flags = UString(); +} + +SourceCode Lexer::sourceCode(int openBrace, int closeBrace, int firstLine) +{ + if (m_codeWithoutBOMs.isEmpty()) + return SourceCode(m_source->provider(), openBrace, closeBrace + 1, firstLine); + + const UChar* data = m_source->provider()->data(); + + ASSERT(openBrace < closeBrace); + + int numBOMsBeforeOpenBrace = 0; + int numBOMsBetweenBraces = 0; + + int i; + for (i = m_source->startOffset(); i < openBrace; ++i) + numBOMsBeforeOpenBrace += data[i] == byteOrderMark; + for (; i < closeBrace; ++i) + numBOMsBetweenBraces += data[i] == byteOrderMark; + + return SourceCode(m_source->provider(), openBrace + numBOMsBeforeOpenBrace, + closeBrace + numBOMsBeforeOpenBrace + numBOMsBetweenBraces + 1, firstLine); } } // namespace JSC |