diff options
Diffstat (limited to 'deps/v8/src/scanner.h')
-rw-r--r-- | deps/v8/src/scanner.h | 303 |
1 files changed, 44 insertions, 259 deletions
diff --git a/deps/v8/src/scanner.h b/deps/v8/src/scanner.h index df5cd72949..acb9b47bd9 100644 --- a/deps/v8/src/scanner.h +++ b/deps/v8/src/scanner.h @@ -35,65 +35,6 @@ namespace v8 { namespace internal { - -class UTF8Buffer { - public: - UTF8Buffer(); - ~UTF8Buffer(); - - inline void AddChar(uc32 c) { - if (recording_) { - if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { - buffer_.Add(static_cast<char>(c)); - } else { - AddCharSlow(c); - } - } - } - - void StartLiteral() { - buffer_.StartSequence(); - recording_ = true; - } - - Vector<const char> EndLiteral() { - if (recording_) { - recording_ = false; - buffer_.Add(kEndMarker); - Vector<char> sequence = buffer_.EndSequence(); - return Vector<const char>(sequence.start(), sequence.length()); - } - return Vector<const char>(); - } - - void DropLiteral() { - if (recording_) { - recording_ = false; - buffer_.DropSequence(); - } - } - - void Reset() { - buffer_.Reset(); - } - - // The end marker added after a parsed literal. - // Using zero allows the usage of strlen and similar functions on - // identifiers and numbers (but not strings, since they may contain zero - // bytes). - // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside - // an utf-8 string. This requires changes in all places that uses - // str-functions on the literals, but allows a single pointer to represent - // the literal, even if it contains embedded zeros. - static const char kEndMarker = '\x00'; - private: - static const int kInitialCapacity = 256; - SequenceCollector<char, 4> buffer_; - bool recording_; - void AddCharSlow(uc32 c); -}; - - // UTF16 buffer to read characters from a character stream. class CharacterStreamUTF16Buffer: public UTF16Buffer { public: @@ -134,175 +75,65 @@ class ExternalStringUTF16Buffer: public UTF16Buffer { }; -enum ParserLanguage { JAVASCRIPT, JSON }; - - -class Scanner { +// Initializes a UTF16Buffer as input stream, using one of a number +// of strategies depending on the available character sources. +class StreamInitializer { public: - typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; + UTF16Buffer* Init(Handle<String> source, + unibrow::CharacterStream* stream, + int start_position, + int end_position); + private: + // Different UTF16 buffers used to pull characters from. Based on input one of + // these will be initialized as the actual data source. + CharacterStreamUTF16Buffer char_stream_buffer_; + ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t> + two_byte_string_buffer_; + ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_; - class LiteralScope { - public: - explicit LiteralScope(Scanner* self); - ~LiteralScope(); - void Complete(); + // Used to convert the source string into a character stream when a stream + // is not passed to the scanner. + SafeStringInputBuffer safe_string_input_buffer_; +}; - private: - Scanner* scanner_; - bool complete_; - }; +// ---------------------------------------------------------------------------- +// V8JavaScriptScanner +// JavaScript scanner getting its input from either a V8 String or a unicode +// CharacterStream. - Scanner(); +class V8JavaScriptScanner : public JavaScriptScanner { + public: + V8JavaScriptScanner() {} + + Token::Value NextCheckStack(); // Initialize the Scanner to scan source. - void Initialize(Handle<String> source, - ParserLanguage language); + void Initialize(Handle<String> source, int literal_flags = kAllLiterals); void Initialize(Handle<String> source, unibrow::CharacterStream* stream, - ParserLanguage language); + int literal_flags = kAllLiterals); void Initialize(Handle<String> source, int start_position, int end_position, - ParserLanguage language); - - // Returns the next token. - Token::Value Next(); - - // Returns the current token again. - Token::Value current_token() { return current_.token; } - - // One token look-ahead (past the token returned by Next()). - Token::Value peek() const { return next_.token; } - - // Returns true if there was a line terminator before the peek'ed token. - bool has_line_terminator_before_next() const { - return has_line_terminator_before_next_; - } - - struct Location { - Location(int b, int e) : beg_pos(b), end_pos(e) { } - Location() : beg_pos(0), end_pos(0) { } - int beg_pos; - int end_pos; - }; - - // Returns the location information for the current token - // (the token returned by Next()). - Location location() const { return current_.location; } - Location peek_location() const { return next_.location; } - - // Returns the literal string, if any, for the current token (the - // token returned by Next()). The string is 0-terminated and in - // UTF-8 format; they may contain 0-characters. Literal strings are - // collected for identifiers, strings, and numbers. - // These functions only give the correct result if the literal - // was scanned between calls to StartLiteral() and TerminateLiteral(). - const char* literal_string() const { - return current_.literal_chars.start(); - } - - int literal_length() const { - // Excluding terminal '\x00' added by TerminateLiteral(). - return current_.literal_chars.length() - 1; - } - - Vector<const char> literal() const { - return Vector<const char>(literal_string(), literal_length()); - } - - // Returns the literal string for the next token (the token that - // would be returned if Next() were called). - const char* next_literal_string() const { - return next_.literal_chars.start(); - } - - - // Returns the length of the next token (that would be returned if - // Next() were called). - int next_literal_length() const { - // Excluding terminal '\x00' added by TerminateLiteral(). - return next_.literal_chars.length() - 1; - } - - Vector<const char> next_literal() const { - return Vector<const char>(next_literal_string(), next_literal_length()); - } - - // Scans the input as a regular expression pattern, previous - // character(s) must be /(=). Returns true if a pattern is scanned. - bool ScanRegExpPattern(bool seen_equal); - // Returns true if regexp flags are scanned (always since flags can - // be empty). - bool ScanRegExpFlags(); + int literal_flags = kAllLiterals); - // Seek forward to the given position. This operation does not - // work in general, for instance when there are pushed back - // characters, but works for seeking forward until simple delimiter - // tokens, which is what it is used for. - void SeekForward(int pos); - - bool stack_overflow() { return stack_overflow_; } + protected: + StreamInitializer stream_initializer_; +}; - // Tells whether the buffer contains an identifier (no escapes). - // Used for checking if a property name is an identifier. - static bool IsIdentifier(unibrow::CharacterStream* buffer); - static const int kCharacterLookaheadBufferSize = 1; - static const int kNoEndPosition = 1; +class JsonScanner : public Scanner { + public: + JsonScanner(); - private: - // The current and look-ahead token. - struct TokenDesc { - Token::Value token; - Location location; - Vector<const char> literal_chars; - }; - - void Init(Handle<String> source, - unibrow::CharacterStream* stream, - int start_position, int end_position, - ParserLanguage language); - - // Literal buffer support - inline void StartLiteral(); - inline void AddLiteralChar(uc32 ch); - inline void AddLiteralCharAdvance(); - inline void TerminateLiteral(); - // Stops scanning of a literal, e.g., due to an encountered error. - inline void DropLiteral(); - - // Low-level scanning support. - void Advance() { c0_ = source_->Advance(); } - void PushBack(uc32 ch) { - source_->PushBack(ch); - c0_ = ch; - } + // Initialize the Scanner to scan source. + void Initialize(Handle<String> source); - bool SkipWhiteSpace() { - if (is_parsing_json_) { - return SkipJsonWhiteSpace(); - } else { - return SkipJavaScriptWhiteSpace(); - } - } + // Returns the next token. + Token::Value Next(); - bool SkipJavaScriptWhiteSpace(); + protected: + // Skip past JSON whitespace (only space, tab, newline and carrige-return). bool SkipJsonWhiteSpace(); - Token::Value SkipSingleLineComment(); - Token::Value SkipMultiLineComment(); - - inline Token::Value Select(Token::Value tok); - inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_); - - inline void Scan() { - if (is_parsing_json_) { - ScanJson(); - } else { - ScanJavaScript(); - } - } - - // Scans a single JavaScript token. - void ScanJavaScript(); // Scan a single JSON token. The JSON lexical grammar is specified in the // ECMAScript 5 standard, section 15.12.1.1. @@ -331,53 +162,7 @@ class Scanner { // JSONNullLiteral). Token::Value ScanJsonIdentifier(const char* text, Token::Value token); - void ScanDecimalDigits(); - Token::Value ScanNumber(bool seen_period); - Token::Value ScanIdentifier(); - uc32 ScanHexEscape(uc32 c, int length); - uc32 ScanOctalEscape(uc32 c, int length); - void ScanEscape(); - Token::Value ScanString(); - - // Scans a possible HTML comment -- begins with '<!'. - Token::Value ScanHtmlComment(); - - // Return the current source position. - int source_pos() { - return source_->pos() - kCharacterLookaheadBufferSize; - } - - // Decodes a unicode escape-sequence which is part of an identifier. - // If the escape sequence cannot be decoded the result is kBadRune. - uc32 ScanIdentifierUnicodeEscape(); - - TokenDesc current_; // desc for current token (as returned by Next()) - TokenDesc next_; // desc for next token (one token look-ahead) - bool has_line_terminator_before_next_; - bool is_parsing_json_; - - // Different UTF16 buffers used to pull characters from. Based on input one of - // these will be initialized as the actual data source. - CharacterStreamUTF16Buffer char_stream_buffer_; - ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t> - two_byte_string_buffer_; - ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_; - - // Source. Will point to one of the buffers declared above. - UTF16Buffer* source_; - - // Used to convert the source string into a character stream when a stream - // is not passed to the scanner. - SafeStringInputBuffer safe_string_input_buffer_; - - // Buffer to hold literal values (identifiers, strings, numbers) - // using '\x00'-terminated UTF-8 encoding. Handles allocation internally. - UTF8Buffer literal_buffer_; - - bool stack_overflow_; - - // One Unicode character look-ahead; c0_ < 0 at the end of the input. - uc32 c0_; + StreamInitializer stream_initializer_; }; @@ -400,7 +185,7 @@ void ExternalStringUTF16Buffer<StringType, CharType>::Initialize( SeekForward(start_position); } end_ = - end_position != Scanner::kNoEndPosition ? end_position : data->length(); + end_position != kNoEndPosition ? end_position : data->length(); } |