/* valascanner.vala * * Copyright (C) 2008 Jürg Billeter * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * Author: * Jürg Billeter */ using GLib; using Gee; /** * Lexical scanner for Vala source files. */ public class Vala.Scanner : Object { public SourceFile source_file { get; construct; } char* begin; char* current; char* end; int line; int column; string _comment; public Scanner (SourceFile source_file) { this.source_file = source_file; } construct { begin = source_file.get_mapped_contents (); end = begin + source_file.get_mapped_length (); current = begin; line = 1; column = 1; } bool is_ident_char (char c) { return (c.isalnum () || c == '_'); } TokenType get_identifier_or_keyword (char* begin, int len) { switch (len) { case 2: switch (begin[0]) { case 'a': if (matches (begin, "as")) return TokenType.AS; break; case 'd': if (matches (begin, "do")) return TokenType.DO; break; case 'i': switch (begin[1]) { case 'f': return TokenType.IF; case 'n': return TokenType.IN; case 's': return TokenType.IS; } break; } break; case 3: switch (begin[0]) { case 'f': if (matches (begin, "for")) return TokenType.FOR; break; case 'g': if (matches (begin, "get")) return TokenType.GET; break; case 'n': if (matches (begin, "new")) return TokenType.NEW; break; case 'o': if (matches (begin, "out")) return TokenType.OUT; break; case 'r': if (matches (begin, "ref")) return TokenType.REF; break; case 's': if (matches (begin, "set")) return TokenType.SET; break; case 't': if (matches (begin, "try")) return TokenType.TRY; break; case 'v': if (matches (begin, "var")) return TokenType.VAR; break; } break; case 4: switch (begin[0]) { case 'b': if (matches (begin, "base")) return TokenType.BASE; break; case 'c': if (matches (begin, "case")) return TokenType.CASE; break; case 'e': switch (begin[1]) { case 'l': if (matches (begin, "else")) return TokenType.ELSE; break; case 'n': if (matches (begin, "enum")) return TokenType.ENUM; break; } break; case 'l': if (matches (begin, "lock")) return TokenType.LOCK; break; case 'n': if (matches (begin, "null")) return TokenType.NULL; break; case 't': switch (begin[1]) { case 'h': if (matches (begin, "this")) return TokenType.THIS; break; case 'r': if (matches (begin, "true")) return TokenType.TRUE; break; } break; case 'v': if (matches (begin, "void")) return TokenType.VOID; break; case 'w': if (matches (begin, "weak")) return TokenType.WEAK; break; } break; case 5: switch (begin[0]) { case 'b': if (matches (begin, "break")) return TokenType.BREAK; break; case 'c': switch (begin[1]) { case 'a': if (matches (begin, "catch")) return TokenType.CATCH; break; case 'l': if (matches (begin, "class")) return TokenType.CLASS; break; case 'o': if (matches (begin, "const")) return TokenType.CONST; break; } break; case 'f': if (matches (begin, "false")) return TokenType.FALSE; break; case 't': if (matches (begin, "throw")) return TokenType.THROW; break; case 'u': if (matches (begin, "using")) return TokenType.USING; break; case 'w': if (matches (begin, "while")) return TokenType.WHILE; break; } break; case 6: switch (begin[0]) { case 'd': if (matches (begin, "delete")) return TokenType.DELETE; break; case 'i': if (matches (begin, "inline")) return TokenType.INLINE; break; case 'p': if (matches (begin, "public")) return TokenType.PUBLIC; break; case 'r': if (matches (begin, "return")) return TokenType.RETURN; break; case 's': switch (begin[1]) { case 'i': switch (begin[2]) { case 'g': if (matches (begin, "signal")) return TokenType.SIGNAL; break; case 'z': if (matches (begin, "sizeof")) return TokenType.SIZEOF; break; } break; case 't': switch (begin[2]) { case 'a': if (matches (begin, "static")) return TokenType.STATIC; break; case 'r': if (matches (begin, "struct")) return TokenType.STRUCT; break; } break; case 'w': if (matches (begin, "switch")) return TokenType.SWITCH; break; } break; case 't': switch (begin[1]) { case 'h': if (matches (begin, "throws")) return TokenType.THROWS; break; case 'y': if (matches (begin, "typeof")) return TokenType.TYPEOF; break; } break; } break; case 7: switch (begin[0]) { case 'd': if (matches (begin, "default")) return TokenType.DEFAULT; break; case 'e': if (matches (begin, "ensures")) return TokenType.ENSURES; break; case 'f': switch (begin[1]) { case 'i': if (matches (begin, "finally")) return TokenType.FINALLY; break; case 'o': if (matches (begin, "foreach")) return TokenType.FOREACH; break; } break; case 'p': if (matches (begin, "private")) return TokenType.PRIVATE; break; case 'v': if (matches (begin, "virtual")) return TokenType.VIRTUAL; break; } break; case 8: switch (begin[0]) { case 'a': if (matches (begin, "abstract")) return TokenType.ABSTRACT; break; case 'c': if (matches (begin, "continue")) return TokenType.CONTINUE; break; case 'd': if (matches (begin, "delegate")) return TokenType.DELEGATE; break; case 'o': if (matches (begin, "override")) return TokenType.OVERRIDE; break; case 'r': if (matches (begin, "requires")) return TokenType.REQUIRES; break; case 'v': if (matches (begin, "volatile")) return TokenType.VOLATILE; break; } break; case 9: switch (begin[0]) { case 'c': if (matches (begin, "construct")) return TokenType.CONSTRUCT; break; case 'i': if (matches (begin, "interface")) return TokenType.INTERFACE; break; case 'n': if (matches (begin, "namespace")) return TokenType.NAMESPACE; break; case 'p': if (matches (begin, "protected")) return TokenType.PROTECTED; break; } break; case 11: if (matches (begin, "errordomain")) return TokenType.ERRORDOMAIN; break; } return TokenType.IDENTIFIER; } public TokenType read_token (out SourceLocation token_begin, out SourceLocation token_end) { space (); TokenType type; char* begin = current; token_begin.pos = begin; token_begin.line = line; token_begin.column = column; int token_length_in_chars = -1; if (current >= end) { type = TokenType.EOF; } else if (current[0].isalpha () || current[0] == '_') { int len = 0; while (current < end && is_ident_char (current[0])) { current++; len++; } type = get_identifier_or_keyword (begin, len); } else if (current[0] == '@') { token_begin.pos++; // @ is not part of the identifier current++; int len = 0; while (current < end && is_ident_char (current[0])) { current++; len++; } type = TokenType.IDENTIFIER; } else if (current[0].isdigit ()) { while (current < end && current[0].isdigit ()) { current++; } type = TokenType.INTEGER_LITERAL; if (current < end && current[0].tolower () == 'l') { current++; if (current < end && current[0].tolower () == 'l') { current++; } } else if (current < end && current[0].tolower () == 'u') { current++; if (current < end && current[0].tolower () == 'l') { current++; if (current < end && current[0].tolower () == 'l') { current++; } } } else if (current < end && current[0] == '.') { current++; while (current < end && current[0].isdigit ()) { current++; } if (current < end && current[0].tolower () == 'e') { current++; if (current < end && (current[0] == '+' || current[0] == '-')) { current++; } while (current < end && current[0].isdigit ()) { current++; } } if (current < end && current[0].tolower () == 'f') { current++; } type = TokenType.REAL_LITERAL; } else if (current < end && current == begin + 1 && begin[0] == '0' && begin[1] == 'x' && begin[2].isxdigit ()) { // hexadecimal integer literal current++; while (current < end && current[0].isxdigit ()) { current++; } } else if (current < end && is_ident_char (current[0])) { // allow identifiers to start with a digit // as long as they contain at least one char while (current < end && is_ident_char (current[0])) { current++; } type = TokenType.IDENTIFIER; } } else { switch (current[0]) { case '{': type = TokenType.OPEN_BRACE; current++; break; case '}': type = TokenType.CLOSE_BRACE; current++; break; case '(': type = TokenType.OPEN_PARENS; current++; break; case ')': type = TokenType.CLOSE_PARENS; current++; break; case '[': type = TokenType.OPEN_BRACKET; current++; break; case ']': type = TokenType.CLOSE_BRACKET; current++; break; case '.': type = TokenType.DOT; current++; if (current < end - 1) { if (current[0] == '.' && current[1] == '.') { type = TokenType.ELLIPSIS; current += 2; } } break; case ':': type = TokenType.COLON; current++; break; case ',': type = TokenType.COMMA; current++; break; case ';': type = TokenType.SEMICOLON; current++; break; case '#': type = TokenType.HASH; current++; break; case '?': type = TokenType.INTERR; current++; break; case '|': type = TokenType.BITWISE_OR; current++; if (current < end) { switch (current[0]) { case '=': type = TokenType.ASSIGN_BITWISE_OR; current++; break; case '|': type = TokenType.OP_OR; current++; break; } } break; case '&': type = TokenType.BITWISE_AND; current++; if (current < end) { switch (current[0]) { case '=': type = TokenType.ASSIGN_BITWISE_AND; current++; break; case '&': type = TokenType.OP_AND; current++; break; } } break; case '^': type = TokenType.CARRET; current++; if (current < end && current[0] == '=') { type = TokenType.ASSIGN_BITWISE_XOR; current++; } break; case '~': type = TokenType.TILDE; current++; break; case '=': type = TokenType.ASSIGN; current++; if (current < end) { switch (current[0]) { case '=': type = TokenType.OP_EQ; current++; break; case '>': type = TokenType.LAMBDA; current++; break; } } break; case '<': type = TokenType.OP_LT; current++; if (current < end) { switch (current[0]) { case '=': type = TokenType.OP_LE; current++; break; case '<': type = TokenType.OP_SHIFT_LEFT; current++; if (current < end && current[0] == '=') { type = TokenType.ASSIGN_SHIFT_LEFT; current++; } break; } } break; case '>': type = TokenType.OP_GT; current++; if (current < end && current[0] == '=') { type = TokenType.OP_GE; current++; } break; case '!': type = TokenType.OP_NEG; current++; if (current < end && current[0] == '=') { type = TokenType.OP_NE; current++; } break; case '+': type = TokenType.PLUS; current++; if (current < end) { switch (current[0]) { case '=': type = TokenType.ASSIGN_ADD; current++; break; case '+': type = TokenType.OP_INC; current++; break; } } break; case '-': type = TokenType.MINUS; current++; if (current < end) { switch (current[0]) { case '=': type = TokenType.ASSIGN_SUB; current++; break; case '-': type = TokenType.OP_DEC; current++; break; case '>': type = TokenType.OP_PTR; current++; break; } } break; case '*': type = TokenType.STAR; current++; if (current < end && current[0] == '=') { type = TokenType.ASSIGN_MUL; current++; } break; case '/': type = TokenType.DIV; current++; if (current < end && current[0] == '=') { type = TokenType.ASSIGN_DIV; current++; } break; case '%': type = TokenType.PERCENT; current++; if (current < end && current[0] == '=') { type = TokenType.ASSIGN_PERCENT; current++; } break; case '\'': case '"': if (begin[0] == '\'') { type = TokenType.CHARACTER_LITERAL; } else { type = TokenType.STRING_LITERAL; } token_length_in_chars = 2; current++; while (current < end && current[0] != begin[0]) { if (current[0] == '\\') { current++; token_length_in_chars++; if (current < end && current[0] == 'x') { // hexadecimal escape character current++; token_length_in_chars++; while (current < end && current[0].isxdigit ()) { current++; token_length_in_chars++; } } else { current++; token_length_in_chars++; } } else if (current[0] == '\n') { break; } else { unichar u = ((string) current).get_char_validated ((long) (end - current)); if (u != (unichar) (-1)) { current += u.to_utf8 (null); token_length_in_chars++; } else { Report.error (new SourceReference (source_file, line, column + token_length_in_chars, line, column + token_length_in_chars), "invalid UTF-8 character"); } } } if (current < end && current[0] != '\n') { current++; } else { Report.error (new SourceReference (source_file, line, column + token_length_in_chars, line, column + token_length_in_chars), "syntax error, expected %c".printf (begin[0])); } break; default: unichar u = ((string) current).get_char_validated ((long) (end - current)); if (u != (unichar) (-1)) { current += u.to_utf8 (null); Report.error (new SourceReference (source_file, line, column, line, column), "syntax error, unexpected character"); } else { current++; Report.error (new SourceReference (source_file, line, column, line, column), "invalid UTF-8 character"); } column++; return read_token (out token_begin, out token_end); } } if (token_length_in_chars < 0) { column += (int) (current - begin); } else { column += token_length_in_chars; } token_end.pos = current; token_end.line = line; token_end.column = column - 1; return type; } bool matches (char* begin, string keyword) { char* keyword_array = keyword; long len = keyword.len (); for (int i = 0; i < len; i++) { if (begin[i] != keyword_array[i]) { return false; } } return true; } bool whitespace () { bool found = false; while (current < end && current[0].isspace ()) { if (current[0] == '\n') { line++; column = 0; } found = true; current++; column++; } return found; } bool comment () { if (current > end - 2 || current[0] != '/' || (current[1] != '/' && current[1] != '*')) { return false; } if (current[1] == '/') { // single-line comment current += 2; char* begin = current; // skip until end of line or end of file while (current < end && current[0] != '\n') { current++; } push_comment (((string) begin).ndup ((long) (current - begin)), line == 1); } else { // delimited comment current += 2; char* begin = current; int begin_line = line; while (current < end - 1 && (current[0] != '*' || current[1] != '/')) { if (current[0] == '\n') { line++; column = 0; } current++; column++; } if (current == end - 1) { Report.error (new SourceReference (source_file, line, column, line, column), "syntax error, expected */"); return true; } push_comment (((string) begin).ndup ((long) (current - begin)), begin_line == 1); current += 2; column += 2; } return true; } void space () { while (whitespace () || comment ()) { } } void push_comment (string comment_item, bool file_comment) { if (_comment == null) { _comment = comment_item; } else { _comment = "%s\n%s".printf (_comment, comment_item); } if (file_comment) { source_file.comment = _comment; _comment = null; } } /** * Clears and returns the content of the comment stack. * * @return saved comment */ public string pop_comment () { if (_comment == null) { return null; } var result = new StringBuilder (_comment); _comment = null; weak string index; while ((index = result.str.chr (-1, '\t')) != null) { result.erase (result.str.pointer_to_offset (index), 1); } return result.str; } }