diff options
author | Nikolai Kosjar <nikolai.kosjar@digia.com> | 2014-02-25 13:44:11 -0300 |
---|---|---|
committer | Nikolai Kosjar <nikolai.kosjar@digia.com> | 2014-05-23 14:23:15 +0200 |
commit | 70122b3061ee3fbb07442beb0158edf849ceb98e (patch) | |
tree | e8c272ec1df948acd27378a44764dd683ab5b426 | |
parent | 4fefb1ca2a5270752acf00d586393f472fb1b9a3 (diff) | |
download | qt-creator-70122b3061ee3fbb07442beb0158edf849ceb98e.tar.gz |
C++: Support for UTF-8 in the lexer
This will save us toLatin1() conversations in CppTools (which already
holds UTF-8 encoded QByteArrays) and thus loss of information (see
QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers.
API-wise the following functions are added to Token. In follow-up
patches these will become handy in combination with QStrings.
utf16chars() - aequivalent of bytes()
utf16charsBegin() - aequivalent of bytesBegin()
utf16charsEnd() - aequivalent of bytesEnd()
Next steps:
* Adapt functions from TranslationUnit. They should work with utf16
chars in order to calculate lines and columns correctly also for
UTF-8 multi-byte code points.
* Adapt the higher level clients:
* Cpp{Tools,Editor} should expect UTF-8 encoded Literals.
* Cpp{Tools,Editor}: When dealing with identifiers on the
QString/QTextDocument layer, code points
represendet by two QChars need to be respected, too.
* Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report
offsets usable in CppEditor/CppTools.
Addresses QTCREATORBUG-7356.
Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0
Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
-rw-r--r-- | src/libs/3rdparty/cplusplus/Lexer.cpp | 20 | ||||
-rw-r--r-- | src/libs/3rdparty/cplusplus/Lexer.h | 31 | ||||
-rw-r--r-- | src/libs/3rdparty/cplusplus/Token.cpp | 1 | ||||
-rw-r--r-- | src/libs/3rdparty/cplusplus/Token.h | 20 | ||||
-rw-r--r-- | src/libs/cplusplus/SimpleLexer.cpp | 4 | ||||
-rw-r--r-- | src/libs/cplusplus/SimpleLexer.h | 2 | ||||
-rw-r--r-- | tests/auto/cplusplus/cplusplus.pro | 3 | ||||
-rw-r--r-- | tests/auto/cplusplus/cplusplus.qbs | 1 | ||||
-rw-r--r-- | tests/auto/cplusplus/lexer/tst_lexer.cpp | 215 | ||||
-rw-r--r-- | tests/auto/cplusplus/translationunit/translationunit.pro | 2 | ||||
-rw-r--r-- | tests/auto/cplusplus/translationunit/translationunit.qbs | 7 | ||||
-rw-r--r-- | tests/auto/cplusplus/translationunit/tst_translationunit.cpp | 225 |
12 files changed, 503 insertions, 28 deletions
diff --git a/src/libs/3rdparty/cplusplus/Lexer.cpp b/src/libs/3rdparty/cplusplus/Lexer.cpp index f2729fa531..914b3c2275 100644 --- a/src/libs/3rdparty/cplusplus/Lexer.cpp +++ b/src/libs/3rdparty/cplusplus/Lexer.cpp @@ -29,6 +29,13 @@ using namespace CPlusPlus; +/*! + \class Lexer + \brief The Lexer generates tokens from an UTF-8 encoded source text. + + \sa Token +*/ + Lexer::Lexer(TranslationUnit *unit) : _translationUnit(unit), _control(unit->control()), @@ -63,6 +70,7 @@ void Lexer::setSource(const char *firstChar, const char *lastChar) _firstChar = firstChar; _lastChar = lastChar; _currentChar = _firstChar - 1; + _currentCharUtf16 = -1; _tokenStart = _currentChar; _yychar = '\n'; } @@ -109,6 +117,7 @@ void Lexer::scan(Token *tok) tok->reset(); scan_helper(tok); tok->f.bytes = _currentChar - _tokenStart; + tok->f.utf16chars = _currentCharUtf16 - _tokenStartUtf16; } void Lexer::scan_helper(Token *tok) @@ -143,6 +152,9 @@ void Lexer::scan_helper(Token *tok) _tokenStart = _currentChar; tok->byteOffset = _currentChar - _firstChar; + _tokenStartUtf16 = _currentCharUtf16; + tok->utf16charOffset = _currentCharUtf16; + if (_yychar) { s._newlineExpected = false; } else if (s._tokenKind) { @@ -621,8 +633,8 @@ void Lexer::scan_helper(Token *tok) } else { scanIdentifier(tok); } - } else if (std::isalpha(ch) || ch == '_' || ch == '$') { - scanIdentifier(tok); + } else if (std::isalpha(ch) || ch == '_' || ch == '$' || isByteOfMultiByteCodePoint(ch)) { + scanIdentifier(tok, _currentChar - _tokenStart - 1); } else if (std::isdigit(ch)) { scanNumericLiteral(tok); } else { @@ -776,8 +788,10 @@ void Lexer::scanNumericLiteral(Token *tok) void Lexer::scanIdentifier(Token *tok, unsigned extraProcessedChars) { const char *yytext = _currentChar - 1 - extraProcessedChars; - while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$') + while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$' + || isByteOfMultiByteCodePoint(_yychar)) { yyinp(); + } int yylen = _currentChar - yytext; if (f._scanKeywords) tok->f.kind = classify(yytext, yylen, _languageFeatures); diff --git a/src/libs/3rdparty/cplusplus/Lexer.h b/src/libs/3rdparty/cplusplus/Lexer.h index 43a877e7a8..8d63d2ba1d 100644 --- a/src/libs/3rdparty/cplusplus/Lexer.h +++ b/src/libs/3rdparty/cplusplus/Lexer.h @@ -62,6 +62,7 @@ public: void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; } private: + void pushLineStartOffset(); void scan_helper(Token *tok); void setSource(const char *firstChar, const char *lastChar); static int classify(const char *string, int length, LanguageFeatures features); @@ -77,15 +78,32 @@ private: void scanBackslash(Kind type); void scanCppComment(Kind type); - inline void yyinp() + static bool isByteOfMultiByteCodePoint(unsigned char byte) + { return byte & 0x80; } // Check if most significant bit is set + + void yyinp() { - _yychar = *++_currentChar; + ++_currentCharUtf16; + + // Process multi-byte UTF-8 code point (non-latin1) + if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(_yychar))) { + unsigned trailingBytesCurrentCodePoint = 1; + for (unsigned char c = _yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1) + ++trailingBytesCurrentCodePoint; + // Code points >= 0x00010000 are represented by two UTF16 code units + if (trailingBytesCurrentCodePoint >= 3) + ++_currentCharUtf16; + _yychar = *(_currentChar += trailingBytesCurrentCodePoint + 1); + + // Process single-byte UTF-8 code point (latin1) + } else { + _yychar = *++_currentChar; + } + if (CPLUSPLUS_UNLIKELY(_yychar == '\n')) pushLineStartOffset(); } - void pushLineStartOffset(); - private: struct Flags { unsigned _scanCommentTokens: 1; @@ -105,6 +123,10 @@ private: const char *_lastChar; const char *_tokenStart; unsigned char _yychar; + + unsigned _currentCharUtf16; + unsigned _tokenStartUtf16; + union { unsigned char _state; State s; @@ -113,6 +135,7 @@ private: unsigned _flags; Flags f; }; + unsigned _currentLine; LanguageFeatures _languageFeatures; }; diff --git a/src/libs/3rdparty/cplusplus/Token.cpp b/src/libs/3rdparty/cplusplus/Token.cpp index 57e36c3ea5..8be6757131 100644 --- a/src/libs/3rdparty/cplusplus/Token.cpp +++ b/src/libs/3rdparty/cplusplus/Token.cpp @@ -85,6 +85,7 @@ void Token::reset() { flags = 0; byteOffset = 0; + utf16charOffset = 0; ptr = 0; } diff --git a/src/libs/3rdparty/cplusplus/Token.h b/src/libs/3rdparty/cplusplus/Token.h index 02d7f5ebe9..ec10483852 100644 --- a/src/libs/3rdparty/cplusplus/Token.h +++ b/src/libs/3rdparty/cplusplus/Token.h @@ -285,7 +285,7 @@ enum Kind { class CPLUSPLUS_EXPORT Token { public: - Token() : flags(0), byteOffset(0), ptr(0) {} + Token() : flags(0), byteOffset(0), utf16charOffset(0), ptr(0) {} inline bool is(unsigned k) const { return f.kind == k; } inline bool isNot(unsigned k) const { return f.kind != k; } @@ -298,13 +298,14 @@ public: inline bool joined() const { return f.joined; } inline bool expanded() const { return f.expanded; } inline bool generated() const { return f.generated; } - inline unsigned bytes() const { return f.bytes; } - inline unsigned bytesBegin() const - { return byteOffset; } + inline unsigned bytes() const { return f.bytes; } + inline unsigned bytesBegin() const { return byteOffset; } + inline unsigned bytesEnd() const { return byteOffset + f.bytes; } - inline unsigned bytesEnd() const - { return byteOffset + f.bytes; } + inline unsigned utf16chars() const { return f.utf16chars; } + inline unsigned utf16charsBegin() const { return utf16charOffset; } + inline unsigned utf16charsEnd() const { return utf16charOffset + f.utf16chars; } inline bool isLiteral() const { return f.kind >= T_FIRST_LITERAL && f.kind <= T_LAST_LITERAL; } @@ -354,15 +355,17 @@ public: unsigned generated : 1; // Unused... unsigned pad : 3; - // The token length in bytes. + // The token length in bytes and UTF16 chars. unsigned bytes : 16; + unsigned utf16chars : 16; }; union { - unsigned flags; + unsigned long flags; Flags f; }; unsigned byteOffset; + unsigned utf16charOffset; union { void *ptr; @@ -393,5 +396,4 @@ struct LanguageFeatures } // namespace CPlusPlus - #endif // CPLUSPLUS_TOKEN_H diff --git a/src/libs/cplusplus/SimpleLexer.cpp b/src/libs/cplusplus/SimpleLexer.cpp index 8e539acb84..95c6c051a5 100644 --- a/src/libs/cplusplus/SimpleLexer.cpp +++ b/src/libs/cplusplus/SimpleLexer.cpp @@ -61,11 +61,11 @@ bool SimpleLexer::endedJoined() const return _endedJoined; } -QList<Token> SimpleLexer::operator()(const QString &text, int state) +QList<Token> SimpleLexer::operator()(const QString &text, int state, bool convertToUtf8) { QList<Token> tokens; - const QByteArray bytes = text.toLatin1(); + const QByteArray bytes = convertToUtf8 ? text.toUtf8() : text.toLatin1(); const char *firstChar = bytes.constData(); const char *lastChar = firstChar + bytes.size(); diff --git a/src/libs/cplusplus/SimpleLexer.h b/src/libs/cplusplus/SimpleLexer.h index 1eb4ab6c3b..a5b7d3e4ac 100644 --- a/src/libs/cplusplus/SimpleLexer.h +++ b/src/libs/cplusplus/SimpleLexer.h @@ -54,7 +54,7 @@ public: bool endedJoined() const; - QList<Token> operator()(const QString &text, int state = 0); + QList<Token> operator()(const QString &text, int state = 0, bool convertToUtf8 = false); int state() const { return _lastState; } diff --git a/tests/auto/cplusplus/cplusplus.pro b/tests/auto/cplusplus/cplusplus.pro index 2a783402a0..3cbc0dfdf2 100644 --- a/tests/auto/cplusplus/cplusplus.pro +++ b/tests/auto/cplusplus/cplusplus.pro @@ -12,4 +12,5 @@ SUBDIRS = \ misc \ cxx11 \ checksymbols \ - lexer + lexer \ + translationunit diff --git a/tests/auto/cplusplus/cplusplus.qbs b/tests/auto/cplusplus/cplusplus.qbs index c5ec28e1c6..338a7c9b4b 100644 --- a/tests/auto/cplusplus/cplusplus.qbs +++ b/tests/auto/cplusplus/cplusplus.qbs @@ -13,6 +13,7 @@ Project { "misc/misc.qbs", "preprocessor/preprocessor.qbs", "semantic/semantic.qbs", + "translationunit/translationunit.qbs", "typeprettyprinter/typeprettyprinter.qbs" ] } diff --git a/tests/auto/cplusplus/lexer/tst_lexer.cpp b/tests/auto/cplusplus/lexer/tst_lexer.cpp index 54d58c01f1..7218f07fe8 100644 --- a/tests/auto/cplusplus/lexer/tst_lexer.cpp +++ b/tests/auto/cplusplus/lexer/tst_lexer.cpp @@ -52,28 +52,49 @@ class tst_SimpleLexer: public QObject public: tst_SimpleLexer() : _state(0) {} + enum TokenCompareFlag { + CompareKind = 1 << 1, + CompareBytes = 1 << 2, + CompareBytesBegin = 1 << 3, + CompareBytesEnd = 1 << 4, + CompareUtf16Chars = 1 << 5, + CompareUtf16CharsBegin = 1 << 6, + CompareUtf16CharsEnd = 1 << 7 + }; + Q_DECLARE_FLAGS(TokenCompareFlags, TokenCompareFlag) + private slots: void basic(); void basic_data(); void incremental(); void incremental_data(); + // + // The following "non-latin1" code points are used in the tests following this comment: + // + // U+00FC - 2 code units in UTF8, 1 in UTF16 - LATIN SMALL LETTER U WITH DIAERESIS + // U+4E8C - 3 code units in UTF8, 1 in UTF16 - CJK UNIFIED IDEOGRAPH-4E8C + // U+10302 - 4 code units in UTF8, 2 in UTF16 - OLD ITALIC LETTER KE + // + + void bytes_and_utf16chars(); + void bytes_and_utf16chars_data(); + void offsets(); + void offsets_data(); + private: static TokenList toTokenList(const TokenKindList &tokenKinds); - enum TokenCompareFlag { - CompareKind = 1 << 1 - }; - Q_DECLARE_FLAGS(TokenCompareFlags, TokenCompareFlag) - void run(const QByteArray &source, const TokenList &expectedTokenList, bool preserveState, - TokenCompareFlag compareFlags); + TokenCompareFlags compareFlags); int _state; }; +Q_DECLARE_OPERATORS_FOR_FLAGS(tst_SimpleLexer::TokenCompareFlags) + TokenList tst_SimpleLexer::toTokenList(const TokenKindList &tokenKinds) { TokenList tokens; @@ -88,10 +109,13 @@ TokenList tst_SimpleLexer::toTokenList(const TokenKindList &tokenKinds) void tst_SimpleLexer::run(const QByteArray &source, const TokenList &expectedTokenList, bool preserveState, - TokenCompareFlag compareFlags) + TokenCompareFlags compareFlags) { + QVERIFY(compareFlags); + SimpleLexer lexer; - const QList<Token> tokenList = lexer(source, preserveState ? _state : 0); + const QList<Token> tokenList = lexer(source, preserveState ? _state : 0, + /*convertToUtf8=*/ true); if (preserveState) _state = lexer.state(); @@ -108,6 +132,20 @@ void tst_SimpleLexer::run(const QByteArray &source, #endif if (compareFlags & CompareKind) QCOMPARE(token.kind(), expectedToken.kind()); + + if (compareFlags & CompareBytes) + QCOMPARE(token.bytes(), expectedToken.bytes()); + if (compareFlags & CompareBytesBegin) + QCOMPARE(token.bytesBegin(), expectedToken.bytesBegin()); + if (compareFlags & CompareBytesEnd) + QCOMPARE(token.bytesEnd(), expectedToken.bytesEnd()); + + if (compareFlags & CompareUtf16Chars) + QCOMPARE(token.utf16chars(), expectedToken.utf16chars()); + if (compareFlags & CompareUtf16CharsBegin) + QCOMPARE(token.utf16charsBegin(), expectedToken.utf16charsBegin()); + if (compareFlags & CompareUtf16CharsEnd) + QCOMPARE(token.utf16charsEnd(), expectedToken.utf16charsEnd()); } QVERIFY2(i == expectedTokenList.size(), "Less tokens than expected."); } @@ -221,7 +259,168 @@ void tst_SimpleLexer::basic_data() << T_LBRACKET << T_RBRACKET << T_LBRACE << T_RBRACE << T_IDENTIFIER << T_QUESTION << T_IDENTIFIER << T_COLON << T_IDENTIFIER; QTest::newRow(source) << source << expectedTokenKindList; +} +void tst_SimpleLexer::bytes_and_utf16chars() +{ + QFETCH(QByteArray, source); + QFETCH(QList<Token>, expectedTokenList); + + const TokenCompareFlags compareFlags = CompareKind | CompareBytes | CompareUtf16Chars; + run(source, expectedTokenList, false, compareFlags); +} + +static QList<Token> createToken(unsigned kind, unsigned bytes, unsigned utf16chars) +{ + Token t; + t.f.kind = kind; + t.f.bytes = bytes; + t.f.utf16chars = utf16chars; + return QList<Token>() << t; +} + +void tst_SimpleLexer::bytes_and_utf16chars_data() +{ + QTest::addColumn<QByteArray>("source"); + QTest::addColumn<QList<Token> >("expectedTokenList"); + + typedef QByteArray _; + + // LATIN1 Identifier + QTest::newRow("latin1 identifier") + << _("var") << createToken(T_IDENTIFIER, 3, 3); + + // NON-LATIN1 identifier (code point with 2 UTF8 code units) + QTest::newRow("non-latin1 identifier (2-byte code unit at start)") + << _("\u00FC_var") << createToken(T_IDENTIFIER, 6, 5); + QTest::newRow("non-latin1 identifier (2-byte code unit in center)") + << _("_v\u00FCr_") << createToken(T_IDENTIFIER, 6, 5); + QTest::newRow("non-latin1 identifier (2-byte code unit at end)") + << _("var_\u00FC") << createToken(T_IDENTIFIER, 6, 5); + QTest::newRow("non-latin1 identifier (2-byte code unit only)") + << _("\u00FC") << createToken(T_IDENTIFIER, 2, 1); + + // NON-LATIN1 identifier (code point with 3 UTF8 code units) + QTest::newRow("non-latin1 identifier (3-byte code unit at start)") + << _("\u4E8C_var") << createToken(T_IDENTIFIER, 7, 5); + QTest::newRow("non-latin1 identifier (3-byte code unit in center)") + << _("_v\u4E8Cr_") << createToken(T_IDENTIFIER, 7, 5); + QTest::newRow("non-latin1 identifier (3-byte code unit at end)") + << _("var_\u4E8C") << createToken(T_IDENTIFIER, 7, 5); + QTest::newRow("non-latin1 identifier (3-byte code unit only)") + << _("\u4E8C") << createToken(T_IDENTIFIER, 3, 1); + + // NON-LATIN1 identifier (code point with 4 UTF8 code units) + QTest::newRow("non-latin1 identifier (4-byte code unit at start)") + << _("\U00010302_var") << createToken(T_IDENTIFIER, 8, 6); + QTest::newRow("non-latin1 identifier (4-byte code unit in center)") + << _("_v\U00010302r_") << createToken(T_IDENTIFIER, 8, 6); + QTest::newRow("non-latin1 identifier (4-byte code unit at end)") + << _("var_\U00010302") << createToken(T_IDENTIFIER, 8, 6); + QTest::newRow("non-latin1 identifier (4-byte code unit only)") + << _("\U00010302") << createToken(T_IDENTIFIER, 4, 2); + + // NON-LATIN1 identifier (code points with several multi-byte UTF8 code units) + QTest::newRow("non-latin1 identifier (mixed multi-byte code units at start)") + << _("\u00FC\u4E8C\U00010302_var") << createToken(T_IDENTIFIER, 13, 8); + QTest::newRow("non-latin1 identifier (mixed multi-byte code units in center)") + << _("_v\u00FC\u4E8C\U00010302r_") << createToken(T_IDENTIFIER, 13, 8); + QTest::newRow("non-latin1 identifier (mixed multi-byte code units at end)") + << _("var_\u00FC\u4E8C\U00010302") << createToken(T_IDENTIFIER, 13, 8); + QTest::newRow("non-latin1 identifier (mixed multi-byte code units only)") + << _("\u00FC\u4E8C\U00010302") << createToken(T_IDENTIFIER, 9, 4); + + // Comments + QTest::newRow("ascii comment /* ... */") + << _("/* hello world */") << createToken(T_COMMENT, 17, 17); + QTest::newRow("latin1 comment //") + << _("// hello world") << createToken(T_CPP_COMMENT, 14, 14); + QTest::newRow("non-latin1 comment /* ... */ (1)") + << _("/* \u00FC\u4E8C\U00010302 */") << createToken(T_COMMENT, 15, 10); + QTest::newRow("non-latin1 comment /* ... */ (2)") + << _("/*\u00FC\u4E8C\U00010302*/") << createToken(T_COMMENT, 13, 8); + QTest::newRow("non-latin1 comment // (1)") + << _("// \u00FC\u4E8C\U00010302") << createToken(T_CPP_COMMENT, 12, 7); + QTest::newRow("non-latin1 comment // (2)") + << _("//\u00FC\u4E8C\U00010302") << createToken(T_CPP_COMMENT, 11, 6); + + // String Literals + QTest::newRow("latin1 string literal") + << _("\"hello\"") << createToken(T_STRING_LITERAL, 7, 7); + QTest::newRow("non-latin1 string literal") + << _("\"\u00FC\u4E8C\U00010302\"") << createToken(T_STRING_LITERAL, 11, 6); +} + +static Token createToken(unsigned kind, unsigned byteOffset, unsigned bytes, + unsigned utf16charsOffset, unsigned utf16chars) +{ + Token t; + t.f.kind = kind; + t.byteOffset = byteOffset; + t.f.bytes = bytes; + t.utf16charOffset = utf16charsOffset; + t.f.utf16chars = utf16chars; + return t; +} + +void tst_SimpleLexer::offsets() +{ + QFETCH(QByteArray, source); + QFETCH(QList<Token>, expectedTokenList); + + const TokenCompareFlags compareFlags = CompareKind + | CompareBytesBegin + | CompareBytesEnd + | CompareUtf16CharsBegin + | CompareUtf16CharsEnd + ; + run(source, expectedTokenList, false, compareFlags); +} + +void tst_SimpleLexer::offsets_data() +{ + QTest::addColumn<QByteArray>("source"); + QTest::addColumn<QList<Token> >("expectedTokenList"); + + typedef QByteArray _; + + // LATIN1 Identifier + QTest::newRow("latin1 identifiers") + << _("var var") << (QList<Token>() + << createToken(T_IDENTIFIER, 0, 3, 0, 3) + << createToken(T_IDENTIFIER, 4, 3, 4, 3) + ); + + // NON-LATIN1 identifier + QTest::newRow("non-latin1 identifiers 1") + << _("var_\u00FC var_\u00FC") << (QList<Token>() + << createToken(T_IDENTIFIER, 0, 6, 0, 5) + << createToken(T_IDENTIFIER, 7, 6, 6, 5) + ); + QTest::newRow("non-latin1 identifiers 2") + << _("\u00FC\u4E8C\U00010302 \u00FC\u4E8C\U00010302") << (QList<Token>() + << createToken(T_IDENTIFIER, 0, 9, 0, 4) + << createToken(T_IDENTIFIER, 10, 9, 5, 4) + ); + + QTest::newRow("non-latin1 identifiers 3") // first code unit on line: <bytes> / <utf16char> + << _("class v\u00FC\u4E8C\U00010302\n" // 0 / 0 + "{\n" // 17 / 12 + "public:\n" // 19 / 14 + " v\u00FC\u4E8C\U00010302();\n" // 27 / 22 + "};\n") << (QList<Token>() // 45 / 35 + << createToken(T_CLASS, 0, 5, 0, 5) // class + << createToken(T_IDENTIFIER, 6, 10, 6, 5) // non-latin1 id + << createToken(T_LBRACE, 17, 1, 12, 1) // { + << createToken(T_PUBLIC, 19, 6, 14, 6) // public + << createToken(T_COLON, 25, 1, 20, 1) // : + << createToken(T_IDENTIFIER, 31, 10, 26, 5) // id + << createToken(T_LPAREN, 41, 1, 31, 1) // ( + << createToken(T_RPAREN, 42, 1, 32, 1) // ) + << createToken(T_SEMICOLON, 43, 1, 33, 1) // ; + << createToken(T_RBRACE, 45, 1, 35, 1) // } + << createToken(T_SEMICOLON, 46, 1, 36, 1) // ; + ); } void tst_SimpleLexer::incremental() diff --git a/tests/auto/cplusplus/translationunit/translationunit.pro b/tests/auto/cplusplus/translationunit/translationunit.pro new file mode 100644 index 0000000000..f17babbf9e --- /dev/null +++ b/tests/auto/cplusplus/translationunit/translationunit.pro @@ -0,0 +1,2 @@ +include(../shared/shared.pri) +SOURCES += tst_translationunit.cpp diff --git a/tests/auto/cplusplus/translationunit/translationunit.qbs b/tests/auto/cplusplus/translationunit/translationunit.qbs new file mode 100644 index 0000000000..8420c28872 --- /dev/null +++ b/tests/auto/cplusplus/translationunit/translationunit.qbs @@ -0,0 +1,7 @@ +import qbs +import "../cplusplusautotest.qbs" as CPlusPlusAutotest + +CPlusPlusAutotest { + name: "CPlusPlus translation unit autotest" + files: "tst_translationunit.cpp" +} diff --git a/tests/auto/cplusplus/translationunit/tst_translationunit.cpp b/tests/auto/cplusplus/translationunit/tst_translationunit.cpp new file mode 100644 index 0000000000..d89d979bab --- /dev/null +++ b/tests/auto/cplusplus/translationunit/tst_translationunit.cpp @@ -0,0 +1,225 @@ +/**************************************************************************** +** +** Copyright (C) 2014 Digia Plc and/or its subsidiary(-ies). +** Contact: http://www.qt-project.org/legal +** +** This file is part of Qt Creator. +** +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and Digia. For licensing terms and +** conditions see http://qt.digia.com/licensing. For further information +** use the contact form at http://qt.digia.com/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Digia gives you certain additional +** rights. These rights are described in the Digia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +****************************************************************************/ + +#include <cplusplus/PreprocessorClient.h> +#include <cplusplus/PreprocessorEnvironment.h> +#include <cplusplus/Token.h> +#include <cplusplus/TranslationUnit.h> +#include <cplusplus/pp-engine.h> + +#include <QtTest> +#include <QDebug> + +//TESTED_COMPONENT=src/libs/cplusplus +using namespace CPlusPlus; + +class tst_TranslationUnit: public QObject +{ + Q_OBJECT +private slots: + + // + // The following "non-latin1" code points are used in the tests following this comment: + // + // U+00FC - 2 code units in UTF8, 1 in UTF16 - LATIN SMALL LETTER U WITH DIAERESIS + // U+4E8C - 3 code units in UTF8, 1 in UTF16 - CJK UNIFIED IDEOGRAPH-4E8C + // U+10302 - 4 code units in UTF8, 2 in UTF16 - OLD ITALIC LETTER KE + // + + void unicodeIdentifier(); + void unicodeIdentifier_data(); + + void unicodeStringLiteral(); + void unicodeStringLiteral_data(); + +private: + class Document + { + public: + typedef QSharedPointer<Document> Ptr; + + static Document::Ptr create(const QByteArray &source) + { + LanguageFeatures features; + features.objCEnabled = true; + features.qtEnabled = false; + features.qtKeywordsEnabled = false; + features.qtMocRunEnabled = false; + + Document::Ptr document = Document::Ptr(new Document); + document->translationUnit()->setLanguageFeatures(features); + const QByteArray preprocessedSource = preprocess(source); + document->translationUnit()->setSource(preprocessedSource.constData(), + preprocessedSource.length()); + document->translationUnit()->parse(); + + if (document->hasParsingErrors()) + return Document::Ptr(); + return document; + } + + public: + Document() + : m_translationUnit(&m_control, m_control.stringLiteral("testFile")) + { + m_control.setDiagnosticClient(&m_diagnosticClient); + } + + TranslationUnit *translationUnit() + { return &m_translationUnit; } + + bool hasParsingErrors() const + { return m_diagnosticClient.errorCount != 0; } + + const Identifier *lastIdentifier() const + { return *(m_control.lastIdentifier() - 1); } + + const StringLiteral *lastStringLiteral() const + { return *(m_control.lastStringLiteral() - 1); } + + private: + static QByteArray preprocess(const QByteArray &source) + { + Client *client = 0; // no client. + Environment env; + Preprocessor preprocess(client, &env); + preprocess.setKeepComments(true); + return preprocess.run(QLatin1String("<stdin>"), source); + } + + private: + Control m_control; + TranslationUnit m_translationUnit; + + class Diagnostic: public DiagnosticClient { + public: + int errorCount; + + Diagnostic() : errorCount(0) {} + + void report(int /*level*/, const StringLiteral *fileName, unsigned line, + unsigned column, const char *format, va_list ap) + { + ++errorCount; + qDebug() << fileName->chars() << ':' << line << ':' << column + << ' ' << QString().vsprintf(format, ap); + } + } m_diagnosticClient; + }; +}; + +void tst_TranslationUnit::unicodeIdentifier() +{ + QFETCH(QByteArray, identifierText); + + Document::Ptr document = Document::create("void " + identifierText + ";"); + QVERIFY(document); + + const Identifier *actual = document->lastIdentifier(); + QCOMPARE(QString::fromUtf8(actual->chars(), actual->size()), + QString::fromUtf8(identifierText)); +} + +void tst_TranslationUnit::unicodeIdentifier_data() +{ + QTest::addColumn<QByteArray>("identifierText"); + + typedef QByteArray _; + + QTest::newRow("latin1 identifier") << _("var"); + + QTest::newRow("non-latin1 identifier 1") << _("prefix\u00FC\u4E8C\U00010302"); + QTest::newRow("non-latin1 identifier 2") << _("prefix\U00010302\u00FC\u4E8C"); + QTest::newRow("non-latin1 identifier 3") << _("\U00010302\u00FC\u4E8C"); + QTest::newRow("non-latin1 identifier 4") << _("\u4E8C\U00010302\u00FC"); + QTest::newRow("non-latin1 identifier 5") << _("\u4E8C\U00010302\u00FCsuffix"); + QTest::newRow("non-latin1 identifier 6") << _("\U00010302\u00FC\u4E8Csuffix"); + + // Some special cases (different code path inside lexer) + QTest::newRow("non-latin1 identifier 7") << _("LR\U00010302\u00FC\u4E8C"); + QTest::newRow("non-latin1 identifier 8") << _("u8R\U00010302\u00FC\u4E8C"); + QTest::newRow("non-latin1 identifier 9") << _("u8\U00010302\u00FC\u4E8C"); + QTest::newRow("non-latin1 identifier 10") << _("u\U00010302\u00FC\u4E8C"); +} + +static QByteArray stripQuotesFromLiteral(const QByteArray literal) +{ + QByteArray result = literal; + + // Strip front + while (!result.isEmpty() && result[0] != '"') + result = result.mid(1); + if (result.isEmpty()) + return QByteArray(); + result = result.mid(1); + + // Strip end + while (result.size() >= 2 + && (std::isspace(result[result.size() - 1]) || result[result.size()-1] == '"')) { + result.chop(1); + } + + return result; +} + +void tst_TranslationUnit::unicodeStringLiteral() +{ + QFETCH(QByteArray, literalText); + + Document::Ptr document = Document::create("char t[] = " + literalText + ";"); + QVERIFY(document); + + const StringLiteral *actual = document->lastStringLiteral(); + QCOMPARE(QString::fromUtf8(actual->chars(), actual->size()), + QString::fromUtf8(stripQuotesFromLiteral(literalText))); +} + +void tst_TranslationUnit::unicodeStringLiteral_data() +{ + QTest::addColumn<QByteArray>("literalText"); + + typedef QByteArray _; + + QTest::newRow("latin1 literal") << _("\"var\""); + + QTest::newRow("non-latin1 literal 1") << _("\"prefix\u00FC\u4E8C\U00010302\""); + QTest::newRow("non-latin1 literal 2") << _("\"prefix\U00010302\u00FC\u4E8C\""); + QTest::newRow("non-latin1 literal 3") << _("\"\U00010302\u00FC\u4E8C\""); + QTest::newRow("non-latin1 literal 4") << _("\"\u4E8C\U00010302\u00FC\""); + QTest::newRow("non-latin1 literal 5") << _("\"\u4E8C\U00010302\u00FCsuffix\""); + QTest::newRow("non-latin1 literal 6") << _("\"\U00010302\u00FC\u4E8Csuffix\""); + + QTest::newRow("non-latin1 literal 7") << _("L\"\U00010302\u00FC\u4E8C\""); + QTest::newRow("non-latin1 literal 8") << _("u8\"\U00010302\u00FC\u4E8C\""); + QTest::newRow("non-latin1 literal 9") << _("u\"\U00010302\u00FC\u4E8C\""); + QTest::newRow("non-latin1 literal 10") << _("U\"\U00010302\u00FC\u4E8C\""); +} + +QTEST_APPLESS_MAIN(tst_TranslationUnit) +#include "tst_translationunit.moc" |