summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNikolai Kosjar <nikolai.kosjar@digia.com>2014-02-25 13:44:11 -0300
committerNikolai Kosjar <nikolai.kosjar@digia.com>2014-05-23 14:23:15 +0200
commit70122b3061ee3fbb07442beb0158edf849ceb98e (patch)
treee8c272ec1df948acd27378a44764dd683ab5b426
parent4fefb1ca2a5270752acf00d586393f472fb1b9a3 (diff)
downloadqt-creator-70122b3061ee3fbb07442beb0158edf849ceb98e.tar.gz
C++: Support for UTF-8 in the lexer
This will save us toLatin1() conversations in CppTools (which already holds UTF-8 encoded QByteArrays) and thus loss of information (see QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers. API-wise the following functions are added to Token. In follow-up patches these will become handy in combination with QStrings. utf16chars() - aequivalent of bytes() utf16charsBegin() - aequivalent of bytesBegin() utf16charsEnd() - aequivalent of bytesEnd() Next steps: * Adapt functions from TranslationUnit. They should work with utf16 chars in order to calculate lines and columns correctly also for UTF-8 multi-byte code points. * Adapt the higher level clients: * Cpp{Tools,Editor} should expect UTF-8 encoded Literals. * Cpp{Tools,Editor}: When dealing with identifiers on the QString/QTextDocument layer, code points represendet by two QChars need to be respected, too. * Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report offsets usable in CppEditor/CppTools. Addresses QTCREATORBUG-7356. Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0 Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
-rw-r--r--src/libs/3rdparty/cplusplus/Lexer.cpp20
-rw-r--r--src/libs/3rdparty/cplusplus/Lexer.h31
-rw-r--r--src/libs/3rdparty/cplusplus/Token.cpp1
-rw-r--r--src/libs/3rdparty/cplusplus/Token.h20
-rw-r--r--src/libs/cplusplus/SimpleLexer.cpp4
-rw-r--r--src/libs/cplusplus/SimpleLexer.h2
-rw-r--r--tests/auto/cplusplus/cplusplus.pro3
-rw-r--r--tests/auto/cplusplus/cplusplus.qbs1
-rw-r--r--tests/auto/cplusplus/lexer/tst_lexer.cpp215
-rw-r--r--tests/auto/cplusplus/translationunit/translationunit.pro2
-rw-r--r--tests/auto/cplusplus/translationunit/translationunit.qbs7
-rw-r--r--tests/auto/cplusplus/translationunit/tst_translationunit.cpp225
12 files changed, 503 insertions, 28 deletions
diff --git a/src/libs/3rdparty/cplusplus/Lexer.cpp b/src/libs/3rdparty/cplusplus/Lexer.cpp
index f2729fa531..914b3c2275 100644
--- a/src/libs/3rdparty/cplusplus/Lexer.cpp
+++ b/src/libs/3rdparty/cplusplus/Lexer.cpp
@@ -29,6 +29,13 @@
using namespace CPlusPlus;
+/*!
+ \class Lexer
+ \brief The Lexer generates tokens from an UTF-8 encoded source text.
+
+ \sa Token
+*/
+
Lexer::Lexer(TranslationUnit *unit)
: _translationUnit(unit),
_control(unit->control()),
@@ -63,6 +70,7 @@ void Lexer::setSource(const char *firstChar, const char *lastChar)
_firstChar = firstChar;
_lastChar = lastChar;
_currentChar = _firstChar - 1;
+ _currentCharUtf16 = -1;
_tokenStart = _currentChar;
_yychar = '\n';
}
@@ -109,6 +117,7 @@ void Lexer::scan(Token *tok)
tok->reset();
scan_helper(tok);
tok->f.bytes = _currentChar - _tokenStart;
+ tok->f.utf16chars = _currentCharUtf16 - _tokenStartUtf16;
}
void Lexer::scan_helper(Token *tok)
@@ -143,6 +152,9 @@ void Lexer::scan_helper(Token *tok)
_tokenStart = _currentChar;
tok->byteOffset = _currentChar - _firstChar;
+ _tokenStartUtf16 = _currentCharUtf16;
+ tok->utf16charOffset = _currentCharUtf16;
+
if (_yychar) {
s._newlineExpected = false;
} else if (s._tokenKind) {
@@ -621,8 +633,8 @@ void Lexer::scan_helper(Token *tok)
} else {
scanIdentifier(tok);
}
- } else if (std::isalpha(ch) || ch == '_' || ch == '$') {
- scanIdentifier(tok);
+ } else if (std::isalpha(ch) || ch == '_' || ch == '$' || isByteOfMultiByteCodePoint(ch)) {
+ scanIdentifier(tok, _currentChar - _tokenStart - 1);
} else if (std::isdigit(ch)) {
scanNumericLiteral(tok);
} else {
@@ -776,8 +788,10 @@ void Lexer::scanNumericLiteral(Token *tok)
void Lexer::scanIdentifier(Token *tok, unsigned extraProcessedChars)
{
const char *yytext = _currentChar - 1 - extraProcessedChars;
- while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$')
+ while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$'
+ || isByteOfMultiByteCodePoint(_yychar)) {
yyinp();
+ }
int yylen = _currentChar - yytext;
if (f._scanKeywords)
tok->f.kind = classify(yytext, yylen, _languageFeatures);
diff --git a/src/libs/3rdparty/cplusplus/Lexer.h b/src/libs/3rdparty/cplusplus/Lexer.h
index 43a877e7a8..8d63d2ba1d 100644
--- a/src/libs/3rdparty/cplusplus/Lexer.h
+++ b/src/libs/3rdparty/cplusplus/Lexer.h
@@ -62,6 +62,7 @@ public:
void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
private:
+ void pushLineStartOffset();
void scan_helper(Token *tok);
void setSource(const char *firstChar, const char *lastChar);
static int classify(const char *string, int length, LanguageFeatures features);
@@ -77,15 +78,32 @@ private:
void scanBackslash(Kind type);
void scanCppComment(Kind type);
- inline void yyinp()
+ static bool isByteOfMultiByteCodePoint(unsigned char byte)
+ { return byte & 0x80; } // Check if most significant bit is set
+
+ void yyinp()
{
- _yychar = *++_currentChar;
+ ++_currentCharUtf16;
+
+ // Process multi-byte UTF-8 code point (non-latin1)
+ if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(_yychar))) {
+ unsigned trailingBytesCurrentCodePoint = 1;
+ for (unsigned char c = _yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1)
+ ++trailingBytesCurrentCodePoint;
+ // Code points >= 0x00010000 are represented by two UTF16 code units
+ if (trailingBytesCurrentCodePoint >= 3)
+ ++_currentCharUtf16;
+ _yychar = *(_currentChar += trailingBytesCurrentCodePoint + 1);
+
+ // Process single-byte UTF-8 code point (latin1)
+ } else {
+ _yychar = *++_currentChar;
+ }
+
if (CPLUSPLUS_UNLIKELY(_yychar == '\n'))
pushLineStartOffset();
}
- void pushLineStartOffset();
-
private:
struct Flags {
unsigned _scanCommentTokens: 1;
@@ -105,6 +123,10 @@ private:
const char *_lastChar;
const char *_tokenStart;
unsigned char _yychar;
+
+ unsigned _currentCharUtf16;
+ unsigned _tokenStartUtf16;
+
union {
unsigned char _state;
State s;
@@ -113,6 +135,7 @@ private:
unsigned _flags;
Flags f;
};
+
unsigned _currentLine;
LanguageFeatures _languageFeatures;
};
diff --git a/src/libs/3rdparty/cplusplus/Token.cpp b/src/libs/3rdparty/cplusplus/Token.cpp
index 57e36c3ea5..8be6757131 100644
--- a/src/libs/3rdparty/cplusplus/Token.cpp
+++ b/src/libs/3rdparty/cplusplus/Token.cpp
@@ -85,6 +85,7 @@ void Token::reset()
{
flags = 0;
byteOffset = 0;
+ utf16charOffset = 0;
ptr = 0;
}
diff --git a/src/libs/3rdparty/cplusplus/Token.h b/src/libs/3rdparty/cplusplus/Token.h
index 02d7f5ebe9..ec10483852 100644
--- a/src/libs/3rdparty/cplusplus/Token.h
+++ b/src/libs/3rdparty/cplusplus/Token.h
@@ -285,7 +285,7 @@ enum Kind {
class CPLUSPLUS_EXPORT Token
{
public:
- Token() : flags(0), byteOffset(0), ptr(0) {}
+ Token() : flags(0), byteOffset(0), utf16charOffset(0), ptr(0) {}
inline bool is(unsigned k) const { return f.kind == k; }
inline bool isNot(unsigned k) const { return f.kind != k; }
@@ -298,13 +298,14 @@ public:
inline bool joined() const { return f.joined; }
inline bool expanded() const { return f.expanded; }
inline bool generated() const { return f.generated; }
- inline unsigned bytes() const { return f.bytes; }
- inline unsigned bytesBegin() const
- { return byteOffset; }
+ inline unsigned bytes() const { return f.bytes; }
+ inline unsigned bytesBegin() const { return byteOffset; }
+ inline unsigned bytesEnd() const { return byteOffset + f.bytes; }
- inline unsigned bytesEnd() const
- { return byteOffset + f.bytes; }
+ inline unsigned utf16chars() const { return f.utf16chars; }
+ inline unsigned utf16charsBegin() const { return utf16charOffset; }
+ inline unsigned utf16charsEnd() const { return utf16charOffset + f.utf16chars; }
inline bool isLiteral() const
{ return f.kind >= T_FIRST_LITERAL && f.kind <= T_LAST_LITERAL; }
@@ -354,15 +355,17 @@ public:
unsigned generated : 1;
// Unused...
unsigned pad : 3;
- // The token length in bytes.
+ // The token length in bytes and UTF16 chars.
unsigned bytes : 16;
+ unsigned utf16chars : 16;
};
union {
- unsigned flags;
+ unsigned long flags;
Flags f;
};
unsigned byteOffset;
+ unsigned utf16charOffset;
union {
void *ptr;
@@ -393,5 +396,4 @@ struct LanguageFeatures
} // namespace CPlusPlus
-
#endif // CPLUSPLUS_TOKEN_H
diff --git a/src/libs/cplusplus/SimpleLexer.cpp b/src/libs/cplusplus/SimpleLexer.cpp
index 8e539acb84..95c6c051a5 100644
--- a/src/libs/cplusplus/SimpleLexer.cpp
+++ b/src/libs/cplusplus/SimpleLexer.cpp
@@ -61,11 +61,11 @@ bool SimpleLexer::endedJoined() const
return _endedJoined;
}
-QList<Token> SimpleLexer::operator()(const QString &text, int state)
+QList<Token> SimpleLexer::operator()(const QString &text, int state, bool convertToUtf8)
{
QList<Token> tokens;
- const QByteArray bytes = text.toLatin1();
+ const QByteArray bytes = convertToUtf8 ? text.toUtf8() : text.toLatin1();
const char *firstChar = bytes.constData();
const char *lastChar = firstChar + bytes.size();
diff --git a/src/libs/cplusplus/SimpleLexer.h b/src/libs/cplusplus/SimpleLexer.h
index 1eb4ab6c3b..a5b7d3e4ac 100644
--- a/src/libs/cplusplus/SimpleLexer.h
+++ b/src/libs/cplusplus/SimpleLexer.h
@@ -54,7 +54,7 @@ public:
bool endedJoined() const;
- QList<Token> operator()(const QString &text, int state = 0);
+ QList<Token> operator()(const QString &text, int state = 0, bool convertToUtf8 = false);
int state() const
{ return _lastState; }
diff --git a/tests/auto/cplusplus/cplusplus.pro b/tests/auto/cplusplus/cplusplus.pro
index 2a783402a0..3cbc0dfdf2 100644
--- a/tests/auto/cplusplus/cplusplus.pro
+++ b/tests/auto/cplusplus/cplusplus.pro
@@ -12,4 +12,5 @@ SUBDIRS = \
misc \
cxx11 \
checksymbols \
- lexer
+ lexer \
+ translationunit
diff --git a/tests/auto/cplusplus/cplusplus.qbs b/tests/auto/cplusplus/cplusplus.qbs
index c5ec28e1c6..338a7c9b4b 100644
--- a/tests/auto/cplusplus/cplusplus.qbs
+++ b/tests/auto/cplusplus/cplusplus.qbs
@@ -13,6 +13,7 @@ Project {
"misc/misc.qbs",
"preprocessor/preprocessor.qbs",
"semantic/semantic.qbs",
+ "translationunit/translationunit.qbs",
"typeprettyprinter/typeprettyprinter.qbs"
]
}
diff --git a/tests/auto/cplusplus/lexer/tst_lexer.cpp b/tests/auto/cplusplus/lexer/tst_lexer.cpp
index 54d58c01f1..7218f07fe8 100644
--- a/tests/auto/cplusplus/lexer/tst_lexer.cpp
+++ b/tests/auto/cplusplus/lexer/tst_lexer.cpp
@@ -52,28 +52,49 @@ class tst_SimpleLexer: public QObject
public:
tst_SimpleLexer() : _state(0) {}
+ enum TokenCompareFlag {
+ CompareKind = 1 << 1,
+ CompareBytes = 1 << 2,
+ CompareBytesBegin = 1 << 3,
+ CompareBytesEnd = 1 << 4,
+ CompareUtf16Chars = 1 << 5,
+ CompareUtf16CharsBegin = 1 << 6,
+ CompareUtf16CharsEnd = 1 << 7
+ };
+ Q_DECLARE_FLAGS(TokenCompareFlags, TokenCompareFlag)
+
private slots:
void basic();
void basic_data();
void incremental();
void incremental_data();
+ //
+ // The following "non-latin1" code points are used in the tests following this comment:
+ //
+ // U+00FC - 2 code units in UTF8, 1 in UTF16 - LATIN SMALL LETTER U WITH DIAERESIS
+ // U+4E8C - 3 code units in UTF8, 1 in UTF16 - CJK UNIFIED IDEOGRAPH-4E8C
+ // U+10302 - 4 code units in UTF8, 2 in UTF16 - OLD ITALIC LETTER KE
+ //
+
+ void bytes_and_utf16chars();
+ void bytes_and_utf16chars_data();
+ void offsets();
+ void offsets_data();
+
private:
static TokenList toTokenList(const TokenKindList &tokenKinds);
- enum TokenCompareFlag {
- CompareKind = 1 << 1
- };
- Q_DECLARE_FLAGS(TokenCompareFlags, TokenCompareFlag)
-
void run(const QByteArray &source,
const TokenList &expectedTokenList,
bool preserveState,
- TokenCompareFlag compareFlags);
+ TokenCompareFlags compareFlags);
int _state;
};
+Q_DECLARE_OPERATORS_FOR_FLAGS(tst_SimpleLexer::TokenCompareFlags)
+
TokenList tst_SimpleLexer::toTokenList(const TokenKindList &tokenKinds)
{
TokenList tokens;
@@ -88,10 +109,13 @@ TokenList tst_SimpleLexer::toTokenList(const TokenKindList &tokenKinds)
void tst_SimpleLexer::run(const QByteArray &source,
const TokenList &expectedTokenList,
bool preserveState,
- TokenCompareFlag compareFlags)
+ TokenCompareFlags compareFlags)
{
+ QVERIFY(compareFlags);
+
SimpleLexer lexer;
- const QList<Token> tokenList = lexer(source, preserveState ? _state : 0);
+ const QList<Token> tokenList = lexer(source, preserveState ? _state : 0,
+ /*convertToUtf8=*/ true);
if (preserveState)
_state = lexer.state();
@@ -108,6 +132,20 @@ void tst_SimpleLexer::run(const QByteArray &source,
#endif
if (compareFlags & CompareKind)
QCOMPARE(token.kind(), expectedToken.kind());
+
+ if (compareFlags & CompareBytes)
+ QCOMPARE(token.bytes(), expectedToken.bytes());
+ if (compareFlags & CompareBytesBegin)
+ QCOMPARE(token.bytesBegin(), expectedToken.bytesBegin());
+ if (compareFlags & CompareBytesEnd)
+ QCOMPARE(token.bytesEnd(), expectedToken.bytesEnd());
+
+ if (compareFlags & CompareUtf16Chars)
+ QCOMPARE(token.utf16chars(), expectedToken.utf16chars());
+ if (compareFlags & CompareUtf16CharsBegin)
+ QCOMPARE(token.utf16charsBegin(), expectedToken.utf16charsBegin());
+ if (compareFlags & CompareUtf16CharsEnd)
+ QCOMPARE(token.utf16charsEnd(), expectedToken.utf16charsEnd());
}
QVERIFY2(i == expectedTokenList.size(), "Less tokens than expected.");
}
@@ -221,7 +259,168 @@ void tst_SimpleLexer::basic_data()
<< T_LBRACKET << T_RBRACKET << T_LBRACE << T_RBRACE
<< T_IDENTIFIER << T_QUESTION << T_IDENTIFIER << T_COLON << T_IDENTIFIER;
QTest::newRow(source) << source << expectedTokenKindList;
+}
+void tst_SimpleLexer::bytes_and_utf16chars()
+{
+ QFETCH(QByteArray, source);
+ QFETCH(QList<Token>, expectedTokenList);
+
+ const TokenCompareFlags compareFlags = CompareKind | CompareBytes | CompareUtf16Chars;
+ run(source, expectedTokenList, false, compareFlags);
+}
+
+static QList<Token> createToken(unsigned kind, unsigned bytes, unsigned utf16chars)
+{
+ Token t;
+ t.f.kind = kind;
+ t.f.bytes = bytes;
+ t.f.utf16chars = utf16chars;
+ return QList<Token>() << t;
+}
+
+void tst_SimpleLexer::bytes_and_utf16chars_data()
+{
+ QTest::addColumn<QByteArray>("source");
+ QTest::addColumn<QList<Token> >("expectedTokenList");
+
+ typedef QByteArray _;
+
+ // LATIN1 Identifier
+ QTest::newRow("latin1 identifier")
+ << _("var") << createToken(T_IDENTIFIER, 3, 3);
+
+ // NON-LATIN1 identifier (code point with 2 UTF8 code units)
+ QTest::newRow("non-latin1 identifier (2-byte code unit at start)")
+ << _("\u00FC_var") << createToken(T_IDENTIFIER, 6, 5);
+ QTest::newRow("non-latin1 identifier (2-byte code unit in center)")
+ << _("_v\u00FCr_") << createToken(T_IDENTIFIER, 6, 5);
+ QTest::newRow("non-latin1 identifier (2-byte code unit at end)")
+ << _("var_\u00FC") << createToken(T_IDENTIFIER, 6, 5);
+ QTest::newRow("non-latin1 identifier (2-byte code unit only)")
+ << _("\u00FC") << createToken(T_IDENTIFIER, 2, 1);
+
+ // NON-LATIN1 identifier (code point with 3 UTF8 code units)
+ QTest::newRow("non-latin1 identifier (3-byte code unit at start)")
+ << _("\u4E8C_var") << createToken(T_IDENTIFIER, 7, 5);
+ QTest::newRow("non-latin1 identifier (3-byte code unit in center)")
+ << _("_v\u4E8Cr_") << createToken(T_IDENTIFIER, 7, 5);
+ QTest::newRow("non-latin1 identifier (3-byte code unit at end)")
+ << _("var_\u4E8C") << createToken(T_IDENTIFIER, 7, 5);
+ QTest::newRow("non-latin1 identifier (3-byte code unit only)")
+ << _("\u4E8C") << createToken(T_IDENTIFIER, 3, 1);
+
+ // NON-LATIN1 identifier (code point with 4 UTF8 code units)
+ QTest::newRow("non-latin1 identifier (4-byte code unit at start)")
+ << _("\U00010302_var") << createToken(T_IDENTIFIER, 8, 6);
+ QTest::newRow("non-latin1 identifier (4-byte code unit in center)")
+ << _("_v\U00010302r_") << createToken(T_IDENTIFIER, 8, 6);
+ QTest::newRow("non-latin1 identifier (4-byte code unit at end)")
+ << _("var_\U00010302") << createToken(T_IDENTIFIER, 8, 6);
+ QTest::newRow("non-latin1 identifier (4-byte code unit only)")
+ << _("\U00010302") << createToken(T_IDENTIFIER, 4, 2);
+
+ // NON-LATIN1 identifier (code points with several multi-byte UTF8 code units)
+ QTest::newRow("non-latin1 identifier (mixed multi-byte code units at start)")
+ << _("\u00FC\u4E8C\U00010302_var") << createToken(T_IDENTIFIER, 13, 8);
+ QTest::newRow("non-latin1 identifier (mixed multi-byte code units in center)")
+ << _("_v\u00FC\u4E8C\U00010302r_") << createToken(T_IDENTIFIER, 13, 8);
+ QTest::newRow("non-latin1 identifier (mixed multi-byte code units at end)")
+ << _("var_\u00FC\u4E8C\U00010302") << createToken(T_IDENTIFIER, 13, 8);
+ QTest::newRow("non-latin1 identifier (mixed multi-byte code units only)")
+ << _("\u00FC\u4E8C\U00010302") << createToken(T_IDENTIFIER, 9, 4);
+
+ // Comments
+ QTest::newRow("ascii comment /* ... */")
+ << _("/* hello world */") << createToken(T_COMMENT, 17, 17);
+ QTest::newRow("latin1 comment //")
+ << _("// hello world") << createToken(T_CPP_COMMENT, 14, 14);
+ QTest::newRow("non-latin1 comment /* ... */ (1)")
+ << _("/* \u00FC\u4E8C\U00010302 */") << createToken(T_COMMENT, 15, 10);
+ QTest::newRow("non-latin1 comment /* ... */ (2)")
+ << _("/*\u00FC\u4E8C\U00010302*/") << createToken(T_COMMENT, 13, 8);
+ QTest::newRow("non-latin1 comment // (1)")
+ << _("// \u00FC\u4E8C\U00010302") << createToken(T_CPP_COMMENT, 12, 7);
+ QTest::newRow("non-latin1 comment // (2)")
+ << _("//\u00FC\u4E8C\U00010302") << createToken(T_CPP_COMMENT, 11, 6);
+
+ // String Literals
+ QTest::newRow("latin1 string literal")
+ << _("\"hello\"") << createToken(T_STRING_LITERAL, 7, 7);
+ QTest::newRow("non-latin1 string literal")
+ << _("\"\u00FC\u4E8C\U00010302\"") << createToken(T_STRING_LITERAL, 11, 6);
+}
+
+static Token createToken(unsigned kind, unsigned byteOffset, unsigned bytes,
+ unsigned utf16charsOffset, unsigned utf16chars)
+{
+ Token t;
+ t.f.kind = kind;
+ t.byteOffset = byteOffset;
+ t.f.bytes = bytes;
+ t.utf16charOffset = utf16charsOffset;
+ t.f.utf16chars = utf16chars;
+ return t;
+}
+
+void tst_SimpleLexer::offsets()
+{
+ QFETCH(QByteArray, source);
+ QFETCH(QList<Token>, expectedTokenList);
+
+ const TokenCompareFlags compareFlags = CompareKind
+ | CompareBytesBegin
+ | CompareBytesEnd
+ | CompareUtf16CharsBegin
+ | CompareUtf16CharsEnd
+ ;
+ run(source, expectedTokenList, false, compareFlags);
+}
+
+void tst_SimpleLexer::offsets_data()
+{
+ QTest::addColumn<QByteArray>("source");
+ QTest::addColumn<QList<Token> >("expectedTokenList");
+
+ typedef QByteArray _;
+
+ // LATIN1 Identifier
+ QTest::newRow("latin1 identifiers")
+ << _("var var") << (QList<Token>()
+ << createToken(T_IDENTIFIER, 0, 3, 0, 3)
+ << createToken(T_IDENTIFIER, 4, 3, 4, 3)
+ );
+
+ // NON-LATIN1 identifier
+ QTest::newRow("non-latin1 identifiers 1")
+ << _("var_\u00FC var_\u00FC") << (QList<Token>()
+ << createToken(T_IDENTIFIER, 0, 6, 0, 5)
+ << createToken(T_IDENTIFIER, 7, 6, 6, 5)
+ );
+ QTest::newRow("non-latin1 identifiers 2")
+ << _("\u00FC\u4E8C\U00010302 \u00FC\u4E8C\U00010302") << (QList<Token>()
+ << createToken(T_IDENTIFIER, 0, 9, 0, 4)
+ << createToken(T_IDENTIFIER, 10, 9, 5, 4)
+ );
+
+ QTest::newRow("non-latin1 identifiers 3") // first code unit on line: <bytes> / <utf16char>
+ << _("class v\u00FC\u4E8C\U00010302\n" // 0 / 0
+ "{\n" // 17 / 12
+ "public:\n" // 19 / 14
+ " v\u00FC\u4E8C\U00010302();\n" // 27 / 22
+ "};\n") << (QList<Token>() // 45 / 35
+ << createToken(T_CLASS, 0, 5, 0, 5) // class
+ << createToken(T_IDENTIFIER, 6, 10, 6, 5) // non-latin1 id
+ << createToken(T_LBRACE, 17, 1, 12, 1) // {
+ << createToken(T_PUBLIC, 19, 6, 14, 6) // public
+ << createToken(T_COLON, 25, 1, 20, 1) // :
+ << createToken(T_IDENTIFIER, 31, 10, 26, 5) // id
+ << createToken(T_LPAREN, 41, 1, 31, 1) // (
+ << createToken(T_RPAREN, 42, 1, 32, 1) // )
+ << createToken(T_SEMICOLON, 43, 1, 33, 1) // ;
+ << createToken(T_RBRACE, 45, 1, 35, 1) // }
+ << createToken(T_SEMICOLON, 46, 1, 36, 1) // ;
+ );
}
void tst_SimpleLexer::incremental()
diff --git a/tests/auto/cplusplus/translationunit/translationunit.pro b/tests/auto/cplusplus/translationunit/translationunit.pro
new file mode 100644
index 0000000000..f17babbf9e
--- /dev/null
+++ b/tests/auto/cplusplus/translationunit/translationunit.pro
@@ -0,0 +1,2 @@
+include(../shared/shared.pri)
+SOURCES += tst_translationunit.cpp
diff --git a/tests/auto/cplusplus/translationunit/translationunit.qbs b/tests/auto/cplusplus/translationunit/translationunit.qbs
new file mode 100644
index 0000000000..8420c28872
--- /dev/null
+++ b/tests/auto/cplusplus/translationunit/translationunit.qbs
@@ -0,0 +1,7 @@
+import qbs
+import "../cplusplusautotest.qbs" as CPlusPlusAutotest
+
+CPlusPlusAutotest {
+ name: "CPlusPlus translation unit autotest"
+ files: "tst_translationunit.cpp"
+}
diff --git a/tests/auto/cplusplus/translationunit/tst_translationunit.cpp b/tests/auto/cplusplus/translationunit/tst_translationunit.cpp
new file mode 100644
index 0000000000..d89d979bab
--- /dev/null
+++ b/tests/auto/cplusplus/translationunit/tst_translationunit.cpp
@@ -0,0 +1,225 @@
+/****************************************************************************
+**
+** Copyright (C) 2014 Digia Plc and/or its subsidiary(-ies).
+** Contact: http://www.qt-project.org/legal
+**
+** This file is part of Qt Creator.
+**
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and Digia. For licensing terms and
+** conditions see http://qt.digia.com/licensing. For further information
+** use the contact form at http://qt.digia.com/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Digia gives you certain additional
+** rights. These rights are described in the Digia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+****************************************************************************/
+
+#include <cplusplus/PreprocessorClient.h>
+#include <cplusplus/PreprocessorEnvironment.h>
+#include <cplusplus/Token.h>
+#include <cplusplus/TranslationUnit.h>
+#include <cplusplus/pp-engine.h>
+
+#include <QtTest>
+#include <QDebug>
+
+//TESTED_COMPONENT=src/libs/cplusplus
+using namespace CPlusPlus;
+
+class tst_TranslationUnit: public QObject
+{
+ Q_OBJECT
+private slots:
+
+ //
+ // The following "non-latin1" code points are used in the tests following this comment:
+ //
+ // U+00FC - 2 code units in UTF8, 1 in UTF16 - LATIN SMALL LETTER U WITH DIAERESIS
+ // U+4E8C - 3 code units in UTF8, 1 in UTF16 - CJK UNIFIED IDEOGRAPH-4E8C
+ // U+10302 - 4 code units in UTF8, 2 in UTF16 - OLD ITALIC LETTER KE
+ //
+
+ void unicodeIdentifier();
+ void unicodeIdentifier_data();
+
+ void unicodeStringLiteral();
+ void unicodeStringLiteral_data();
+
+private:
+ class Document
+ {
+ public:
+ typedef QSharedPointer<Document> Ptr;
+
+ static Document::Ptr create(const QByteArray &source)
+ {
+ LanguageFeatures features;
+ features.objCEnabled = true;
+ features.qtEnabled = false;
+ features.qtKeywordsEnabled = false;
+ features.qtMocRunEnabled = false;
+
+ Document::Ptr document = Document::Ptr(new Document);
+ document->translationUnit()->setLanguageFeatures(features);
+ const QByteArray preprocessedSource = preprocess(source);
+ document->translationUnit()->setSource(preprocessedSource.constData(),
+ preprocessedSource.length());
+ document->translationUnit()->parse();
+
+ if (document->hasParsingErrors())
+ return Document::Ptr();
+ return document;
+ }
+
+ public:
+ Document()
+ : m_translationUnit(&m_control, m_control.stringLiteral("testFile"))
+ {
+ m_control.setDiagnosticClient(&m_diagnosticClient);
+ }
+
+ TranslationUnit *translationUnit()
+ { return &m_translationUnit; }
+
+ bool hasParsingErrors() const
+ { return m_diagnosticClient.errorCount != 0; }
+
+ const Identifier *lastIdentifier() const
+ { return *(m_control.lastIdentifier() - 1); }
+
+ const StringLiteral *lastStringLiteral() const
+ { return *(m_control.lastStringLiteral() - 1); }
+
+ private:
+ static QByteArray preprocess(const QByteArray &source)
+ {
+ Client *client = 0; // no client.
+ Environment env;
+ Preprocessor preprocess(client, &env);
+ preprocess.setKeepComments(true);
+ return preprocess.run(QLatin1String("<stdin>"), source);
+ }
+
+ private:
+ Control m_control;
+ TranslationUnit m_translationUnit;
+
+ class Diagnostic: public DiagnosticClient {
+ public:
+ int errorCount;
+
+ Diagnostic() : errorCount(0) {}
+
+ void report(int /*level*/, const StringLiteral *fileName, unsigned line,
+ unsigned column, const char *format, va_list ap)
+ {
+ ++errorCount;
+ qDebug() << fileName->chars() << ':' << line << ':' << column
+ << ' ' << QString().vsprintf(format, ap);
+ }
+ } m_diagnosticClient;
+ };
+};
+
+void tst_TranslationUnit::unicodeIdentifier()
+{
+ QFETCH(QByteArray, identifierText);
+
+ Document::Ptr document = Document::create("void " + identifierText + ";");
+ QVERIFY(document);
+
+ const Identifier *actual = document->lastIdentifier();
+ QCOMPARE(QString::fromUtf8(actual->chars(), actual->size()),
+ QString::fromUtf8(identifierText));
+}
+
+void tst_TranslationUnit::unicodeIdentifier_data()
+{
+ QTest::addColumn<QByteArray>("identifierText");
+
+ typedef QByteArray _;
+
+ QTest::newRow("latin1 identifier") << _("var");
+
+ QTest::newRow("non-latin1 identifier 1") << _("prefix\u00FC\u4E8C\U00010302");
+ QTest::newRow("non-latin1 identifier 2") << _("prefix\U00010302\u00FC\u4E8C");
+ QTest::newRow("non-latin1 identifier 3") << _("\U00010302\u00FC\u4E8C");
+ QTest::newRow("non-latin1 identifier 4") << _("\u4E8C\U00010302\u00FC");
+ QTest::newRow("non-latin1 identifier 5") << _("\u4E8C\U00010302\u00FCsuffix");
+ QTest::newRow("non-latin1 identifier 6") << _("\U00010302\u00FC\u4E8Csuffix");
+
+ // Some special cases (different code path inside lexer)
+ QTest::newRow("non-latin1 identifier 7") << _("LR\U00010302\u00FC\u4E8C");
+ QTest::newRow("non-latin1 identifier 8") << _("u8R\U00010302\u00FC\u4E8C");
+ QTest::newRow("non-latin1 identifier 9") << _("u8\U00010302\u00FC\u4E8C");
+ QTest::newRow("non-latin1 identifier 10") << _("u\U00010302\u00FC\u4E8C");
+}
+
+static QByteArray stripQuotesFromLiteral(const QByteArray literal)
+{
+ QByteArray result = literal;
+
+ // Strip front
+ while (!result.isEmpty() && result[0] != '"')
+ result = result.mid(1);
+ if (result.isEmpty())
+ return QByteArray();
+ result = result.mid(1);
+
+ // Strip end
+ while (result.size() >= 2
+ && (std::isspace(result[result.size() - 1]) || result[result.size()-1] == '"')) {
+ result.chop(1);
+ }
+
+ return result;
+}
+
+void tst_TranslationUnit::unicodeStringLiteral()
+{
+ QFETCH(QByteArray, literalText);
+
+ Document::Ptr document = Document::create("char t[] = " + literalText + ";");
+ QVERIFY(document);
+
+ const StringLiteral *actual = document->lastStringLiteral();
+ QCOMPARE(QString::fromUtf8(actual->chars(), actual->size()),
+ QString::fromUtf8(stripQuotesFromLiteral(literalText)));
+}
+
+void tst_TranslationUnit::unicodeStringLiteral_data()
+{
+ QTest::addColumn<QByteArray>("literalText");
+
+ typedef QByteArray _;
+
+ QTest::newRow("latin1 literal") << _("\"var\"");
+
+ QTest::newRow("non-latin1 literal 1") << _("\"prefix\u00FC\u4E8C\U00010302\"");
+ QTest::newRow("non-latin1 literal 2") << _("\"prefix\U00010302\u00FC\u4E8C\"");
+ QTest::newRow("non-latin1 literal 3") << _("\"\U00010302\u00FC\u4E8C\"");
+ QTest::newRow("non-latin1 literal 4") << _("\"\u4E8C\U00010302\u00FC\"");
+ QTest::newRow("non-latin1 literal 5") << _("\"\u4E8C\U00010302\u00FCsuffix\"");
+ QTest::newRow("non-latin1 literal 6") << _("\"\U00010302\u00FC\u4E8Csuffix\"");
+
+ QTest::newRow("non-latin1 literal 7") << _("L\"\U00010302\u00FC\u4E8C\"");
+ QTest::newRow("non-latin1 literal 8") << _("u8\"\U00010302\u00FC\u4E8C\"");
+ QTest::newRow("non-latin1 literal 9") << _("u\"\U00010302\u00FC\u4E8C\"");
+ QTest::newRow("non-latin1 literal 10") << _("U\"\U00010302\u00FC\u4E8C\"");
+}
+
+QTEST_APPLESS_MAIN(tst_TranslationUnit)
+#include "tst_translationunit.moc"