summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVitaly Cheptsov <4348897+vit9696@users.noreply.github.com>2021-10-05 15:36:06 +0300
committerGitHub <noreply@github.com>2021-10-05 05:36:06 -0700
commit277c6066c8b76f084a238d28bd6193871eb0b74f (patch)
treeef4b91364f5606437e38ba2da7b6ad704492cc8c
parentd4c2922407c486da1ba9c6cd769990cc64308be6 (diff)
downloadpycparser-277c6066c8b76f084a238d28bd6193871eb0b74f.tar.gz
Implement u8, u, and U strings from C11 (#439)
* Implement u8, u, and U strings from C11 * Also add u8, u, and U chars from C11 and C23 Co-authored-by: vit9696 <vit9696@users.noreply.github.com>
-rw-r--r--pycparser/c_lexer.py36
-rw-r--r--pycparser/c_parser.py9
-rw-r--r--tests/c_files/c11.c11
-rw-r--r--tests/test_c_lexer.py6
-rw-r--r--utils/fake_libc_include/_fake_typedefs.h2
5 files changed, 64 insertions, 0 deletions
diff --git a/pycparser/c_lexer.py b/pycparser/c_lexer.py
index 8fdd3d7..f861b5c 100644
--- a/pycparser/c_lexer.py
+++ b/pycparser/c_lexer.py
@@ -139,10 +139,16 @@ class CLexer(object):
'FLOAT_CONST', 'HEX_FLOAT_CONST',
'CHAR_CONST',
'WCHAR_CONST',
+ 'U8CHAR_CONST',
+ 'U16CHAR_CONST',
+ 'U32CHAR_CONST',
# String literals
'STRING_LITERAL',
'WSTRING_LITERAL',
+ 'U8STRING_LITERAL',
+ 'U16STRING_LITERAL',
+ 'U32STRING_LITERAL',
# Operators
'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
@@ -244,6 +250,9 @@ class CLexer(object):
cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
char_const = "'"+cconst_char+"'"
wchar_const = 'L'+char_const
+ u8char_const = 'u8'+char_const
+ u16char_const = 'u'+char_const
+ u32char_const = 'U'+char_const
multicharacter_constant = "'"+cconst_char+"{2,4}'"
unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')"""
@@ -252,6 +261,9 @@ class CLexer(object):
string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')'
string_literal = '"'+string_char+'*"'
wstring_literal = 'L'+string_literal
+ u8string_literal = 'u8'+string_literal
+ u16string_literal = 'u'+string_literal
+ u32string_literal = 'U'+string_literal
bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
# floating constants (K&R2: A.2.5.3)
@@ -486,6 +498,18 @@ class CLexer(object):
def t_WCHAR_CONST(self, t):
return t
+ @TOKEN(u8char_const)
+ def t_U8CHAR_CONST(self, t):
+ return t
+
+ @TOKEN(u16char_const)
+ def t_U16CHAR_CONST(self, t):
+ return t
+
+ @TOKEN(u32char_const)
+ def t_U32CHAR_CONST(self, t):
+ return t
+
@TOKEN(unmatched_quote)
def t_UNMATCHED_QUOTE(self, t):
msg = "Unmatched '"
@@ -500,6 +524,18 @@ class CLexer(object):
def t_WSTRING_LITERAL(self, t):
return t
+ @TOKEN(u8string_literal)
+ def t_U8STRING_LITERAL(self, t):
+ return t
+
+ @TOKEN(u16string_literal)
+ def t_U16STRING_LITERAL(self, t):
+ return t
+
+ @TOKEN(u32string_literal)
+ def t_U32STRING_LITERAL(self, t):
+ return t
+
# unmatched string literals are caught by the preprocessor
@TOKEN(bad_string_literal)
diff --git a/pycparser/c_parser.py b/pycparser/c_parser.py
index 4bbeeca..61e4471 100644
--- a/pycparser/c_parser.py
+++ b/pycparser/c_parser.py
@@ -1865,6 +1865,9 @@ class CParser(PLYParser):
def p_constant_3(self, p):
""" constant : CHAR_CONST
| WCHAR_CONST
+ | U8CHAR_CONST
+ | U16CHAR_CONST
+ | U32CHAR_CONST
"""
p[0] = c_ast.Constant(
'char', p[1], self._token_coord(p, 1))
@@ -1887,7 +1890,13 @@ class CParser(PLYParser):
def p_unified_wstring_literal(self, p):
""" unified_wstring_literal : WSTRING_LITERAL
+ | U8STRING_LITERAL
+ | U16STRING_LITERAL
+ | U32STRING_LITERAL
| unified_wstring_literal WSTRING_LITERAL
+ | unified_wstring_literal U8STRING_LITERAL
+ | unified_wstring_literal U16STRING_LITERAL
+ | unified_wstring_literal U32STRING_LITERAL
"""
if len(p) == 2: # single literal
p[0] = c_ast.Constant(
diff --git a/tests/c_files/c11.c b/tests/c_files/c11.c
index 4f97a87..1c7ec08 100644
--- a/tests/c_files/c11.c
+++ b/tests/c_files/c11.c
@@ -5,6 +5,7 @@
#include <assert.h>
#include <stdatomic.h>
#include <stdalign.h>
+#include <wchar.h>
/* C11 thread locals */
_Thread_local int flag;
@@ -38,6 +39,16 @@ int main()
static_assert(_Alignof(int) == sizeof(int), "Unexpected int alignment");
static_assert(alignof(int) == sizeof(int), "Unexpected int alignment");
+ wchar_t *w = L"12345";
+ char16_t *c16 = u"12345";
+ char32_t *c32 = U"12345";
+ char *u8 = u8"12345";
+
+ wchar_t wc = L'1';
+ char16_t c16c = u'1';
+ char32_t c32c = U'1';
+ char u8c = u8'1';
+
printf("Flag: %d\n", flag);
printf("Flag2: %d\n", flag2);
func();
diff --git a/tests/test_c_lexer.py b/tests/test_c_lexer.py
index 1d3c39b..03fd838 100644
--- a/tests/test_c_lexer.py
+++ b/tests/test_c_lexer.py
@@ -122,6 +122,9 @@ class TestCLexerNoErrors(unittest.TestCase):
def test_char_constants(self):
self.assertTokensTypes(r"""'x'""", ['CHAR_CONST'])
self.assertTokensTypes(r"""L'x'""", ['WCHAR_CONST'])
+ self.assertTokensTypes(r"""u8'x'""", ['U8CHAR_CONST'])
+ self.assertTokensTypes(r"""u'x'""", ['U16CHAR_CONST'])
+ self.assertTokensTypes(r"""U'x'""", ['U32CHAR_CONST'])
self.assertTokensTypes(r"""'\t'""", ['CHAR_CONST'])
self.assertTokensTypes(r"""'\''""", ['CHAR_CONST'])
self.assertTokensTypes(r"""'\?'""", ['CHAR_CONST'])
@@ -147,6 +150,9 @@ class TestCLexerNoErrors(unittest.TestCase):
def test_string_literal(self):
self.assertTokensTypes('"a string"', ['STRING_LITERAL'])
self.assertTokensTypes('L"ing"', ['WSTRING_LITERAL'])
+ self.assertTokensTypes('u8"ing"', ['U8STRING_LITERAL'])
+ self.assertTokensTypes('u"ing"', ['U16STRING_LITERAL'])
+ self.assertTokensTypes('U"ing"', ['U32STRING_LITERAL'])
self.assertTokensTypes(
'"i am a string too \t"',
['STRING_LITERAL'])
diff --git a/utils/fake_libc_include/_fake_typedefs.h b/utils/fake_libc_include/_fake_typedefs.h
index 9a85d40..3be1462 100644
--- a/utils/fake_libc_include/_fake_typedefs.h
+++ b/utils/fake_libc_include/_fake_typedefs.h
@@ -42,6 +42,8 @@ typedef int __ULong;
typedef int __FILE;
typedef int ptrdiff_t;
typedef int wchar_t;
+typedef int char16_t;
+typedef int char32_t;
typedef int __off_t;
typedef int __pid_t;
typedef int __loff_t;