diff options
author | Vitaly Cheptsov <4348897+vit9696@users.noreply.github.com> | 2021-10-05 15:36:06 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-10-05 05:36:06 -0700 |
commit | 277c6066c8b76f084a238d28bd6193871eb0b74f (patch) | |
tree | ef4b91364f5606437e38ba2da7b6ad704492cc8c | |
parent | d4c2922407c486da1ba9c6cd769990cc64308be6 (diff) | |
download | pycparser-277c6066c8b76f084a238d28bd6193871eb0b74f.tar.gz |
Implement u8, u, and U strings from C11 (#439)
* Implement u8, u, and U strings from C11
* Also add u8, u, and U chars from C11 and C23
Co-authored-by: vit9696 <vit9696@users.noreply.github.com>
-rw-r--r-- | pycparser/c_lexer.py | 36 | ||||
-rw-r--r-- | pycparser/c_parser.py | 9 | ||||
-rw-r--r-- | tests/c_files/c11.c | 11 | ||||
-rw-r--r-- | tests/test_c_lexer.py | 6 | ||||
-rw-r--r-- | utils/fake_libc_include/_fake_typedefs.h | 2 |
5 files changed, 64 insertions, 0 deletions
diff --git a/pycparser/c_lexer.py b/pycparser/c_lexer.py index 8fdd3d7..f861b5c 100644 --- a/pycparser/c_lexer.py +++ b/pycparser/c_lexer.py @@ -139,10 +139,16 @@ class CLexer(object): 'FLOAT_CONST', 'HEX_FLOAT_CONST', 'CHAR_CONST', 'WCHAR_CONST', + 'U8CHAR_CONST', + 'U16CHAR_CONST', + 'U32CHAR_CONST', # String literals 'STRING_LITERAL', 'WSTRING_LITERAL', + 'U8STRING_LITERAL', + 'U16STRING_LITERAL', + 'U32STRING_LITERAL', # Operators 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', @@ -244,6 +250,9 @@ class CLexer(object): cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' char_const = "'"+cconst_char+"'" wchar_const = 'L'+char_const + u8char_const = 'u8'+char_const + u16char_const = 'u'+char_const + u32char_const = 'U'+char_const multicharacter_constant = "'"+cconst_char+"{2,4}'" unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)" bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')""" @@ -252,6 +261,9 @@ class CLexer(object): string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')' string_literal = '"'+string_char+'*"' wstring_literal = 'L'+string_literal + u8string_literal = 'u8'+string_literal + u16string_literal = 'u'+string_literal + u32string_literal = 'U'+string_literal bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' # floating constants (K&R2: A.2.5.3) @@ -486,6 +498,18 @@ class CLexer(object): def t_WCHAR_CONST(self, t): return t + @TOKEN(u8char_const) + def t_U8CHAR_CONST(self, t): + return t + + @TOKEN(u16char_const) + def t_U16CHAR_CONST(self, t): + return t + + @TOKEN(u32char_const) + def t_U32CHAR_CONST(self, t): + return t + @TOKEN(unmatched_quote) def t_UNMATCHED_QUOTE(self, t): msg = "Unmatched '" @@ -500,6 +524,18 @@ class CLexer(object): def t_WSTRING_LITERAL(self, t): return t + @TOKEN(u8string_literal) + def t_U8STRING_LITERAL(self, t): + return t + + @TOKEN(u16string_literal) + def t_U16STRING_LITERAL(self, t): + return t + + @TOKEN(u32string_literal) + def t_U32STRING_LITERAL(self, t): + return t + # unmatched string literals are caught by the preprocessor @TOKEN(bad_string_literal) diff --git a/pycparser/c_parser.py b/pycparser/c_parser.py index 4bbeeca..61e4471 100644 --- a/pycparser/c_parser.py +++ b/pycparser/c_parser.py @@ -1865,6 +1865,9 @@ class CParser(PLYParser): def p_constant_3(self, p): """ constant : CHAR_CONST | WCHAR_CONST + | U8CHAR_CONST + | U16CHAR_CONST + | U32CHAR_CONST """ p[0] = c_ast.Constant( 'char', p[1], self._token_coord(p, 1)) @@ -1887,7 +1890,13 @@ class CParser(PLYParser): def p_unified_wstring_literal(self, p): """ unified_wstring_literal : WSTRING_LITERAL + | U8STRING_LITERAL + | U16STRING_LITERAL + | U32STRING_LITERAL | unified_wstring_literal WSTRING_LITERAL + | unified_wstring_literal U8STRING_LITERAL + | unified_wstring_literal U16STRING_LITERAL + | unified_wstring_literal U32STRING_LITERAL """ if len(p) == 2: # single literal p[0] = c_ast.Constant( diff --git a/tests/c_files/c11.c b/tests/c_files/c11.c index 4f97a87..1c7ec08 100644 --- a/tests/c_files/c11.c +++ b/tests/c_files/c11.c @@ -5,6 +5,7 @@ #include <assert.h> #include <stdatomic.h> #include <stdalign.h> +#include <wchar.h> /* C11 thread locals */ _Thread_local int flag; @@ -38,6 +39,16 @@ int main() static_assert(_Alignof(int) == sizeof(int), "Unexpected int alignment"); static_assert(alignof(int) == sizeof(int), "Unexpected int alignment"); + wchar_t *w = L"12345"; + char16_t *c16 = u"12345"; + char32_t *c32 = U"12345"; + char *u8 = u8"12345"; + + wchar_t wc = L'1'; + char16_t c16c = u'1'; + char32_t c32c = U'1'; + char u8c = u8'1'; + printf("Flag: %d\n", flag); printf("Flag2: %d\n", flag2); func(); diff --git a/tests/test_c_lexer.py b/tests/test_c_lexer.py index 1d3c39b..03fd838 100644 --- a/tests/test_c_lexer.py +++ b/tests/test_c_lexer.py @@ -122,6 +122,9 @@ class TestCLexerNoErrors(unittest.TestCase): def test_char_constants(self): self.assertTokensTypes(r"""'x'""", ['CHAR_CONST']) self.assertTokensTypes(r"""L'x'""", ['WCHAR_CONST']) + self.assertTokensTypes(r"""u8'x'""", ['U8CHAR_CONST']) + self.assertTokensTypes(r"""u'x'""", ['U16CHAR_CONST']) + self.assertTokensTypes(r"""U'x'""", ['U32CHAR_CONST']) self.assertTokensTypes(r"""'\t'""", ['CHAR_CONST']) self.assertTokensTypes(r"""'\''""", ['CHAR_CONST']) self.assertTokensTypes(r"""'\?'""", ['CHAR_CONST']) @@ -147,6 +150,9 @@ class TestCLexerNoErrors(unittest.TestCase): def test_string_literal(self): self.assertTokensTypes('"a string"', ['STRING_LITERAL']) self.assertTokensTypes('L"ing"', ['WSTRING_LITERAL']) + self.assertTokensTypes('u8"ing"', ['U8STRING_LITERAL']) + self.assertTokensTypes('u"ing"', ['U16STRING_LITERAL']) + self.assertTokensTypes('U"ing"', ['U32STRING_LITERAL']) self.assertTokensTypes( '"i am a string too \t"', ['STRING_LITERAL']) diff --git a/utils/fake_libc_include/_fake_typedefs.h b/utils/fake_libc_include/_fake_typedefs.h index 9a85d40..3be1462 100644 --- a/utils/fake_libc_include/_fake_typedefs.h +++ b/utils/fake_libc_include/_fake_typedefs.h @@ -42,6 +42,8 @@ typedef int __ULong; typedef int __FILE; typedef int ptrdiff_t; typedef int wchar_t; +typedef int char16_t; +typedef int char32_t; typedef int __off_t; typedef int __pid_t; typedef int __loff_t; |