Implement u8, u, and U strings from C11 (#439)

* Implement u8, u, and U strings from C11 * Also add u8, u, and U chars from C11 and C23 Co-authored-by: vit9696 <vit9696@users.noreply.github.com>
author: Vitaly Cheptsov <4348897+vit9696@users.noreply.github.com> 2021-10-05 15:36:06 +0300
committer: GitHub <noreply@github.com> 2021-10-05 05:36:06 -0700
commit: 277c6066c8b76f084a238d28bd6193871eb0b74f (patch)
tree: ef4b91364f5606437e38ba2da7b6ad704492cc8c
parent: d4c2922407c486da1ba9c6cd769990cc64308be6 (diff)
download: pycparser-277c6066c8b76f084a238d28bd6193871eb0b74f.tar.gz
5 files changed, 64 insertions, 0 deletions
diff --git a/pycparser/c_lexer.py b/pycparser/c_lexer.py
index 8fdd3d7..f861b5c 100644
--- a/pycparser/c_lexer.py
+++ b/pycparser/c_lexer.py
@@ -139,10 +139,16 @@ class CLexer(object):
         'FLOAT_CONST', 'HEX_FLOAT_CONST',
         'CHAR_CONST',
         'WCHAR_CONST',
+        'U8CHAR_CONST',
+        'U16CHAR_CONST',
+        'U32CHAR_CONST',
 
         # String literals
         'STRING_LITERAL',
         'WSTRING_LITERAL',
+        'U8STRING_LITERAL',
+        'U16STRING_LITERAL',
+        'U32STRING_LITERAL',
 
         # Operators
         'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
@@ -244,6 +250,9 @@ class CLexer(object):
     cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
     char_const = "'"+cconst_char+"'"
     wchar_const = 'L'+char_const
+    u8char_const = 'u8'+char_const
+    u16char_const = 'u'+char_const
+    u32char_const = 'U'+char_const
     multicharacter_constant = "'"+cconst_char+"{2,4}'"
     unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
     bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')"""
@@ -252,6 +261,9 @@ class CLexer(object):
     string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')'
     string_literal = '"'+string_char+'*"'
     wstring_literal = 'L'+string_literal
+    u8string_literal = 'u8'+string_literal
+    u16string_literal = 'u'+string_literal
+    u32string_literal = 'U'+string_literal
     bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
 
     # floating constants (K&R2: A.2.5.3)
@@ -486,6 +498,18 @@ class CLexer(object):
     def t_WCHAR_CONST(self, t):
         return t
 
+    @TOKEN(u8char_const)
+    def t_U8CHAR_CONST(self, t):
+        return t
+
+    @TOKEN(u16char_const)
+    def t_U16CHAR_CONST(self, t):
+        return t
+
+    @TOKEN(u32char_const)
+    def t_U32CHAR_CONST(self, t):
+        return t
+
     @TOKEN(unmatched_quote)
     def t_UNMATCHED_QUOTE(self, t):
         msg = "Unmatched '"
@@ -500,6 +524,18 @@ class CLexer(object):
     def t_WSTRING_LITERAL(self, t):
         return t
 
+    @TOKEN(u8string_literal)
+    def t_U8STRING_LITERAL(self, t):
+        return t
+
+    @TOKEN(u16string_literal)
+    def t_U16STRING_LITERAL(self, t):
+        return t
+
+    @TOKEN(u32string_literal)
+    def t_U32STRING_LITERAL(self, t):
+        return t
+
     # unmatched string literals are caught by the preprocessor
 
     @TOKEN(bad_string_literal)
diff --git a/pycparser/c_parser.py b/pycparser/c_parser.py
index 4bbeeca..61e4471 100644
--- a/pycparser/c_parser.py
+++ b/pycparser/c_parser.py
@@ -1865,6 +1865,9 @@ class CParser(PLYParser):
     def p_constant_3(self, p):
         """ constant    : CHAR_CONST
                         | WCHAR_CONST
+                        | U8CHAR_CONST
+                        | U16CHAR_CONST
+                        | U32CHAR_CONST
         """
         p[0] = c_ast.Constant(
             'char', p[1], self._token_coord(p, 1))
@@ -1887,7 +1890,13 @@ class CParser(PLYParser):
 
     def p_unified_wstring_literal(self, p):
         """ unified_wstring_literal : WSTRING_LITERAL
+                                    | U8STRING_LITERAL
+                                    | U16STRING_LITERAL
+                                    | U32STRING_LITERAL
                                     | unified_wstring_literal WSTRING_LITERAL
+                                    | unified_wstring_literal U8STRING_LITERAL
+                                    | unified_wstring_literal U16STRING_LITERAL
+                                    | unified_wstring_literal U32STRING_LITERAL
         """
         if len(p) == 2: # single literal
             p[0] = c_ast.Constant(
diff --git a/tests/c_files/c11.c b/tests/c_files/c11.c
index 4f97a87..1c7ec08 100644
--- a/tests/c_files/c11.c
+++ b/tests/c_files/c11.c
@@ -5,6 +5,7 @@
 #include <assert.h>
 #include <stdatomic.h>
 #include <stdalign.h>
+#include <wchar.h>
 
 /* C11 thread locals */
 _Thread_local int flag;
@@ -38,6 +39,16 @@ int main()
   static_assert(_Alignof(int) == sizeof(int), "Unexpected int alignment");
   static_assert(alignof(int) == sizeof(int), "Unexpected int alignment");
 
+  wchar_t *w = L"12345";
+  char16_t *c16 = u"12345";
+  char32_t *c32 = U"12345";
+  char *u8 = u8"12345";
+
+  wchar_t wc = L'1';
+  char16_t c16c = u'1';
+  char32_t c32c = U'1';
+  char u8c = u8'1';
+
   printf("Flag: %d\n", flag);
   printf("Flag2: %d\n", flag2);
   func();
diff --git a/tests/test_c_lexer.py b/tests/test_c_lexer.py
index 1d3c39b..03fd838 100644
--- a/tests/test_c_lexer.py
+++ b/tests/test_c_lexer.py
@@ -122,6 +122,9 @@ class TestCLexerNoErrors(unittest.TestCase):
     def test_char_constants(self):
         self.assertTokensTypes(r"""'x'""", ['CHAR_CONST'])
         self.assertTokensTypes(r"""L'x'""", ['WCHAR_CONST'])
+        self.assertTokensTypes(r"""u8'x'""", ['U8CHAR_CONST'])
+        self.assertTokensTypes(r"""u'x'""", ['U16CHAR_CONST'])
+        self.assertTokensTypes(r"""U'x'""", ['U32CHAR_CONST'])
         self.assertTokensTypes(r"""'\t'""", ['CHAR_CONST'])
         self.assertTokensTypes(r"""'\''""", ['CHAR_CONST'])
         self.assertTokensTypes(r"""'\?'""", ['CHAR_CONST'])
@@ -147,6 +150,9 @@ class TestCLexerNoErrors(unittest.TestCase):
     def test_string_literal(self):
         self.assertTokensTypes('"a string"', ['STRING_LITERAL'])
         self.assertTokensTypes('L"ing"', ['WSTRING_LITERAL'])
+        self.assertTokensTypes('u8"ing"', ['U8STRING_LITERAL'])
+        self.assertTokensTypes('u"ing"', ['U16STRING_LITERAL'])
+        self.assertTokensTypes('U"ing"', ['U32STRING_LITERAL'])
         self.assertTokensTypes(
             '"i am a string too \t"',
             ['STRING_LITERAL'])
diff --git a/utils/fake_libc_include/_fake_typedefs.h b/utils/fake_libc_include/_fake_typedefs.h
index 9a85d40..3be1462 100644
--- a/utils/fake_libc_include/_fake_typedefs.h
+++ b/utils/fake_libc_include/_fake_typedefs.h
@@ -42,6 +42,8 @@ typedef int __ULong;
 typedef int __FILE;
 typedef int ptrdiff_t;
 typedef int wchar_t;
+typedef int char16_t;
+typedef int char32_t;
 typedef int __off_t;
 typedef int __pid_t;
 typedef int __loff_t;
author	Vitaly Cheptsov <4348897+vit9696@users.noreply.github.com>	2021-10-05 15:36:06 +0300
committer	GitHub <noreply@github.com>	2021-10-05 05:36:06 -0700
commit	277c6066c8b76f084a238d28bd6193871eb0b74f (patch)
tree	ef4b91364f5606437e38ba2da7b6ad704492cc8c
parent	d4c2922407c486da1ba9c6cd769990cc64308be6 (diff)
download	pycparser-277c6066c8b76f084a238d28bd6193871eb0b74f.tar.gz