summaryrefslogtreecommitdiff
path: root/Tools/c-analyzer/c_parser/parser/_regexes.py
diff options
context:
space:
mode:
Diffstat (limited to 'Tools/c-analyzer/c_parser/parser/_regexes.py')
-rw-r--r--Tools/c-analyzer/c_parser/parser/_regexes.py796
1 files changed, 796 insertions, 0 deletions
diff --git a/Tools/c-analyzer/c_parser/parser/_regexes.py b/Tools/c-analyzer/c_parser/parser/_regexes.py
new file mode 100644
index 0000000000..e9bc31d335
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_regexes.py
@@ -0,0 +1,796 @@
+# Regular expression patterns for C syntax.
+#
+# None of these patterns has any capturing. However, a number of them
+# have capturing markers compatible with utils.set_capture_groups().
+
+import textwrap
+
+
+def _ind(text, level=1, edges='both'):
+ indent = ' ' * level
+ text = textwrap.indent(text, indent)
+ if edges == 'pre' or edges == 'both':
+ text = '\n' + indent + text.lstrip()
+ if edges == 'post' or edges == 'both':
+ text = text.rstrip() + '\n' + ' ' * (level - 1)
+ return text
+
+
+#######################################
+# general
+
+HEX = r'(?: [0-9a-zA-Z] )'
+
+STRING_LITERAL = textwrap.dedent(rf'''
+ (?:
+ # character literal
+ (?:
+ ['] [^'] [']
+ |
+ ['] \\ . [']
+ |
+ ['] \\x{HEX}{HEX} [']
+ |
+ ['] \\0\d\d [']
+ |
+ (?:
+ ['] \\o[01]\d\d [']
+ |
+ ['] \\o2[0-4]\d [']
+ |
+ ['] \\o25[0-5] [']
+ )
+ )
+ |
+ # string literal
+ (?:
+ ["] (?: [^"\\]* \\ . )* [^"\\]* ["]
+ )
+ # end string literal
+ )
+ ''')
+
+_KEYWORD = textwrap.dedent(r'''
+ (?:
+ \b
+ (?:
+ auto |
+ extern |
+ register |
+ static |
+ typedef |
+
+ const |
+ volatile |
+
+ signed |
+ unsigned |
+ char |
+ short |
+ int |
+ long |
+ float |
+ double |
+ void |
+
+ struct |
+ union |
+ enum |
+
+ goto |
+ return |
+ sizeof |
+ break |
+ continue |
+ if |
+ else |
+ for |
+ do |
+ while |
+ switch |
+ case |
+ default |
+ entry
+ )
+ \b
+ )
+ ''')
+KEYWORD = rf'''
+ # keyword
+ {_KEYWORD}
+ # end keyword
+ '''
+_KEYWORD = ''.join(_KEYWORD.split())
+
+IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'
+# We use a negative lookahead to filter out keywords.
+STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'
+ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'
+
+
+#######################################
+# types
+
+SIMPLE_TYPE = textwrap.dedent(rf'''
+ # simple type
+ (?:
+ \b
+ (?:
+ void
+ |
+ (?: signed | unsigned ) # implies int
+ |
+ (?:
+ (?: (?: signed | unsigned ) \s+ )?
+ (?: (?: long | short ) \s+ )?
+ (?: char | short | int | long | float | double )
+ )
+ )
+ \b
+ )
+ # end simple type
+ ''')
+
+COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'
+
+
+#######################################
+# variable declarations
+
+STORAGE_CLASS = r'(?: \b (?: auto | register | static | extern ) \b )'
+TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
+PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'
+
+TYPE_SPEC = textwrap.dedent(rf'''
+ # type spec
+ (?:
+ {_ind(SIMPLE_TYPE, 2)}
+ |
+ (?:
+ [_]*typeof[_]*
+ \s* [(]
+ (?: \s* [*&] )*
+ \s* {STRICT_IDENTIFIER}
+ \s* [)]
+ )
+ |
+ # reference to a compound type
+ (?:
+ {COMPOUND_TYPE_KIND}
+ (?: \s* {ANON_IDENTIFIER} )?
+ )
+ |
+ # reference to a typedef
+ {STRICT_IDENTIFIER}
+ )
+ # end type spec
+ ''')
+
+DECLARATOR = textwrap.dedent(rf'''
+ # declarator (possibly abstract)
+ (?:
+ (?: {PTR_QUALIFIER} \s* )*
+ (?:
+ (?:
+ (?: # <IDENTIFIER>
+ {STRICT_IDENTIFIER}
+ )
+ (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
+ )
+ |
+ (?:
+ [(] \s*
+ (?: # <WRAPPED_IDENTIFIER>
+ {STRICT_IDENTIFIER}
+ )
+ (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
+ \s* [)]
+ )
+ |
+ # func ptr
+ (?:
+ [(] (?: \s* {PTR_QUALIFIER} )? \s*
+ (?: # <FUNC_IDENTIFIER>
+ {STRICT_IDENTIFIER}
+ )
+ (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
+ \s* [)]
+ # We allow for a single level of paren nesting in parameters.
+ \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]
+ )
+ )
+ )
+ # end declarator
+ ''')
+
+VAR_DECL = textwrap.dedent(rf'''
+ # var decl (and typedef and func return type)
+ (?:
+ (?:
+ (?: # <STORAGE>
+ {STORAGE_CLASS}
+ )
+ \s*
+ )?
+ (?:
+ (?: # <TYPE_QUAL>
+ {TYPE_QUALIFIER}
+ )
+ \s*
+ )?
+ (?:
+ (?: # <TYPE_SPEC>
+ {_ind(TYPE_SPEC, 4)}
+ )
+ )
+ \s*
+ (?:
+ (?: # <DECLARATOR>
+ {_ind(DECLARATOR, 4)}
+ )
+ )
+ )
+ # end var decl
+ ''')
+
+INITIALIZER = textwrap.dedent(rf'''
+ # initializer
+ (?:
+ (?:
+ [(]
+ # no nested parens (e.g. func ptr)
+ [^)]*
+ [)]
+ \s*
+ )?
+ (?:
+ # a string literal
+ (?:
+ (?: {_ind(STRING_LITERAL, 4)} \s* )*
+ {_ind(STRING_LITERAL, 4)}
+ )
+ |
+
+ # a simple initializer
+ (?:
+ (?:
+ [^'",;{{]*
+ {_ind(STRING_LITERAL, 4)}
+ )*
+ [^'",;{{]*
+ )
+ |
+
+ # a struct/array literal
+ (?:
+ # We only expect compound initializers with
+ # single-variable declarations.
+ {{
+ (?:
+ [^'";]*?
+ {_ind(STRING_LITERAL, 5)}
+ )*
+ [^'";]*?
+ }}
+ (?= \s* ; ) # Note this lookahead.
+ )
+ )
+ )
+ # end initializer
+ ''')
+
+
+#######################################
+# compound type declarations
+
+STRUCT_MEMBER_DECL = textwrap.dedent(rf'''
+ (?:
+ # inline compound type decl
+ (?:
+ (?: # <COMPOUND_TYPE_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ (?:
+ \s+
+ (?: # <COMPOUND_TYPE_NAME>
+ {STRICT_IDENTIFIER}
+ )
+ )?
+ \s* {{
+ )
+ |
+ (?:
+ # typed member
+ (?:
+ # Technically it doesn't have to have a type...
+ (?: # <SPECIFIER_QUALIFIER>
+ (?: {TYPE_QUALIFIER} \s* )?
+ {_ind(TYPE_SPEC, 5)}
+ )
+ (?:
+ # If it doesn't have a declarator then it will have
+ # a size and vice versa.
+ \s*
+ (?: # <DECLARATOR>
+ {_ind(DECLARATOR, 6)}
+ )
+ )?
+ )
+
+ # sized member
+ (?:
+ \s* [:] \s*
+ (?: # <SIZE>
+ \d+
+ )
+ )?
+ \s*
+ (?: # <ENDING>
+ [,;]
+ )
+ )
+ |
+ (?:
+ \s*
+ (?: # <CLOSE>
+ }}
+ )
+ )
+ )
+ ''')
+
+ENUM_MEMBER_DECL = textwrap.dedent(rf'''
+ (?:
+ (?:
+ \s*
+ (?: # <CLOSE>
+ }}
+ )
+ )
+ |
+ (?:
+ \s*
+ (?: # <NAME>
+ {IDENTIFIER}
+ )
+ (?:
+ \s* = \s*
+ (?: # <INIT>
+ {_ind(STRING_LITERAL, 4)}
+ |
+ [^'",}}]+
+ )
+ )?
+ \s*
+ (?: # <ENDING>
+ , | }}
+ )
+ )
+ )
+ ''')
+
+
+#######################################
+# statements
+
+SIMPLE_STMT_BODY = textwrap.dedent(rf'''
+ # simple statement body
+ (?:
+ (?:
+ [^'"{{}};]*
+ {_ind(STRING_LITERAL, 3)}
+ )*
+ [^'"{{}};]*
+ #(?= [;{{] ) # Note this lookahead.
+ )
+ # end simple statement body
+ ''')
+SIMPLE_STMT = textwrap.dedent(rf'''
+ # simple statement
+ (?:
+ (?: # <SIMPLE_STMT>
+ # stmt-inline "initializer"
+ (?:
+ return \b
+ (?:
+ \s*
+ {_ind(INITIALIZER, 5)}
+ )?
+ )
+ |
+ # variable assignment
+ (?:
+ (?: [*] \s* )?
+ (?:
+ {STRICT_IDENTIFIER} \s*
+ (?: . | -> ) \s*
+ )*
+ {STRICT_IDENTIFIER}
+ (?: \s* \[ \s* \d+ \s* \] )?
+ \s* = \s*
+ {_ind(INITIALIZER, 4)}
+ )
+ |
+ # catchall return statement
+ (?:
+ return \b
+ (?:
+ (?:
+ [^'";]*
+ {_ind(STRING_LITERAL, 6)}
+ )*
+ \s* [^'";]*
+ )?
+ )
+ |
+ # simple statement
+ (?:
+ {_ind(SIMPLE_STMT_BODY, 4)}
+ )
+ )
+ \s*
+ (?: # <SIMPLE_ENDING>
+ ;
+ )
+ )
+ # end simple statement
+ ''')
+COMPOUND_STMT = textwrap.dedent(rf'''
+ # compound statement
+ (?:
+ \b
+ (?:
+ (?:
+ (?: # <COMPOUND_BARE>
+ else | do
+ )
+ \b
+ )
+ |
+ (?:
+ (?: # <COMPOUND_LABELED>
+ (?:
+ case \b
+ (?:
+ [^'":]*
+ {_ind(STRING_LITERAL, 7)}
+ )*
+ \s* [^'":]*
+ )
+ |
+ default
+ |
+ {STRICT_IDENTIFIER}
+ )
+ \s* [:]
+ )
+ |
+ (?:
+ (?: # <COMPOUND_PAREN>
+ for | while | if | switch
+ )
+ \s* (?= [(] ) # Note this lookahead.
+ )
+ )
+ \s*
+ )
+ # end compound statement
+ ''')
+
+
+#######################################
+# function bodies
+
+LOCAL = textwrap.dedent(rf'''
+ (?:
+ # an empty statement
+ (?: # <EMPTY>
+ ;
+ )
+ |
+ # inline type decl
+ (?:
+ (?:
+ (?: # <INLINE_LEADING>
+ [^;{{}}]+?
+ )
+ \s*
+ )?
+ (?: # <INLINE_PRE>
+ (?: {STORAGE_CLASS} \s* )?
+ (?: {TYPE_QUALIFIER} \s* )?
+ )? # </INLINE_PRE>
+ (?: # <INLINE_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ (?:
+ \s+
+ (?: # <INLINE_NAME>
+ {STRICT_IDENTIFIER}
+ )
+ )?
+ \s* {{
+ )
+ |
+ # var decl
+ (?:
+ (?: # <STORAGE>
+ {STORAGE_CLASS}
+ )? # </STORAGE>
+ (?:
+ \s*
+ (?: # <VAR_DECL>
+ {_ind(VAR_DECL, 5)}
+ )
+ )
+ (?:
+ (?:
+ # initializer
+ # We expect only basic initializers.
+ \s* = \s*
+ (?: # <VAR_INIT>
+ {_ind(INITIALIZER, 6)}
+ )
+ )?
+ (?:
+ \s*
+ (?: # <VAR_ENDING>
+ [,;]
+ )
+ )
+ )
+ )
+ |
+ {_ind(COMPOUND_STMT, 2)}
+ |
+ # start-of-block
+ (?:
+ (?: # <BLOCK_LEADING>
+ (?:
+ [^'"{{}};]*
+ {_ind(STRING_LITERAL, 5)}
+ )*
+ [^'"{{}};]*
+ # Presumably we will not see "== {{".
+ [^\s='"{{}});]
+ \s*
+ )? # </BLOCK_LEADING>
+ (?: # <BLOCK_OPEN>
+ {{
+ )
+ )
+ |
+ {_ind(SIMPLE_STMT, 2)}
+ |
+ # end-of-block
+ (?: # <BLOCK_CLOSE>
+ }}
+ )
+ )
+ ''')
+
+LOCAL_STATICS = textwrap.dedent(rf'''
+ (?:
+ # inline type decl
+ (?:
+ (?:
+ (?: # <INLINE_LEADING>
+ [^;{{}}]+?
+ )
+ \s*
+ )?
+ (?: # <INLINE_PRE>
+ (?: {STORAGE_CLASS} \s* )?
+ (?: {TYPE_QUALIFIER} \s* )?
+ )?
+ (?: # <INLINE_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ (?:
+ \s+
+ (?: # <INLINE_NAME>
+ {STRICT_IDENTIFIER}
+ )
+ )?
+ \s* {{
+ )
+ |
+ # var decl
+ (?:
+ # We only look for static variables.
+ (?: # <STATIC_DECL>
+ static \b
+ (?: \s* {TYPE_QUALIFIER} )?
+ \s* {_ind(TYPE_SPEC, 4)}
+ \s* {_ind(DECLARATOR, 4)}
+ )
+ \s*
+ (?:
+ (?: # <STATIC_INIT>
+ = \s*
+ {_ind(INITIALIZER, 4)}
+ \s*
+ [,;{{]
+ )
+ |
+ (?: # <STATIC_ENDING>
+ [,;]
+ )
+ )
+ )
+ |
+ # everything else
+ (?:
+ (?: # <DELIM_LEADING>
+ (?:
+ [^'"{{}};]*
+ {_ind(STRING_LITERAL, 4)}
+ )*
+ \s* [^'"{{}};]*
+ )
+ (?:
+ (?: # <BLOCK_OPEN>
+ {{
+ )
+ |
+ (?: # <BLOCK_CLOSE>
+ }}
+ )
+ |
+ (?: # <STMT_END>
+ ;
+ )
+ )
+ )
+ )
+ ''')
+
+
+#######################################
+# global declarations
+
+GLOBAL = textwrap.dedent(rf'''
+ (?:
+ # an empty statement
+ (?: # <EMPTY>
+ ;
+ )
+ |
+
+ # compound type decl (maybe inline)
+ (?:
+ (?:
+ (?: # <COMPOUND_LEADING>
+ [^;{{}}]+?
+ )
+ \s*
+ )?
+ (?: # <COMPOUND_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ (?:
+ \s+
+ (?: # <COMPOUND_NAME>
+ {STRICT_IDENTIFIER}
+ )
+ )?
+ \s* {{
+ )
+ |
+ # bogus inline decl artifact
+ # This simplifies resolving the relative syntactic ambiguity of
+ # inline structs.
+ (?:
+ (?: # <FORWARD_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ \s*
+ (?: # <FORWARD_NAME>
+ {ANON_IDENTIFIER}
+ )
+ (?: # <MAYBE_INLINE_ACTUAL>
+ [^=,;({{[*\]]*
+ [=,;({{]
+ )
+ )
+ |
+
+ # typedef
+ (?:
+ \b typedef \b \s*
+ (?: # <TYPEDEF_DECL>
+ {_ind(VAR_DECL, 4)}
+ )
+ (?:
+ # We expect no inline type definitions in the parameters.
+ \s* [(] \s*
+ (?: # <TYPEDEF_FUNC_PARAMS>
+ [^{{;]*
+ )
+ \s* [)]
+ )?
+ \s* ;
+ )
+ |
+
+ # func decl/definition & var decls
+ # XXX dedicated pattern for funcs (more restricted)?
+ (?:
+ (?:
+ (?: # <VAR_STORAGE>
+ {STORAGE_CLASS}
+ )
+ \s*
+ )?
+ (?:
+ (?: # <FUNC_INLINE>
+ \b inline \b
+ )
+ \s*
+ )?
+ (?: # <VAR_DECL>
+ {_ind(VAR_DECL, 4)}
+ )
+ (?:
+ # func decl / definition
+ (?:
+ (?:
+ # We expect no inline type definitions in the parameters.
+ \s* [(] \s*
+ (?: # <FUNC_PARAMS>
+ [^{{;]*
+ )
+ \s* [)] \s*
+ (?: # <FUNC_DELIM>
+ [{{;]
+ )
+ )
+ |
+ (?:
+ # This is some old-school syntax!
+ \s* [(] \s*
+ # We throw away the bare names:
+ {STRICT_IDENTIFIER}
+ (?: \s* , \s* {STRICT_IDENTIFIER} )*
+ \s* [)] \s*
+
+ # We keep the trailing param declarations:
+ (?: # <FUNC_LEGACY_PARAMS>
+ # There's at least one!
+ (?: {TYPE_QUALIFIER} \s* )?
+ {_ind(TYPE_SPEC, 7)}
+ \s*
+ {_ind(DECLARATOR, 7)}
+ \s* ;
+ (?:
+ \s*
+ (?: {TYPE_QUALIFIER} \s* )?
+ {_ind(TYPE_SPEC, 8)}
+ \s*
+ {_ind(DECLARATOR, 8)}
+ \s* ;
+ )*
+ )
+ \s* {{
+ )
+ )
+ |
+ # var / typedef
+ (?:
+ (?:
+ # initializer
+ # We expect only basic initializers.
+ \s* = \s*
+ (?: # <VAR_INIT>
+ {_ind(INITIALIZER, 6)}
+ )
+ )?
+ \s*
+ (?: # <VAR_ENDING>
+ [,;]
+ )
+ )
+ )
+ )
+ )
+ ''')