summaryrefslogtreecommitdiff
path: root/Tools/c-analyzer/c_parser/parser/_regexes.py
diff options
context:
space:
mode:
authorEric Snow <ericsnowcurrently@gmail.com>2020-10-22 18:42:51 -0600
committerGitHub <noreply@github.com>2020-10-22 18:42:51 -0600
commit345cd37abe324ad4f60f80e2c3133b8849e54e9b (patch)
tree5d965e662dca9dcac19e7eddd63a3d9d0b816fed /Tools/c-analyzer/c_parser/parser/_regexes.py
parentec388cfb4ede56dace2bb78851ff6f38fa2a6abe (diff)
downloadcpython-git-345cd37abe324ad4f60f80e2c3133b8849e54e9b.tar.gz
bpo-36876: Fix the C analyzer tool. (GH-22841)
The original tool wasn't working right and it was simpler to create a new one, partially re-using some of the old code. At this point the tool runs properly on the master. (Try: ./python Tools/c-analyzer/c-analyzer.py analyze.) It take ~40 seconds on my machine to analyze the full CPython code base. Note that we'll need to iron out some OS-specific stuff (e.g. preprocessor). We're okay though since this tool isn't used yet in our workflow. We will also need to verify the analysis results in detail before activating the check in CI, though I'm pretty sure it's close. https://bugs.python.org/issue36876
Diffstat (limited to 'Tools/c-analyzer/c_parser/parser/_regexes.py')
-rw-r--r--Tools/c-analyzer/c_parser/parser/_regexes.py796
1 files changed, 796 insertions, 0 deletions
diff --git a/Tools/c-analyzer/c_parser/parser/_regexes.py b/Tools/c-analyzer/c_parser/parser/_regexes.py
new file mode 100644
index 0000000000..e9bc31d335
--- /dev/null
+++ b/Tools/c-analyzer/c_parser/parser/_regexes.py
@@ -0,0 +1,796 @@
+# Regular expression patterns for C syntax.
+#
+# None of these patterns has any capturing. However, a number of them
+# have capturing markers compatible with utils.set_capture_groups().
+
+import textwrap
+
+
+def _ind(text, level=1, edges='both'):
+ indent = ' ' * level
+ text = textwrap.indent(text, indent)
+ if edges == 'pre' or edges == 'both':
+ text = '\n' + indent + text.lstrip()
+ if edges == 'post' or edges == 'both':
+ text = text.rstrip() + '\n' + ' ' * (level - 1)
+ return text
+
+
+#######################################
+# general
+
+HEX = r'(?: [0-9a-zA-Z] )'
+
+STRING_LITERAL = textwrap.dedent(rf'''
+ (?:
+ # character literal
+ (?:
+ ['] [^'] [']
+ |
+ ['] \\ . [']
+ |
+ ['] \\x{HEX}{HEX} [']
+ |
+ ['] \\0\d\d [']
+ |
+ (?:
+ ['] \\o[01]\d\d [']
+ |
+ ['] \\o2[0-4]\d [']
+ |
+ ['] \\o25[0-5] [']
+ )
+ )
+ |
+ # string literal
+ (?:
+ ["] (?: [^"\\]* \\ . )* [^"\\]* ["]
+ )
+ # end string literal
+ )
+ ''')
+
+_KEYWORD = textwrap.dedent(r'''
+ (?:
+ \b
+ (?:
+ auto |
+ extern |
+ register |
+ static |
+ typedef |
+
+ const |
+ volatile |
+
+ signed |
+ unsigned |
+ char |
+ short |
+ int |
+ long |
+ float |
+ double |
+ void |
+
+ struct |
+ union |
+ enum |
+
+ goto |
+ return |
+ sizeof |
+ break |
+ continue |
+ if |
+ else |
+ for |
+ do |
+ while |
+ switch |
+ case |
+ default |
+ entry
+ )
+ \b
+ )
+ ''')
+KEYWORD = rf'''
+ # keyword
+ {_KEYWORD}
+ # end keyword
+ '''
+_KEYWORD = ''.join(_KEYWORD.split())
+
+IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'
+# We use a negative lookahead to filter out keywords.
+STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'
+ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'
+
+
+#######################################
+# types
+
+SIMPLE_TYPE = textwrap.dedent(rf'''
+ # simple type
+ (?:
+ \b
+ (?:
+ void
+ |
+ (?: signed | unsigned ) # implies int
+ |
+ (?:
+ (?: (?: signed | unsigned ) \s+ )?
+ (?: (?: long | short ) \s+ )?
+ (?: char | short | int | long | float | double )
+ )
+ )
+ \b
+ )
+ # end simple type
+ ''')
+
+COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'
+
+
+#######################################
+# variable declarations
+
+STORAGE_CLASS = r'(?: \b (?: auto | register | static | extern ) \b )'
+TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
+PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'
+
+TYPE_SPEC = textwrap.dedent(rf'''
+ # type spec
+ (?:
+ {_ind(SIMPLE_TYPE, 2)}
+ |
+ (?:
+ [_]*typeof[_]*
+ \s* [(]
+ (?: \s* [*&] )*
+ \s* {STRICT_IDENTIFIER}
+ \s* [)]
+ )
+ |
+ # reference to a compound type
+ (?:
+ {COMPOUND_TYPE_KIND}
+ (?: \s* {ANON_IDENTIFIER} )?
+ )
+ |
+ # reference to a typedef
+ {STRICT_IDENTIFIER}
+ )
+ # end type spec
+ ''')
+
+DECLARATOR = textwrap.dedent(rf'''
+ # declarator (possibly abstract)
+ (?:
+ (?: {PTR_QUALIFIER} \s* )*
+ (?:
+ (?:
+ (?: # <IDENTIFIER>
+ {STRICT_IDENTIFIER}
+ )
+ (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
+ )
+ |
+ (?:
+ [(] \s*
+ (?: # <WRAPPED_IDENTIFIER>
+ {STRICT_IDENTIFIER}
+ )
+ (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
+ \s* [)]
+ )
+ |
+ # func ptr
+ (?:
+ [(] (?: \s* {PTR_QUALIFIER} )? \s*
+ (?: # <FUNC_IDENTIFIER>
+ {STRICT_IDENTIFIER}
+ )
+ (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
+ \s* [)]
+ # We allow for a single level of paren nesting in parameters.
+ \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]
+ )
+ )
+ )
+ # end declarator
+ ''')
+
+VAR_DECL = textwrap.dedent(rf'''
+ # var decl (and typedef and func return type)
+ (?:
+ (?:
+ (?: # <STORAGE>
+ {STORAGE_CLASS}
+ )
+ \s*
+ )?
+ (?:
+ (?: # <TYPE_QUAL>
+ {TYPE_QUALIFIER}
+ )
+ \s*
+ )?
+ (?:
+ (?: # <TYPE_SPEC>
+ {_ind(TYPE_SPEC, 4)}
+ )
+ )
+ \s*
+ (?:
+ (?: # <DECLARATOR>
+ {_ind(DECLARATOR, 4)}
+ )
+ )
+ )
+ # end var decl
+ ''')
+
+INITIALIZER = textwrap.dedent(rf'''
+ # initializer
+ (?:
+ (?:
+ [(]
+ # no nested parens (e.g. func ptr)
+ [^)]*
+ [)]
+ \s*
+ )?
+ (?:
+ # a string literal
+ (?:
+ (?: {_ind(STRING_LITERAL, 4)} \s* )*
+ {_ind(STRING_LITERAL, 4)}
+ )
+ |
+
+ # a simple initializer
+ (?:
+ (?:
+ [^'",;{{]*
+ {_ind(STRING_LITERAL, 4)}
+ )*
+ [^'",;{{]*
+ )
+ |
+
+ # a struct/array literal
+ (?:
+ # We only expect compound initializers with
+ # single-variable declarations.
+ {{
+ (?:
+ [^'";]*?
+ {_ind(STRING_LITERAL, 5)}
+ )*
+ [^'";]*?
+ }}
+ (?= \s* ; ) # Note this lookahead.
+ )
+ )
+ )
+ # end initializer
+ ''')
+
+
+#######################################
+# compound type declarations
+
+STRUCT_MEMBER_DECL = textwrap.dedent(rf'''
+ (?:
+ # inline compound type decl
+ (?:
+ (?: # <COMPOUND_TYPE_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ (?:
+ \s+
+ (?: # <COMPOUND_TYPE_NAME>
+ {STRICT_IDENTIFIER}
+ )
+ )?
+ \s* {{
+ )
+ |
+ (?:
+ # typed member
+ (?:
+ # Technically it doesn't have to have a type...
+ (?: # <SPECIFIER_QUALIFIER>
+ (?: {TYPE_QUALIFIER} \s* )?
+ {_ind(TYPE_SPEC, 5)}
+ )
+ (?:
+ # If it doesn't have a declarator then it will have
+ # a size and vice versa.
+ \s*
+ (?: # <DECLARATOR>
+ {_ind(DECLARATOR, 6)}
+ )
+ )?
+ )
+
+ # sized member
+ (?:
+ \s* [:] \s*
+ (?: # <SIZE>
+ \d+
+ )
+ )?
+ \s*
+ (?: # <ENDING>
+ [,;]
+ )
+ )
+ |
+ (?:
+ \s*
+ (?: # <CLOSE>
+ }}
+ )
+ )
+ )
+ ''')
+
+ENUM_MEMBER_DECL = textwrap.dedent(rf'''
+ (?:
+ (?:
+ \s*
+ (?: # <CLOSE>
+ }}
+ )
+ )
+ |
+ (?:
+ \s*
+ (?: # <NAME>
+ {IDENTIFIER}
+ )
+ (?:
+ \s* = \s*
+ (?: # <INIT>
+ {_ind(STRING_LITERAL, 4)}
+ |
+ [^'",}}]+
+ )
+ )?
+ \s*
+ (?: # <ENDING>
+ , | }}
+ )
+ )
+ )
+ ''')
+
+
+#######################################
+# statements
+
+SIMPLE_STMT_BODY = textwrap.dedent(rf'''
+ # simple statement body
+ (?:
+ (?:
+ [^'"{{}};]*
+ {_ind(STRING_LITERAL, 3)}
+ )*
+ [^'"{{}};]*
+ #(?= [;{{] ) # Note this lookahead.
+ )
+ # end simple statement body
+ ''')
+SIMPLE_STMT = textwrap.dedent(rf'''
+ # simple statement
+ (?:
+ (?: # <SIMPLE_STMT>
+ # stmt-inline "initializer"
+ (?:
+ return \b
+ (?:
+ \s*
+ {_ind(INITIALIZER, 5)}
+ )?
+ )
+ |
+ # variable assignment
+ (?:
+ (?: [*] \s* )?
+ (?:
+ {STRICT_IDENTIFIER} \s*
+ (?: . | -> ) \s*
+ )*
+ {STRICT_IDENTIFIER}
+ (?: \s* \[ \s* \d+ \s* \] )?
+ \s* = \s*
+ {_ind(INITIALIZER, 4)}
+ )
+ |
+ # catchall return statement
+ (?:
+ return \b
+ (?:
+ (?:
+ [^'";]*
+ {_ind(STRING_LITERAL, 6)}
+ )*
+ \s* [^'";]*
+ )?
+ )
+ |
+ # simple statement
+ (?:
+ {_ind(SIMPLE_STMT_BODY, 4)}
+ )
+ )
+ \s*
+ (?: # <SIMPLE_ENDING>
+ ;
+ )
+ )
+ # end simple statement
+ ''')
+COMPOUND_STMT = textwrap.dedent(rf'''
+ # compound statement
+ (?:
+ \b
+ (?:
+ (?:
+ (?: # <COMPOUND_BARE>
+ else | do
+ )
+ \b
+ )
+ |
+ (?:
+ (?: # <COMPOUND_LABELED>
+ (?:
+ case \b
+ (?:
+ [^'":]*
+ {_ind(STRING_LITERAL, 7)}
+ )*
+ \s* [^'":]*
+ )
+ |
+ default
+ |
+ {STRICT_IDENTIFIER}
+ )
+ \s* [:]
+ )
+ |
+ (?:
+ (?: # <COMPOUND_PAREN>
+ for | while | if | switch
+ )
+ \s* (?= [(] ) # Note this lookahead.
+ )
+ )
+ \s*
+ )
+ # end compound statement
+ ''')
+
+
+#######################################
+# function bodies
+
+LOCAL = textwrap.dedent(rf'''
+ (?:
+ # an empty statement
+ (?: # <EMPTY>
+ ;
+ )
+ |
+ # inline type decl
+ (?:
+ (?:
+ (?: # <INLINE_LEADING>
+ [^;{{}}]+?
+ )
+ \s*
+ )?
+ (?: # <INLINE_PRE>
+ (?: {STORAGE_CLASS} \s* )?
+ (?: {TYPE_QUALIFIER} \s* )?
+ )? # </INLINE_PRE>
+ (?: # <INLINE_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ (?:
+ \s+
+ (?: # <INLINE_NAME>
+ {STRICT_IDENTIFIER}
+ )
+ )?
+ \s* {{
+ )
+ |
+ # var decl
+ (?:
+ (?: # <STORAGE>
+ {STORAGE_CLASS}
+ )? # </STORAGE>
+ (?:
+ \s*
+ (?: # <VAR_DECL>
+ {_ind(VAR_DECL, 5)}
+ )
+ )
+ (?:
+ (?:
+ # initializer
+ # We expect only basic initializers.
+ \s* = \s*
+ (?: # <VAR_INIT>
+ {_ind(INITIALIZER, 6)}
+ )
+ )?
+ (?:
+ \s*
+ (?: # <VAR_ENDING>
+ [,;]
+ )
+ )
+ )
+ )
+ |
+ {_ind(COMPOUND_STMT, 2)}
+ |
+ # start-of-block
+ (?:
+ (?: # <BLOCK_LEADING>
+ (?:
+ [^'"{{}};]*
+ {_ind(STRING_LITERAL, 5)}
+ )*
+ [^'"{{}};]*
+ # Presumably we will not see "== {{".
+ [^\s='"{{}});]
+ \s*
+ )? # </BLOCK_LEADING>
+ (?: # <BLOCK_OPEN>
+ {{
+ )
+ )
+ |
+ {_ind(SIMPLE_STMT, 2)}
+ |
+ # end-of-block
+ (?: # <BLOCK_CLOSE>
+ }}
+ )
+ )
+ ''')
+
+LOCAL_STATICS = textwrap.dedent(rf'''
+ (?:
+ # inline type decl
+ (?:
+ (?:
+ (?: # <INLINE_LEADING>
+ [^;{{}}]+?
+ )
+ \s*
+ )?
+ (?: # <INLINE_PRE>
+ (?: {STORAGE_CLASS} \s* )?
+ (?: {TYPE_QUALIFIER} \s* )?
+ )?
+ (?: # <INLINE_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ (?:
+ \s+
+ (?: # <INLINE_NAME>
+ {STRICT_IDENTIFIER}
+ )
+ )?
+ \s* {{
+ )
+ |
+ # var decl
+ (?:
+ # We only look for static variables.
+ (?: # <STATIC_DECL>
+ static \b
+ (?: \s* {TYPE_QUALIFIER} )?
+ \s* {_ind(TYPE_SPEC, 4)}
+ \s* {_ind(DECLARATOR, 4)}
+ )
+ \s*
+ (?:
+ (?: # <STATIC_INIT>
+ = \s*
+ {_ind(INITIALIZER, 4)}
+ \s*
+ [,;{{]
+ )
+ |
+ (?: # <STATIC_ENDING>
+ [,;]
+ )
+ )
+ )
+ |
+ # everything else
+ (?:
+ (?: # <DELIM_LEADING>
+ (?:
+ [^'"{{}};]*
+ {_ind(STRING_LITERAL, 4)}
+ )*
+ \s* [^'"{{}};]*
+ )
+ (?:
+ (?: # <BLOCK_OPEN>
+ {{
+ )
+ |
+ (?: # <BLOCK_CLOSE>
+ }}
+ )
+ |
+ (?: # <STMT_END>
+ ;
+ )
+ )
+ )
+ )
+ ''')
+
+
+#######################################
+# global declarations
+
+GLOBAL = textwrap.dedent(rf'''
+ (?:
+ # an empty statement
+ (?: # <EMPTY>
+ ;
+ )
+ |
+
+ # compound type decl (maybe inline)
+ (?:
+ (?:
+ (?: # <COMPOUND_LEADING>
+ [^;{{}}]+?
+ )
+ \s*
+ )?
+ (?: # <COMPOUND_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ (?:
+ \s+
+ (?: # <COMPOUND_NAME>
+ {STRICT_IDENTIFIER}
+ )
+ )?
+ \s* {{
+ )
+ |
+ # bogus inline decl artifact
+ # This simplifies resolving the relative syntactic ambiguity of
+ # inline structs.
+ (?:
+ (?: # <FORWARD_KIND>
+ {COMPOUND_TYPE_KIND}
+ )
+ \s*
+ (?: # <FORWARD_NAME>
+ {ANON_IDENTIFIER}
+ )
+ (?: # <MAYBE_INLINE_ACTUAL>
+ [^=,;({{[*\]]*
+ [=,;({{]
+ )
+ )
+ |
+
+ # typedef
+ (?:
+ \b typedef \b \s*
+ (?: # <TYPEDEF_DECL>
+ {_ind(VAR_DECL, 4)}
+ )
+ (?:
+ # We expect no inline type definitions in the parameters.
+ \s* [(] \s*
+ (?: # <TYPEDEF_FUNC_PARAMS>
+ [^{{;]*
+ )
+ \s* [)]
+ )?
+ \s* ;
+ )
+ |
+
+ # func decl/definition & var decls
+ # XXX dedicated pattern for funcs (more restricted)?
+ (?:
+ (?:
+ (?: # <VAR_STORAGE>
+ {STORAGE_CLASS}
+ )
+ \s*
+ )?
+ (?:
+ (?: # <FUNC_INLINE>
+ \b inline \b
+ )
+ \s*
+ )?
+ (?: # <VAR_DECL>
+ {_ind(VAR_DECL, 4)}
+ )
+ (?:
+ # func decl / definition
+ (?:
+ (?:
+ # We expect no inline type definitions in the parameters.
+ \s* [(] \s*
+ (?: # <FUNC_PARAMS>
+ [^{{;]*
+ )
+ \s* [)] \s*
+ (?: # <FUNC_DELIM>
+ [{{;]
+ )
+ )
+ |
+ (?:
+ # This is some old-school syntax!
+ \s* [(] \s*
+ # We throw away the bare names:
+ {STRICT_IDENTIFIER}
+ (?: \s* , \s* {STRICT_IDENTIFIER} )*
+ \s* [)] \s*
+
+ # We keep the trailing param declarations:
+ (?: # <FUNC_LEGACY_PARAMS>
+ # There's at least one!
+ (?: {TYPE_QUALIFIER} \s* )?
+ {_ind(TYPE_SPEC, 7)}
+ \s*
+ {_ind(DECLARATOR, 7)}
+ \s* ;
+ (?:
+ \s*
+ (?: {TYPE_QUALIFIER} \s* )?
+ {_ind(TYPE_SPEC, 8)}
+ \s*
+ {_ind(DECLARATOR, 8)}
+ \s* ;
+ )*
+ )
+ \s* {{
+ )
+ )
+ |
+ # var / typedef
+ (?:
+ (?:
+ # initializer
+ # We expect only basic initializers.
+ \s* = \s*
+ (?: # <VAR_INIT>
+ {_ind(INITIALIZER, 6)}
+ )
+ )?
+ \s*
+ (?: # <VAR_ENDING>
+ [,;]
+ )
+ )
+ )
+ )
+ )
+ ''')