From e50de09886bfdf4febb75accdd1549952787b5e0 Mon Sep 17 00:00:00 2001 From: Akim Demaille Date: Fri, 10 Apr 2020 18:31:07 +0200 Subject: tokens: properly define the YYEOF token kind Currently EOF is handled in an adhoc way, with a #define YYEOF 0 in the implementation file. As a result, the user has to define her own EOF token if she wants to use it, which is a pity. Give the $end token a visible kind name, YYEOF. Except that in C, where enums are not scoped, we would have collisions between all the definitions of YYEOFs in the header files, so in C, make it EOF. * data/skeletons/c.m4 (YYEOF): Override its name to avoid collisions. Unless the user already gave it a different name. * data/skeletons/glr.c (YYEOF): Remove. Use ]b4_symbol(0, [id])[ instead. Add support for "pre_epilogue", for glr.cc. * data/skeletons/glr.cc: Remove dead code (never emitted #undefs). * data/skeletons/yacc.c * src/parse-gram.c * src/reader.c * src/symtab.c * tests/actions.at * tests/input.at --- TODO | 16 +++++++++++++++- data/skeletons/bison.m4 | 8 +++----- data/skeletons/c.m4 | 5 ++++- data/skeletons/glr.c | 10 +++++----- data/skeletons/glr.cc | 17 ++++++++++++++--- data/skeletons/yacc.c | 9 ++++----- src/parse-gram.c | 9 ++++----- src/reader.c | 7 ++++++- src/symtab.c | 5 ++++- tests/actions.at | 9 +++++++++ tests/input.at | 3 ++- 11 files changed, 70 insertions(+), 28 deletions(-) diff --git a/TODO b/TODO index 37a80460..4da4100d 100644 --- a/TODO +++ b/TODO @@ -6,7 +6,7 @@ should not have to dispatch to several APIs. ** Documentation - yyexpected_tokens in all the languages. - YYENOMEM -- YYERRCODE? +- YYERRCODE, YYUNDEF, YYEOF - i18n in Java - symbol.type_get should be kind_get, and it's not documented. @@ -85,6 +85,20 @@ push parsers on top of pull parser. Which is currently not relevant, since push parsers are measurably slower. * Bison 3.7 +** Counter example generation +See https://github.com/akimd/bison/pull/15. + +** Clean up +Rename user_token_number for tokens as "code". It's not a "user number", +it's the token code, and the user can control it, but this code always +exists. + +Rename endtoken as eoftoken. + +Don't rename in Bison 3.6 (it would be logical to do so) because that +would probably create many conflicts in Vincent's work (see previous point). + +* Bison 3.8 ** Unit rules / Injection rules (Akim Demaille) Maybe we could expand unit rules (or "injections", see https://homepages.cwi.nl/~daybuild/daily-books/syntax/2-sdf/sdf.html), i.e., diff --git a/data/skeletons/bison.m4 b/data/skeletons/bison.m4 index 4739f0fb..c1b04077 100644 --- a/data/skeletons/bison.m4 +++ b/data/skeletons/bison.m4 @@ -537,11 +537,9 @@ m4_define([b4_symbol_map], # Whether NUM denotes a token that has an exported definition (i.e., # shows in enum yytokentype). m4_define([b4_token_visible_if], -[m4_case(b4_symbol([$1], [tag]), - [$undefined], [$2], - [b4_symbol_if([$1], [is_token], - [b4_symbol_if([$1], [has_id], [$2], [$3])], - [$3])])]) +[b4_symbol_if([$1], [is_token], + [b4_symbol_if([$1], [has_id], [$2], [$3])], + [$3])]) # b4_token_has_definition(NUM) diff --git a/data/skeletons/c.m4 b/data/skeletons/c.m4 index 8013141d..e517259d 100644 --- a/data/skeletons/c.m4 +++ b/data/skeletons/c.m4 @@ -431,8 +431,11 @@ static const b4_int_type_for([$2]) yy$1[[]] = ## ------------- ## # Because C enums are not scoped, because tokens are exposed in the -# header, and because these tokens are common to all the parser, we +# header, and because these tokens are common to all the parsers, we # need to make sure their names don't collide: use the api.prefix. +# YYEOF is special, since the user may give it a different name. +m4_if(b4_symbol(0, id), [YYEOF], + [m4_define([b4_symbol(0, id)], [b4_api_PREFIX[][EOF]])]) m4_define([b4_symbol(1, id)], [b4_api_PREFIX[][ERRCODE]]) m4_define([b4_symbol(2, id)], [b4_api_PREFIX[][UNDEF]]) diff --git a/data/skeletons/glr.c b/data/skeletons/glr.c index 10ee46c4..1fd042fb 100644 --- a/data/skeletons/glr.c +++ b/data/skeletons/glr.c @@ -429,7 +429,6 @@ int yychar;])[ enum { YYENOMEM = -2 }; -static const int YYEOF = 0; static const int YYEMPTY = -2; typedef enum { yyok, yyaccept, yyabort, yyerr } YYRESULTTAG; @@ -833,9 +832,9 @@ yygetToken (int *yycharp][]b4_pure_if([, yyGLRStack* yystackp])[]b4_user_formals #endif // YY_EXCEPTIONS]], [[ *yycharp = ]b4_lex[;]])[ } - if (*yycharp <= YYEOF) + if (*yycharp <= ]b4_symbol(0, [id])[) { - *yycharp = YYEOF; + *yycharp = ]b4_symbol(0, [id])[; yytoken = ]b4_symbol_prefix[YYEOF; YY_DPRINTF ((stderr, "Now at end of input.\n")); } @@ -2311,7 +2310,7 @@ yyrecoverSyntaxError (yyGLRStack* yystackp]b4_user_formals[) { yysymbol_kind_t yytoken; int yyj; - if (yychar == YYEOF) + if (yychar == ]b4_symbol(0, [id])[) yyFail (yystackp][]b4_lpure_args[, YY_NULLPTR); if (yychar != YYEMPTY) {]b4_locations_if([[ @@ -2724,6 +2723,7 @@ m4_if(b4_prefix, [yy], [], #define yynerrs ]b4_prefix[nerrs]b4_locations_if([[ #define yylloc ]b4_prefix[lloc]])])[ -]b4_percent_code_get([[epilogue]])[]dnl +]m4_ifdef([b4_pre_epilogue], [b4_pre_epilogue])[]dnl This is a hack for glr.cc. To remove when we have a better glr.cc. +b4_percent_code_get([[epilogue]])[]dnl b4_epilogue[]dnl b4_output_end diff --git a/data/skeletons/glr.cc b/data/skeletons/glr.cc index 9d3b07a5..c6159c3a 100644 --- a/data/skeletons/glr.cc +++ b/data/skeletons/glr.cc @@ -105,6 +105,12 @@ yyerror (]b4_locations_if([[const ]b4_namespace_ref::b4_parser_class[::location_ ]])[]m4_ifset([b4_parse_param], [b4_formals(b4_parse_param), ])[const char* msg);]])[ +]b4_percent_define_flag_if([[global_tokens_and_yystype]], [], +[m4_define([b4_pre_epilogue], +[[/* The user is using the C++ token type, not the C one. */ +#undef ]b4_symbol(0, [id]) +])])[ + # Hijack the epilogue to define implementations (yyerror, parser member # functions etc.). ]m4_append([b4_epilogue], @@ -329,8 +335,14 @@ b4_percent_code_get([[requires]])[ ]dnl Redirections for glr.c. b4_percent_define_flag_if([[global_tokens_and_yystype]], -[b4_token_defines]) -[ +[b4_token_defines +])[ +]b4_namespace_close[ + +]dnl Map the name used in c.m4 to the one used in c++.m4. +[#undef ]b4_symbol(0, [id])[ +#define ]b4_symbol(0, [id])[ ]b4_namespace_ref[::]b4_parser_class[::token::]b4_symbol(0, [id])[ + #ifndef ]b4_api_PREFIX[STYPE # define ]b4_api_PREFIX[STYPE ]b4_namespace_ref[::]b4_parser_class[::semantic_type #endif @@ -338,7 +350,6 @@ b4_percent_define_flag_if([[global_tokens_and_yystype]], # define ]b4_api_PREFIX[LTYPE ]b4_namespace_ref[::]b4_parser_class[::location_type #endif -]b4_namespace_close[ ]m4_define([b4_declare_symbol_enum], [[typedef ]b4_namespace_ref[::]b4_parser_class[::symbol_kind_type yysymbol_kind_t; #define ]b4_symbol_prefix[YYEMPTY ]b4_namespace_ref[::]b4_parser_class[::symbol_kind::]b4_symbol_prefix[YYEMPTY diff --git a/data/skeletons/yacc.c b/data/skeletons/yacc.c index 12bb7fe1..73679084 100644 --- a/data/skeletons/yacc.c +++ b/data/skeletons/yacc.c @@ -716,7 +716,6 @@ enum { YYENOMEM = -2 }; #define yyerrok (yyerrstatus = 0) #define yyclearin (yychar = YYEMPTY) #define YYEMPTY (-2) -#define YYEOF 0 #define YYACCEPT goto yyacceptlab #define YYABORT goto yyabortlab @@ -1760,9 +1759,9 @@ yyread_pushed_token:]])[ yychar = ]b4_lex[;]])[ } - if (yychar <= YYEOF) + if (yychar <= ]b4_symbol(0, [id])[) { - yychar = YYEOF; + yychar = ]b4_symbol(0, [id])[; yytoken = ]b4_symbol_prefix[YYEOF; YYDPRINTF ((stderr, "Now at end of input.\n")); } @@ -1957,10 +1956,10 @@ yyerrlab: /* If just tried and failed to reuse lookahead token after an error, discard it. */ - if (yychar <= YYEOF) + if (yychar <= ]b4_symbol(0, [id])[) { /* Return failure if at end of input. */ - if (yychar == YYEOF) + if (yychar == ]b4_symbol(0, [id])[) YYABORT; } else diff --git a/src/parse-gram.c b/src/parse-gram.c index ed6794b2..d3bb2ea4 100644 --- a/src/parse-gram.c +++ b/src/parse-gram.c @@ -900,7 +900,6 @@ enum { YYENOMEM = -2 }; #define yyerrok (yyerrstatus = 0) #define yyclearin (yychar = YYEMPTY) #define YYEMPTY (-2) -#define YYEOF 0 #define YYACCEPT goto yyacceptlab #define YYABORT goto yyabortlab @@ -1950,9 +1949,9 @@ yybackup: yychar = yylex (&yylval, &yylloc); } - if (yychar <= YYEOF) + if (yychar <= GRAM_EOF) { - yychar = YYEOF; + yychar = GRAM_EOF; yytoken = YYSYMBOL_YYEOF; YYDPRINTF ((stderr, "Now at end of input.\n")); } @@ -2643,10 +2642,10 @@ yyerrlab: /* If just tried and failed to reuse lookahead token after an error, discard it. */ - if (yychar <= YYEOF) + if (yychar <= GRAM_EOF) { /* Return failure if at end of input. */ - if (yychar == YYEOF) + if (yychar == GRAM_EOF) YYABORT; } else diff --git a/src/reader.c b/src/reader.c index c386f433..bffca16d 100644 --- a/src/reader.c +++ b/src/reader.c @@ -778,11 +778,16 @@ check_and_convert_grammar (void) /* If the user did not define her ENDTOKEN, do it now. */ if (!endtoken) { - endtoken = symbol_get ("$end", empty_loc); + endtoken = symbol_get ("YYEOF", empty_loc); endtoken->content->class = token_sym; endtoken->content->number = 0; /* Value specified by POSIX. */ endtoken->content->user_token_number = 0; + { + symbol *alias = symbol_get ("$end", empty_loc); + symbol_class_set (alias, token_sym, empty_loc, false); + symbol_make_alias (endtoken, alias, empty_loc); + } } /* Report any undefined symbols and consider them nonterminals. */ diff --git a/src/symtab.c b/src/symtab.c index fd96b827..b236fc36 100644 --- a/src/symtab.c +++ b/src/symtab.c @@ -70,9 +70,12 @@ bool tag_seen = false; static bool symbol_is_user_defined (symbol *sym) { + const bool eof_is_user_defined + = !endtoken->alias || STRNEQ (endtoken->alias->tag, "$end"); return sym->tag[0] != '$' + && (eof_is_user_defined || (sym != endtoken && sym->alias != errtoken)) && sym != errtoken && sym->alias != errtoken - && sym != undeftoken && sym->alias != undeftoken; + && sym != undeftoken && sym->alias != undeftoken; } diff --git a/tests/actions.at b/tests/actions.at index 9fa630df..223d00d2 100644 --- a/tests/actions.at +++ b/tests/actions.at @@ -1303,6 +1303,15 @@ AT_CLEANUP AT_SETUP([Default %printer and %destructor for user-defined end token]) +# Enable declaration of default %printer/%destructor. Make the parser +# use these for all user-declared grammar symbols for which the user +# does not declare a specific %printer/%destructor. Thus, the parser +# uses it for token 0 if the user declares it but not if Bison +# generates it as $end. Discussed starting at +# , +# , and +# . + # AT_TEST(TYPED) # -------------- m4_pushdef([AT_TEST], diff --git a/tests/input.at b/tests/input.at index 6d434350..2250406d 100644 --- a/tests/input.at +++ b/tests/input.at @@ -344,6 +344,7 @@ exp: int main (void) { assert (YYERRCODE == 123); + assert (YYTRANSLATE (YYEOF) == YYSYMBOL_YYEOF); assert (YYTRANSLATE (YYERRCODE) == YYSYMBOL_YYERROR); assert (YYTRANSLATE (YYUNDEF) == YYSYMBOL_YYUNDEF); return 0; @@ -430,7 +431,7 @@ exp:; AT_BISON_CHECK([-Wno-other -S./dump-symbols.m4 input.y]) AT_CHECK([cat symbols.csv], [], [[number, class, tag, id, user_number, type, -0, Token, $end, , 0, , +0, Token, $end, YYEOF, 0, , 1, Token, error, YYERRCODE, 256, , 2, Token, $undefined, YYUNDEF, 257, , 3, Token, 'a', , 97, , -- cgit v1.2.1