summaryrefslogtreecommitdiff
path: root/Parser
diff options
context:
space:
mode:
authorPablo Galindo <Pablogsal@gmail.com>2019-03-01 15:34:44 -0800
committerGitHub <noreply@github.com>2019-03-01 15:34:44 -0800
commit1f24a719e7be5e49b876a5dc7daf21d01ee69faa (patch)
tree8f8f56cab78ef671a8cb7f54b8ec2495d9a435e6 /Parser
parent7eebbbd5b3907447eddadf5cb7cb1cc9230d15b2 (diff)
downloadcpython-git-1f24a719e7be5e49b876a5dc7daf21d01ee69faa.tar.gz
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814)
Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions.
Diffstat (limited to 'Parser')
-rw-r--r--Parser/bitset.c66
-rw-r--r--Parser/firstsets.c113
-rw-r--r--Parser/grammar.c273
-rw-r--r--Parser/metagrammar.c159
-rw-r--r--Parser/parsetok.c16
-rw-r--r--Parser/parsetok_pgen.c2
-rw-r--r--Parser/pgen.c724
-rw-r--r--Parser/pgen/__init__.py0
-rw-r--r--Parser/pgen/__main__.py34
-rw-r--r--Parser/pgen/grammar.py160
-rw-r--r--Parser/pgen/pgen.py408
-rw-r--r--Parser/pgen/token.py40
-rw-r--r--Parser/pgenmain.c188
-rw-r--r--Parser/printgrammar.c120
-rw-r--r--Parser/tokenizer.c56
-rw-r--r--Parser/tokenizer.h8
-rw-r--r--Parser/tokenizer_pgen.c2
17 files changed, 642 insertions, 1727 deletions
diff --git a/Parser/bitset.c b/Parser/bitset.c
deleted file mode 100644
index f5bfd41bd6..0000000000
--- a/Parser/bitset.c
+++ /dev/null
@@ -1,66 +0,0 @@
-
-/* Bitset primitives used by the parser generator */
-
-#include "pgenheaders.h"
-#include "bitset.h"
-
-bitset
-newbitset(int nbits)
-{
- int nbytes = NBYTES(nbits);
- bitset ss = (char *)PyObject_MALLOC(sizeof(BYTE) * nbytes);
-
- if (ss == NULL)
- Py_FatalError("no mem for bitset");
-
- ss += nbytes;
- while (--nbytes >= 0)
- *--ss = 0;
- return ss;
-}
-
-void
-delbitset(bitset ss)
-{
- PyObject_FREE(ss);
-}
-
-int
-addbit(bitset ss, int ibit)
-{
- int ibyte = BIT2BYTE(ibit);
- BYTE mask = BIT2MASK(ibit);
-
- if (ss[ibyte] & mask)
- return 0; /* Bit already set */
- ss[ibyte] |= mask;
- return 1;
-}
-
-#if 0 /* Now a macro */
-int
-testbit(bitset ss, int ibit)
-{
- return (ss[BIT2BYTE(ibit)] & BIT2MASK(ibit)) != 0;
-}
-#endif
-
-int
-samebitset(bitset ss1, bitset ss2, int nbits)
-{
- int i;
-
- for (i = NBYTES(nbits); --i >= 0; )
- if (*ss1++ != *ss2++)
- return 0;
- return 1;
-}
-
-void
-mergebitset(bitset ss1, bitset ss2, int nbits)
-{
- int i;
-
- for (i = NBYTES(nbits); --i >= 0; )
- *ss1++ |= *ss2++;
-}
diff --git a/Parser/firstsets.c b/Parser/firstsets.c
deleted file mode 100644
index ee75d1bfe0..0000000000
--- a/Parser/firstsets.c
+++ /dev/null
@@ -1,113 +0,0 @@
-
-/* Computation of FIRST stets */
-
-#include "pgenheaders.h"
-#include "grammar.h"
-#include "token.h"
-
-extern int Py_DebugFlag;
-
-/* Forward */
-static void calcfirstset(grammar *, dfa *);
-
-void
-addfirstsets(grammar *g)
-{
- int i;
- dfa *d;
-
- if (Py_DebugFlag)
- printf("Adding FIRST sets ...\n");
- for (i = 0; i < g->g_ndfas; i++) {
- d = &g->g_dfa[i];
- if (d->d_first == NULL)
- calcfirstset(g, d);
- }
-}
-
-static void
-calcfirstset(grammar *g, dfa *d)
-{
- int i, j;
- state *s;
- arc *a;
- int nsyms;
- int *sym;
- int nbits;
- static bitset dummy;
- bitset result;
- int type;
- dfa *d1;
- label *l0;
-
- if (Py_DebugFlag)
- printf("Calculate FIRST set for '%s'\n", d->d_name);
-
- if (dummy == NULL)
- dummy = newbitset(1);
- if (d->d_first == dummy) {
- fprintf(stderr, "Left-recursion for '%s'\n", d->d_name);
- return;
- }
- if (d->d_first != NULL) {
- fprintf(stderr, "Re-calculating FIRST set for '%s' ???\n",
- d->d_name);
- }
- d->d_first = dummy;
-
- l0 = g->g_ll.ll_label;
- nbits = g->g_ll.ll_nlabels;
- result = newbitset(nbits);
-
- sym = (int *)PyObject_MALLOC(sizeof(int));
- if (sym == NULL)
- Py_FatalError("no mem for new sym in calcfirstset");
- nsyms = 1;
- sym[0] = findlabel(&g->g_ll, d->d_type, (char *)NULL);
-
- s = &d->d_state[d->d_initial];
- for (i = 0; i < s->s_narcs; i++) {
- a = &s->s_arc[i];
- for (j = 0; j < nsyms; j++) {
- if (sym[j] == a->a_lbl)
- break;
- }
- if (j >= nsyms) { /* New label */
- sym = (int *)PyObject_REALLOC(sym,
- sizeof(int) * (nsyms + 1));
- if (sym == NULL)
- Py_FatalError(
- "no mem to resize sym in calcfirstset");
- sym[nsyms++] = a->a_lbl;
- type = l0[a->a_lbl].lb_type;
- if (ISNONTERMINAL(type)) {
- d1 = PyGrammar_FindDFA(g, type);
- if (d1->d_first == dummy) {
- fprintf(stderr,
- "Left-recursion below '%s'\n",
- d->d_name);
- }
- else {
- if (d1->d_first == NULL)
- calcfirstset(g, d1);
- mergebitset(result,
- d1->d_first, nbits);
- }
- }
- else if (ISTERMINAL(type)) {
- addbit(result, a->a_lbl);
- }
- }
- }
- d->d_first = result;
- if (Py_DebugFlag) {
- printf("FIRST set for '%s': {", d->d_name);
- for (i = 0; i < nbits; i++) {
- if (testbit(result, i))
- printf(" %s", PyGrammar_LabelRepr(&l0[i]));
- }
- printf(" }\n");
- }
-
- PyObject_FREE(sym);
-}
diff --git a/Parser/grammar.c b/Parser/grammar.c
deleted file mode 100644
index 75fd5b9cde..0000000000
--- a/Parser/grammar.c
+++ /dev/null
@@ -1,273 +0,0 @@
-
-/* Grammar implementation */
-
-#include "Python.h"
-#include "pgenheaders.h"
-
-#include <ctype.h>
-
-#include "token.h"
-#include "grammar.h"
-
-extern int Py_DebugFlag;
-
-grammar *
-newgrammar(int start)
-{
- grammar *g;
-
- g = (grammar *)PyObject_MALLOC(sizeof(grammar));
- if (g == NULL)
- Py_FatalError("no mem for new grammar");
- g->g_ndfas = 0;
- g->g_dfa = NULL;
- g->g_start = start;
- g->g_ll.ll_nlabels = 0;
- g->g_ll.ll_label = NULL;
- g->g_accel = 0;
- return g;
-}
-
-void
-freegrammar(grammar *g)
-{
- int i;
- for (i = 0; i < g->g_ndfas; i++) {
- free(g->g_dfa[i].d_name);
- for (int j = 0; j < g->g_dfa[i].d_nstates; j++)
- PyObject_FREE(g->g_dfa[i].d_state[j].s_arc);
- PyObject_FREE(g->g_dfa[i].d_state);
- }
- PyObject_FREE(g->g_dfa);
- for (i = 0; i < g->g_ll.ll_nlabels; i++)
- free(g->g_ll.ll_label[i].lb_str);
- PyObject_FREE(g->g_ll.ll_label);
- PyObject_FREE(g);
-}
-
-dfa *
-adddfa(grammar *g, int type, const char *name)
-{
- dfa *d;
-
- g->g_dfa = (dfa *)PyObject_REALLOC(g->g_dfa,
- sizeof(dfa) * (g->g_ndfas + 1));
- if (g->g_dfa == NULL)
- Py_FatalError("no mem to resize dfa in adddfa");
- d = &g->g_dfa[g->g_ndfas++];
- d->d_type = type;
- d->d_name = strdup(name);
- d->d_nstates = 0;
- d->d_state = NULL;
- d->d_initial = -1;
- d->d_first = NULL;
- return d; /* Only use while fresh! */
-}
-
-int
-addstate(dfa *d)
-{
- state *s;
-
- d->d_state = (state *)PyObject_REALLOC(d->d_state,
- sizeof(state) * (d->d_nstates + 1));
- if (d->d_state == NULL)
- Py_FatalError("no mem to resize state in addstate");
- s = &d->d_state[d->d_nstates++];
- s->s_narcs = 0;
- s->s_arc = NULL;
- s->s_lower = 0;
- s->s_upper = 0;
- s->s_accel = NULL;
- s->s_accept = 0;
- return Py_SAFE_DOWNCAST(s - d->d_state, intptr_t, int);
-}
-
-void
-addarc(dfa *d, int from, int to, int lbl)
-{
- state *s;
- arc *a;
-
- assert(0 <= from && from < d->d_nstates);
- assert(0 <= to && to < d->d_nstates);
-
- s = &d->d_state[from];
- s->s_arc = (arc *)PyObject_REALLOC(s->s_arc, sizeof(arc) * (s->s_narcs + 1));
- if (s->s_arc == NULL)
- Py_FatalError("no mem to resize arc list in addarc");
- a = &s->s_arc[s->s_narcs++];
- a->a_lbl = lbl;
- a->a_arrow = to;
-}
-
-int
-addlabel(labellist *ll, int type, const char *str)
-{
- int i;
- label *lb;
-
- for (i = 0; i < ll->ll_nlabels; i++) {
- if (ll->ll_label[i].lb_type == type &&
- strcmp(ll->ll_label[i].lb_str, str) == 0)
- return i;
- }
- ll->ll_label = (label *)PyObject_REALLOC(ll->ll_label,
- sizeof(label) * (ll->ll_nlabels + 1));
- if (ll->ll_label == NULL)
- Py_FatalError("no mem to resize labellist in addlabel");
- lb = &ll->ll_label[ll->ll_nlabels++];
- lb->lb_type = type;
- lb->lb_str = strdup(str);
- if (Py_DebugFlag)
- printf("Label @ %8p, %d: %s\n", ll, ll->ll_nlabels,
- PyGrammar_LabelRepr(lb));
- return Py_SAFE_DOWNCAST(lb - ll->ll_label, intptr_t, int);
-}
-
-/* Same, but rather dies than adds */
-
-int
-findlabel(labellist *ll, int type, const char *str)
-{
- int i;
-
- for (i = 0; i < ll->ll_nlabels; i++) {
- if (ll->ll_label[i].lb_type == type /*&&
- strcmp(ll->ll_label[i].lb_str, str) == 0*/)
- return i;
- }
- fprintf(stderr, "Label %d/'%s' not found\n", type, str);
- Py_FatalError("grammar.c:findlabel()");
-
- /* Py_FatalError() is declared with __attribute__((__noreturn__)).
- GCC emits a warning without "return 0;" (compiler bug!), but Clang is
- smarter and emits a warning on the return... */
-#ifndef __clang__
- return 0; /* Make gcc -Wall happy */
-#endif
-}
-
-/* Forward */
-static void translabel(grammar *, label *);
-
-void
-translatelabels(grammar *g)
-{
- int i;
-
-#ifdef Py_DEBUG
- printf("Translating labels ...\n");
-#endif
- /* Don't translate EMPTY */
- for (i = EMPTY+1; i < g->g_ll.ll_nlabels; i++)
- translabel(g, &g->g_ll.ll_label[i]);
-}
-
-static void
-translabel(grammar *g, label *lb)
-{
- int i;
-
- if (Py_DebugFlag)
- printf("Translating label %s ...\n", PyGrammar_LabelRepr(lb));
-
- if (lb->lb_type == NAME) {
- for (i = 0; i < g->g_ndfas; i++) {
- if (strcmp(lb->lb_str, g->g_dfa[i].d_name) == 0) {
- if (Py_DebugFlag)
- printf(
- "Label %s is non-terminal %d.\n",
- lb->lb_str,
- g->g_dfa[i].d_type);
- lb->lb_type = g->g_dfa[i].d_type;
- free(lb->lb_str);
- lb->lb_str = NULL;
- return;
- }
- }
- for (i = 0; i < (int)N_TOKENS; i++) {
- if (strcmp(lb->lb_str, _PyParser_TokenNames[i]) == 0) {
- if (Py_DebugFlag)
- printf("Label %s is terminal %d.\n",
- lb->lb_str, i);
- lb->lb_type = i;
- free(lb->lb_str);
- lb->lb_str = NULL;
- return;
- }
- }
- printf("Can't translate NAME label '%s'\n", lb->lb_str);
- return;
- }
-
- if (lb->lb_type == STRING) {
- if (isalpha(Py_CHARMASK(lb->lb_str[1])) ||
- lb->lb_str[1] == '_') {
- char *p;
- char *src;
- char *dest;
- size_t name_len;
- if (Py_DebugFlag)
- printf("Label %s is a keyword\n", lb->lb_str);
- lb->lb_type = NAME;
- src = lb->lb_str + 1;
- p = strchr(src, '\'');
- if (p)
- name_len = p - src;
- else
- name_len = strlen(src);
- dest = (char *)malloc(name_len + 1);
- if (!dest) {
- printf("Can't alloc dest '%s'\n", src);
- return;
- }
- strncpy(dest, src, name_len);
- dest[name_len] = '\0';
- free(lb->lb_str);
- lb->lb_str = dest;
- }
- else if (lb->lb_str[2] == lb->lb_str[0]) {
- int type = (int) PyToken_OneChar(lb->lb_str[1]);
- if (type != OP) {
- lb->lb_type = type;
- free(lb->lb_str);
- lb->lb_str = NULL;
- }
- else
- printf("Unknown OP label %s\n",
- lb->lb_str);
- }
- else if (lb->lb_str[2] && lb->lb_str[3] == lb->lb_str[0]) {
- int type = (int) PyToken_TwoChars(lb->lb_str[1],
- lb->lb_str[2]);
- if (type != OP) {
- lb->lb_type = type;
- free(lb->lb_str);
- lb->lb_str = NULL;
- }
- else
- printf("Unknown OP label %s\n",
- lb->lb_str);
- }
- else if (lb->lb_str[2] && lb->lb_str[3] && lb->lb_str[4] == lb->lb_str[0]) {
- int type = (int) PyToken_ThreeChars(lb->lb_str[1],
- lb->lb_str[2],
- lb->lb_str[3]);
- if (type != OP) {
- lb->lb_type = type;
- free(lb->lb_str);
- lb->lb_str = NULL;
- }
- else
- printf("Unknown OP label %s\n",
- lb->lb_str);
- }
- else
- printf("Can't translate STRING label %s\n",
- lb->lb_str);
- }
- else
- printf("Can't translate label '%s'\n",
- PyGrammar_LabelRepr(lb));
-}
diff --git a/Parser/metagrammar.c b/Parser/metagrammar.c
deleted file mode 100644
index 53810b8125..0000000000
--- a/Parser/metagrammar.c
+++ /dev/null
@@ -1,159 +0,0 @@
-
-#include "pgenheaders.h"
-#include "metagrammar.h"
-#include "grammar.h"
-#include "pgen.h"
-static arc arcs_0_0[3] = {
- {2, 0},
- {3, 0},
- {4, 1},
-};
-static arc arcs_0_1[1] = {
- {0, 1},
-};
-static state states_0[2] = {
- {3, arcs_0_0},
- {1, arcs_0_1},
-};
-static arc arcs_1_0[1] = {
- {5, 1},
-};
-static arc arcs_1_1[1] = {
- {6, 2},
-};
-static arc arcs_1_2[1] = {
- {7, 3},
-};
-static arc arcs_1_3[1] = {
- {3, 4},
-};
-static arc arcs_1_4[1] = {
- {0, 4},
-};
-static state states_1[5] = {
- {1, arcs_1_0},
- {1, arcs_1_1},
- {1, arcs_1_2},
- {1, arcs_1_3},
- {1, arcs_1_4},
-};
-static arc arcs_2_0[1] = {
- {8, 1},
-};
-static arc arcs_2_1[2] = {
- {9, 0},
- {0, 1},
-};
-static state states_2[2] = {
- {1, arcs_2_0},
- {2, arcs_2_1},
-};
-static arc arcs_3_0[1] = {
- {10, 1},
-};
-static arc arcs_3_1[2] = {
- {10, 1},
- {0, 1},
-};
-static state states_3[2] = {
- {1, arcs_3_0},
- {2, arcs_3_1},
-};
-static arc arcs_4_0[2] = {
- {11, 1},
- {13, 2},
-};
-static arc arcs_4_1[1] = {
- {7, 3},
-};
-static arc arcs_4_2[3] = {
- {14, 4},
- {15, 4},
- {0, 2},
-};
-static arc arcs_4_3[1] = {
- {12, 4},
-};
-static arc arcs_4_4[1] = {
- {0, 4},
-};
-static state states_4[5] = {
- {2, arcs_4_0},
- {1, arcs_4_1},
- {3, arcs_4_2},
- {1, arcs_4_3},
- {1, arcs_4_4},
-};
-static arc arcs_5_0[3] = {
- {5, 1},
- {16, 1},
- {17, 2},
-};
-static arc arcs_5_1[1] = {
- {0, 1},
-};
-static arc arcs_5_2[1] = {
- {7, 3},
-};
-static arc arcs_5_3[1] = {
- {18, 1},
-};
-static state states_5[4] = {
- {3, arcs_5_0},
- {1, arcs_5_1},
- {1, arcs_5_2},
- {1, arcs_5_3},
-};
-static dfa dfas[6] = {
- {256, "MSTART", 0, 2, states_0,
- "\070\000\000"},
- {257, "RULE", 0, 5, states_1,
- "\040\000\000"},
- {258, "RHS", 0, 2, states_2,
- "\040\010\003"},
- {259, "ALT", 0, 2, states_3,
- "\040\010\003"},
- {260, "ITEM", 0, 5, states_4,
- "\040\010\003"},
- {261, "ATOM", 0, 4, states_5,
- "\040\000\003"},
-};
-static label labels[19] = {
- {0, "EMPTY"},
- {256, 0},
- {257, 0},
- {4, 0},
- {0, 0},
- {1, 0},
- {11, 0},
- {258, 0},
- {259, 0},
- {18, 0},
- {260, 0},
- {9, 0},
- {10, 0},
- {261, 0},
- {16, 0},
- {14, 0},
- {3, 0},
- {7, 0},
- {8, 0},
-};
-static grammar _PyParser_Grammar = {
- 6,
- dfas,
- {19, labels},
- 256
-};
-
-grammar *
-meta_grammar(void)
-{
- return &_PyParser_Grammar;
-}
-
-grammar *
-Py_meta_grammar(void)
-{
- return meta_grammar();
-}
diff --git a/Parser/parsetok.c b/Parser/parsetok.c
index 6a96f6bc5a..7a6c886519 100644
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@@ -99,10 +99,8 @@ PyParser_ParseStringObject(const char *s, PyObject *filename,
tok->type_comments = 1;
}
-#ifndef PGEN
Py_INCREF(err_ret->filename);
tok->filename = err_ret->filename;
-#endif
return parsetok(tok, g, start, err_ret, flags);
}
@@ -113,7 +111,6 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename_str,
{
node *n;
PyObject *filename = NULL;
-#ifndef PGEN
if (filename_str != NULL) {
filename = PyUnicode_DecodeFSDefault(filename_str);
if (filename == NULL) {
@@ -121,11 +118,8 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename_str,
return NULL;
}
}
-#endif
n = PyParser_ParseStringObject(s, filename, g, start, err_ret, flags);
-#ifndef PGEN
Py_XDECREF(filename);
-#endif
return n;
}
@@ -169,10 +163,8 @@ PyParser_ParseFileObject(FILE *fp, PyObject *filename,
if (*flags & PyPARSE_TYPE_COMMENTS) {
tok->type_comments = 1;
}
-#ifndef PGEN
Py_INCREF(err_ret->filename);
tok->filename = err_ret->filename;
-#endif
return parsetok(tok, g, start, err_ret, flags);
}
@@ -184,7 +176,6 @@ PyParser_ParseFileFlagsEx(FILE *fp, const char *filename,
{
node *n;
PyObject *fileobj = NULL;
-#ifndef PGEN
if (filename != NULL) {
fileobj = PyUnicode_DecodeFSDefault(filename);
if (fileobj == NULL) {
@@ -192,12 +183,9 @@ PyParser_ParseFileFlagsEx(FILE *fp, const char *filename,
return NULL;
}
}
-#endif
n = PyParser_ParseFileObject(fp, fileobj, enc, g,
start, ps1, ps2, err_ret, flags);
-#ifndef PGEN
Py_XDECREF(fileobj);
-#endif
return n;
}
@@ -371,7 +359,6 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
}
}
-#ifndef PGEN
/* Check that the source for a single input statement really
is a single statement by looking at what is left in the
buffer after parsing. Trailing whitespace and comments
@@ -399,7 +386,6 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
c = *++cur;
}
}
-#endif
}
else
n = NULL;
@@ -470,7 +456,6 @@ initerr(perrdetail *err_ret, PyObject *filename)
err_ret->text = NULL;
err_ret->token = -1;
err_ret->expected = -1;
-#ifndef PGEN
if (filename) {
Py_INCREF(filename);
err_ret->filename = filename;
@@ -482,6 +467,5 @@ initerr(perrdetail *err_ret, PyObject *filename)
return -1;
}
}
-#endif
return 0;
}
diff --git a/Parser/parsetok_pgen.c b/Parser/parsetok_pgen.c
deleted file mode 100644
index 97b92883f3..0000000000
--- a/Parser/parsetok_pgen.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define PGEN
-#include "parsetok.c"
diff --git a/Parser/pgen.c b/Parser/pgen.c
deleted file mode 100644
index 6451a1d998..0000000000
--- a/Parser/pgen.c
+++ /dev/null
@@ -1,724 +0,0 @@
-/* Parser generator */
-
-/* For a description, see the comments at end of this file */
-
-#include "Python.h"
-#include "pgenheaders.h"
-#include "token.h"
-#include "node.h"
-#include "grammar.h"
-#include "metagrammar.h"
-#include "pgen.h"
-
-extern int Py_DebugFlag;
-extern int Py_IgnoreEnvironmentFlag; /* needed by Py_GETENV */
-
-
-/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
-
-typedef struct _nfaarc {
- int ar_label;
- int ar_arrow;
-} nfaarc;
-
-typedef struct _nfastate {
- int st_narcs;
- nfaarc *st_arc;
-} nfastate;
-
-typedef struct _nfa {
- int nf_type;
- char *nf_name;
- int nf_nstates;
- nfastate *nf_state;
- int nf_start, nf_finish;
-} nfa;
-
-/* Forward */
-static void compile_rhs(labellist *ll,
- nfa *nf, node *n, int *pa, int *pb);
-static void compile_alt(labellist *ll,
- nfa *nf, node *n, int *pa, int *pb);
-static void compile_item(labellist *ll,
- nfa *nf, node *n, int *pa, int *pb);
-static void compile_atom(labellist *ll,
- nfa *nf, node *n, int *pa, int *pb);
-
-static int
-addnfastate(nfa *nf)
-{
- nfastate *st;
-
- nf->nf_state = (nfastate *)PyObject_REALLOC(nf->nf_state,
- sizeof(nfastate) * (nf->nf_nstates + 1));
- if (nf->nf_state == NULL)
- Py_FatalError("out of mem");
- st = &nf->nf_state[nf->nf_nstates++];
- st->st_narcs = 0;
- st->st_arc = NULL;
- return st - nf->nf_state;
-}
-
-static void
-addnfaarc(nfa *nf, int from, int to, int lbl)
-{
- nfastate *st;
- nfaarc *ar;
-
- st = &nf->nf_state[from];
- st->st_arc = (nfaarc *)PyObject_REALLOC(st->st_arc,
- sizeof(nfaarc) * (st->st_narcs + 1));
- if (st->st_arc == NULL)
- Py_FatalError("out of mem");
- ar = &st->st_arc[st->st_narcs++];
- ar->ar_label = lbl;
- ar->ar_arrow = to;
-}
-
-static nfa *
-newnfa(char *name)
-{
- nfa *nf;
- static int type = NT_OFFSET; /* All types will be disjunct */
-
- nf = (nfa *)PyObject_MALLOC(sizeof(nfa));
- if (nf == NULL)
- Py_FatalError("no mem for new nfa");
- nf->nf_type = type++;
- nf->nf_name = name; /* XXX strdup(name) ??? */
- nf->nf_nstates = 0;
- nf->nf_state = NULL;
- nf->nf_start = nf->nf_finish = -1;
- return nf;
-}
-
-typedef struct _nfagrammar {
- int gr_nnfas;
- nfa **gr_nfa;
- labellist gr_ll;
-} nfagrammar;
-
-/* Forward */
-static void compile_rule(nfagrammar *gr, node *n);
-
-static nfagrammar *
-newnfagrammar(void)
-{
- nfagrammar *gr;
-
- gr = (nfagrammar *)PyObject_MALLOC(sizeof(nfagrammar));
- if (gr == NULL)
- Py_FatalError("no mem for new nfa grammar");
- gr->gr_nnfas = 0;
- gr->gr_nfa = NULL;
- gr->gr_ll.ll_nlabels = 0;
- gr->gr_ll.ll_label = NULL;
- addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
- return gr;
-}
-
-static void
-freenfagrammar(nfagrammar *gr)
-{
- for (int i = 0; i < gr->gr_nnfas; i++) {
- PyObject_FREE(gr->gr_nfa[i]->nf_state);
- }
- PyObject_FREE(gr->gr_nfa);
- PyObject_FREE(gr);
-}
-
-static nfa *
-addnfa(nfagrammar *gr, char *name)
-{
- nfa *nf;
-
- nf = newnfa(name);
- gr->gr_nfa = (nfa **)PyObject_REALLOC(gr->gr_nfa,
- sizeof(nfa*) * (gr->gr_nnfas + 1));
- if (gr->gr_nfa == NULL)
- Py_FatalError("out of mem");
- gr->gr_nfa[gr->gr_nnfas++] = nf;
- addlabel(&gr->gr_ll, NAME, nf->nf_name);
- return nf;
-}
-
-#ifdef Py_DEBUG
-
-static const char REQNFMT[] = "metacompile: less than %d children\n";
-
-#define REQN(i, count) do { \
- if (i < count) { \
- fprintf(stderr, REQNFMT, count); \
- Py_FatalError("REQN"); \
- } \
-} while (0)
-
-#else
-#define REQN(i, count) /* empty */
-#endif
-
-static nfagrammar *
-metacompile(node *n)
-{
- nfagrammar *gr;
- int i;
-
- if (Py_DebugFlag)
- printf("Compiling (meta-) parse tree into NFA grammar\n");
- gr = newnfagrammar();
- REQ(n, MSTART);
- i = n->n_nchildren - 1; /* Last child is ENDMARKER */
- n = n->n_child;
- for (; --i >= 0; n++) {
- if (n->n_type != NEWLINE)
- compile_rule(gr, n);
- }
- return gr;
-}
-
-static void
-compile_rule(nfagrammar *gr, node *n)
-{
- nfa *nf;
-
- REQ(n, RULE);
- REQN(n->n_nchildren, 4);
- n = n->n_child;
- REQ(n, NAME);
- nf = addnfa(gr, n->n_str);
- n++;
- REQ(n, COLON);
- n++;
- REQ(n, RHS);
- compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
- n++;
- REQ(n, NEWLINE);
-}
-
-static void
-compile_rhs(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
-{
- int i;
- int a, b;
-
- REQ(n, RHS);
- i = n->n_nchildren;
- REQN(i, 1);
- n = n->n_child;
- REQ(n, ALT);
- compile_alt(ll, nf, n, pa, pb);
- if (--i <= 0)
- return;
- n++;
- a = *pa;
- b = *pb;
- *pa = addnfastate(nf);
- *pb = addnfastate(nf);
- addnfaarc(nf, *pa, a, EMPTY);
- addnfaarc(nf, b, *pb, EMPTY);
- for (; --i >= 0; n++) {
- REQ(n, VBAR);
- REQN(i, 1);
- --i;
- n++;
- REQ(n, ALT);
- compile_alt(ll, nf, n, &a, &b);
- addnfaarc(nf, *pa, a, EMPTY);
- addnfaarc(nf, b, *pb, EMPTY);
- }
-}
-
-static void
-compile_alt(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
-{
- int i;
- int a, b;
-
- REQ(n, ALT);
- i = n->n_nchildren;
- REQN(i, 1);
- n = n->n_child;
- REQ(n, ITEM);
- compile_item(ll, nf, n, pa, pb);
- --i;
- n++;
- for (; --i >= 0; n++) {
- REQ(n, ITEM);
- compile_item(ll, nf, n, &a, &b);
- addnfaarc(nf, *pb, a, EMPTY);
- *pb = b;
- }
-}
-
-static void
-compile_item(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
-{
- int i;
- int a, b;
-
- REQ(n, ITEM);
- i = n->n_nchildren;
- REQN(i, 1);
- n = n->n_child;
- if (n->n_type == LSQB) {
- REQN(i, 3);
- n++;
- REQ(n, RHS);
- *pa = addnfastate(nf);
- *pb = addnfastate(nf);
- addnfaarc(nf, *pa, *pb, EMPTY);
- compile_rhs(ll, nf, n, &a, &b);
- addnfaarc(nf, *pa, a, EMPTY);
- addnfaarc(nf, b, *pb, EMPTY);
- REQN(i, 1);
- n++;
- REQ(n, RSQB);
- }
- else {
- compile_atom(ll, nf, n, pa, pb);
- if (--i <= 0)
- return;
- n++;
- addnfaarc(nf, *pb, *pa, EMPTY);
- if (n->n_type == STAR)
- *pb = *pa;
- else
- REQ(n, PLUS);
- }
-}
-
-static void
-compile_atom(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
-{
- int i;
-
- REQ(n, ATOM);
- i = n->n_nchildren;
- (void)i; /* Don't warn about set but unused */
- REQN(i, 1);
- n = n->n_child;
- if (n->n_type == LPAR) {
- REQN(i, 3);
- n++;
- REQ(n, RHS);
- compile_rhs(ll, nf, n, pa, pb);
- n++;
- REQ(n, RPAR);
- }
- else if (n->n_type == NAME || n->n_type == STRING) {
- *pa = addnfastate(nf);
- *pb = addnfastate(nf);
- addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
- }
- else
- REQ(n, NAME);
-}
-
-static void
-dumpstate(labellist *ll, nfa *nf, int istate)
-{
- nfastate *st;
- int i;
- nfaarc *ar;
-
- printf("%c%2d%c",
- istate == nf->nf_start ? '*' : ' ',
- istate,
- istate == nf->nf_finish ? '.' : ' ');
- st = &nf->nf_state[istate];
- ar = st->st_arc;
- for (i = 0; i < st->st_narcs; i++) {
- if (i > 0)
- printf("\n ");
- printf("-> %2d %s", ar->ar_arrow,
- PyGrammar_LabelRepr(&ll->ll_label[ar->ar_label]));
- ar++;
- }
- printf("\n");
-}
-
-static void
-dumpnfa(labellist *ll, nfa *nf)
-{
- int i;
-
- printf("NFA '%s' has %d states; start %d, finish %d\n",
- nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
- for (i = 0; i < nf->nf_nstates; i++)
- dumpstate(ll, nf, i);
-}
-
-
-/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
-
-static void
-addclosure(bitset ss, nfa *nf, int istate)
-{
- if (addbit(ss, istate)) {
- nfastate *st = &nf->nf_state[istate];
- nfaarc *ar = st->st_arc;
- int i;
-
- for (i = st->st_narcs; --i >= 0; ) {
- if (ar->ar_label == EMPTY)
- addclosure(ss, nf, ar->ar_arrow);
- ar++;
- }
- }
-}
-
-typedef struct _ss_arc {
- bitset sa_bitset;
- int sa_arrow;
- int sa_label;
-} ss_arc;
-
-typedef struct _ss_state {
- bitset ss_ss;
- int ss_narcs;
- struct _ss_arc *ss_arc;
- int ss_deleted;
- int ss_finish;
- int ss_rename;
-} ss_state;
-
-typedef struct _ss_dfa {
- int sd_nstates;
- ss_state *sd_state;
-} ss_dfa;
-
-/* Forward */
-static void printssdfa(int xx_nstates, ss_state *xx_state, int nbits,
- labellist *ll, const char *msg);
-static void simplify(int xx_nstates, ss_state *xx_state);
-static void convert(dfa *d, int xx_nstates, ss_state *xx_state);
-
-static void
-makedfa(nfagrammar *gr, nfa *nf, dfa *d)
-{
- int nbits = nf->nf_nstates;
- bitset ss;
- int xx_nstates;
- ss_state *xx_state, *yy;
- ss_arc *zz;
- int istate, jstate, iarc, jarc, ibit;
- nfastate *st;
- nfaarc *ar;
-
- ss = newbitset(nbits);
- addclosure(ss, nf, nf->nf_start);
- xx_state = (ss_state *)PyObject_MALLOC(sizeof(ss_state));
- if (xx_state == NULL)
- Py_FatalError("no mem for xx_state in makedfa");
- xx_nstates = 1;
- yy = &xx_state[0];
- yy->ss_ss = ss;
- yy->ss_narcs = 0;
- yy->ss_arc = NULL;
- yy->ss_deleted = 0;
- yy->ss_finish = testbit(ss, nf->nf_finish);
- if (yy->ss_finish)
- printf("Error: nonterminal '%s' may produce empty.\n",
- nf->nf_name);
-
- /* This algorithm is from a book written before
- the invention of structured programming... */
-
- /* For each unmarked state... */
- for (istate = 0; istate < xx_nstates; ++istate) {
- size_t size;
- yy = &xx_state[istate];
- ss = yy->ss_ss;
- /* For all its states... */
- for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
- if (!testbit(ss, ibit))
- continue;
- st = &nf->nf_state[ibit];
- /* For all non-empty arcs from this state... */
- for (iarc = 0; iarc < st->st_narcs; iarc++) {
- ar = &st->st_arc[iarc];
- if (ar->ar_label == EMPTY)
- continue;
- /* Look up in list of arcs from this state */
- for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
- zz = &yy->ss_arc[jarc];
- if (ar->ar_label == zz->sa_label)
- goto found;
- }
- /* Add new arc for this state */
- size = sizeof(ss_arc) * (yy->ss_narcs + 1);
- yy->ss_arc = (ss_arc *)PyObject_REALLOC(
- yy->ss_arc, size);
- if (yy->ss_arc == NULL)
- Py_FatalError("out of mem");
- zz = &yy->ss_arc[yy->ss_narcs++];
- zz->sa_label = ar->ar_label;
- zz->sa_bitset = newbitset(nbits);
- zz->sa_arrow = -1;
- found: ;
- /* Add destination */
- addclosure(zz->sa_bitset, nf, ar->ar_arrow);
- }
- }
- /* Now look up all the arrow states */
- for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
- zz = &xx_state[istate].ss_arc[jarc];
- for (jstate = 0; jstate < xx_nstates; jstate++) {
- if (samebitset(zz->sa_bitset,
- xx_state[jstate].ss_ss, nbits)) {
- zz->sa_arrow = jstate;
- goto done;
- }
- }
- size = sizeof(ss_state) * (xx_nstates + 1);
- xx_state = (ss_state *)PyObject_REALLOC(xx_state,
- size);
- if (xx_state == NULL)
- Py_FatalError("out of mem");
- zz->sa_arrow = xx_nstates;
- yy = &xx_state[xx_nstates++];
- yy->ss_ss = zz->sa_bitset;
- yy->ss_narcs = 0;
- yy->ss_arc = NULL;
- yy->ss_deleted = 0;
- yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
- done: ;
- }
- }
-
- if (Py_DebugFlag)
- printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
- "before minimizing");
-
- simplify(xx_nstates, xx_state);
-
- if (Py_DebugFlag)
- printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
- "after minimizing");
-
- convert(d, xx_nstates, xx_state);
-
- for (int i = 0; i < xx_nstates; i++) {
- for (int j = 0; j < xx_state[i].ss_narcs; j++)
- delbitset(xx_state[i].ss_arc[j].sa_bitset);
- PyObject_FREE(xx_state[i].ss_arc);
- }
- PyObject_FREE(xx_state);
-}
-
-static void
-printssdfa(int xx_nstates, ss_state *xx_state, int nbits,
- labellist *ll, const char *msg)
-{
- int i, ibit, iarc;
- ss_state *yy;
- ss_arc *zz;
-
- printf("Subset DFA %s\n", msg);
- for (i = 0; i < xx_nstates; i++) {
- yy = &xx_state[i];
- if (yy->ss_deleted)
- continue;
- printf(" Subset %d", i);
- if (yy->ss_finish)
- printf(" (finish)");
- printf(" { ");
- for (ibit = 0; ibit < nbits; ibit++) {
- if (testbit(yy->ss_ss, ibit))
- printf("%d ", ibit);
- }
- printf("}\n");
- for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
- zz = &yy->ss_arc[iarc];
- printf(" Arc to state %d, label %s\n",
- zz->sa_arrow,
- PyGrammar_LabelRepr(
- &ll->ll_label[zz->sa_label]));
- }
- }
-}
-
-
-/* PART THREE -- SIMPLIFY DFA */
-
-/* Simplify the DFA by repeatedly eliminating states that are
- equivalent to another oner. This is NOT Algorithm 3.3 from
- [Aho&Ullman 77]. It does not always finds the minimal DFA,
- but it does usually make a much smaller one... (For an example
- of sub-optimal behavior, try S: x a b+ | y a b+.)
-*/
-
-static int
-samestate(ss_state *s1, ss_state *s2)
-{
- int i;
-
- if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
- return 0;
- for (i = 0; i < s1->ss_narcs; i++) {
- if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
- s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
- return 0;
- }
- return 1;
-}
-
-static void
-renamestates(int xx_nstates, ss_state *xx_state, int from, int to)
-{
- int i, j;
-
- if (Py_DebugFlag)
- printf("Rename state %d to %d.\n", from, to);
- for (i = 0; i < xx_nstates; i++) {
- if (xx_state[i].ss_deleted)
- continue;
- for (j = 0; j < xx_state[i].ss_narcs; j++) {
- if (xx_state[i].ss_arc[j].sa_arrow == from)
- xx_state[i].ss_arc[j].sa_arrow = to;
- }
- }
-}
-
-static void
-simplify(int xx_nstates, ss_state *xx_state)
-{
- int changes;
- int i, j;
-
- do {
- changes = 0;
- for (i = 1; i < xx_nstates; i++) {
- if (xx_state[i].ss_deleted)
- continue;
- for (j = 0; j < i; j++) {
- if (xx_state[j].ss_deleted)
- continue;
- if (samestate(&xx_state[i], &xx_state[j])) {
- xx_state[i].ss_deleted++;
- renamestates(xx_nstates, xx_state,
- i, j);
- changes++;
- break;
- }
- }
- }
- } while (changes);
-}
-
-
-/* PART FOUR -- GENERATE PARSING TABLES */
-
-/* Convert the DFA into a grammar that can be used by our parser */
-
-static void
-convert(dfa *d, int xx_nstates, ss_state *xx_state)
-{
- int i, j;
- ss_state *yy;
- ss_arc *zz;
-
- for (i = 0; i < xx_nstates; i++) {
- yy = &xx_state[i];
- if (yy->ss_deleted)
- continue;
- yy->ss_rename = addstate(d);
- }
-
- for (i = 0; i < xx_nstates; i++) {
- yy = &xx_state[i];
- if (yy->ss_deleted)
- continue;
- for (j = 0; j < yy->ss_narcs; j++) {
- zz = &yy->ss_arc[j];
- addarc(d, yy->ss_rename,
- xx_state[zz->sa_arrow].ss_rename,
- zz->sa_label);
- }
- if (yy->ss_finish)
- addarc(d, yy->ss_rename, yy->ss_rename, 0);
- }
-
- d->d_initial = 0;
-}
-
-
-/* PART FIVE -- GLUE IT ALL TOGETHER */
-
-static grammar *
-maketables(nfagrammar *gr)
-{
- int i;
- nfa *nf;
- dfa *d;
- grammar *g;
-
- if (gr->gr_nnfas == 0)
- return NULL;
- g = newgrammar(gr->gr_nfa[0]->nf_type);
- /* XXX first rule must be start rule */
- g->g_ll = gr->gr_ll;
-
- for (i = 0; i < gr->gr_nnfas; i++) {
- nf = gr->gr_nfa[i];
- if (Py_DebugFlag) {
- printf("Dump of NFA for '%s' ...\n", nf->nf_name);
- dumpnfa(&gr->gr_ll, nf);
- printf("Making DFA for '%s' ...\n", nf->nf_name);
- }
- d = adddfa(g, nf->nf_type, nf->nf_name);
- makedfa(gr, gr->gr_nfa[i], d);
- }
-
- return g;
-}
-
-grammar *
-pgen(node *n)
-{
- nfagrammar *gr;
- grammar *g;
-
- gr = metacompile(n);
- g = maketables(gr);
- translatelabels(g);
- addfirstsets(g);
- freenfagrammar(gr);
- return g;
-}
-
-grammar *
-Py_pgen(node *n)
-{
- return pgen(n);
-}
-
-/*
-
-Description
------------
-
-Input is a grammar in extended BNF (using * for repetition, + for
-at-least-once repetition, [] for optional parts, | for alternatives and
-() for grouping). This has already been parsed and turned into a parse
-tree.
-
-Each rule is considered as a regular expression in its own right.
-It is turned into a Non-deterministic Finite Automaton (NFA), which
-is then turned into a Deterministic Finite Automaton (DFA), which is then
-optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
-or similar compiler books (this technique is more often used for lexical
-analyzers).
-
-The DFA's are used by the parser as parsing tables in a special way
-that's probably unique. Before they are usable, the FIRST sets of all
-non-terminals are computed.
-
-Reference
----------
-
-[Aho&Ullman 77]
- Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
- (first edition)
-
-*/
diff --git a/Parser/pgen/__init__.py b/Parser/pgen/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/Parser/pgen/__init__.py
diff --git a/Parser/pgen/__main__.py b/Parser/pgen/__main__.py
new file mode 100644
index 0000000000..965b08fd23
--- /dev/null
+++ b/Parser/pgen/__main__.py
@@ -0,0 +1,34 @@
+import argparse
+
+from .pgen import ParserGenerator
+
+def main():
+ parser = argparse.ArgumentParser(description="Parser generator main program.")
+ parser.add_argument(
+ "grammar", type=str, help="The file with the grammar definition in EBNF format"
+ )
+ parser.add_argument(
+ "tokens", type=str, help="The file with the token definitions"
+ )
+ parser.add_argument(
+ "graminit_h",
+ type=argparse.FileType('w'),
+ help="The path to write the grammar's non-terminals as #defines",
+ )
+ parser.add_argument(
+ "graminit_c",
+ type=argparse.FileType('w'),
+ help="The path to write the grammar as initialized data",
+ )
+
+ parser.add_argument("--verbose", "-v", action="count")
+ args = parser.parse_args()
+
+ p = ParserGenerator(args.grammar, args.tokens, verbose=args.verbose)
+ grammar = p.make_grammar()
+ grammar.produce_graminit_h(args.graminit_h.write)
+ grammar.produce_graminit_c(args.graminit_c.write)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/Parser/pgen/grammar.py b/Parser/pgen/grammar.py
new file mode 100644
index 0000000000..bd405e674e
--- /dev/null
+++ b/Parser/pgen/grammar.py
@@ -0,0 +1,160 @@
+import collections
+
+class Grammar:
+ """Pgen parsing tables conversion class.
+
+ Once initialized, this class supplies the grammar tables for the
+ parsing engine implemented by parse.py. The parsing engine
+ accesses the instance variables directly. The class here does not
+ provide initialization of the tables; several subclasses exist to
+ do this (see the conv and pgen modules).
+
+ The load() method reads the tables from a pickle file, which is
+ much faster than the other ways offered by subclasses. The pickle
+ file is written by calling dump() (after loading the grammar
+ tables using a subclass). The report() method prints a readable
+ representation of the tables to stdout, for debugging.
+
+ The instance variables are as follows:
+
+ symbol2number -- a dict mapping symbol names to numbers. Symbol
+ numbers are always 256 or higher, to distinguish
+ them from token numbers, which are between 0 and
+ 255 (inclusive).
+
+ number2symbol -- a dict mapping numbers to symbol names;
+ these two are each other's inverse.
+
+ states -- a list of DFAs, where each DFA is a list of
+ states, each state is a list of arcs, and each
+ arc is a (i, j) pair where i is a label and j is
+ a state number. The DFA number is the index into
+ this list. (This name is slightly confusing.)
+ Final states are represented by a special arc of
+ the form (0, j) where j is its own state number.
+
+ dfas -- a dict mapping symbol numbers to (DFA, first)
+ pairs, where DFA is an item from the states list
+ above, and first is a set of tokens that can
+ begin this grammar rule (represented by a dict
+ whose values are always 1).
+
+ labels -- a list of (x, y) pairs where x is either a token
+ number or a symbol number, and y is either None
+ or a string; the strings are keywords. The label
+ number is the index in this list; label numbers
+ are used to mark state transitions (arcs) in the
+ DFAs.
+
+ start -- the number of the grammar's start symbol.
+
+ keywords -- a dict mapping keyword strings to arc labels.
+
+ tokens -- a dict mapping token numbers to arc labels.
+
+ """
+
+ def __init__(self):
+ self.symbol2number = collections.OrderedDict()
+ self.number2symbol = collections.OrderedDict()
+ self.states = []
+ self.dfas = collections.OrderedDict()
+ self.labels = [(0, "EMPTY")]
+ self.keywords = collections.OrderedDict()
+ self.tokens = collections.OrderedDict()
+ self.symbol2label = collections.OrderedDict()
+ self.start = 256
+
+ def produce_graminit_h(self, writer):
+ writer("/* Generated by Parser/pgen */\n\n")
+ for number, symbol in self.number2symbol.items():
+ writer("#define {} {}\n".format(symbol, number))
+
+ def produce_graminit_c(self, writer):
+ writer("/* Generated by Parser/pgen */\n\n")
+
+ writer('#include "pgenheaders.h"\n')
+ writer('#include "grammar.h"\n')
+ writer("grammar _PyParser_Grammar;\n")
+
+ self.print_dfas(writer)
+ self.print_labels(writer)
+
+ writer("grammar _PyParser_Grammar = {\n")
+ writer(" {n_dfas},\n".format(n_dfas=len(self.dfas)))
+ writer(" dfas,\n")
+ writer(" {{{n_labels}, labels}},\n".format(n_labels=len(self.labels)))
+ writer(" {start_number}\n".format(start_number=self.start))
+ writer("};\n")
+
+ def print_labels(self, writer):
+ writer(
+ "static label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels))
+ )
+ for label, name in self.labels:
+ if name is None:
+ writer(" {{{label}, 0}},\n".format(label=label))
+ else:
+ writer(
+ ' {{{label}, "{label_name}"}},\n'.format(
+ label=label, label_name=name
+ )
+ )
+ writer("};\n")
+
+ def print_dfas(self, writer):
+ self.print_states(writer)
+ writer("static dfa dfas[{}] = {{\n".format(len(self.dfas)))
+ for dfaindex, dfa_elem in enumerate(self.dfas.items()):
+ symbol, (dfa, first_sets) = dfa_elem
+ writer(
+ ' {{{dfa_symbol}, "{symbol_name}", '.format(
+ dfa_symbol=symbol, symbol_name=self.number2symbol[symbol]
+ )
+ + "0, {n_states}, states_{dfa_index},\n".format(
+ n_states=len(dfa), dfa_index=dfaindex
+ )
+ )
+ writer(' "')
+
+ k = [name for label, name in self.labels if label in first_sets]
+ bitset = bytearray((len(self.labels) >> 3) + 1)
+ for token in first_sets:
+ bitset[token >> 3] |= 1 << (token & 7)
+ for byte in bitset:
+ writer("\\%03o" % (byte & 0xFF))
+ writer('"},\n')
+ writer("};\n")
+
+ def print_states(self, write):
+ for dfaindex, dfa in enumerate(self.states):
+ self.print_arcs(write, dfaindex, dfa)
+ write(
+ "static state states_{dfa_index}[{n_states}] = {{\n".format(
+ dfa_index=dfaindex, n_states=len(dfa)
+ )
+ )
+ for stateindex, state in enumerate(dfa):
+ narcs = len(state)
+ write(
+ " {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format(
+ n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex
+ )
+ )
+ write("};\n")
+
+ def print_arcs(self, write, dfaindex, states):
+ for stateindex, state in enumerate(states):
+ narcs = len(state)
+ write(
+ "static arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format(
+ dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs
+ )
+ )
+ for a, b in state:
+ write(
+ " {{{from_label}, {to_state}}},\n".format(
+ from_label=a, to_state=b
+ )
+ )
+ write("};\n")
diff --git a/Parser/pgen/pgen.py b/Parser/pgen/pgen.py
new file mode 100644
index 0000000000..c878919f2a
--- /dev/null
+++ b/Parser/pgen/pgen.py
@@ -0,0 +1,408 @@
+import collections
+import tokenize # from stdlib
+
+from . import grammar, token
+
+class ParserGenerator(object):
+
+ def __init__(self, grammar_file, token_file, stream=None, verbose=False):
+ close_stream = None
+ if stream is None:
+ stream = open(grammar_file)
+ close_stream = stream.close
+ with open(token_file) as tok_file:
+ token_lines = tok_file.readlines()
+ self.tokens = dict(token.generate_tokens(token_lines))
+ self.opmap = dict(token.generate_opmap(token_lines))
+ # Manually add <> so it does not collide with !=
+ self.opmap['<>'] = "NOTEQUAL"
+ self.verbose = verbose
+ self.filename = grammar_file
+ self.stream = stream
+ self.generator = tokenize.generate_tokens(stream.readline)
+ self.gettoken() # Initialize lookahead
+ self.dfas, self.startsymbol = self.parse()
+ if close_stream is not None:
+ close_stream()
+ self.first = {} # map from symbol name to set of tokens
+ self.addfirstsets()
+
+ def make_grammar(self):
+ c = grammar.Grammar()
+ names = list(self.dfas.keys())
+ names.remove(self.startsymbol)
+ names.insert(0, self.startsymbol)
+ for name in names:
+ i = 256 + len(c.symbol2number)
+ c.symbol2number[name] = i
+ c.number2symbol[i] = name
+ for name in names:
+ self.make_label(c, name)
+ dfa = self.dfas[name]
+ states = []
+ for state in dfa:
+ arcs = []
+ for label, next in sorted(state.arcs.items()):
+ arcs.append((self.make_label(c, label), dfa.index(next)))
+ if state.isfinal:
+ arcs.append((0, dfa.index(state)))
+ states.append(arcs)
+ c.states.append(states)
+ c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
+ c.start = c.symbol2number[self.startsymbol]
+
+ if self.verbose:
+ print("")
+ print("Grammar summary")
+ print("===============")
+
+ print("- {n_labels} labels".format(n_labels=len(c.labels)))
+ print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
+ print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
+ print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
+ print(
+ "- Start symbol: {start_symbol}".format(
+ start_symbol=c.number2symbol[c.start]
+ )
+ )
+ return c
+
+ def make_first(self, c, name):
+ rawfirst = self.first[name]
+ first = set()
+ for label in sorted(rawfirst):
+ ilabel = self.make_label(c, label)
+ ##assert ilabel not in first # XXX failed on <> ... !=
+ first.add(ilabel)
+ return first
+
+ def make_label(self, c, label):
+ # XXX Maybe this should be a method on a subclass of converter?
+ ilabel = len(c.labels)
+ if label[0].isalpha():
+ # Either a symbol name or a named token
+ if label in c.symbol2number:
+ # A symbol name (a non-terminal)
+ if label in c.symbol2label:
+ return c.symbol2label[label]
+ else:
+ c.labels.append((c.symbol2number[label], None))
+ c.symbol2label[label] = ilabel
+ return ilabel
+ else:
+ # A named token (NAME, NUMBER, STRING)
+ itoken = self.tokens.get(label, None)
+ assert isinstance(itoken, int), label
+ assert itoken in self.tokens.values(), label
+ if itoken in c.tokens:
+ return c.tokens[itoken]
+ else:
+ c.labels.append((itoken, None))
+ c.tokens[itoken] = ilabel
+ return ilabel
+ else:
+ # Either a keyword or an operator
+ assert label[0] in ('"', "'"), label
+ value = eval(label)
+ if value[0].isalpha():
+ # A keyword
+ if value in c.keywords:
+ return c.keywords[value]
+ else:
+ c.labels.append((self.tokens["NAME"], value))
+ c.keywords[value] = ilabel
+ return ilabel
+ else:
+ # An operator (any non-numeric token)
+ tok_name = self.opmap[value] # Fails if unknown token
+ itoken = self.tokens[tok_name]
+ if itoken in c.tokens:
+ return c.tokens[itoken]
+ else:
+ c.labels.append((itoken, None))
+ c.tokens[itoken] = ilabel
+ return ilabel
+
+ def addfirstsets(self):
+ names = list(self.dfas.keys())
+ for name in names:
+ if name not in self.first:
+ self.calcfirst(name)
+
+ if self.verbose:
+ print("First set for {dfa_name}".format(dfa_name=name))
+ for item in self.first[name]:
+ print(" - {terminal}".format(terminal=item))
+
+ def calcfirst(self, name):
+ dfa = self.dfas[name]
+ self.first[name] = None # dummy to detect left recursion
+ state = dfa[0]
+ totalset = set()
+ overlapcheck = {}
+ for label, next in state.arcs.items():
+ if label in self.dfas:
+ if label in self.first:
+ fset = self.first[label]
+ if fset is None:
+ raise ValueError("recursion for rule %r" % name)
+ else:
+ self.calcfirst(label)
+ fset = self.first[label]
+ totalset.update(fset)
+ overlapcheck[label] = fset
+ else:
+ totalset.add(label)
+ overlapcheck[label] = {label}
+ inverse = {}
+ for label, itsfirst in overlapcheck.items():
+ for symbol in itsfirst:
+ if symbol in inverse:
+ raise ValueError("rule %s is ambiguous; %s is in the"
+ " first sets of %s as well as %s" %
+ (name, symbol, label, inverse[symbol]))
+ inverse[symbol] = label
+ self.first[name] = totalset
+
+ def parse(self):
+ dfas = collections.OrderedDict()
+ startsymbol = None
+ # MSTART: (NEWLINE | RULE)* ENDMARKER
+ while self.type != tokenize.ENDMARKER:
+ while self.type == tokenize.NEWLINE:
+ self.gettoken()
+ # RULE: NAME ':' RHS NEWLINE
+ name = self.expect(tokenize.NAME)
+ if self.verbose:
+ print("Processing rule {dfa_name}".format(dfa_name=name))
+ self.expect(tokenize.OP, ":")
+ a, z = self.parse_rhs()
+ self.expect(tokenize.NEWLINE)
+ if self.verbose:
+ self.dump_nfa(name, a, z)
+ dfa = self.make_dfa(a, z)
+ if self.verbose:
+ self.dump_dfa(name, dfa)
+ oldlen = len(dfa)
+ self.simplify_dfa(dfa)
+ newlen = len(dfa)
+ dfas[name] = dfa
+ #print name, oldlen, newlen
+ if startsymbol is None:
+ startsymbol = name
+ return dfas, startsymbol
+
+ def make_dfa(self, start, finish):
+ # To turn an NFA into a DFA, we define the states of the DFA
+ # to correspond to *sets* of states of the NFA. Then do some
+ # state reduction. Let's represent sets as dicts with 1 for
+ # values.
+ assert isinstance(start, NFAState)
+ assert isinstance(finish, NFAState)
+ def closure(state):
+ base = set()
+ addclosure(state, base)
+ return base
+ def addclosure(state, base):
+ assert isinstance(state, NFAState)
+ if state in base:
+ return
+ base.add(state)
+ for label, next in state.arcs:
+ if label is None:
+ addclosure(next, base)
+ states = [DFAState(closure(start), finish)]
+ for state in states: # NB states grows while we're iterating
+ arcs = {}
+ for nfastate in state.nfaset:
+ for label, next in nfastate.arcs:
+ if label is not None:
+ addclosure(next, arcs.setdefault(label, set()))
+ for label, nfaset in sorted(arcs.items()):
+ for st in states:
+ if st.nfaset == nfaset:
+ break
+ else:
+ st = DFAState(nfaset, finish)
+ states.append(st)
+ state.addarc(st, label)
+ return states # List of DFAState instances; first one is start
+
+ def dump_nfa(self, name, start, finish):
+ print("Dump of NFA for", name)
+ todo = [start]
+ for i, state in enumerate(todo):
+ print(" State", i, state is finish and "(final)" or "")
+ for label, next in state.arcs:
+ if next in todo:
+ j = todo.index(next)
+ else:
+ j = len(todo)
+ todo.append(next)
+ if label is None:
+ print(" -> %d" % j)
+ else:
+ print(" %s -> %d" % (label, j))
+
+ def dump_dfa(self, name, dfa):
+ print("Dump of DFA for", name)
+ for i, state in enumerate(dfa):
+ print(" State", i, state.isfinal and "(final)" or "")
+ for label, next in sorted(state.arcs.items()):
+ print(" %s -> %d" % (label, dfa.index(next)))
+
+ def simplify_dfa(self, dfa):
+ # This is not theoretically optimal, but works well enough.
+ # Algorithm: repeatedly look for two states that have the same
+ # set of arcs (same labels pointing to the same nodes) and
+ # unify them, until things stop changing.
+
+ # dfa is a list of DFAState instances
+ changes = True
+ while changes:
+ changes = False
+ for i, state_i in enumerate(dfa):
+ for j in range(i+1, len(dfa)):
+ state_j = dfa[j]
+ if state_i == state_j:
+ #print " unify", i, j
+ del dfa[j]
+ for state in dfa:
+ state.unifystate(state_j, state_i)
+ changes = True
+ break
+
+ def parse_rhs(self):
+ # RHS: ALT ('|' ALT)*
+ a, z = self.parse_alt()
+ if self.value != "|":
+ return a, z
+ else:
+ aa = NFAState()
+ zz = NFAState()
+ aa.addarc(a)
+ z.addarc(zz)
+ while self.value == "|":
+ self.gettoken()
+ a, z = self.parse_alt()
+ aa.addarc(a)
+ z.addarc(zz)
+ return aa, zz
+
+ def parse_alt(self):
+ # ALT: ITEM+
+ a, b = self.parse_item()
+ while (self.value in ("(", "[") or
+ self.type in (tokenize.NAME, tokenize.STRING)):
+ c, d = self.parse_item()
+ b.addarc(c)
+ b = d
+ return a, b
+
+ def parse_item(self):
+ # ITEM: '[' RHS ']' | ATOM ['+' | '*']
+ if self.value == "[":
+ self.gettoken()
+ a, z = self.parse_rhs()
+ self.expect(tokenize.OP, "]")
+ a.addarc(z)
+ return a, z
+ else:
+ a, z = self.parse_atom()
+ value = self.value
+ if value not in ("+", "*"):
+ return a, z
+ self.gettoken()
+ z.addarc(a)
+ if value == "+":
+ return a, z
+ else:
+ return a, a
+
+ def parse_atom(self):
+ # ATOM: '(' RHS ')' | NAME | STRING
+ if self.value == "(":
+ self.gettoken()
+ a, z = self.parse_rhs()
+ self.expect(tokenize.OP, ")")
+ return a, z
+ elif self.type in (tokenize.NAME, tokenize.STRING):
+ a = NFAState()
+ z = NFAState()
+ a.addarc(z, self.value)
+ self.gettoken()
+ return a, z
+ else:
+ self.raise_error("expected (...) or NAME or STRING, got %s/%s",
+ self.type, self.value)
+
+ def expect(self, type, value=None):
+ if self.type != type or (value is not None and self.value != value):
+ self.raise_error("expected %s/%s, got %s/%s",
+ type, value, self.type, self.value)
+ value = self.value
+ self.gettoken()
+ return value
+
+ def gettoken(self):
+ tup = next(self.generator)
+ while tup[0] in (tokenize.COMMENT, tokenize.NL):
+ tup = next(self.generator)
+ self.type, self.value, self.begin, self.end, self.line = tup
+ # print(getattr(tokenize, 'tok_name')[self.type], repr(self.value))
+
+ def raise_error(self, msg, *args):
+ if args:
+ try:
+ msg = msg % args
+ except:
+ msg = " ".join([msg] + list(map(str, args)))
+ raise SyntaxError(msg, (self.filename, self.end[0],
+ self.end[1], self.line))
+
+class NFAState(object):
+
+ def __init__(self):
+ self.arcs = [] # list of (label, NFAState) pairs
+
+ def addarc(self, next, label=None):
+ assert label is None or isinstance(label, str)
+ assert isinstance(next, NFAState)
+ self.arcs.append((label, next))
+
+class DFAState(object):
+
+ def __init__(self, nfaset, final):
+ assert isinstance(nfaset, set)
+ assert isinstance(next(iter(nfaset)), NFAState)
+ assert isinstance(final, NFAState)
+ self.nfaset = nfaset
+ self.isfinal = final in nfaset
+ self.arcs = {} # map from label to DFAState
+
+ def addarc(self, next, label):
+ assert isinstance(label, str)
+ assert label not in self.arcs
+ assert isinstance(next, DFAState)
+ self.arcs[label] = next
+
+ def unifystate(self, old, new):
+ for label, next in self.arcs.items():
+ if next is old:
+ self.arcs[label] = new
+
+ def __eq__(self, other):
+ # Equality test -- ignore the nfaset instance variable
+ assert isinstance(other, DFAState)
+ if self.isfinal != other.isfinal:
+ return False
+ # Can't just return self.arcs == other.arcs, because that
+ # would invoke this method recursively, with cycles...
+ if len(self.arcs) != len(other.arcs):
+ return False
+ for label, next in self.arcs.items():
+ if next is not other.arcs.get(label):
+ return False
+ return True
+
+ __hash__ = None # For Py3 compatibility.
diff --git a/Parser/pgen/token.py b/Parser/pgen/token.py
new file mode 100644
index 0000000000..f9d45c450c
--- /dev/null
+++ b/Parser/pgen/token.py
@@ -0,0 +1,40 @@
+import itertools
+
+def generate_tokens(tokens):
+ numbers = itertools.count(0)
+ for line in tokens:
+ line = line.strip()
+
+ if not line:
+ continue
+ if line.strip().startswith('#'):
+ continue
+
+ name = line.split()[0]
+ yield (name, next(numbers))
+
+ yield ('N_TOKENS', next(numbers))
+ yield ('NT_OFFSET', 256)
+
+def generate_opmap(tokens):
+ for line in tokens:
+ line = line.strip()
+
+ if not line:
+ continue
+ if line.strip().startswith('#'):
+ continue
+
+ pieces = line.split()
+
+ if len(pieces) != 2:
+ continue
+
+ name, op = pieces
+ yield (op.strip("'"), name)
+
+ # Yield independently <>. This is needed so it does not collide
+ # with the token generation in "generate_tokens" because if this
+ # symbol is included in Grammar/Tokens, it will collide with !=
+ # as it has the same name (NOTEQUAL).
+ yield ('<>', 'NOTEQUAL')
diff --git a/Parser/pgenmain.c b/Parser/pgenmain.c
deleted file mode 100644
index fa6c88253c..0000000000
--- a/Parser/pgenmain.c
+++ /dev/null
@@ -1,188 +0,0 @@
-
-/* Parser generator main program */
-
-/* This expects a filename containing the grammar as argv[1] (UNIX)
- or asks the console for such a file name (THINK C).
- It writes its output on two files in the current directory:
- - "graminit.c" gets the grammar as a bunch of initialized data
- - "graminit.h" gets the grammar's non-terminals as #defines.
- Error messages and status info during the generation process are
- written to stdout, or sometimes to stderr. */
-
-/* XXX TO DO:
- - check for duplicate definitions of names (instead of fatal err)
-*/
-
-#define PGEN
-
-#include "Python.h"
-#include "pycore_pymem.h"
-#include "pycore_pystate.h"
-#include "pgenheaders.h"
-#include "grammar.h"
-#include "node.h"
-#include "parsetok.h"
-#include "pgen.h"
-
-int Py_DebugFlag = 0;
-int Py_VerboseFlag = 0;
-int Py_IgnoreEnvironmentFlag = 0;
-_PyRuntimeState _PyRuntime = _PyRuntimeState_INIT;
-
-/* Forward */
-grammar *getgrammar(const char *filename);
-
-void
-Py_Exit(int sts)
-{
- exit(sts);
-}
-
-/* Needed by obmalloc.c */
-int PyGILState_Check(void)
-{ return 1; }
-
-void _PyMem_DumpTraceback(int fd, const void *ptr)
-{}
-
-int
-main(int argc, char **argv)
-{
- grammar *g;
- FILE *fp;
- char *filename, *graminit_h, *graminit_c;
-
- if (argc != 4) {
- fprintf(stderr,
- "usage: %s grammar graminit.h graminit.c\n", argv[0]);
- Py_Exit(2);
- }
- filename = argv[1];
- graminit_h = argv[2];
- graminit_c = argv[3];
- g = getgrammar(filename);
- fp = fopen(graminit_c, "w");
- if (fp == NULL) {
- perror(graminit_c);
- Py_Exit(1);
- }
- if (Py_DebugFlag)
- printf("Writing %s ...\n", graminit_c);
- printgrammar(g, fp);
- fclose(fp);
- fp = fopen(graminit_h, "w");
- if (fp == NULL) {
- perror(graminit_h);
- Py_Exit(1);
- }
- if (Py_DebugFlag)
- printf("Writing %s ...\n", graminit_h);
- printnonterminals(g, fp);
- fclose(fp);
- freegrammar(g);
- Py_Exit(0);
- return 0; /* Make gcc -Wall happy */
-}
-
-grammar *
-getgrammar(const char *filename)
-{
- FILE *fp;
- node *n;
- grammar *g0, *g;
- perrdetail err;
-
- fp = fopen(filename, "r");
- if (fp == NULL) {
- perror(filename);
- Py_Exit(1);
- }
- g0 = meta_grammar();
- n = PyParser_ParseFile(fp, filename, g0, g0->g_start,
- (char *)NULL, (char *)NULL, &err);
- fclose(fp);
- if (n == NULL) {
- fprintf(stderr, "Parsing error %d, line %d.\n",
- err.error, err.lineno);
- if (err.text != NULL) {
- size_t len;
- int i;
- fprintf(stderr, "%s", err.text);
- len = strlen(err.text);
- if (len == 0 || err.text[len-1] != '\n')
- fprintf(stderr, "\n");
- for (i = 0; i < err.offset; i++) {
- if (err.text[i] == '\t')
- putc('\t', stderr);
- else
- putc(' ', stderr);
- }
- fprintf(stderr, "^\n");
- PyObject_FREE(err.text);
- }
- Py_Exit(1);
- }
- g = pgen(n);
- PyNode_Free(n);
- if (g == NULL) {
- printf("Bad grammar.\n");
- Py_Exit(1);
- }
- return g;
-}
-
-/* Can't happen in pgen */
-PyObject*
-PyErr_Occurred()
-{
- return 0;
-}
-
-void
-Py_FatalError(const char *msg)
-{
- fprintf(stderr, "pgen: FATAL ERROR: %s\n", msg);
- Py_Exit(1);
-}
-
-/* No-nonsense my_readline() for tokenizer.c */
-
-char *
-PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt)
-{
- size_t n = 1000;
- char *p = (char *)PyMem_MALLOC(n);
- char *q;
- if (p == NULL)
- return NULL;
- fprintf(stderr, "%s", prompt);
- q = fgets(p, n, sys_stdin);
- if (q == NULL) {
- *p = '\0';
- return p;
- }
- n = strlen(p);
- if (n > 0 && p[n-1] != '\n')
- p[n-1] = '\n';
- return (char *)PyMem_REALLOC(p, n+1);
-}
-
-/* No-nonsense fgets */
-char *
-Py_UniversalNewlineFgets(char *buf, int n, FILE *stream, PyObject *fobj)
-{
- return fgets(buf, n, stream);
-}
-
-
-#include <stdarg.h>
-
-void
-PySys_WriteStderr(const char *format, ...)
-{
- va_list va;
-
- va_start(va, format);
- vfprintf(stderr, format, va);
- va_end(va);
-}
diff --git a/Parser/printgrammar.c b/Parser/printgrammar.c
deleted file mode 100644
index 1a8b0e176f..0000000000
--- a/Parser/printgrammar.c
+++ /dev/null
@@ -1,120 +0,0 @@
-
-/* Print a bunch of C initializers that represent a grammar */
-
-#define PGEN
-
-#include "pgenheaders.h"
-#include "grammar.h"
-
-/* Forward */
-static void printarcs(int, dfa *, FILE *);
-static void printstates(grammar *, FILE *);
-static void printdfas(grammar *, FILE *);
-static void printlabels(grammar *, FILE *);
-
-void
-printgrammar(grammar *g, FILE *fp)
-{
- fprintf(fp, "/* Generated by Parser/pgen */\n\n");
- fprintf(fp, "#include \"pgenheaders.h\"\n");
- fprintf(fp, "#include \"grammar.h\"\n");
- fprintf(fp, "grammar _PyParser_Grammar;\n");
- printdfas(g, fp);
- printlabels(g, fp);
- fprintf(fp, "grammar _PyParser_Grammar = {\n");
- fprintf(fp, " %d,\n", g->g_ndfas);
- fprintf(fp, " dfas,\n");
- fprintf(fp, " {%d, labels},\n", g->g_ll.ll_nlabels);
- fprintf(fp, " %d\n", g->g_start);
- fprintf(fp, "};\n");
-}
-
-void
-printnonterminals(grammar *g, FILE *fp)
-{
- dfa *d;
- int i;
-
- fprintf(fp, "/* Generated by Parser/pgen */\n\n");
-
- d = g->g_dfa;
- for (i = g->g_ndfas; --i >= 0; d++)
- fprintf(fp, "#define %s %d\n", d->d_name, d->d_type);
-}
-
-static void
-printarcs(int i, dfa *d, FILE *fp)
-{
- arc *a;
- state *s;
- int j, k;
-
- s = d->d_state;
- for (j = 0; j < d->d_nstates; j++, s++) {
- fprintf(fp, "static arc arcs_%d_%d[%d] = {\n",
- i, j, s->s_narcs);
- a = s->s_arc;
- for (k = 0; k < s->s_narcs; k++, a++)
- fprintf(fp, " {%d, %d},\n", a->a_lbl, a->a_arrow);
- fprintf(fp, "};\n");
- }
-}
-
-static void
-printstates(grammar *g, FILE *fp)
-{
- state *s;
- dfa *d;
- int i, j;
-
- d = g->g_dfa;
- for (i = 0; i < g->g_ndfas; i++, d++) {
- printarcs(i, d, fp);
- fprintf(fp, "static state states_%d[%d] = {\n",
- i, d->d_nstates);
- s = d->d_state;
- for (j = 0; j < d->d_nstates; j++, s++)
- fprintf(fp, " {%d, arcs_%d_%d},\n",
- s->s_narcs, i, j);
- fprintf(fp, "};\n");
- }
-}
-
-static void
-printdfas(grammar *g, FILE *fp)
-{
- dfa *d;
- int i, j, n;
-
- printstates(g, fp);
- fprintf(fp, "static dfa dfas[%d] = {\n", g->g_ndfas);
- d = g->g_dfa;
- for (i = 0; i < g->g_ndfas; i++, d++) {
- fprintf(fp, " {%d, \"%s\", %d, %d, states_%d,\n",
- d->d_type, d->d_name, d->d_initial, d->d_nstates, i);
- fprintf(fp, " \"");
- n = NBYTES(g->g_ll.ll_nlabels);
- for (j = 0; j < n; j++)
- fprintf(fp, "\\%03o", d->d_first[j] & 0xff);
- fprintf(fp, "\"},\n");
- }
- fprintf(fp, "};\n");
-}
-
-static void
-printlabels(grammar *g, FILE *fp)
-{
- label *l;
- int i;
-
- fprintf(fp, "static label labels[%d] = {\n", g->g_ll.ll_nlabels);
- l = g->g_ll.ll_label;
- for (i = g->g_ll.ll_nlabels; --i >= 0; l++) {
- if (l->lb_str == NULL)
- fprintf(fp, " {%d, 0},\n", l->lb_type);
- else
- fprintf(fp, " {%d, \"%s\"},\n",
- l->lb_type, l->lb_str);
- }
- fprintf(fp, "};\n");
-}
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 1ded9ade37..44ec41512c 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -10,13 +10,11 @@
#include "tokenizer.h"
#include "errcode.h"
-#ifndef PGEN
#include "unicodeobject.h"
#include "bytesobject.h"
#include "fileobject.h"
#include "codecs.h"
#include "abstract.h"
-#endif /* PGEN */
/* Alternate tab spacing */
#define ALTTABSIZE 1
@@ -81,11 +79,9 @@ tok_new(void)
tok->enc = NULL;
tok->encoding = NULL;
tok->cont_line = 0;
-#ifndef PGEN
tok->filename = NULL;
tok->decoding_readline = NULL;
tok->decoding_buffer = NULL;
-#endif
tok->type_comments = 0;
return tok;
@@ -104,28 +100,6 @@ new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
return result;
}
-#ifdef PGEN
-
-static char *
-decoding_fgets(char *s, int size, struct tok_state *tok)
-{
- return fgets(s, size, tok->fp);
-}
-
-static int
-decoding_feof(struct tok_state *tok)
-{
- return feof(tok->fp);
-}
-
-static char *
-decode_str(const char *str, int exec_input, struct tok_state *tok)
-{
- return new_string(str, strlen(str), tok);
-}
-
-#else /* PGEN */
-
static char *
error_ret(struct tok_state *tok) /* XXX */
{
@@ -551,7 +525,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
return error_ret(tok);
}
}
-#ifndef PGEN
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (line && !tok->encoding) {
@@ -574,7 +547,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
badchar, tok->filename, tok->lineno + 1);
return error_ret(tok);
}
-#endif
return line;
}
@@ -738,8 +710,6 @@ decode_str(const char *input, int single, struct tok_state *tok)
return str;
}
-#endif /* PGEN */
-
/* Set up tokenizer for string */
struct tok_state *
@@ -765,9 +735,7 @@ PyTokenizer_FromUTF8(const char *str, int exec_input)
struct tok_state *tok = tok_new();
if (tok == NULL)
return NULL;
-#ifndef PGEN
tok->input = str = translate_newlines(str, exec_input, tok);
-#endif
if (str == NULL) {
PyTokenizer_Free(tok);
return NULL;
@@ -828,11 +796,9 @@ PyTokenizer_Free(struct tok_state *tok)
{
if (tok->encoding != NULL)
PyMem_FREE(tok->encoding);
-#ifndef PGEN
Py_XDECREF(tok->decoding_readline);
Py_XDECREF(tok->decoding_buffer);
Py_XDECREF(tok->filename);
-#endif
if (tok->fp != NULL && tok->buf != NULL)
PyMem_FREE(tok->buf);
if (tok->input)
@@ -871,7 +837,6 @@ tok_nextc(struct tok_state *tok)
}
if (tok->prompt != NULL) {
char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
-#ifndef PGEN
if (newtok != NULL) {
char *translated = translate_newlines(newtok, 0, tok);
PyMem_FREE(newtok);
@@ -900,7 +865,6 @@ tok_nextc(struct tok_state *tok)
strcpy(newtok, buf);
Py_DECREF(u);
}
-#endif
if (tok->nextprompt != NULL)
tok->prompt = tok->nextprompt;
if (newtok == NULL)
@@ -1056,7 +1020,6 @@ tok_backup(struct tok_state *tok, int c)
static int
syntaxerror(struct tok_state *tok, const char *format, ...)
{
-#ifndef PGEN
va_list vargs;
#ifdef HAVE_STDARG_PROTOTYPES
va_start(vargs, format);
@@ -1069,9 +1032,6 @@ syntaxerror(struct tok_state *tok, const char *format, ...)
tok->lineno,
(int)(tok->cur - tok->line_start));
tok->done = E_ERROR;
-#else
- tok->done = E_TOKEN;
-#endif
return ERRORTOKEN;
}
@@ -1083,9 +1043,6 @@ indenterror(struct tok_state *tok)
return ERRORTOKEN;
}
-#ifdef PGEN
-#define verify_identifier(tok) 1
-#else
/* Verify that the identifier follows PEP 3131.
All identifier strings are guaranteed to be "ready" unicode objects.
*/
@@ -1112,7 +1069,6 @@ verify_identifier(struct tok_state *tok)
tok->done = E_IDENTIFIER;
return result;
}
-#endif
static int
tok_decimal_tail(struct tok_state *tok)
@@ -1667,25 +1623,20 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
case '(':
case '[':
case '{':
-#ifndef PGEN
if (tok->level >= MAXLEVEL) {
return syntaxerror(tok, "too many nested parentheses");
}
tok->parenstack[tok->level] = c;
tok->parenlinenostack[tok->level] = tok->lineno;
-#endif
tok->level++;
break;
case ')':
case ']':
case '}':
-#ifndef PGEN
if (!tok->level) {
return syntaxerror(tok, "unmatched '%c'", c);
}
-#endif
tok->level--;
-#ifndef PGEN
int opening = tok->parenstack[tok->level];
if (!((opening == '(' && c == ')') ||
(opening == '[' && c == ']') ||
@@ -1704,7 +1655,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
c, opening);
}
}
-#endif
break;
}
@@ -1742,11 +1692,7 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
FILE *fp;
char *p_start =NULL , *p_end =NULL , *encoding = NULL;
-#ifndef PGEN
fd = _Py_dup(fd);
-#else
- fd = dup(fd);
-#endif
if (fd < 0) {
return NULL;
}
@@ -1760,7 +1706,6 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
fclose(fp);
return NULL;
}
-#ifndef PGEN
if (filename != NULL) {
Py_INCREF(filename);
tok->filename = filename;
@@ -1773,7 +1718,6 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
return encoding;
}
}
-#endif
while (tok->lineno < 2 && tok->done == E_OK) {
PyTokenizer_Get(tok, &p_start, &p_end);
}
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 9639c658b1..22e91f7dca 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -42,15 +42,9 @@ struct tok_state {
expression (cf. issue 16806) */
int level; /* () [] {} Parentheses nesting level */
/* Used to allow free continuations inside them */
-#ifndef PGEN
char parenstack[MAXLEVEL];
int parenlinenostack[MAXLEVEL];
- /* pgen doesn't have access to Python codecs, it cannot decode the input
- filename. The bytes filename might be kept, but it is only used by
- indenterror() and it is not really needed: pgen only compiles one file
- (Grammar/Grammar). */
PyObject *filename;
-#endif
/* Stuff for checking on different tab sizes */
int altindstack[MAXINDENT]; /* Stack of alternate indents */
/* Stuff for PEP 0263 */
@@ -63,10 +57,8 @@ struct tok_state {
const char* multi_line_start; /* pointer to start of first line of
a single line or multi line string
expression (cf. issue 16806) */
-#ifndef PGEN
PyObject *decoding_readline; /* open(...).readline */
PyObject *decoding_buffer;
-#endif
const char* enc; /* Encoding for the current str. */
const char* str;
const char* input; /* Tokenizer's newline translated copy of the string. */
diff --git a/Parser/tokenizer_pgen.c b/Parser/tokenizer_pgen.c
deleted file mode 100644
index 9cb8492d6a..0000000000
--- a/Parser/tokenizer_pgen.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define PGEN
-#include "tokenizer.c"