summaryrefslogtreecommitdiff
path: root/src/compiler.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/compiler.cc')
-rw-r--r--src/compiler.cc1170
1 files changed, 1170 insertions, 0 deletions
diff --git a/src/compiler.cc b/src/compiler.cc
new file mode 100644
index 0000000..d60519e
--- /dev/null
+++ b/src/compiler.cc
@@ -0,0 +1,1170 @@
+/*
+ * Copyright 2006-2012 Adrian Thurston <thurston@complang.org>
+ */
+
+/* This file is part of Colm.
+ *
+ * Colm is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Colm is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Colm; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <iostream>
+#include <iomanip>
+#include <errno.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <sstream>
+
+#include "global.h"
+#include "avltree.h"
+#include "parsedata.h"
+#include "parser.h"
+#include "parsedata.h"
+#include "parsetree.h"
+#include "mergesort.h"
+#include "redbuild.h"
+#include "pdacodegen.h"
+#include "fsmcodegen.h"
+#include "pdarun.h"
+#include "colm.h"
+#include "pool.h"
+
+using std::ostringstream;
+using std::cout;
+using std::cerr;
+using std::endl;
+
+char machineMain[] = "main";
+exit_object endp;
+void operator<<( ostream &out, exit_object & )
+{
+ out << endl;
+ exit(1);
+}
+
+/* Perform minimization after an operation according
+ * to the command line args. */
+void afterOpMinimize( FsmGraph *fsm, bool lastInSeq )
+{
+ /* Switch on the prefered minimization algorithm. */
+ if ( lastInSeq ) {
+ /* First clean up the graph. FsmGraph operations may leave these
+ * lying around. There should be no dead end states. The subtract
+ * intersection operators are the only places where they may be
+ * created and those operators clean them up. */
+ fsm->removeUnreachableStates();
+ fsm->minimizePartition2();
+ }
+}
+
+/* Count the transitions in the fsm by walking the state list. */
+int countTransitions( FsmGraph *fsm )
+{
+ int numTrans = 0;
+ FsmState *state = fsm->stateList.head;
+ while ( state != 0 ) {
+ numTrans += state->outList.length();
+ state = state->next;
+ }
+ return numTrans;
+}
+
+Key makeFsmKeyHex( char *str, const InputLoc &loc, Compiler *pd )
+{
+ /* Reset errno so we can check for overflow or underflow. In the event of
+ * an error, sets the return val to the upper or lower bound being tested
+ * against. */
+ errno = 0;
+ unsigned int size = keyOps->alphType->size;
+ bool unusedBits = size < sizeof(unsigned long);
+
+ unsigned long ul = strtoul( str, 0, 16 );
+
+ if ( errno == ERANGE || (unusedBits && ul >> (size * 8)) ) {
+ error(loc) << "literal " << str << " overflows the alphabet type" << endl;
+ ul = 1 << (size * 8);
+ }
+
+ if ( unusedBits && ul >> (size * 8 - 1) )
+ ul |= (ULONG_MAX >> (size*8 ) ) << (size*8);
+
+ return Key( (long)ul );
+}
+
+Key makeFsmKeyDec( char *str, const InputLoc &loc, Compiler *pd )
+{
+ /* Convert the number to a decimal. First reset errno so we can check
+ * for overflow or underflow. */
+ errno = 0;
+ long long minVal = keyOps->alphType->minVal;
+ long long maxVal = keyOps->alphType->maxVal;
+
+ long long ll = strtoll( str, 0, 10 );
+
+ /* Check for underflow. */
+ if ( (errno == ERANGE && ll < 0) || ll < minVal) {
+ error(loc) << "literal " << str << " underflows the alphabet type" << endl;
+ ll = minVal;
+ }
+ /* Check for overflow. */
+ else if ( (errno == ERANGE && ll > 0) || ll > maxVal ) {
+ error(loc) << "literal " << str << " overflows the alphabet type" << endl;
+ ll = maxVal;
+ }
+
+ return Key( (long)ll );
+}
+
+/* Make an fsm key in int format (what the fsm graph uses) from an alphabet
+ * number returned by the parser. Validates that the number doesn't overflow
+ * the alphabet type. */
+Key makeFsmKeyNum( char *str, const InputLoc &loc, Compiler *pd )
+{
+ /* Switch on hex/decimal format. */
+ if ( str[0] == '0' && str[1] == 'x' )
+ return makeFsmKeyHex( str, loc, pd );
+ else
+ return makeFsmKeyDec( str, loc, pd );
+}
+
+/* Make an fsm int format (what the fsm graph uses) from a single character.
+ * Performs proper conversion depending on signed/unsigned property of the
+ * alphabet. */
+Key makeFsmKeyChar( char c, Compiler *pd )
+{
+ /* Copy from a char type. */
+ return Key( c );
+}
+
+/* Make an fsm key array in int format (what the fsm graph uses) from a string
+ * of characters. Performs proper conversion depending on signed/unsigned
+ * property of the alphabet. */
+void makeFsmKeyArray( Key *result, char *data, int len, Compiler *pd )
+{
+ /* Copy from a char star type. */
+ char *src = data;
+ for ( int i = 0; i < len; i++ )
+ result[i] = Key(src[i]);
+}
+
+/* Like makeFsmKeyArray except the result has only unique keys. They ordering
+ * will be changed. */
+void makeFsmUniqueKeyArray( KeySet &result, char *data, int len,
+ bool caseInsensitive, Compiler *pd )
+{
+ /* Copy from a char star type. */
+ char *src = data;
+ for ( int si = 0; si < len; si++ ) {
+ Key key( src[si] );
+ result.insert( key );
+ if ( caseInsensitive ) {
+ if ( key.isLower() )
+ result.insert( key.toUpper() );
+ else if ( key.isUpper() )
+ result.insert( key.toLower() );
+ }
+ }
+}
+
+FsmGraph *dotFsm( Compiler *pd )
+{
+ FsmGraph *retFsm = new FsmGraph();
+ retFsm->rangeFsm( keyOps->minKey, keyOps->maxKey );
+ return retFsm;
+}
+
+FsmGraph *dotStarFsm( Compiler *pd )
+{
+ FsmGraph *retFsm = new FsmGraph();
+ retFsm->rangeStarFsm( keyOps->minKey, keyOps->maxKey );
+ return retFsm;
+}
+
+/* Make a builtin type. Depends on the signed nature of the alphabet type. */
+FsmGraph *makeBuiltin( BuiltinMachine builtin, Compiler *pd )
+{
+ /* FsmGraph created to return. */
+ FsmGraph *retFsm = 0;
+
+ switch ( builtin ) {
+ case BT_Any: {
+ /* All characters. */
+ retFsm = dotFsm( pd );
+ break;
+ }
+ case BT_Ascii: {
+ /* Ascii characters 0 to 127. */
+ retFsm = new FsmGraph();
+ retFsm->rangeFsm( 0, 127 );
+ break;
+ }
+ case BT_Extend: {
+ /* Ascii extended characters. This is the full byte range. Dependent
+ * on signed, vs no signed. If the alphabet is one byte then just use
+ * dot fsm. */
+ retFsm = new FsmGraph();
+ retFsm->rangeFsm( -128, 127 );
+ break;
+ }
+ case BT_Alpha: {
+ /* Alpha [A-Za-z]. */
+ FsmGraph *upper = new FsmGraph(), *lower = new FsmGraph();
+ upper->rangeFsm( 'A', 'Z' );
+ lower->rangeFsm( 'a', 'z' );
+ upper->unionOp( lower );
+ upper->minimizePartition2();
+ retFsm = upper;
+ break;
+ }
+ case BT_Digit: {
+ /* Digits [0-9]. */
+ retFsm = new FsmGraph();
+ retFsm->rangeFsm( '0', '9' );
+ break;
+ }
+ case BT_Alnum: {
+ /* Alpha numerics [0-9A-Za-z]. */
+ FsmGraph *digit = new FsmGraph(), *lower = new FsmGraph();
+ FsmGraph *upper = new FsmGraph();
+ digit->rangeFsm( '0', '9' );
+ upper->rangeFsm( 'A', 'Z' );
+ lower->rangeFsm( 'a', 'z' );
+ digit->unionOp( upper );
+ digit->unionOp( lower );
+ digit->minimizePartition2();
+ retFsm = digit;
+ break;
+ }
+ case BT_Lower: {
+ /* Lower case characters. */
+ retFsm = new FsmGraph();
+ retFsm->rangeFsm( 'a', 'z' );
+ break;
+ }
+ case BT_Upper: {
+ /* Upper case characters. */
+ retFsm = new FsmGraph();
+ retFsm->rangeFsm( 'A', 'Z' );
+ break;
+ }
+ case BT_Cntrl: {
+ /* Control characters. */
+ FsmGraph *cntrl = new FsmGraph();
+ FsmGraph *highChar = new FsmGraph();
+ cntrl->rangeFsm( 0, 31 );
+ highChar->concatFsm( 127 );
+ cntrl->unionOp( highChar );
+ cntrl->minimizePartition2();
+ retFsm = cntrl;
+ break;
+ }
+ case BT_Graph: {
+ /* Graphical ascii characters [!-~]. */
+ retFsm = new FsmGraph();
+ retFsm->rangeFsm( '!', '~' );
+ break;
+ }
+ case BT_Print: {
+ /* Printable characters. Same as graph except includes space. */
+ retFsm = new FsmGraph();
+ retFsm->rangeFsm( ' ', '~' );
+ break;
+ }
+ case BT_Punct: {
+ /* Punctuation. */
+ FsmGraph *range1 = new FsmGraph();
+ FsmGraph *range2 = new FsmGraph();
+ FsmGraph *range3 = new FsmGraph();
+ FsmGraph *range4 = new FsmGraph();
+ range1->rangeFsm( '!', '/' );
+ range2->rangeFsm( ':', '@' );
+ range3->rangeFsm( '[', '`' );
+ range4->rangeFsm( '{', '~' );
+ range1->unionOp( range2 );
+ range1->unionOp( range3 );
+ range1->unionOp( range4 );
+ range1->minimizePartition2();
+ retFsm = range1;
+ break;
+ }
+ case BT_Space: {
+ /* Whitespace: [\t\v\f\n\r ]. */
+ FsmGraph *cntrl = new FsmGraph();
+ FsmGraph *space = new FsmGraph();
+ cntrl->rangeFsm( '\t', '\r' );
+ space->concatFsm( ' ' );
+ cntrl->unionOp( space );
+ cntrl->minimizePartition2();
+ retFsm = cntrl;
+ break;
+ }
+ case BT_Xdigit: {
+ /* Hex digits [0-9A-Fa-f]. */
+ FsmGraph *digit = new FsmGraph();
+ FsmGraph *upper = new FsmGraph();
+ FsmGraph *lower = new FsmGraph();
+ digit->rangeFsm( '0', '9' );
+ upper->rangeFsm( 'A', 'F' );
+ lower->rangeFsm( 'a', 'f' );
+ digit->unionOp( upper );
+ digit->unionOp( lower );
+ digit->minimizePartition2();
+ retFsm = digit;
+ break;
+ }
+ case BT_Lambda: {
+ retFsm = new FsmGraph();
+ retFsm->lambdaFsm();
+ break;
+ }
+ case BT_Empty: {
+ retFsm = new FsmGraph();
+ retFsm->emptyFsm();
+ break;
+ }}
+
+ return retFsm;
+}
+
+/*
+ * Compiler
+ */
+
+/* Initialize the structure that will collect info during the parse of a
+ * machine. */
+Compiler::Compiler( )
+:
+ nextPriorKey(0),
+ nextNameId(0),
+ alphTypeSet(false),
+ getKeyExpr(0),
+ accessExpr(0),
+ curStateExpr(0),
+ lowerNum(0),
+ upperNum(0),
+ errorCount(0),
+ curActionOrd(0),
+ curPriorOrd(0),
+ nextEpsilonResolvedLink(0),
+ nextTokenId(1),
+ rootCodeBlock(0),
+ mainReturnUT(0),
+ //access(0),
+ //tokenStruct(0),
+
+ ptrLangEl(0),
+ boolLangEl(0),
+ intLangEl(0),
+ strLangEl(0),
+ streamLangEl(0),
+ anyLangEl(0),
+ rootLangEl(0),
+ noTokenLangEl(0),
+ eofLangEl(0),
+ errorLangEl(0),
+ ignoreLangEl(0),
+
+ firstNonTermId(0),
+ prodIdIndex(0),
+
+ uniqueTypeNil(0),
+ uniqueTypePtr(0),
+ uniqueTypeBool(0),
+ uniqueTypeInt(0),
+ uniqueTypeStr(0),
+ uniqueTypeStream(0),
+ uniqueTypeIgnore(0),
+ uniqueTypeAny(0),
+ nextPatConsId(0),
+ nextGenericId(1),
+ nextFuncId(0),
+ loopCleanup(0),
+ nextObjectId(1), /* 0 is reserved for no object. */
+ nextFrameId(0),
+ nextParserId(0),
+ revertOn(true),
+ predValue(0),
+ nextMatchEndNum(0),
+ argvTypeRef(0),
+ inContiguous(false),
+ contiguousOffset(0),
+ contiguousStretch(0)
+{
+}
+
+/* Clean up the data collected during a parse. */
+Compiler::~Compiler()
+{
+ /* Delete all the nodes in the action list. Will cause all the
+ * string data that represents the actions to be deallocated. */
+ actionList.empty();
+}
+
+ostream &operator<<( ostream &out, const Token &token )
+{
+ out << token.data;
+ return out;
+}
+
+/* Write out a name reference. */
+ostream &operator<<( ostream &out, const NameRef &nameRef )
+{
+ int pos = 0;
+ if ( nameRef[pos] == 0 ) {
+ out << "::";
+ pos += 1;
+ }
+ out << nameRef[pos++];
+ for ( ; pos < nameRef.length(); pos++ )
+ out << "::" << nameRef[pos];
+ return out;
+}
+
+NameInst **Compiler::makeNameIndex()
+{
+ /* The number of nodes in the tree can now be given by nextNameId. Put a
+ * null pointer on the end of the list to terminate it. */
+ NameInst **nameIndex = new NameInst*[nextNameId+1];
+ memset( nameIndex, 0, sizeof(NameInst*)*(nextNameId+1) );
+
+ for ( NameInstList::Iter ni = nameInstList; ni.lte(); ni++ )
+ nameIndex[ni->id] = ni;
+
+ return nameIndex;
+}
+
+void Compiler::createBuiltin( const char *name, BuiltinMachine builtin )
+{
+ LexExpression *expression = LexExpression::cons( builtin );
+ LexJoin *join = LexJoin::cons( expression );
+ LexDefinition *varDef = new LexDefinition( name, join );
+ GraphDictEl *graphDictEl = new GraphDictEl( name, varDef );
+ rootNamespace->rlMap.insert( graphDictEl );
+}
+
+/* Initialize the graph dict with builtin types. */
+void Compiler::initGraphDict( )
+{
+ createBuiltin( "any", BT_Any );
+ createBuiltin( "ascii", BT_Ascii );
+ createBuiltin( "extend", BT_Extend );
+ createBuiltin( "alpha", BT_Alpha );
+ createBuiltin( "digit", BT_Digit );
+ createBuiltin( "alnum", BT_Alnum );
+ createBuiltin( "lower", BT_Lower );
+ createBuiltin( "upper", BT_Upper );
+ createBuiltin( "cntrl", BT_Cntrl );
+ createBuiltin( "graph", BT_Graph );
+ createBuiltin( "print", BT_Print );
+ createBuiltin( "punct", BT_Punct );
+ createBuiltin( "space", BT_Space );
+ createBuiltin( "xdigit", BT_Xdigit );
+ createBuiltin( "null", BT_Lambda );
+ createBuiltin( "zlen", BT_Lambda );
+ createBuiltin( "empty", BT_Empty );
+}
+
+/* Initialize the key operators object that will be referenced by all fsms
+ * created. */
+void Compiler::initKeyOps( )
+{
+ /* Signedness and bounds. */
+ HostType *alphType = alphTypeSet ? userAlphType : hostLang->defaultAlphType;
+ thisKeyOps.setAlphType( alphType );
+
+ if ( lowerNum != 0 ) {
+ /* If ranges are given then interpret the alphabet type. */
+ thisKeyOps.minKey = makeFsmKeyNum( lowerNum, rangeLowLoc, this );
+ thisKeyOps.maxKey = makeFsmKeyNum( upperNum, rangeHighLoc, this );
+ }
+}
+
+/* Remove duplicates of unique actions from an action table. */
+void Compiler::removeDups( ActionTable &table )
+{
+ /* Scan through the table looking for unique actions to
+ * remove duplicates of. */
+ for ( int i = 0; i < table.length(); i++ ) {
+ /* Remove any duplicates ahead of i. */
+ for ( int r = i+1; r < table.length(); ) {
+ if ( table[r].value == table[i].value )
+ table.vremove(r);
+ else
+ r += 1;
+ }
+ }
+}
+
+/* Remove duplicates from action lists. This operates only on transition and
+ * eof action lists and so should be called once all actions have been
+ * transfered to their final resting place. */
+void Compiler::removeActionDups( FsmGraph *graph )
+{
+ /* Loop all states. */
+ for ( StateList::Iter state = graph->stateList; state.lte(); state++ ) {
+ /* Loop all transitions. */
+ for ( TransList::Iter trans = state->outList; trans.lte(); trans++ )
+ removeDups( trans->actionTable );
+ removeDups( state->toStateActionTable );
+ removeDups( state->fromStateActionTable );
+ removeDups( state->eofActionTable );
+ }
+}
+
+Action *Compiler::newAction( const String &name, InlineList *inlineList )
+{
+ InputLoc loc;
+ loc.line = 1;
+ loc.col = 1;
+ loc.fileName = 0;
+
+ Action *action = Action::cons( loc, name, inlineList );
+ actionList.append( action );
+ return action;
+}
+
+void Compiler::initLongestMatchData()
+{
+ if ( regionSetList.length() > 0 ) {
+ /* The initActId action gives act a default value. */
+ InlineList *il4 = InlineList::cons();
+ il4->append( InlineItem::cons( InputLoc(), InlineItem::LmInitAct ) );
+ initActId = newAction( "initact", il4 );
+ initActId->isLmAction = true;
+
+ /* The setTokStart action sets tokstart. */
+ InlineList *il5 = InlineList::cons();
+ il5->append( InlineItem::cons( InputLoc(), InlineItem::LmSetTokStart ) );
+ setTokStart = newAction( "tokstart", il5 );
+ setTokStart->isLmAction = true;
+
+ /* The setTokEnd action sets tokend. */
+ InlineList *il3 = InlineList::cons();
+ il3->append( InlineItem::cons( InputLoc(), InlineItem::LmSetTokEnd ) );
+ setTokEnd = newAction( "tokend", il3 );
+ setTokEnd->isLmAction = true;
+
+ /* The action will also need an ordering: ahead of all user action
+ * embeddings. */
+ initActIdOrd = curActionOrd++;
+ setTokStartOrd = curActionOrd++;
+ setTokEndOrd = curActionOrd++;
+ }
+}
+
+void Compiler::finishGraphBuild( FsmGraph *graph )
+{
+ /* Resolve any labels that point to multiple states. Any labels that are
+ * still around are referenced only by gotos and calls and they need to be
+ * made into deterministic entry points. */
+ graph->deterministicEntry();
+
+ /*
+ * All state construction is now complete.
+ */
+
+ /* Transfer global error actions. */
+ for ( StateList::Iter state = graph->stateList; state.lte(); state++ )
+ graph->transferErrorActions( state, 0 );
+
+ removeActionDups( graph );
+
+ /* Remove unreachable states. There should be no dead end states. The
+ * subtract and intersection operators are the only places where they may
+ * be created and those operators clean them up. */
+ graph->removeUnreachableStates();
+
+ /* No more fsm operations are to be done. Action ordering numbers are
+ * no longer of use and will just hinder minimization. Clear them. */
+ graph->nullActionKeys();
+
+ /* Transition priorities are no longer of use. We can clear them
+ * because they will just hinder minimization as well. Clear them. */
+ graph->clearAllPriorities();
+
+ /* Minimize here even if we minimized at every op. Now that function
+ * keys have been cleared we may get a more minimal fsm. */
+ graph->minimizePartition2();
+ graph->compressTransitions();
+}
+
+/* Build the name tree and supporting data structures. */
+NameInst *Compiler::makeNameTree()
+{
+ /* Create the root name. */
+ nextNameId = 1;
+
+ /* First make the name tree. */
+ for ( RegionImplList::Iter rel = regionImplList; rel.lte(); rel++ ) {
+ /* Recurse on the instance. */
+ rel->makeNameTree( rel->loc, this );
+ }
+
+ return 0;
+}
+
+FsmGraph *Compiler::makeAllRegions()
+{
+ /* Build the name tree and supporting data structures. */
+ makeNameTree();
+ NameInst **nameIndex = makeNameIndex();
+
+ int numGraphs = 0;
+ FsmGraph **graphs = new FsmGraph*[regionImplList.length()];
+
+ /* Make all the instantiations, we know that main exists in this list. */
+ for ( RegionImplList::Iter rel = regionImplList; rel.lte(); rel++ ) {
+ /* Build the graph from a walk of the parse tree. */
+ FsmGraph *newGraph = rel->walk( this );
+
+ /* Wrap up the construction. */
+ finishGraphBuild( newGraph );
+
+ /* Save off the new graph. */
+ graphs[numGraphs++] = newGraph;
+ }
+
+ /* NOTE: If putting in minimization here we need to include eofTarget
+ * into the minimization algorithm. It is currently set by the longest
+ * match operator and not considered anywhere else. */
+
+ FsmGraph *all;
+ if ( numGraphs == 0 ) {
+ all = new FsmGraph;
+ all->lambdaFsm();
+ }
+ else {
+ /* Add all the other graphs into the first. */
+ all = graphs[0];
+ all->globOp( graphs+1, numGraphs-1 );
+ delete[] graphs;
+ }
+
+ /* Go through all the token regions and check for lmRequiresErrorState. */
+ for ( RegionImplList::Iter reg = regionImplList; reg.lte(); reg++ ) {
+ if ( reg->lmSwitchHandlesError )
+ all->lmRequiresErrorState = true;
+ }
+
+ all->nameIndex = nameIndex;
+
+ return all;
+}
+
+void Compiler::analyzeAction( Action *action, InlineList *inlineList )
+{
+ /* FIXME: Actions used as conditions should be very constrained. */
+ for ( InlineList::Iter item = *inlineList; item.lte(); item++ ) {
+ //if ( item->type == InlineItem::Call || item->type == InlineItem::CallExpr )
+ // action->anyCall = true;
+
+ /* Need to recurse into longest match items. */
+ if ( item->type == InlineItem::LmSwitch ) {
+ RegionImpl *lm = item->tokenRegion;
+ for ( TokenInstanceListReg::Iter lmi = lm->tokenInstanceList; lmi.lte(); lmi++ ) {
+ if ( lmi->action != 0 )
+ analyzeAction( action, lmi->action->inlineList );
+ }
+ }
+
+ if ( item->type == InlineItem::LmOnLast ||
+ item->type == InlineItem::LmOnNext ||
+ item->type == InlineItem::LmOnLagBehind )
+ {
+ TokenInstance *lmi = item->longestMatchPart;
+ if ( lmi->action != 0 )
+ analyzeAction( action, lmi->action->inlineList );
+ }
+
+ if ( item->children != 0 )
+ analyzeAction( action, item->children );
+ }
+}
+
+void Compiler::analyzeGraph( FsmGraph *graph )
+{
+ for ( ActionList::Iter act = actionList; act.lte(); act++ )
+ analyzeAction( act, act->inlineList );
+
+ for ( StateList::Iter st = graph->stateList; st.lte(); st++ ) {
+ /* The transition list. */
+ for ( TransList::Iter trans = st->outList; trans.lte(); trans++ ) {
+ for ( ActionTable::Iter at = trans->actionTable; at.lte(); at++ )
+ at->value->numTransRefs += 1;
+ }
+
+ for ( ActionTable::Iter at = st->toStateActionTable; at.lte(); at++ )
+ at->value->numToStateRefs += 1;
+
+ for ( ActionTable::Iter at = st->fromStateActionTable; at.lte(); at++ )
+ at->value->numFromStateRefs += 1;
+
+ for ( ActionTable::Iter at = st->eofActionTable; at.lte(); at++ )
+ at->value->numEofRefs += 1;
+ }
+}
+
+FsmGraph *Compiler::makeScanner()
+{
+ /* Make the graph, do minimization. */
+ FsmGraph *fsmGraph = makeAllRegions();
+
+ /* If any errors have occured in the input file then don't write anything. */
+ if ( gblErrorCount > 0 )
+ return 0;
+
+ analyzeGraph( fsmGraph );
+
+ /* Decide if an error state is necessary.
+ * 1. There is an error transition
+ * 2. There is a gap in the transitions
+ * 3. The longest match operator requires it. */
+ if ( fsmGraph->lmRequiresErrorState || fsmGraph->hasErrorTrans() )
+ fsmGraph->errState = fsmGraph->addState();
+
+ /* State numbers need to be assigned such that all final states have a
+ * larger state id number than all non-final states. This enables the
+ * first_final mechanism to function correctly. We also want states to be
+ * ordered in a predictable fashion. So we first apply a depth-first
+ * search, then do a stable sort by final state status, then assign
+ * numbers. */
+
+ fsmGraph->depthFirstOrdering();
+ fsmGraph->sortStatesByFinal();
+ fsmGraph->setStateNumbers( 0 );
+
+ return fsmGraph;
+}
+
+LangEl *Compiler::makeRepeatProd( const InputLoc &loc, Namespace *nspace,
+ const String &repeatName, UniqueType *ut )
+{
+ LangEl *prodName = addLangEl( this, nspace, repeatName, LangEl::NonTerm );
+ prodName->isRepeat = true;
+
+ ProdElList *prodElList1 = new ProdElList;
+
+ /* Build the first production of the repeat. */
+ TypeRef *typeRef1 = TypeRef::cons( loc, ut );
+ ProdEl *factor1 = new ProdEl( ProdEl::ReferenceType, InputLoc(), 0, false, typeRef1, 0 );
+
+ UniqueType *prodNameUT = findUniqueType( TYPE_TREE, prodName );
+ TypeRef *typeRef2 = TypeRef::cons( loc, prodNameUT );
+ ProdEl *factor2 = new ProdEl( ProdEl::ReferenceType, InputLoc(), 0, false, typeRef2, 0 );
+
+ prodElList1->append( factor1 );
+ prodElList1->append( factor2 );
+
+ Production *newDef1 = Production::cons( InputLoc(),
+ prodName, prodElList1, String(), false, 0,
+ prodList.length(), prodName->defList.length() );
+
+ prodName->defList.append( newDef1 );
+ prodList.append( newDef1 );
+
+ /* Build the second production of the repeat. */
+ ProdElList *prodElList2 = new ProdElList;
+
+ Production *newDef2 = Production::cons( InputLoc(),
+ prodName, prodElList2, String(), false, 0,
+ prodList.length(), prodName->defList.length() );
+
+ prodName->defList.append( newDef2 );
+ prodList.append( newDef2 );
+
+ return prodName;
+}
+
+LangEl *Compiler::makeListProd( const InputLoc &loc, Namespace *nspace,
+ const String &listName, UniqueType *ut )
+{
+ LangEl *prodName = addLangEl( this, nspace, listName, LangEl::NonTerm );
+ prodName->isList = true;
+
+ /* Build the first production of the list. */
+ TypeRef *typeRef1 = TypeRef::cons( loc, ut );
+ ProdEl *factor1 = new ProdEl( ProdEl::ReferenceType, InputLoc(), 0, false, typeRef1, 0 );
+
+ UniqueType *prodNameUT = findUniqueType( TYPE_TREE, prodName );
+ TypeRef *typeRef2 = TypeRef::cons( loc, prodNameUT );
+ ProdEl *factor2 = new ProdEl( ProdEl::ReferenceType, loc, 0, false, typeRef2, 0 );
+
+ ProdElList *prodElList1 = new ProdElList;
+ prodElList1->append( factor1 );
+ prodElList1->append( factor2 );
+
+ Production *newDef1 = Production::cons( loc,
+ prodName, prodElList1, String(), false, 0,
+ prodList.length(), prodName->defList.length() );
+
+ prodName->defList.append( newDef1 );
+ prodList.append( newDef1 );
+
+ /* Build the second production of the list. */
+ TypeRef *typeRef3 = TypeRef::cons( loc, ut );
+ ProdEl *factor3 = new ProdEl( ProdEl::ReferenceType, loc, 0, false, typeRef3, 0 );
+
+ ProdElList *prodElList2 = new ProdElList;
+ prodElList2->append( factor3 );
+
+ Production *newDef2 = Production::cons( loc,
+ prodName, prodElList2, String(), false, 0,
+ prodList.length(), prodName->defList.length() );
+
+ prodName->defList.append( newDef2 );
+ prodList.append( newDef2 );
+
+ return prodName;
+}
+
+LangEl *Compiler::makeOptProd( const InputLoc &loc, Namespace *nspace,
+ const String &optName, UniqueType *ut )
+{
+ LangEl *prodName = addLangEl( this, nspace, optName, LangEl::NonTerm );
+ prodName->isOpt = true;
+
+ ProdElList *prodElList1 = new ProdElList;
+
+ /* Build the first production of the repeat. */
+ TypeRef *typeRef1 = TypeRef::cons( loc, ut );
+ ProdEl *factor1 = new ProdEl( ProdEl::ReferenceType, loc, 0, false, typeRef1, 0 );
+ prodElList1->append( factor1 );
+
+ Production *newDef1 = Production::cons( loc,
+ prodName, prodElList1, String(), false, 0,
+ prodList.length(), prodName->defList.length() );
+
+ prodName->defList.append( newDef1 );
+ prodList.append( newDef1 );
+
+ /* Build the second production of the repeat. */
+ ProdElList *prodElList2 = new ProdElList;
+
+ Production *newDef2 = Production::cons( loc,
+ prodName, prodElList2, String(), false, 0,
+ prodList.length(), prodName->defList.length() );
+
+ prodName->defList.append( newDef2 );
+ prodList.append( newDef2 );
+
+ return prodName;
+}
+
+Namespace *Namespace::findNamespace( const String &name )
+{
+ for ( NamespaceVect::Iter c = childNamespaces; c.lte(); c++ ) {
+ if ( strcmp( name, (*c)->name ) == 0 )
+ return *c;
+ }
+ return 0;
+}
+
+/* Search from a previously resolved qualification. (name 1+ in a qual list). */
+Namespace *NamespaceQual::searchFrom( Namespace *from, StringVect::Iter &qualPart )
+{
+ /* While there are still parts in the qualification. */
+ while ( qualPart.lte() ) {
+ Namespace *child = from->findNamespace( *qualPart );
+ if ( child == 0 )
+ return 0;
+
+ from = child;
+ qualPart.increment();
+ }
+
+ return from;
+}
+
+Namespace *NamespaceQual::getQual( Compiler *pd )
+{
+ /* Do the search only once. */
+ if ( cachedNspaceQual != 0 )
+ return cachedNspaceQual;
+
+ if ( qualNames.length() == 0 ) {
+ /* No qualification, use the region the qualification was
+ * declared in. */
+ cachedNspaceQual = declInNspace;
+ }
+ else if ( strcmp( qualNames[0], "root" ) == 0 ) {
+ /* First item is "root." Start the downward search from there. */
+ StringVect::Iter qualPart = qualNames;
+ qualPart.increment();
+ cachedNspaceQual = searchFrom( pd->rootNamespace, qualPart );
+ return cachedNspaceQual;
+ }
+ else {
+ /* Have a qualification. Move upwards through the declared
+ * regions looking for the first part. */
+ StringVect::Iter qualPart = qualNames;
+ Namespace *parentNamespace = declInNspace;
+ while ( parentNamespace != 0 ) {
+ /* Search for the first part underneath the current parent. */
+ Namespace *child = parentNamespace->findNamespace( *qualPart );
+
+ if ( child != 0 ) {
+ /* Found the first part. Start going below the result. */
+ qualPart.increment();
+ cachedNspaceQual = searchFrom( child, qualPart );
+ return cachedNspaceQual;
+ }
+
+ /* Not found, move up to the parent. */
+ parentNamespace = parentNamespace->parentNamespace;
+ }
+
+ /* Failed to find the place to start from. */
+ cachedNspaceQual = 0;
+ }
+
+ return cachedNspaceQual;
+}
+
+void Compiler::initEmptyScanner( RegionSet *regionSet, TokenRegion *reg )
+{
+ if ( reg != 0 && reg->impl->tokenInstanceList.length() == 0 ) {
+ reg->impl->wasEmpty = true;
+
+ static int def = 1;
+ String name( 64, "__%p_DEF_PAT_%d", reg, def++ );
+
+ LexJoin *join = LexJoin::cons( LexExpression::cons( BT_Any ) );
+
+ TokenDef *tokenDef = TokenDef::cons( name, String(), false, false,
+ join, 0, internal, nextTokenId++, rootNamespace,
+ regionSet, 0, 0 );
+
+ TokenInstance *tokenInstance = TokenInstance::cons( tokenDef,
+ join, internal, nextTokenId++,
+ rootNamespace, reg );
+
+ reg->impl->tokenInstanceList.append( tokenInstance );
+
+ /* These do not go in the namespace so so they cannot get declared
+ * in the declare pass. */
+ LangEl *lel = addLangEl( this, rootNamespace, name, LangEl::Term );
+
+ tokenInstance->tokenDef->tdLangEl = lel;
+ lel->tokenDef = tokenDef;
+ }
+}
+
+void Compiler::initEmptyScanners()
+{
+ for ( RegionSetList::Iter regionSet = regionSetList; regionSet.lte(); regionSet++ ) {
+ initEmptyScanner( regionSet, regionSet->tokenIgnore );
+ initEmptyScanner( regionSet, regionSet->tokenOnly );
+ initEmptyScanner( regionSet, regionSet->ignoreOnly );
+ initEmptyScanner( regionSet, regionSet->collectIgnore );
+ }
+}
+
+PdaRun *Compiler::parsePattern( Program *prg, Tree **sp, const InputLoc &loc,
+ int parserId, StreamImpl *sourceStream )
+{
+ StreamImpl *in = newSourceStreamGeneric( "<internal>" );
+
+ PdaRun *pdaRun = new PdaRun;
+ initPdaRun( prg, pdaRun, pdaTables, parserId, 0, false, 0 );
+
+ Stream *res = streamAllocate( prg );
+ res->id = LEL_ID_STREAM;
+ res->in = sourceStream;
+ in->funcs->appendStream( in, (Tree*)res );
+ in->funcs->setEof( in );
+
+ long pcr = parseLoop( prg, sp, pdaRun, in, PcrStart );
+ assert( pcr == PcrDone );
+ if ( pdaRun->parseError ) {
+ cerr << ( loc.fileName != 0 ? loc.fileName : "<input>" ) <<
+ ":" << loc.line << ":" << loc.col;
+
+ if ( pdaRun->parseErrorText != 0 ) {
+ cerr << ": relative error: " <<
+ pdaRun->parseErrorText->tokdata->data;
+ }
+ else {
+ cerr << ": parse error";
+ }
+
+ cerr << endl;
+ gblErrorCount += 1;
+ }
+
+ return pdaRun;
+}
+
+
+void Compiler::parsePatterns()
+{
+ Program *prg = colm_new_program( runtimeData );
+
+ /* Turn off context-dependent parsing. */
+ prg->ctxDepParsing = 0;
+
+ Tree **sp = prg->stackRoot;
+
+ for ( ConsList::Iter cons = replList; cons.lte(); cons++ ) {
+ StreamImpl *in = newSourceStreamCons( "<internal>", cons );
+ cons->pdaRun = parsePattern( prg, sp, cons->loc, cons->langEl->parserId, in );
+ }
+
+ for ( PatList::Iter pat = patternList; pat.lte(); pat++ ) {
+ StreamImpl *in = newSourceStreamPat( "<internal>", pat );
+ pat->pdaRun = parsePattern( prg, sp, pat->loc, pat->langEl->parserId, in );
+ }
+
+ /* Bail on above errors. */
+ if ( gblErrorCount > 0 )
+ exit(1);
+
+ fillInPatterns( prg );
+}
+
+void Compiler::collectParserEls( BstSet<LangEl*> &parserEls )
+{
+ for ( PatList::Iter pat = patternList; pat.lte(); pat++ ) {
+ /* We assume the reduction action compilation phase was run before
+ * pattern parsing and it decorated the pattern with the target type. */
+ assert( pat->langEl != 0 );
+ if ( pat->langEl->type != LangEl::NonTerm )
+ error(pat->loc) << "pattern type is not a non-terminal" << endp;
+
+ if ( pat->langEl->parserId < 0 ) {
+ /* Make a parser for the language element. */
+ parserEls.insert( pat->langEl );
+ pat->langEl->parserId = nextParserId++;
+ }
+ }
+
+ for ( ConsList::Iter repl = replList; repl.lte(); repl++ ) {
+ /* We assume the reduction action compilation phase was run before
+ * replacement parsing decorated the replacement with the target type. */
+ assert( repl->langEl != 0 );
+
+ if ( repl->langEl->parserId < 0 ) {
+ /* Make a parser for the language element. */
+ parserEls.insert( repl->langEl );
+ repl->langEl->parserId = nextParserId++;
+ }
+ }
+
+ /* Make parsers that we need. */
+ for ( LelList::Iter lel = langEls; lel.lte(); lel++ ) {
+ if ( lel->parserId >= 0 )
+ parserEls.insert( lel );
+ }
+}
+
+void Compiler::generateOutput( long activeRealm )
+{
+ FsmCodeGen *fsmGen = new FsmCodeGen( *outStream, redFsm, fsmTables );
+
+ PdaCodeGen *pdaGen = new PdaCodeGen( *outStream );
+
+ fsmGen->writeIncludes();
+ pdaGen->defineRuntime();
+ fsmGen->writeCode();
+
+ /* Make parsers that we need. */
+ pdaGen->writeParserData( 0, pdaTables );
+
+ /* Write the runtime data. */
+ pdaGen->writeRuntimeData( runtimeData, pdaTables );
+
+ if ( !gblLibrary )
+ fsmGen->writeMain( activeRealm );
+
+ outStream->flush();
+}
+
+
+void Compiler::prepGrammar()
+{
+ /* This will create language elements. */
+ wrapNonTerminals();
+
+ makeLangElIds();
+ makeLangElNames();
+ makeDefinitionNames();
+ noUndefindLangEls();
+
+ /* Put the language elements in an index by language element id. */
+ langElIndex = new LangEl*[nextSymbolId+1];
+ memset( langElIndex, 0, sizeof(LangEl*)*(nextSymbolId+1) );
+ for ( LelList::Iter lel = langEls; lel.lte(); lel++ )
+ langElIndex[lel->id] = lel;
+
+ makeProdFsms();
+
+ /* Allocate the Runtime data now. Every PdaTable that we make
+ * will reference it, but it will be filled in after all the tables are
+ * built. */
+ runtimeData = new RuntimeData;
+}
+
+void Compiler::compile()
+{
+ beginProcessing();
+ initKeyOps();
+
+ declarePass();
+
+ resolvePass();
+
+ makeTerminalWrappers();
+ makeEofElements();
+
+ /*
+ * Parsers
+ */
+
+ /* Init the longest match data */
+ initLongestMatchData();
+ FsmGraph *fsmGraph = makeScanner();
+
+ prepGrammar();
+
+ initAllLanguageObjects();
+ initAllFrameObjects();
+
+ for ( FunctionList::Iter f = functionList; f.lte(); f++ )
+ initUserFunctions( f, f->isUserIter );
+
+ /* Compile bytecode. */
+ compileByteCode();
+
+ /* Make the reduced fsm. */
+ RedFsmBuild reduce( this, fsmGraph );
+ redFsm = reduce.reduceMachine();
+
+ BstSet<LangEl*> parserEls;
+ collectParserEls( parserEls );
+
+ makeParser( parserEls );
+
+ /* Make the scanner tables. */
+ fsmTables = redFsm->makeFsmTables();
+
+ /* Now that all parsers are built, make the global runtimeData. */
+ makeRuntimeData();
+
+ /*
+ * All compilation is now complete.
+ */
+
+ /* Parse constructors and patterns. */
+ parsePatterns();
+}
+