diff options
Diffstat (limited to 'src/input.c')
-rw-r--r-- | src/input.c | 1156 |
1 files changed, 1156 insertions, 0 deletions
diff --git a/src/input.c b/src/input.c new file mode 100644 index 0000000..579fadd --- /dev/null +++ b/src/input.c @@ -0,0 +1,1156 @@ +/* GNU m4 -- A simple macro processor + + Copyright (C) 1989-1994, 2004-2013 Free Software Foundation, Inc. + + This file is part of GNU M4. + + GNU M4 is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + GNU M4 is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* Handling of different input sources, and lexical analysis. */ + +#include "m4.h" + +#include "memchr2.h" + +/* Unread input can be either files, that should be read (eg. included + files), strings, which should be rescanned (eg. macro expansion text), + or quoted macro definitions (as returned by the builtin "defn"). + Unread input are organised in a stack, implemented with an obstack. + Each input source is described by a "struct input_block". The obstack + is "current_input". The top of the input stack is "isp". + + The macro "m4wrap" places the text to be saved on another input + stack, on the obstack "wrapup_stack", whose top is "wsp". When EOF + is seen on normal input (eg, when "current_input" is empty), input is + switched over to "wrapup_stack", and the original "current_input" is + freed. A new stack is allocated for "wrapup_stack", which will + accept any text produced by calls to "m4wrap" from within the + wrapped text. This process of shuffling "wrapup_stack" to + "current_input" can continue indefinitely, even generating infinite + loops (e.g. "define(`f',`m4wrap(`f')')f"), without memory leaks. + + Pushing new input on the input stack is done by push_file (), + push_string (), push_wrapup () (for wrapup text), and push_macro () + (for macro definitions). Because macro expansion needs direct access + to the current input obstack (for optimisation), push_string () are + split in two functions, push_string_init (), which returns a pointer + to the current input stack, and push_string_finish (), which return a + pointer to the final text. The input_block *next is used to manage + the coordination between the different push routines. + + The current file and line number are stored in two global + variables, for use by the error handling functions in m4.c. Macro + expansion wants to report the line where a macro name was detected, + rather than where it finished collecting arguments. This also + applies to text resulting from macro expansions. So each input + block maintains its own notion of the current file and line, and + swapping between input blocks updates the global variables + accordingly. */ + +#ifdef ENABLE_CHANGEWORD +#include "regex.h" +#endif + +enum input_type +{ + INPUT_STRING, /* String resulting from macro expansion. */ + INPUT_FILE, /* File from command line or include. */ + INPUT_MACRO /* Builtin resulting from defn. */ +}; + +typedef enum input_type input_type; + +struct input_block +{ + struct input_block *prev; /* previous input_block on the input stack */ + input_type type; /* see enum values */ + const char *file; /* file where this input is from */ + int line; /* line where this input is from */ + union + { + struct + { + char *string; /* remaining string value */ + char *end; /* terminating NUL of string */ + } + u_s; /* INPUT_STRING */ + struct + { + FILE *fp; /* input file handle */ + bool_bitfield end : 1; /* true if peek has seen EOF */ + bool_bitfield close : 1; /* true if we should close file on pop */ + bool_bitfield advance : 1; /* track previous start_of_input_line */ + } + u_f; /* INPUT_FILE */ + builtin_func *func; /* pointer to macro's function */ + } + u; +}; + +typedef struct input_block input_block; + + +/* Current input file name. */ +const char *current_file; + +/* Current input line number. */ +int current_line; + +/* Obstack for storing individual tokens. */ +static struct obstack token_stack; + +/* Obstack for storing file names. */ +static struct obstack file_names; + +/* Wrapup input stack. */ +static struct obstack *wrapup_stack; + +/* Current stack, from input or wrapup. */ +static struct obstack *current_input; + +/* Bottom of token_stack, for obstack_free. */ +static void *token_bottom; + +/* Pointer to top of current_input. */ +static input_block *isp; + +/* Pointer to top of wrapup_stack. */ +static input_block *wsp; + +/* Aux. for handling split push_string (). */ +static input_block *next; + +/* Flag for next_char () to increment current_line. */ +static bool start_of_input_line; + +/* Flag for next_char () to recognize change in input block. */ +static bool input_change; + +#define CHAR_EOF 256 /* character return on EOF */ +#define CHAR_MACRO 257 /* character return for MACRO token */ + +/* Quote chars. */ +STRING rquote; +STRING lquote; + +/* Comment chars. */ +STRING bcomm; +STRING ecomm; + +#ifdef ENABLE_CHANGEWORD + +# define DEFAULT_WORD_REGEXP "[_a-zA-Z][_a-zA-Z0-9]*" + +static struct re_pattern_buffer word_regexp; +static int default_word_regexp; +static struct re_registers regs; + +#else /* ! ENABLE_CHANGEWORD */ +# define default_word_regexp 1 +#endif /* ! ENABLE_CHANGEWORD */ + +#ifdef DEBUG_INPUT +static const char *token_type_string (token_type); +#endif + + +/*-------------------------------------------------------------------. +| push_file () pushes an input file on the input stack, saving the | +| current file name and line number. If next is non-NULL, this push | +| invalidates a call to push_string_init (), whose storage is | +| consequently released. If CLOSE_WHEN_DONE, then close FP after | +| EOF is detected. | +`-------------------------------------------------------------------*/ + +void +push_file (FILE *fp, const char *title, bool close_when_done) +{ + input_block *i; + + if (next != NULL) + { + obstack_free (current_input, next); + next = NULL; + } + + if (debug_level & DEBUG_TRACE_INPUT) + DEBUG_MESSAGE1 ("input read from %s", title); + + i = (input_block *) obstack_alloc (current_input, + sizeof (struct input_block)); + i->type = INPUT_FILE; + i->file = (char *) obstack_copy0 (&file_names, title, strlen (title)); + i->line = 1; + input_change = true; + + i->u.u_f.fp = fp; + i->u.u_f.end = false; + i->u.u_f.close = close_when_done; + i->u.u_f.advance = start_of_input_line; + output_current_line = -1; + + i->prev = isp; + isp = i; +} + +/*---------------------------------------------------------------. +| push_macro () pushes a builtin macro's definition on the input | +| stack. If next is non-NULL, this push invalidates a call to | +| push_string_init (), whose storage is consequently released. | +`---------------------------------------------------------------*/ + +void +push_macro (builtin_func *func) +{ + input_block *i; + + if (next != NULL) + { + obstack_free (current_input, next); + next = NULL; + } + + i = (input_block *) obstack_alloc (current_input, + sizeof (struct input_block)); + i->type = INPUT_MACRO; + i->file = current_file; + i->line = current_line; + input_change = true; + + i->u.func = func; + i->prev = isp; + isp = i; +} + +/*------------------------------------------------------------------. +| First half of push_string (). The pointer next points to the new | +| input_block. | +`------------------------------------------------------------------*/ + +struct obstack * +push_string_init (void) +{ + if (next != NULL) + { + M4ERROR ((warning_status, 0, + "INTERNAL ERROR: recursive push_string!")); + abort (); + } + + next = (input_block *) obstack_alloc (current_input, + sizeof (struct input_block)); + next->type = INPUT_STRING; + next->file = current_file; + next->line = current_line; + + return current_input; +} + +/*-------------------------------------------------------------------. +| Last half of push_string (). If next is now NULL, a call to | +| push_file () has invalidated the previous call to push_string_init | +| (), so we just give up. If the new object is void, we do not push | +| it. The function push_string_finish () returns a pointer to the | +| finished object. This pointer is only for temporary use, since | +| reading the next token might release the memory used for the | +| object. | +`-------------------------------------------------------------------*/ + +const char * +push_string_finish (void) +{ + const char *ret = NULL; + + if (next == NULL) + return NULL; + + if (obstack_object_size (current_input) > 0) + { + size_t len = obstack_object_size (current_input); + obstack_1grow (current_input, '\0'); + next->u.u_s.string = (char *) obstack_finish (current_input); + next->u.u_s.end = next->u.u_s.string + len; + next->prev = isp; + isp = next; + ret = isp->u.u_s.string; /* for immediate use only */ + input_change = true; + } + else + obstack_free (current_input, next); /* people might leave garbage on it. */ + next = NULL; + return ret; +} + +/*------------------------------------------------------------------. +| The function push_wrapup () pushes a string on the wrapup stack. | +| When the normal input stack gets empty, the wrapup stack will | +| become the input stack, and push_string () and push_file () will | +| operate on wrapup_stack. Push_wrapup should be done as | +| push_string (), but this will suffice, as long as arguments to | +| m4_m4wrap () are moderate in size. | +`------------------------------------------------------------------*/ + +void +push_wrapup (const char *s) +{ + size_t len = strlen (s); + input_block *i; + i = (input_block *) obstack_alloc (wrapup_stack, + sizeof (struct input_block)); + i->prev = wsp; + i->type = INPUT_STRING; + i->file = current_file; + i->line = current_line; + i->u.u_s.string = (char *) obstack_copy0 (wrapup_stack, s, len); + i->u.u_s.end = i->u.u_s.string + len; + wsp = i; +} + + +/*-------------------------------------------------------------------. +| The function pop_input () pops one level of input sources. If the | +| popped input_block is a file, current_file and current_line are | +| reset to the saved values before the memory for the input_block is | +| released. | +`-------------------------------------------------------------------*/ + +static void +pop_input (void) +{ + input_block *tmp = isp->prev; + + switch (isp->type) + { + case INPUT_STRING: + case INPUT_MACRO: + break; + + case INPUT_FILE: + if (debug_level & DEBUG_TRACE_INPUT) + { + if (tmp) + DEBUG_MESSAGE2 ("input reverted to %s, line %d", + tmp->file, tmp->line); + else + DEBUG_MESSAGE ("input exhausted"); + } + + if (ferror (isp->u.u_f.fp)) + { + M4ERROR ((warning_status, 0, "read error")); + if (isp->u.u_f.close) + fclose (isp->u.u_f.fp); + retcode = EXIT_FAILURE; + } + else if (isp->u.u_f.close && fclose (isp->u.u_f.fp) == EOF) + { + M4ERROR ((warning_status, errno, "error reading file")); + retcode = EXIT_FAILURE; + } + start_of_input_line = isp->u.u_f.advance; + output_current_line = -1; + break; + + default: + M4ERROR ((warning_status, 0, + "INTERNAL ERROR: input stack botch in pop_input ()")); + abort (); + } + obstack_free (current_input, isp); + next = NULL; /* might be set in push_string_init () */ + + isp = tmp; + input_change = true; +} + +/*-------------------------------------------------------------------. +| To switch input over to the wrapup stack, main calls pop_wrapup | +| (). Since wrapup text can install new wrapup text, pop_wrapup () | +| returns false when there is no wrapup text on the stack, and true | +| otherwise. | +`-------------------------------------------------------------------*/ + +bool +pop_wrapup (void) +{ + next = NULL; + obstack_free (current_input, NULL); + free (current_input); + + if (wsp == NULL) + { + /* End of the program. Free all memory even though we are about + to exit, since it makes leak detection easier. */ + obstack_free (&token_stack, NULL); + obstack_free (&file_names, NULL); + obstack_free (wrapup_stack, NULL); + free (wrapup_stack); +#ifdef ENABLE_CHANGEWORD + regfree (&word_regexp); +#endif /* ENABLE_CHANGEWORD */ + return false; + } + + current_input = wrapup_stack; + wrapup_stack = (struct obstack *) xmalloc (sizeof (struct obstack)); + obstack_init (wrapup_stack); + + isp = wsp; + wsp = NULL; + input_change = true; + + return true; +} + +/*-------------------------------------------------------------------. +| When a MACRO token is seen, next_token () uses init_macro_token () | +| to retrieve the value of the function pointer. | +`-------------------------------------------------------------------*/ + +static void +init_macro_token (token_data *td) +{ + if (isp->type != INPUT_MACRO) + { + M4ERROR ((warning_status, 0, + "INTERNAL ERROR: bad call to init_macro_token ()")); + abort (); + } + + TOKEN_DATA_TYPE (td) = TOKEN_FUNC; + TOKEN_DATA_FUNC (td) = isp->u.func; +} + + +/*-----------------------------------------------------------------. +| Low level input is done a character at a time. The function | +| peek_input () is used to look at the next character in the input | +| stream. At any given time, it reads from the input_block on the | +| top of the current input stack. | +`-----------------------------------------------------------------*/ + +static int +peek_input (void) +{ + int ch; + input_block *block = isp; + + while (1) + { + if (block == NULL) + return CHAR_EOF; + + switch (block->type) + { + case INPUT_STRING: + ch = to_uchar (block->u.u_s.string[0]); + if (ch != '\0') + return ch; + break; + + case INPUT_FILE: + ch = getc (block->u.u_f.fp); + if (ch != EOF) + { + ungetc (ch, block->u.u_f.fp); + return ch; + } + block->u.u_f.end = true; + break; + + case INPUT_MACRO: + return CHAR_MACRO; + + default: + M4ERROR ((warning_status, 0, + "INTERNAL ERROR: input stack botch in peek_input ()")); + abort (); + } + block = block->prev; + } +} + +/*-------------------------------------------------------------------. +| The function next_char () is used to read and advance the input to | +| the next character. It also manages line numbers for error | +| messages, so they do not get wrong, due to lookahead. The token | +| consisting of a newline alone is taken as belonging to the line it | +| ends, and the current line number is not incremented until the | +| next character is read. 99.9% of all calls will read from a | +| string, so factor that out into a macro for speed. | +`-------------------------------------------------------------------*/ + +#define next_char() \ + (isp && isp->type == INPUT_STRING && isp->u.u_s.string[0] \ + && !input_change \ + ? to_uchar (*isp->u.u_s.string++) \ + : next_char_1 ()) + +static int +next_char_1 (void) +{ + int ch; + + while (1) + { + if (isp == NULL) + { + current_file = ""; + current_line = 0; + return CHAR_EOF; + } + + if (input_change) + { + current_file = isp->file; + current_line = isp->line; + input_change = false; + } + + switch (isp->type) + { + case INPUT_STRING: + ch = to_uchar (*isp->u.u_s.string++); + if (ch != '\0') + return ch; + break; + + case INPUT_FILE: + if (start_of_input_line) + { + start_of_input_line = false; + current_line = ++isp->line; + } + + /* If stdin is a terminal, calling getc after peek_input + already called it would make the user have to hit ^D + twice to quit. */ + ch = isp->u.u_f.end ? EOF : getc (isp->u.u_f.fp); + if (ch != EOF) + { + if (ch == '\n') + start_of_input_line = true; + return ch; + } + break; + + case INPUT_MACRO: + pop_input (); /* INPUT_MACRO input sources has only one token */ + return CHAR_MACRO; + + default: + M4ERROR ((warning_status, 0, + "INTERNAL ERROR: input stack botch in next_char ()")); + abort (); + } + + /* End of input source --- pop one level. */ + pop_input (); + } +} + +/*-------------------------------------------------------------------. +| skip_line () simply discards all immediately following characters, | +| upto the first newline. It is only used from m4_dnl (). | +`-------------------------------------------------------------------*/ + +void +skip_line (void) +{ + int ch; + const char *file = current_file; + int line = current_line; + + while ((ch = next_char ()) != CHAR_EOF && ch != '\n') + ; + if (ch == CHAR_EOF) + /* current_file changed to "" if we see CHAR_EOF, use the + previous value we stored earlier. */ + M4ERROR_AT_LINE ((warning_status, 0, file, line, + "Warning: end of file treated as newline")); + /* On the rare occasion that dnl crosses include file boundaries + (either the input file did not end in a newline, or changeword + was used), calling next_char can update current_file and + current_line, and that update will be undone as we return to + expand_macro. This informs next_char to fix things again. */ + if (file != current_file || line != current_line) + input_change = true; +} + + +/*------------------------------------------------------------------. +| This function is for matching a string against a prefix of the | +| input stream. If the string matches the input and consume is | +| true, the input is discarded; otherwise any characters read are | +| pushed back again. The function is used only when multicharacter | +| quotes or comment delimiters are used. | +`------------------------------------------------------------------*/ + +static bool +match_input (const char *s, bool consume) +{ + int n; /* number of characters matched */ + int ch; /* input character */ + const char *t; + bool result = false; + + ch = peek_input (); + if (ch != to_uchar (*s)) + return false; /* fail */ + + if (s[1] == '\0') + { + if (consume) + next_char (); + return true; /* short match */ + } + + next_char (); + for (n = 1, t = s++; peek_input () == to_uchar (*s++); ) + { + next_char (); + n++; + if (*s == '\0') /* long match */ + { + if (consume) + return true; + result = true; + break; + } + } + + /* Failed or shouldn't consume, push back input. */ + { + struct obstack *h = push_string_init (); + + /* `obstack_grow' may be macro evaluating its arg 1 several times. */ + obstack_grow (h, t, n); + } + push_string_finish (); + return result; +} + +/*--------------------------------------------------------------------. +| The macro MATCH() is used to match a string S against the input. | +| The first character is handled inline, for speed. Hopefully, this | +| will not hurt efficiency too much when single character quotes and | +| comment delimiters are used. If CONSUME, then CH is the result of | +| next_char, and a successful match will discard the matched string. | +| Otherwise, CH is the result of peek_char, and the input stream is | +| effectively unchanged. | +`--------------------------------------------------------------------*/ + +#define MATCH(ch, s, consume) \ + (to_uchar ((s)[0]) == (ch) \ + && (ch) != '\0' \ + && ((s)[1] == '\0' || (match_input ((s) + (consume), consume)))) + + +/*--------------------------------------------------------. +| Initialize input stacks, and quote/comment characters. | +`--------------------------------------------------------*/ + +void +input_init (void) +{ + current_file = ""; + current_line = 0; + + current_input = (struct obstack *) xmalloc (sizeof (struct obstack)); + obstack_init (current_input); + wrapup_stack = (struct obstack *) xmalloc (sizeof (struct obstack)); + obstack_init (wrapup_stack); + + obstack_init (&file_names); + + /* Allocate an object in the current chunk, so that obstack_free + will always work even if the first token parsed spills to a new + chunk. */ + obstack_init (&token_stack); + obstack_alloc (&token_stack, 1); + token_bottom = obstack_base (&token_stack); + + isp = NULL; + wsp = NULL; + next = NULL; + + start_of_input_line = false; + + lquote.string = xstrdup (DEF_LQUOTE); + lquote.length = strlen (lquote.string); + rquote.string = xstrdup (DEF_RQUOTE); + rquote.length = strlen (rquote.string); + bcomm.string = xstrdup (DEF_BCOMM); + bcomm.length = strlen (bcomm.string); + ecomm.string = xstrdup (DEF_ECOMM); + ecomm.length = strlen (ecomm.string); + +#ifdef ENABLE_CHANGEWORD + set_word_regexp (user_word_regexp); +#endif +} + + +/*------------------------------------------------------------------. +| Functions for setting quotes and comment delimiters. Used by | +| m4_changecom () and m4_changequote (). Pass NULL if the argument | +| was not present, to distinguish from an explicit empty string. | +`------------------------------------------------------------------*/ + +void +set_quotes (const char *lq, const char *rq) +{ + free (lquote.string); + free (rquote.string); + + /* POSIX states that with 0 arguments, the default quotes are used. + POSIX XCU ERN 112 states that behavior is implementation-defined + if there was only one argument, or if there is an empty string in + either position when there are two arguments. We allow an empty + left quote to disable quoting, but a non-empty left quote will + always create a non-empty right quote. See the texinfo for what + some other implementations do. */ + if (!lq) + { + lq = DEF_LQUOTE; + rq = DEF_RQUOTE; + } + else if (!rq || (*lq && !*rq)) + rq = DEF_RQUOTE; + + lquote.string = xstrdup (lq); + lquote.length = strlen (lquote.string); + rquote.string = xstrdup (rq); + rquote.length = strlen (rquote.string); +} + +void +set_comment (const char *bc, const char *ec) +{ + free (bcomm.string); + free (ecomm.string); + + /* POSIX requires no arguments to disable comments. It requires + empty arguments to be used as-is, but this is counter to + traditional behavior, because a non-null begin and null end makes + it impossible to end a comment. An aardvark has been filed: + http://www.opengroup.org/austin/mailarchives/ag-review/msg02168.html + This implementation assumes the aardvark will be approved. See + the texinfo for what some other implementations do. */ + if (!bc) + bc = ec = ""; + else if (!ec || (*bc && !*ec)) + ec = DEF_ECOMM; + + bcomm.string = xstrdup (bc); + bcomm.length = strlen (bcomm.string); + ecomm.string = xstrdup (ec); + ecomm.length = strlen (ecomm.string); +} + +#ifdef ENABLE_CHANGEWORD + +void +set_word_regexp (const char *regexp) +{ + const char *msg; + struct re_pattern_buffer new_word_regexp; + + if (!*regexp || STREQ (regexp, DEFAULT_WORD_REGEXP)) + { + default_word_regexp = true; + return; + } + + /* Dry run to see whether the new expression is compilable. */ + init_pattern_buffer (&new_word_regexp, NULL); + msg = re_compile_pattern (regexp, strlen (regexp), &new_word_regexp); + regfree (&new_word_regexp); + + if (msg != NULL) + { + M4ERROR ((warning_status, 0, + "bad regular expression `%s': %s", regexp, msg)); + return; + } + + /* If compilation worked, retry using the word_regexp struct. We + can't rely on struct assigns working, so redo the compilation. + The fastmap can be reused between compilations, and will be freed + by the final regfree. */ + if (!word_regexp.fastmap) + word_regexp.fastmap = xcharalloc (UCHAR_MAX + 1); + msg = re_compile_pattern (regexp, strlen (regexp), &word_regexp); + assert (!msg); + re_set_registers (&word_regexp, ®s, regs.num_regs, regs.start, regs.end); + if (re_compile_fastmap (&word_regexp)) + assert (false); + + default_word_regexp = false; +} + +#endif /* ENABLE_CHANGEWORD */ + + +/*--------------------------------------------------------------------. +| Parse and return a single token from the input stream. A token | +| can either be TOKEN_EOF, if the input_stack is empty; it can be | +| TOKEN_STRING for a quoted string; TOKEN_WORD for something that is | +| a potential macro name; and TOKEN_SIMPLE for any single character | +| that is not a part of any of the previous types. If LINE is not | +| NULL, set *LINE to the line where the token starts. | +| | +| Next_token () return the token type, and passes back a pointer to | +| the token data through TD. The token text is collected on the | +| obstack token_stack, which never contains more than one token text | +| at a time. The storage pointed to by the fields in TD is | +| therefore subject to change the next time next_token () is called. | +`--------------------------------------------------------------------*/ + +token_type +next_token (token_data *td, int *line) +{ + int ch; + int quote_level; + token_type type; +#ifdef ENABLE_CHANGEWORD + int startpos; + char *orig_text = NULL; +#endif + const char *file; + int dummy; + + obstack_free (&token_stack, token_bottom); + if (!line) + line = &dummy; + + /* Can't consume character until after CHAR_MACRO is handled. */ + ch = peek_input (); + if (ch == CHAR_EOF) + { +#ifdef DEBUG_INPUT + xfprintf (stderr, "next_token -> EOF\n"); +#endif + next_char (); + return TOKEN_EOF; + } + if (ch == CHAR_MACRO) + { + init_macro_token (td); + next_char (); +#ifdef DEBUG_INPUT + xfprintf (stderr, "next_token -> MACDEF (%s)\n", + find_builtin_by_addr (TOKEN_DATA_FUNC (td))->name); +#endif + return TOKEN_MACDEF; + } + + next_char (); /* Consume character we already peeked at. */ + file = current_file; + *line = current_line; + if (MATCH (ch, bcomm.string, true)) + { + obstack_grow (&token_stack, bcomm.string, bcomm.length); + while ((ch = next_char ()) != CHAR_EOF + && !MATCH (ch, ecomm.string, true)) + obstack_1grow (&token_stack, ch); + if (ch != CHAR_EOF) + obstack_grow (&token_stack, ecomm.string, ecomm.length); + else + /* current_file changed to "" if we see CHAR_EOF, use the + previous value we stored earlier. */ + M4ERROR_AT_LINE ((EXIT_FAILURE, 0, file, *line, + "ERROR: end of file in comment")); + + type = TOKEN_STRING; + } + else if (default_word_regexp && (isalpha (ch) || ch == '_')) + { + obstack_1grow (&token_stack, ch); + while ((ch = peek_input ()) != CHAR_EOF && (isalnum (ch) || ch == '_')) + { + obstack_1grow (&token_stack, ch); + next_char (); + } + type = TOKEN_WORD; + } + +#ifdef ENABLE_CHANGEWORD + + else if (!default_word_regexp && word_regexp.fastmap[ch]) + { + obstack_1grow (&token_stack, ch); + while (1) + { + ch = peek_input (); + if (ch == CHAR_EOF) + break; + obstack_1grow (&token_stack, ch); + startpos = re_search (&word_regexp, + (char *) obstack_base (&token_stack), + obstack_object_size (&token_stack), 0, 0, + ®s); + if (startpos || + regs.end [0] != (regoff_t) obstack_object_size (&token_stack)) + { + *(((char *) obstack_base (&token_stack) + + obstack_object_size (&token_stack)) - 1) = '\0'; + break; + } + next_char (); + } + + obstack_1grow (&token_stack, '\0'); + orig_text = (char *) obstack_finish (&token_stack); + + if (regs.start[1] != -1) + obstack_grow (&token_stack,orig_text + regs.start[1], + regs.end[1] - regs.start[1]); + else + obstack_grow (&token_stack, orig_text,regs.end[0]); + + type = TOKEN_WORD; + } + +#endif /* ENABLE_CHANGEWORD */ + + else if (!MATCH (ch, lquote.string, true)) + { + switch (ch) + { + case '(': + type = TOKEN_OPEN; + break; + case ',': + type = TOKEN_COMMA; + break; + case ')': + type = TOKEN_CLOSE; + break; + default: + type = TOKEN_SIMPLE; + break; + } + obstack_1grow (&token_stack, ch); + } + else + { + bool fast = lquote.length == 1 && rquote.length == 1; + quote_level = 1; + while (1) + { + /* Try scanning a buffer first. */ + const char *buffer = (isp && isp->type == INPUT_STRING + ? isp->u.u_s.string : NULL); + if (buffer && *buffer) + { + size_t len = isp->u.u_s.end - buffer; + const char *p = buffer; + do + { + p = (char *) memchr2 (p, *lquote.string, *rquote.string, + buffer + len - p); + } + while (p && fast && (*p++ == *rquote.string + ? --quote_level : ++quote_level)); + if (p) + { + if (fast) + { + assert (!quote_level); + obstack_grow (&token_stack, buffer, p - buffer - 1); + isp->u.u_s.string += p - buffer; + break; + } + obstack_grow (&token_stack, buffer, p - buffer); + ch = to_uchar (*p); + isp->u.u_s.string += p - buffer + 1; + } + else + { + obstack_grow (&token_stack, buffer, len); + isp->u.u_s.string += len; + continue; + } + } + /* Fall back to a byte. */ + else + ch = next_char (); + if (ch == CHAR_EOF) + /* current_file changed to "" if we see CHAR_EOF, use + the previous value we stored earlier. */ + M4ERROR_AT_LINE ((EXIT_FAILURE, 0, file, *line, + "ERROR: end of file in string")); + + if (MATCH (ch, rquote.string, true)) + { + if (--quote_level == 0) + break; + obstack_grow (&token_stack, rquote.string, rquote.length); + } + else if (MATCH (ch, lquote.string, true)) + { + quote_level++; + obstack_grow (&token_stack, lquote.string, lquote.length); + } + else + obstack_1grow (&token_stack, ch); + } + type = TOKEN_STRING; + } + + obstack_1grow (&token_stack, '\0'); + + TOKEN_DATA_TYPE (td) = TOKEN_TEXT; + TOKEN_DATA_TEXT (td) = (char *) obstack_finish (&token_stack); +#ifdef ENABLE_CHANGEWORD + if (orig_text == NULL) + orig_text = TOKEN_DATA_TEXT (td); + TOKEN_DATA_ORIG_TEXT (td) = orig_text; +#endif +#ifdef DEBUG_INPUT + xfprintf (stderr, "next_token -> %s (%s)\n", + token_type_string (type), TOKEN_DATA_TEXT (td)); +#endif + return type; +} + +/*-----------------------------------------------. +| Peek at the next token from the input stream. | +`-----------------------------------------------*/ + +token_type +peek_token (void) +{ + token_type result; + int ch = peek_input (); + + if (ch == CHAR_EOF) + { + result = TOKEN_EOF; + } + else if (ch == CHAR_MACRO) + { + result = TOKEN_MACDEF; + } + else if (MATCH (ch, bcomm.string, false)) + { + result = TOKEN_STRING; + } + else if ((default_word_regexp && (isalpha (ch) || ch == '_')) +#ifdef ENABLE_CHANGEWORD + || (! default_word_regexp && word_regexp.fastmap[ch]) +#endif /* ENABLE_CHANGEWORD */ + ) + { + result = TOKEN_WORD; + } + else if (MATCH (ch, lquote.string, false)) + { + result = TOKEN_STRING; + } + else + switch (ch) + { + case '(': + result = TOKEN_OPEN; + break; + case ',': + result = TOKEN_COMMA; + break; + case ')': + result = TOKEN_CLOSE; + break; + default: + result = TOKEN_SIMPLE; + } + +#ifdef DEBUG_INPUT + xfprintf (stderr, "peek_token -> %s\n", token_type_string (result)); +#endif /* DEBUG_INPUT */ + return result; +} + + +#ifdef DEBUG_INPUT + +static const char * +token_type_string (token_type t) +{ + switch (t) + { /* TOKSW */ + case TOKEN_EOF: + return "EOF"; + case TOKEN_STRING: + return "STRING"; + case TOKEN_WORD: + return "WORD"; + case TOKEN_OPEN: + return "OPEN"; + case TOKEN_COMMA: + return "COMMA"; + case TOKEN_CLOSE: + return "CLOSE"; + case TOKEN_SIMPLE: + return "SIMPLE"; + case TOKEN_MACDEF: + return "MACDEF"; + default: + abort (); + } + } + +static void +print_token (const char *s, token_type t, token_data *td) +{ + xfprintf (stderr, "%s: ", s); + switch (t) + { /* TOKSW */ + case TOKEN_OPEN: + case TOKEN_COMMA: + case TOKEN_CLOSE: + case TOKEN_SIMPLE: + xfprintf (stderr, "char:"); + break; + + case TOKEN_WORD: + xfprintf (stderr, "word:"); + break; + + case TOKEN_STRING: + xfprintf (stderr, "string:"); + break; + + case TOKEN_MACDEF: + xfprintf (stderr, "macro: %p\n", TOKEN_DATA_FUNC (td)); + break; + + case TOKEN_EOF: + xfprintf (stderr, "eof\n"); + break; + } + xfprintf (stderr, "\t\"%s\"\n", TOKEN_DATA_TEXT (td)); +} + +static void M4_GNUC_UNUSED +lex_debug (void) +{ + token_type t; + token_data td; + + while ((t = next_token (&td, NULL)) != TOKEN_EOF) + print_token ("lex", t, &td); +} +#endif /* DEBUG_INPUT */ |