From 429cd7f8dd08c81d4fcfae59196fadd82d2fc7d6 Mon Sep 17 00:00:00 2001 From: xi Date: Tue, 6 Jun 2006 19:43:34 +0000 Subject: Add scanner definitions. Add a basic description of all tokens produced by the scanner. git-svn-id: http://svn.pyyaml.org/libyaml/trunk@184 18f92427-320e-0410-9341-c67f048884a3 --- include/yaml/yaml.h | 117 ++++++++- src/Makefile.am | 2 +- src/scanner.c | 667 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 782 insertions(+), 4 deletions(-) create mode 100644 src/scanner.c diff --git a/include/yaml/yaml.h b/include/yaml/yaml.h index 6faf7df..7ee6b28 100644 --- a/include/yaml/yaml.h +++ b/include/yaml/yaml.h @@ -20,7 +20,7 @@ extern "C" { #include /** - * @defgroup Export Definitions + * @defgroup export Export Definitions * @{ */ @@ -114,7 +114,7 @@ typedef struct { /** @} */ /** - * @defgroup Node Styles + * @defgroup styles Node Styles * @{ */ @@ -151,7 +151,7 @@ typedef enum { /** @} */ /** - * @defgroup Tokens + * @defgroup tokens Tokens * @{ */ @@ -532,6 +532,33 @@ typedef struct { unsigned char *current; } yaml_string_input_t; +/** + * This structure holds information about a potential simple key. + */ + +typedef struct { + /** Is a simple key possible? */ + int possible; + + /** Is a simple key required? */ + int required; + + /** The number of the token. */ + size_t token_number; + + /** The position index. */ + size_t index; + + /** The position line. */ + size_t line; + + /** The position column. */ + size_t column; + + /** The position mark. */ + yaml_mark_t mark; +} yaml_simple_key_t; + /** * The parser structure. * @@ -619,6 +646,60 @@ typedef struct { * @} */ + /** + * @name Scanner stuff + * @{ + */ + + /** Have we started to scan the input stream? */ + int stream_start_produced; + + /** Have we reached the end of the input stream? */ + int stream_end_produced; + + /** The number of unclosed '[' and '{' indicators. */ + int flow_level; + + /** The tokens queue, which contains the current produced tokens. */ + yaml_token_t *tokens; + + /** The size of the tokens queue. */ + size_t tokens_size; + + /** The head of the tokens queue. */ + size_t tokens_head; + + /** The tail of the tokens queue. */ + size_t tokens_tail; + + /** The number of tokens fetched from the tokens queue. */ + size_t tokens_parsed; + + /** The stack of indentation levels. */ + int *indents; + + /** The size of the indents stack. */ + size_t indents_size; + + /** The number of items in the indents stack. */ + size_t indents_length; + + /** The current indentation level. */ + int indent; + + /** May a simple key occur at the current position? */ + int simple_key_allowed; + + /** The stack of potential simple keys. */ + yaml_simple_key_t *simple_keys; + + /** The size of the simple keys stack. */ + size_t simple_keys_size; + + /** + * @} + */ + } yaml_parser_t; /** @@ -694,6 +775,36 @@ yaml_parser_set_input(yaml_parser_t *parser, YAML_DECLARE(void) yaml_parser_set_encoding(yaml_parser_t *parser, yaml_encoding_t encoding); +/** + * Get the next token. + * + * The token is removed from the internal token queue and the application is + * responsible for destroing the token object. + * + * @param[in] parser A parser object. + * + * @returns A token object, or @c NULL on error. + */ + +YAML_DECLARE(yaml_token_t *) +yaml_parser_get_token(yaml_parser_t *parser); + +/** + * Peek the next token. + * + * The token is not removed from the internal token queue and will be returned + * again on a subsequent call of @c yaml_parser_get_token or + * @c yaml_parser_peek_token. The application should not destroy the token + * object. + * + * @param[in] parser A parser object. + * + * @returns A token object, or @c NULL on error. + */ + +YAML_DECLARE(yaml_token_t *) +yaml_parser_peek_token(yaml_parser_t *parser); + /** @} */ /* diff --git a/src/Makefile.am b/src/Makefile.am index 6816d81..d30f3dd 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,4 +1,4 @@ AM_CPPFLAGS = -I$(top_srcdir)/include lib_LTLIBRARIES = libyaml.la -libyaml_la_SOURCES = version.c api.c reader.c +libyaml_la_SOURCES = version.c api.c reader.c scanner.c libyaml_la_LDFLAGS = -release $(YAML_LT_RELEASE) -version-info $(YAML_LT_CURRENT):$(YAML_LT_REVISION):$(YAML_LT_AGE) diff --git a/src/scanner.c b/src/scanner.c new file mode 100644 index 0000000..530bd91 --- /dev/null +++ b/src/scanner.c @@ -0,0 +1,667 @@ + +/* + * Introduction + * ************ + * + * The following notes assume that you are familiar with the YAML specification + * (http://yaml.org/spec/cvs/current.html). We mostly follow it, although in + * some cases we are less restrictive that it requires. + * + * The process of transforming a YAML stream into a sequence of events is + * divided on two steps: Scanning and Parsing. + * + * The Scanner transforms the input stream into a sequence of tokens, while the + * parser transform the sequence of tokens produced by the Scanner into a + * sequence of parsing events. + * + * The Scanner is rather clever and complicated. The Parser, on the contrary, + * is a straightforward implementation of a recursive-descendant parser (or, + * LL(1) parser, as it is usually called). + * + * Actually there are two issues of Scanning that might be called "clever", the + * rest is quite straightforward. The issues are "block collection start" and + * "simple keys". Both issues are explained below in details. + * + * Here the Scanning step is explained and implemented. We start with the list + * of all the tokens produced by the Scanner together with short descriptions. + * + * Now, tokens: + * + * STREAM-START(encoding) # The stream start. + * STREAM-END # The stream end. + * VERSION-DIRECTIVE(major,minor) # The '%YAML' directive. + * TAG-DIRECTIVE(handle,prefix) # The '%TAG' directive. + * DOCUMENT-START # '---' + * DOCUMENT-END # '...' + * BLOCK-SEQUENCE-START # Indentation increase denoting a block + * BLOCK-MAPPING-START # sequence or a block mapping. + * BLOCK-END # Indentation decrease. + * FLOW-SEQUENCE-START # '[' + * FLOW-SEQUENCE-END # ']' + * BLOCK-SEQUENCE-START # '{' + * BLOCK-SEQUENCE-END # '}' + * BLOCK-ENTRY # '-' + * FLOW-ENTRY # ',' + * KEY # '?' or nothing (simple keys). + * VALUE # ':' + * ALIAS(anchor) # '*anchor' + * ANCHOR(anchor) # '&anchor' + * TAG(handle,suffix) # '!handle!suffix' + * SCALAR(value,style) # A scalar. + * + * The following two tokens are "virtual" tokens denoting the beginning and the + * end of the stream: + * + * STREAM-START(encoding) + * STREAM-END + * + * We pass the information about the input stream encoding with the + * STREAM-START token. + * + * The next two tokens are responsible for tags: + * + * VERSION-DIRECTIVE(major,minor) + * TAG-DIRECTIVE(handle,prefix) + * + * Example: + * + * %YAML 1.1 + * %TAG ! !foo + * %TAG !yaml! tag:yaml.org,2002: + * --- + * + * The correspoding sequence of tokens: + * + * STREAM-START(utf-8) + * VERSION-DIRECTIVE(1,1) + * TAG-DIRECTIVE("!","!foo") + * TAG-DIRECTIVE("!yaml","tag:yaml.org,2002:") + * DOCUMENT-START + * STREAM-END + * + * Note that the VERSION-DIRECTIVE and TAG-DIRECTIVE tokens occupy a whole + * line. + * + * The document start and end indicators are represented by: + * + * DOCUMENT-START + * DOCUMENT-END + * + * Note that if a YAML stream contains an implicit document (without '---' + * and '...' indicators), no DOCUMENT-START and DOCUMENT-END tokens will be + * produced. + * + * In the following examples, we present whole documents together with the + * produced tokens. + * + * 1. An implicit document: + * + * 'a scalar' + * + * Tokens: + * + * STREAM-START(utf-8) + * SCALAR("a scalar",single-quoted) + * STREAM-END + * + * 2. An explicit document: + * + * --- + * 'a scalar' + * ... + * + * Tokens: + * + * STREAM-START(utf-8) + * DOCUMENT-START + * SCALAR("a scalar",single-quoted) + * DOCUMENT-END + * STREAM-END + * + * 3. Several documents in a stream: + * + * 'a scalar' + * --- + * 'another scalar' + * --- + * 'yet another scalar' + * + * Tokens: + * + * STREAM-START(utf-8) + * SCALAR("a scalar",single-quoted) + * DOCUMENT-START + * SCALAR("another scalar",single-quoted) + * DOCUMENT-START + * SCALAR("yet another scalar",single-quoted) + * STREAM-END + * + * We have already introduced the SCALAR token above. The following tokens are + * used to describe aliases, anchors, tag, and scalars: + * + * ALIAS(anchor) + * ANCHOR(anchor) + * TAG(handle,suffix) + * SCALAR(value,style) + * + * The following series of examples illustrate the usage of these tokens: + * + * 1. A recursive sequence: + * + * &A [ *A ] + * + * Tokens: + * + * STREAM-START(utf-8) + * ANCHOR("A") + * FLOW-SEQUENCE-START + * ALIAS("A") + * FLOW-SEQUENCE-END + * STREAM-END + * + * 2. A tagged scalar: + * + * !!float "3.14" # A good approximation. + * + * Tokens: + * + * STREAM-START(utf-8) + * TAG("!!","float") + * SCALAR("3.14",double-quoted) + * STREAM-END + * + * 3. Various scalar styles: + * + * --- # Implicit empty plain scalars do not produce tokens. + * --- a plain scalar + * --- 'a single-quoted scalar' + * --- "a double-quoted scalar" + * --- |- + * a literal scalar + * --- >- + * a folded + * scalar + * + * Tokens: + * + * STREAM-START(utf-8) + * DOCUMENT-START + * DOCUMENT-START + * SCALAR("a plain scalar",plain) + * DOCUMENT-START + * SCALAR("a single-quoted scalar",single-quoted) + * DOCUMENT-START + * SCALAR("a double-quoted scalar",double-quoted) + * DOCUMENT-START + * SCALAR("a literal scalar",literal) + * DOCUMENT-START + * SCALAR("a folded scalar",folded) + * STREAM-END + * + * Now it's time to review collection-related tokens. We will start with + * flow collections: + * + * FLOW-SEQUENCE-START + * FLOW-SEQUENCE-END + * FLOW-MAPPING-START + * FLOW-MAPPING-END + * FLOW-ENTRY + * KEY + * VALUE + * + * The tokens FLOW-SEQUENCE-START, FLOW-SEQUENCE-END, FLOW-MAPPING-START, and + * FLOW-MAPPING-END represent the indicators '[', ']', '{', and '}' + * correspondingly. FLOW-ENTRY represent the ',' indicator. Finally the + * indicators '?' and ':', which are used for denoting mapping keys and values, + * are represented by the KEY and VALUE tokens. + * + * The following examples show flow collections: + * + * 1. A flow sequence: + * + * [item 1, item 2, item 3] + * + * Tokens: + * + * STREAM-START(utf-8) + * FLOW-SEQUENCE-START + * SCALAR("item 1",plain) + * FLOW-ENTRY + * SCALAR("item 2",plain) + * FLOW-ENTRY + * SCALAR("item 3",plain) + * FLOW-SEQUENCE-END + * STREAM-END + * + * 2. A flow mapping: + * + * { + * a simple key: a value, # Note that the KEY token is produced. + * ? a complex key: another value, + * } + * + * Tokens: + * + * STREAM-START(utf-8) + * FLOW-MAPPING-START + * KEY + * SCALAR("a simple key",plain) + * VALUE + * SCALAR("a value",plain) + * FLOW-ENTRY + * KEY + * SCALAR("a complex key",plain) + * VALUE + * SCALAR("another value",plain) + * FLOW-ENTRY + * FLOW-MAPPING-END + * STREAM-END + * + * A simple key is a key which is not denoted by the '?' indicator. Note that + * the Scanner still produce the KEY token whenever it encounters a simple key. + * + * For scanning block collections, the following tokens are used (note that we + * repeat KEY and VALUE here): + * + * BLOCK-SEQUENCE-START + * BLOCK-MAPPING-START + * BLOCK-END + * BLOCK-ENTRY + * KEY + * VALUE + * + * The tokens BLOCK-SEQUENCE-START and BLOCK-MAPPING-START denote indentation + * increase that precedes a block collection (cf. the INDENT token in Python). + * The token BLOCK-END denote indentation decrease that ends a block collection + * (cf. the DEDENT token in Python). However YAML has some syntax pecularities + * that makes detections of these tokens more complex. + * + * The tokens BLOCK-ENTRY, KEY, and VALUE are used to represent the indicators + * '-', '?', and ':' correspondingly. + * + * The following examples show how the tokens BLOCK-SEQUENCE-START, + * BLOCK-MAPPING-START, and BLOCK-END are emitted by the Scanner: + * + * 1. Block sequences: + * + * - item 1 + * - item 2 + * - + * - item 3.1 + * - item 3.2 + * - + * key 1: value 1 + * key 2: value 2 + * + * Tokens: + * + * STREAM-START(utf-8) + * BLOCK-SEQUENCE-START + * BLOCK-ENTRY + * SCALAR("item 1",plain) + * BLOCK-ENTRY + * SCALAR("item 2",plain) + * BLOCK-ENTRY + * BLOCK-SEQUENCE-START + * BLOCK-ENTRY + * SCALAR("item 3.1",plain) + * BLOCK-ENTRY + * SCALAR("item 3.2",plain) + * BLOCK-END + * BLOCK-ENTRY + * BLOCK-MAPPING-START + * KEY + * SCALAR("key 1",plain) + * VALUE + * SCALAR("value 1",plain) + * KEY + * SCALAR("key 2",plain) + * VALUE + * SCALAR("value 2",plain) + * BLOCK-END + * BLOCK-END + * STREAM-END + * + * 2. Block mappings: + * + * a simple key: a value # The KEY token is produced here. + * ? a complex key + * : another value + * a mapping: + * key 1: value 1 + * key 2: value 2 + * a sequence: + * - item 1 + * - item 2 + * + * Tokens: + * + * STREAM-START(utf-8) + * BLOCK-MAPPING-START + * KEY + * SCALAR("a simple key",plain) + * VALUE + * SCALAR("a value",plain) + * KEY + * SCALAR("a complex key",plain) + * VALUE + * SCALAR("another value",plain) + * KEY + * SCALAR("a mapping",plain) + * BLOCK-MAPPING-START + * KEY + * SCALAR("key 1",plain) + * VALUE + * SCALAR("value 1",plain) + * KEY + * SCALAR("key 2",plain) + * VALUE + * SCALAR("value 2",plain) + * BLOCK-END + * KEY + * SCALAR("a sequence",plain) + * VALUE + * BLOCK-SEQUENCE-START + * BLOCK-ENTRY + * SCALAR("item 1",plain) + * BLOCK-ENTRY + * SCALAR("item 2",plain) + * BLOCK-END + * BLOCK-END + * STREAM-END + * + * YAML does not always require to start a new block collection from a new + * line. If the current line contains only '-', '?', and ':' indicators, a new + * block collection may start at the current line. The following examples + * illustrate this case: + * + * 1. Collections in a sequence: + * + * - - item 1 + * - item 2 + * - key 1: value 1 + * key 2: value 2 + * - ? complex key + * : complex value + * + * Tokens: + * + * STREAM-START(utf-8) + * BLOCK-SEQUENCE-START + * BLOCK-ENTRY + * BLOCK-SEQUENCE-START + * BLOCK-ENTRY + * SCALAR("item 1",plain) + * BLOCK-ENTRY + * SCALAR("item 2",plain) + * BLOCK-END + * BLOCK-ENTRY + * BLOCK-MAPPING-START + * KEY + * SCALAR("key 1",plain) + * VALUE + * SCALAR("value 1",plain) + * KEY + * SCALAR("key 2",plain) + * VALUE + * SCALAR("value 2",plain) + * BLOCK-END + * BLOCK-ENTRY + * BLOCK-MAPPING-START + * KEY + * SCALAR("complex key") + * VALUE + * SCALAR("complex value") + * BLOCK-END + * BLOCK-END + * STREAM-END + * + * 2. Collections in a mapping: + * + * ? a sequence + * : - item 1 + * - item 2 + * ? a mapping + * : key 1: value 1 + * key 2: value 2 + * + * Tokens: + * + * STREAM-START(utf-8) + * BLOCK-MAPPING-START + * KEY + * SCALAR("a sequence",plain) + * VALUE + * BLOCK-SEQUENCE-START + * BLOCK-ENTRY + * SCALAR("item 1",plain) + * BLOCK-ENTRY + * SCALAR("item 2",plain) + * BLOCK-END + * KEY + * SCALAR("a mapping",plain) + * VALUE + * BLOCK-MAPPING-START + * KEY + * SCALAR("key 1",plain) + * VALUE + * SCALAR("value 1",plain) + * KEY + * SCALAR("key 2",plain) + * VALUE + * SCALAR("value 2",plain) + * BLOCK-END + * BLOCK-END + * STREAM-END + * + * YAML also permits non-indented sequences if they are included into a block + * mapping. In this case, the token BLOCK-SEQUENCE-START is not produced: + * + * key: + * - item 1 # BLOCK-SEQUENCE-START is NOT produced here. + * - item 2 + * + * Tokens: + * + * STREAM-START(utf-8) + * BLOCK-MAPPING-START + * KEY + * SCALAR("key",plain) + * VALUE + * BLOCK-ENTRY + * SCALAR("item 1",plain) + * BLOCK-ENTRY + * SCALAR("item 2",plain) + * BLOCK-END + */ + +#if HAVE_CONFIG_H +#include +#endif + +#include + +#include + +/* + * Public API declarations. + */ + +YAML_DECLARE(yaml_token_t *) +yaml_parser_get_token(yaml_parser_t *parser); + +YAML_DECLARE(yaml_token_t *) +yaml_parser_peek_token(yaml_parser_t *parser); + +/* + * High-level token API. + */ + +static int +yaml_parser_fetch_more_tokens(yaml_parser_t *parser); + +static int +yaml_parser_fetch_next_token(yaml_parser_t *parser); + +/* + * Potential simple keys. + */ + +static int +yaml_parser_stale_simple_keys(yaml_parser_t *parser); + +static int +yaml_parser_save_simple_key(yaml_parser_t *parser); + +static int +yaml_parser_remove_simple_key(yaml_parser_t *parser); + +/* + * Indentation treatment. + */ + +static int +yaml_parser_add_indent(yaml_parser_t *parser); + +static int +yaml_parser_remove_indent(yaml_parser_t *parser); + +/* + * Token fetchers. + */ + +static int +yaml_parser_fetch_stream_start(yaml_parser_t *parser); + +static int +yaml_parser_fetch_stream_end(yaml_parser_t *parser); + +static int +yaml_parser_fetch_directive(yaml_parser_t *parser); + +static int +yaml_parser_fetch_document_start(yaml_parser_t *parser); + +static int +yaml_parser_fetch_document_end(yaml_parser_t *parser); + +static int +yaml_parser_fetch_document_indicator(yaml_parser_t *parser, + yaml_token_type_t type); + +static int +yaml_parser_fetch_flow_sequence_start(yaml_parser_t *parser); + +static int +yaml_parser_fetch_flow_mapping_start(yaml_parser_t *parser); + +static int +yaml_parser_fetch_flow_collection_start(yaml_parser_t *parser, + yaml_token_type_t type); + +static int +yaml_parser_fetch_flow_sequence_end(yaml_parser_t *parser); + +static int +yaml_parser_fetch_flow_mapping_end(yaml_parser_t *parser); + +static int +yaml_parser_fetch_flow_collection_end(yaml_parser_t *parser, + yaml_token_type_t type); + +static int +yaml_parser_fetch_flow_entry(yaml_parser_t *parser); + +static int +yaml_parser_fetch_block_entry(yaml_parser_t *parser); + +static int +yaml_parser_fetch_key(yaml_parser_t *parser); + +static int +yaml_parser_fetch_value(yaml_parser_t *parser); + +static int +yaml_parser_fetch_alias(yaml_parser_t *parser); + +static int +yaml_parser_fetch_anchor(yaml_parser_t *parser); + +static int +yaml_parser_fetch_tag(yaml_parser_t *parser); + +static int +yaml_parser_fetch_literal_scalar(yaml_parser_t *parser); + +static int +yaml_parser_fetch_folded_scalar(yaml_parser_t *parser); + +static int +yaml_parser_fetch_block_scalar(yaml_parser_t *parser, int literal); + +static int +yaml_parser_fetch_single_quoted_scalar(yaml_parser_t *parser); + +static int +yaml_parser_fetch_double_quoted_scalar(yaml_parser_t *parser); + +static int +yaml_parser_fetch_flow_scalar(yaml_parser_t *parser, int single); + +static int +yaml_parser_fetch_plain_scalar(yaml_parser_t *parser); + +/* + * Token scanners. + */ + +static int +yaml_parser_scan_to_next_token(yaml_parser_t *parser); + +static yaml_token_t * +yaml_parser_scan_directive(yaml_parser_t *parser); + +static int +yaml_parser_scan_directive_name(yaml_parser_t *parser, + yaml_mark_t start_mark, yaml_char_t **name); + +static int +yaml_parser_scan_yaml_directive_value(yaml_parser_t *parser, + yaml_mark_t start_mark, int *major, int *minor); + +static int +yaml_parser_scan_yaml_directive_number(yaml_parser_t *parser, + yaml_mark_t start_mark, int *number); + +static int +yaml_parser_scan_tag_directive_value(yaml_parser_t *parser, + yaml_char_t **handle, yaml_char_t **prefix); + +static yaml_token_t * +yaml_parser_scan_anchor(yaml_parser_t *parser, + yaml_token_type_t type); + +static yaml_token_t * +yaml_parser_scan_tag(yaml_parser_t *parser); + +static int +yaml_parser_scan_tag_handle(yaml_parser_t *parser, int directive, + yaml_mark_t start_mark, yaml_char_t **handle); + +static int +yaml_parser_scan_tag_uri(yaml_parser_t *parser, int directive, + yaml_mark_t start_mark, yaml_char_t **url); + +static yaml_token_t * +yaml_parser_scan_block_scalar(yaml_parser_t *parser, int literal); + +static int +yaml_parser_scan_block_scalar_indicators(yaml_parser_t *parser, + yaml_mark_t start_mark, int *chomping, int *increment); + +static yaml_token_t * +yaml_parser_scan_flow_scalar(yaml_parser_t *parser, int single); + +static yaml_token_t * +yaml_parser_scan_plain_scalar(yaml_parser_t *parser); + -- cgit v1.2.1