From 0e7b41b89d48956e214a842020a8293ef3ac394b Mon Sep 17 00:00:00 2001 From: xi Date: Mon, 29 May 2006 20:08:09 +0000 Subject: Working on the decoding code. git-svn-id: http://svn.pyyaml.org/libyaml/trunk@180 18f92427-320e-0410-9341-c67f048884a3 --- include/yaml/yaml.h | 53 +++++++--- src/api.c | 67 +++++++++--- src/reader.c | 297 ++++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 365 insertions(+), 52 deletions(-) diff --git a/include/yaml/yaml.h b/include/yaml/yaml.h index c4bd0ba..9ec2c7a 100644 --- a/include/yaml/yaml.h +++ b/include/yaml/yaml.h @@ -252,7 +252,7 @@ typedef struct { * source. The handler should write not more than @a size bytes to the @a * buffer. The number of written bytes should be set to the @a length variable. * - * @param[in] ext A pointer to an application data specified by + * @param[in] data A pointer to an application data specified by * @c yaml_parser_set_read_handler. * @param[out] buffer The buffer to write the data from the source. * @param[in] size The size of the buffer. @@ -262,9 +262,21 @@ typedef struct { * the returned value should be @c 0. On EOF, the handler should set the * @a length to @c 0 and return @c 1. */ -typedef int yaml_read_handler_t(void *ext, unsigned char *buffer, size_t size, + +typedef int yaml_read_handler_t(void *data, unsigned char *buffer, size_t size, size_t *size_read); +/** + * This structure holds a string input specified by + * @c yaml_parser_set_input_string. + */ + +typedef struct { + unsigned char *start; + unsigned char *end; + unsigned char *current; +} yaml_string_input_t; + /** * The parser structure. * @@ -279,7 +291,7 @@ typedef struct { * @{ */ - error_type_t error; + yaml_error_type_t error; /** * @} @@ -302,23 +314,23 @@ typedef struct { /** The pointer to the beginning of the working buffer. */ yaml_char_t *buffer; - /** The size of the buffer (in bytes). */ - size_t buffer_size; + /** The pointer to the end of the working buffer. */ + yaml_char_t *buffer_end; /** The pointer to the current character in the working buffer. */ - yaml_char_t *buffer_pointer; + yaml_char_t *pointer; - /** The number of unread characters in the buffer (in characters). */ - size_t buffer_length; + /** The number of unread characters in the working buffer. */ + size_t unread; - /** The remaining undecoded characters. */ + /** The pointer to the beginning of the raw buffer. */ unsigned char *raw_buffer; - /** The size of the raw buffer (in bytes). */ - size_t raw_buffer_size; + /** The pointer to the current character in the raw buffer. */ + unsigned char *raw_pointer; - /** Is the application responsible for freeing the raw buffer? */ - int raw_buffer_foreign; + /** The number of unread bytes in the raw buffer. */ + size_t raw_unread; /** The input encoding. */ yaml_encoding_t encoding; @@ -335,6 +347,9 @@ typedef struct { /** The column of the current position (starting from @c 0). */ size_t column; + /* String input structure. */ + yaml_string_input_t string_input; + /** * @} */ @@ -461,6 +476,18 @@ yaml_realloc(void *ptr, size_t size); void yaml_free(void *ptr); +/** The size of the raw buffer. */ + +#define YAML_RAW_BUFFER_SIZE 16384 + +/** + * The size of the buffer. + * + * We allocate enough space for decoding the whole raw buffer. + */ + +#define YAML_BUFFER_SIZE (YAML_RAW_BUFFER_SIZE*3) + /** @} */ diff --git a/src/api.c b/src/api.c index 0594727..aa183af 100644 --- a/src/api.c +++ b/src/api.c @@ -46,11 +46,35 @@ yaml_parser_new(void) { yaml_parser_t *parser; + /* Allocate the parser structure. */ + parser = yaml_malloc(sizeof(yaml_parser_t)); if (!parser) return NULL; memset(parser, 0, sizeof(yaml_parser_t)); + /* Allocate the raw buffer. */ + + parser->raw_buffer = yaml_malloc(YAML_RAW_BUFFER_SIZE); + if (!parser->raw_buffer) { + yaml_free(parser); + return NULL; + } + parser->raw_pointer = parser->raw_buffer; + parser->raw_unread = 0; + + /* Allocate the character buffer. */ + + parser->buffer = yaml_malloc(YAML_BUFFER_SIZE); + if (!parser->buffer) { + yaml_free(parser->raw_buffer); + yaml_free(parser); + return NULL; + } + parser->buffer_end = parser->buffer; + parser->pointer = parser->buffer; + parser->unread = 0; + return parser; } @@ -64,8 +88,7 @@ yaml_parser_delete(yaml_parser_t *parser) assert(parser); /* Non-NULL parser object expected. */ yaml_free(parser->buffer); - if (!parser->raw_buffer_foreign) - yaml_free(parser->raw_buffer); + yaml_free(parser->raw_buffer); memset(parser, 0, sizeof(yaml_parser_t)); @@ -73,14 +96,27 @@ yaml_parser_delete(yaml_parser_t *parser) } /* - * String read handler (always returns error). + * String read handler. */ static int yaml_string_read_handler(void *data, unsigned char *buffer, size_t size, size_t *size_read) { - *size_read = 0; + yaml_string_input_t *input = data; + + if (input->current == input->end) { + *size_read = 0; + return 1; + } + + if (size > (input->end - input->current)) { + size = input->end - input->current; + } + + memcpy(buffer, input->current, size); + input->current += size; + *size_read = size; return 1; } @@ -92,8 +128,8 @@ static int yaml_file_read_handler(void *data, unsigned char *buffer, size_t size, size_t *size_read) { - *size_read = fread(buffer, 1, size, (FILE *)ext); - return !ferror((FILE *)ext); + *size_read = fread(buffer, 1, size, (FILE *)data); + return !ferror((FILE *)data); } /* @@ -105,16 +141,15 @@ yaml_parser_set_input_string(yaml_parser_t *parser, unsigned char *input, size_t size) { assert(parser); /* Non-NULL parser object expected. */ - assert(!parser->reader); /* You can set the source only once. */ + assert(!parser->read_handler); /* You can set the source only once. */ assert(input); /* Non-NULL input string expected. */ - parser->read_handler = yaml_string_read_handler; - parser->read_handler_data = NULL; + parser->string_input.start = input; + parser->string_input.current = input; + parser->string_input.end = input+size; - /* We use the input string as a raw (undecoded) buffer. */ - parser->raw_buffer = input; - parser->raw_buffer_size = size; - parser->raw_buffer_foreign = 1; + parser->read_handler = yaml_string_read_handler; + parser->read_handler_data = &parser->string_input; } /* @@ -125,7 +160,7 @@ void yaml_parser_set_input_file(yaml_parser_t *parser, FILE *file) { assert(parser); /* Non-NULL parser object expected. */ - assert(!parser->reader); /* You can set the source only once. */ + assert(!parser->read_handler); /* You can set the source only once. */ assert(file); /* Non-NULL file object expected. */ parser->read_handler = yaml_file_read_handler; @@ -141,11 +176,11 @@ yaml_parser_set_input(yaml_parser_t *parser, yaml_read_handler_t *handler, void *data) { assert(parser); /* Non-NULL parser object expected. */ - assert(!parser->reader); /* You can set the source only once. */ + assert(!parser->read_handler); /* You can set the source only once. */ assert(handler); /* Non-NULL read handler expected. */ parser->read_handler = handler; - parser->read_handler_data = data + parser->read_handler_data = data; } /* diff --git a/src/reader.c b/src/reader.c index 787f785..440a88f 100644 --- a/src/reader.c +++ b/src/reader.c @@ -1,53 +1,304 @@ -#define RAW_BUFFER_SIZE 16384 -#define BUFFER_SIZE (RAW_BUFFER_SIZE*2) /* Should be enough for decoding - the whole raw buffer. */ +#if HAVE_CONFIG_H +#include +#endif + +#include + +#include + +/* Check for the UTF-16-BE BOM. */ +#define IS_UTF16BE_BOM(pointer) ((pointer)[0] == 0xFE && (pointer)[1] == 0xFF) + +/* Check for the UTF-16-LE BOM. */ +#define IS_UTF16LE_BOM(pointer) ((pointer)[0] == 0xFF && (pointer)[1] == 0xFE) + +/* Get a UTF-16-BE character. */ +#define UTF16BE_CHAR(pointer) ((pointer)[0] << 8 + (pointer)[1]) + +/* Get a UTF-16-LE character. */ +#define UTF16LE_CHAR(pointer) ((pointer)[0] + (pointer)[1] << 8) + +/* + * From http://www.ietf.org/rfc/rfc3629.txt: + * + * Char. number range | UTF-8 octet sequence + * (hexadecimal) | (binary) + * --------------------+--------------------------------------------- + * 0000 0000-0000 007F | 0xxxxxxx + * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx + * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx + * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + +/* Get the length of a UTF-8 character (0 on error). */ +#define UTF8_LENGTH(pointer) \ + ((pointer)[0] < 0x80 ? 1 : \ + (pointer)[0] < 0xC0 ? 0 : \ + (pointer)[0] < 0xE0 ? 2 : \ + (pointer)[0] < 0xF0 ? 3 : \ + (pointer)[0] < 0xF8 ? 4 : 0) + +/* Get the value of the first byte of a UTF-8 sequence (0xFF on error). */ +#define UTF8_FIRST_CHUNK(pointer) \ + ((pointer)[0] < 0x80 ? (pointer)[0] & 0x7F : \ + (pointer)[0] < 0xC0 ? 0xFF : \ + (pointer)[0] < 0xE0 ? (pointer)[0] & 0x1F : \ + (pointer)[0] < 0xF0 ? (pointer)[0] & 0x0F : \ + (pointer)[0] < 0xF8 ? (pointer)[0] & 0x07 : 0xFF) + +/* Get the value of a non-first byte of a UTF-8 sequence (0xFF on error). */ +#define UTF8_NEXT_CHUNK(pointer) \ + ((pointer)[0] >= 0x80 && (pointer)[0] < 0xC0 ? (pointer)[0] & 0x3F : 0xFF) + +/* Determine the length of a UTF-8 character. */ /* * Ensure that the buffer contains at least length characters. * Return 1 on success, 0 on failure. + * + * The length is supposed to be significantly less that the buffer size. */ int -yaml_parser_update_reader(yaml_parser_t *parser, size_t length) +yaml_parser_update_buffer(yaml_parser_t *parser, size_t length) { /* If the EOF flag is set, do nothing. */ if (parser->eof) return 1; - /* First, let us check that the buffers are allocated. */ + /* Return if the buffer contains enough characters. */ + + if (parser->unread >= length) + return 1; + + /* Determine the input encoding if it is not known yet. */ - if (!parser->buffer) { - parser->buffer = yaml_malloc(BUFFER_SIZE); - if (!parser->buffer) { - parser->error = YAML_MEMORY_ERROR; + if (!parser->encoding) { + if (!yaml_parser_determine_encoding(parser)) return 0; - } - parser->buffer_size = BUFFER_SIZE; - parser->buffer_pointer = parser->buffer; - parser->buffer_length = 0; } - if (!parser->raw_buffer) { - parser->raw_buffer = yaml_malloc(RAW_BUFFER_SIZE); - if (!parser->raw_buffer) { - parser->error = YAML_MEMORY_ERROR; - return 0; + /* Move the unread characters to the beginning of the buffer. */ + + if (parser->buffer < parser->pointer + && parser->pointer < parser->buffer_end) { + size_t size = parser->buffer_end - parser->pointer; + memmove(parser->buffer, parser->pointer, size); + parser->pointer = parser->buffer; + parser->buffer_end -= size; + } + else if (parser->pointer == parser->buffer_end) { + parser->pointer = parser->buffer; + parser->buffer_end = parser->buffer; + } + + /* Fill the buffer until it has enough characters. */ + + while (parser->unread < length) + { + /* Fill the raw buffer. */ + + if (!yaml_parser_update_raw_buffer(parser)) return 0; + + /* Decode the raw buffer. */ + + while (parser->raw_unread) + { + unsigned int ch; + int incomplete = 0; + + /* Decode the next character. */ + + switch (parser->encoding) + { + case YAML_UTF8_ENCODING: + + unsigned int utf8_length = UTF8_LENGTH(parser->raw_pointer); + unsigned int utf8_chunk; + + /* Check if the raw buffer contains an incomplete character. */ + + if (utf8_length > parser->raw_unread) { + if (parser->eof) { + parser->error = YAML_READER_ERROR; + return 0; + } + incomplete = 1; + } + + /* Get the character checking it for validity. */ + + utf8_chunk = UTF8_FIRST_CHUNK(parser->raw_pointer ++); + if (utf8_chunk == 0xFF) { + parser->error = YAML_READER_ERROR; + return 0; + } + ch = utf8_chunk; + parser->raw_unread --; + while (-- utf8_length) { + utf8_chunk = UTF8_NEXT_CHUNK(parser->raw_pointer ++); + if (utf8_chunk == 0xFF) { + parser->error = YAML_READER_ERROR; + return 0; + } + ch = ch << 6 + utf8_chunk; + parser->raw_unread --; + } + + break; + + case YAML_UTF16LE_ENCODING: + + /* Check if the raw buffer contains an incomplete character. */ + + if (parser->raw_unread < 2) { + if (parser->eof) { + parser->error = YAML_READER_ERROR; + return 0; + } + incomplete = 1; + } + + /* Get the current character. */ + + ch = UTF16LE_CHAR(parser->raw_pointer); + parser->raw_pointer += 2; + parser->raw_unread -= 2; + + break; + + case YAML_UTF16BE_ENCODING: + + /* Check if the raw buffer contains an incomplete character. */ + + if (parser->raw_unread < 2) { + if (parser->eof) { + parser->error = YAML_READER_ERROR; + return 0; + } + incomplete = 1; + } + + /* Get the current character. */ + + ch = UTF16BE_CHAR(parser->raw_pointer); + parser->raw_pointer += 2; + parser->raw_unread -= 2; + + break; + } + + /* + * Check if the character is in the allowed range: + * #x9 | #xA | #xD | [#x20-#x7E] (8 bit) + * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit) + * | [#x10000-#x10FFFF] (32 bit) + */ + + if (! (ch == 0x09 || ch == 0x0A || ch == 0x0D + || (ch >= 0x20 && ch <= 0x7E) + || (ch == 0x85) || (ch >= 0xA0 && ch <= 0xD7FF) + || (ch >= 0xE000 && ch <= 0xFFFD) + || (ch >= 0x10000 && ch <= 0x10FFFF))) { + parser->error = YAML_READER_ERROR; + return 0; + } + + /* Finally put the character into the buffer. */ + + /* 0000 0000-0000 007F -> 0xxxxxxx */ + if (ch <= 0x7F) { + *(parser->buffer_end++) = ch; + } + /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */ + else if (ch <= 0x7FF) { + *(parser->buffer_end++) = 0xC0 + (ch >> 6) & 0x1F; + *(parser->buffer_end++) = 0x80 + ch & 0x3F; + } + /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ + else if (ch <= 0xFFFF) { + *(parser->buffer_end++) = 0x80 + ch & 0x3F; + *(parser->buffer_end++) = 0xC0 + (ch >> 6) & 0x1F; + + } + /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + else { + } } - parser->raw_buffer_size = RAW_BUFFER_SIZE; + } - /* Next, determine the input encoding. */ +} - if (!parser->encoding) { - if (!yaml_parser_determine_encoding(parser)) +/* + * Determine the input stream encoding by checking the BOM symbol. If no BOM is + * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure. + */ + +int +yaml_parser_determine_encoding(yaml_parser_t *parser) +{ + /* Ensure that we had enough bytes in the raw buffer. */ + + while (!parser->eof && parser->raw_unread < 2) { + if (!yaml_parser_update_raw_buffer(parser)) { return 0; + } } - /* more... */ + /* Determine the encoding. */ + if (parser->raw_unread >= 2 && IS_UTF16BE_BOM(parser->raw_pointer)) { + parser->encoding = YAML_UTF16BE_ENCODING; + } + else if (parser->raw_unread >= 2 && IS_UTF16LE_BOM(parser->raw_pointer)) { + parser->encoding = YAML_UTF16LE_ENCODING; + } + else { + parser->encoding = YAML_UTF8_ENCODING; + } } +/* + * Update the raw buffer. + */ + +int +yaml_parser_update_raw_buffer(yaml_parser_t *parser) +{ + size_t size_read = 0; + + /* Return if the raw buffer is full. */ + + if (parser->raw_unread == YAML_RAW_BUFFER_SIZE) return 1; + /* Return on EOF. */ + + if (parser->eof) return 1; + + /* Move the remaining bytes in the raw buffer to the beginning. */ + + if (parser->raw_unread && parser->raw_buffer < parser->raw_pointer) { + memmove(parser->raw_buffer, parser->raw_pointer, parser->raw_unread); + } + parser->raw_pointer = parser->raw_buffer; + + /* Call the read handler to fill the buffer. */ + + if (!parser->read_handler(parser->read_handler_data, + parser->raw_buffer + parser->raw_unread, + YAML_RAW_BUFFER_SIZE - parser->raw_unread, + &size_read)) { + parser->error = YAML_READER_ERROR; + return 0; + } + parser->raw_unread += size_read; + if (!size_read) { + parser->eof = 1; + } + + return 1; +} -- cgit v1.2.1