summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorxi <xi@18f92427-320e-0410-9341-c67f048884a3>2006-05-29 20:08:09 +0000
committerxi <xi@18f92427-320e-0410-9341-c67f048884a3>2006-05-29 20:08:09 +0000
commit0e7b41b89d48956e214a842020a8293ef3ac394b (patch)
tree54f6731b7262f38cff0c3b7f29b9371556f1346e
parent9f2aadeadd713e7d041616d84c3033a822e559bf (diff)
downloadlibyaml-0e7b41b89d48956e214a842020a8293ef3ac394b.tar.gz
Working on the decoding code.
git-svn-id: http://svn.pyyaml.org/libyaml/trunk@180 18f92427-320e-0410-9341-c67f048884a3
-rw-r--r--include/yaml/yaml.h53
-rw-r--r--src/api.c67
-rw-r--r--src/reader.c297
3 files changed, 365 insertions, 52 deletions
diff --git a/include/yaml/yaml.h b/include/yaml/yaml.h
index c4bd0ba..9ec2c7a 100644
--- a/include/yaml/yaml.h
+++ b/include/yaml/yaml.h
@@ -252,7 +252,7 @@ typedef struct {
* source. The handler should write not more than @a size bytes to the @a
* buffer. The number of written bytes should be set to the @a length variable.
*
- * @param[in] ext A pointer to an application data specified by
+ * @param[in] data A pointer to an application data specified by
* @c yaml_parser_set_read_handler.
* @param[out] buffer The buffer to write the data from the source.
* @param[in] size The size of the buffer.
@@ -262,10 +262,22 @@ typedef struct {
* the returned value should be @c 0. On EOF, the handler should set the
* @a length to @c 0 and return @c 1.
*/
-typedef int yaml_read_handler_t(void *ext, unsigned char *buffer, size_t size,
+
+typedef int yaml_read_handler_t(void *data, unsigned char *buffer, size_t size,
size_t *size_read);
/**
+ * This structure holds a string input specified by
+ * @c yaml_parser_set_input_string.
+ */
+
+typedef struct {
+ unsigned char *start;
+ unsigned char *end;
+ unsigned char *current;
+} yaml_string_input_t;
+
+/**
* The parser structure.
*
* All members are internal. Manage the structure using the @c yaml_parser_
@@ -279,7 +291,7 @@ typedef struct {
* @{
*/
- error_type_t error;
+ yaml_error_type_t error;
/**
* @}
@@ -302,23 +314,23 @@ typedef struct {
/** The pointer to the beginning of the working buffer. */
yaml_char_t *buffer;
- /** The size of the buffer (in bytes). */
- size_t buffer_size;
+ /** The pointer to the end of the working buffer. */
+ yaml_char_t *buffer_end;
/** The pointer to the current character in the working buffer. */
- yaml_char_t *buffer_pointer;
+ yaml_char_t *pointer;
- /** The number of unread characters in the buffer (in characters). */
- size_t buffer_length;
+ /** The number of unread characters in the working buffer. */
+ size_t unread;
- /** The remaining undecoded characters. */
+ /** The pointer to the beginning of the raw buffer. */
unsigned char *raw_buffer;
- /** The size of the raw buffer (in bytes). */
- size_t raw_buffer_size;
+ /** The pointer to the current character in the raw buffer. */
+ unsigned char *raw_pointer;
- /** Is the application responsible for freeing the raw buffer? */
- int raw_buffer_foreign;
+ /** The number of unread bytes in the raw buffer. */
+ size_t raw_unread;
/** The input encoding. */
yaml_encoding_t encoding;
@@ -335,6 +347,9 @@ typedef struct {
/** The column of the current position (starting from @c 0). */
size_t column;
+ /* String input structure. */
+ yaml_string_input_t string_input;
+
/**
* @}
*/
@@ -461,6 +476,18 @@ yaml_realloc(void *ptr, size_t size);
void
yaml_free(void *ptr);
+/** The size of the raw buffer. */
+
+#define YAML_RAW_BUFFER_SIZE 16384
+
+/**
+ * The size of the buffer.
+ *
+ * We allocate enough space for decoding the whole raw buffer.
+ */
+
+#define YAML_BUFFER_SIZE (YAML_RAW_BUFFER_SIZE*3)
+
/** @} */
diff --git a/src/api.c b/src/api.c
index 0594727..aa183af 100644
--- a/src/api.c
+++ b/src/api.c
@@ -46,11 +46,35 @@ yaml_parser_new(void)
{
yaml_parser_t *parser;
+ /* Allocate the parser structure. */
+
parser = yaml_malloc(sizeof(yaml_parser_t));
if (!parser) return NULL;
memset(parser, 0, sizeof(yaml_parser_t));
+ /* Allocate the raw buffer. */
+
+ parser->raw_buffer = yaml_malloc(YAML_RAW_BUFFER_SIZE);
+ if (!parser->raw_buffer) {
+ yaml_free(parser);
+ return NULL;
+ }
+ parser->raw_pointer = parser->raw_buffer;
+ parser->raw_unread = 0;
+
+ /* Allocate the character buffer. */
+
+ parser->buffer = yaml_malloc(YAML_BUFFER_SIZE);
+ if (!parser->buffer) {
+ yaml_free(parser->raw_buffer);
+ yaml_free(parser);
+ return NULL;
+ }
+ parser->buffer_end = parser->buffer;
+ parser->pointer = parser->buffer;
+ parser->unread = 0;
+
return parser;
}
@@ -64,8 +88,7 @@ yaml_parser_delete(yaml_parser_t *parser)
assert(parser); /* Non-NULL parser object expected. */
yaml_free(parser->buffer);
- if (!parser->raw_buffer_foreign)
- yaml_free(parser->raw_buffer);
+ yaml_free(parser->raw_buffer);
memset(parser, 0, sizeof(yaml_parser_t));
@@ -73,14 +96,27 @@ yaml_parser_delete(yaml_parser_t *parser)
}
/*
- * String read handler (always returns error).
+ * String read handler.
*/
static int
yaml_string_read_handler(void *data, unsigned char *buffer, size_t size,
size_t *size_read)
{
- *size_read = 0;
+ yaml_string_input_t *input = data;
+
+ if (input->current == input->end) {
+ *size_read = 0;
+ return 1;
+ }
+
+ if (size > (input->end - input->current)) {
+ size = input->end - input->current;
+ }
+
+ memcpy(buffer, input->current, size);
+ input->current += size;
+ *size_read = size;
return 1;
}
@@ -92,8 +128,8 @@ static int
yaml_file_read_handler(void *data, unsigned char *buffer, size_t size,
size_t *size_read)
{
- *size_read = fread(buffer, 1, size, (FILE *)ext);
- return !ferror((FILE *)ext);
+ *size_read = fread(buffer, 1, size, (FILE *)data);
+ return !ferror((FILE *)data);
}
/*
@@ -105,16 +141,15 @@ yaml_parser_set_input_string(yaml_parser_t *parser,
unsigned char *input, size_t size)
{
assert(parser); /* Non-NULL parser object expected. */
- assert(!parser->reader); /* You can set the source only once. */
+ assert(!parser->read_handler); /* You can set the source only once. */
assert(input); /* Non-NULL input string expected. */
- parser->read_handler = yaml_string_read_handler;
- parser->read_handler_data = NULL;
+ parser->string_input.start = input;
+ parser->string_input.current = input;
+ parser->string_input.end = input+size;
- /* We use the input string as a raw (undecoded) buffer. */
- parser->raw_buffer = input;
- parser->raw_buffer_size = size;
- parser->raw_buffer_foreign = 1;
+ parser->read_handler = yaml_string_read_handler;
+ parser->read_handler_data = &parser->string_input;
}
/*
@@ -125,7 +160,7 @@ void
yaml_parser_set_input_file(yaml_parser_t *parser, FILE *file)
{
assert(parser); /* Non-NULL parser object expected. */
- assert(!parser->reader); /* You can set the source only once. */
+ assert(!parser->read_handler); /* You can set the source only once. */
assert(file); /* Non-NULL file object expected. */
parser->read_handler = yaml_file_read_handler;
@@ -141,11 +176,11 @@ yaml_parser_set_input(yaml_parser_t *parser,
yaml_read_handler_t *handler, void *data)
{
assert(parser); /* Non-NULL parser object expected. */
- assert(!parser->reader); /* You can set the source only once. */
+ assert(!parser->read_handler); /* You can set the source only once. */
assert(handler); /* Non-NULL read handler expected. */
parser->read_handler = handler;
- parser->read_handler_data = data
+ parser->read_handler_data = data;
}
/*
diff --git a/src/reader.c b/src/reader.c
index 787f785..440a88f 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -1,53 +1,304 @@
-#define RAW_BUFFER_SIZE 16384
-#define BUFFER_SIZE (RAW_BUFFER_SIZE*2) /* Should be enough for decoding
- the whole raw buffer. */
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <yaml/yaml.h>
+
+#include <assert.h>
+
+/* Check for the UTF-16-BE BOM. */
+#define IS_UTF16BE_BOM(pointer) ((pointer)[0] == 0xFE && (pointer)[1] == 0xFF)
+
+/* Check for the UTF-16-LE BOM. */
+#define IS_UTF16LE_BOM(pointer) ((pointer)[0] == 0xFF && (pointer)[1] == 0xFE)
+
+/* Get a UTF-16-BE character. */
+#define UTF16BE_CHAR(pointer) ((pointer)[0] << 8 + (pointer)[1])
+
+/* Get a UTF-16-LE character. */
+#define UTF16LE_CHAR(pointer) ((pointer)[0] + (pointer)[1] << 8)
+
+/*
+ * From http://www.ietf.org/rfc/rfc3629.txt:
+ *
+ * Char. number range | UTF-8 octet sequence
+ * (hexadecimal) | (binary)
+ * --------------------+---------------------------------------------
+ * 0000 0000-0000 007F | 0xxxxxxx
+ * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
+ * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+ * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
+
+/* Get the length of a UTF-8 character (0 on error). */
+#define UTF8_LENGTH(pointer) \
+ ((pointer)[0] < 0x80 ? 1 : \
+ (pointer)[0] < 0xC0 ? 0 : \
+ (pointer)[0] < 0xE0 ? 2 : \
+ (pointer)[0] < 0xF0 ? 3 : \
+ (pointer)[0] < 0xF8 ? 4 : 0)
+
+/* Get the value of the first byte of a UTF-8 sequence (0xFF on error). */
+#define UTF8_FIRST_CHUNK(pointer) \
+ ((pointer)[0] < 0x80 ? (pointer)[0] & 0x7F : \
+ (pointer)[0] < 0xC0 ? 0xFF : \
+ (pointer)[0] < 0xE0 ? (pointer)[0] & 0x1F : \
+ (pointer)[0] < 0xF0 ? (pointer)[0] & 0x0F : \
+ (pointer)[0] < 0xF8 ? (pointer)[0] & 0x07 : 0xFF)
+
+/* Get the value of a non-first byte of a UTF-8 sequence (0xFF on error). */
+#define UTF8_NEXT_CHUNK(pointer) \
+ ((pointer)[0] >= 0x80 && (pointer)[0] < 0xC0 ? (pointer)[0] & 0x3F : 0xFF)
+
+/* Determine the length of a UTF-8 character. */
/*
* Ensure that the buffer contains at least length characters.
* Return 1 on success, 0 on failure.
+ *
+ * The length is supposed to be significantly less that the buffer size.
*/
int
-yaml_parser_update_reader(yaml_parser_t *parser, size_t length)
+yaml_parser_update_buffer(yaml_parser_t *parser, size_t length)
{
/* If the EOF flag is set, do nothing. */
if (parser->eof)
return 1;
- /* First, let us check that the buffers are allocated. */
+ /* Return if the buffer contains enough characters. */
+
+ if (parser->unread >= length)
+ return 1;
+
+ /* Determine the input encoding if it is not known yet. */
- if (!parser->buffer) {
- parser->buffer = yaml_malloc(BUFFER_SIZE);
- if (!parser->buffer) {
- parser->error = YAML_MEMORY_ERROR;
+ if (!parser->encoding) {
+ if (!yaml_parser_determine_encoding(parser))
return 0;
- }
- parser->buffer_size = BUFFER_SIZE;
- parser->buffer_pointer = parser->buffer;
- parser->buffer_length = 0;
}
- if (!parser->raw_buffer) {
- parser->raw_buffer = yaml_malloc(RAW_BUFFER_SIZE);
- if (!parser->raw_buffer) {
- parser->error = YAML_MEMORY_ERROR;
- return 0;
+ /* Move the unread characters to the beginning of the buffer. */
+
+ if (parser->buffer < parser->pointer
+ && parser->pointer < parser->buffer_end) {
+ size_t size = parser->buffer_end - parser->pointer;
+ memmove(parser->buffer, parser->pointer, size);
+ parser->pointer = parser->buffer;
+ parser->buffer_end -= size;
+ }
+ else if (parser->pointer == parser->buffer_end) {
+ parser->pointer = parser->buffer;
+ parser->buffer_end = parser->buffer;
+ }
+
+ /* Fill the buffer until it has enough characters. */
+
+ while (parser->unread < length)
+ {
+ /* Fill the raw buffer. */
+
+ if (!yaml_parser_update_raw_buffer(parser)) return 0;
+
+ /* Decode the raw buffer. */
+
+ while (parser->raw_unread)
+ {
+ unsigned int ch;
+ int incomplete = 0;
+
+ /* Decode the next character. */
+
+ switch (parser->encoding)
+ {
+ case YAML_UTF8_ENCODING:
+
+ unsigned int utf8_length = UTF8_LENGTH(parser->raw_pointer);
+ unsigned int utf8_chunk;
+
+ /* Check if the raw buffer contains an incomplete character. */
+
+ if (utf8_length > parser->raw_unread) {
+ if (parser->eof) {
+ parser->error = YAML_READER_ERROR;
+ return 0;
+ }
+ incomplete = 1;
+ }
+
+ /* Get the character checking it for validity. */
+
+ utf8_chunk = UTF8_FIRST_CHUNK(parser->raw_pointer ++);
+ if (utf8_chunk == 0xFF) {
+ parser->error = YAML_READER_ERROR;
+ return 0;
+ }
+ ch = utf8_chunk;
+ parser->raw_unread --;
+ while (-- utf8_length) {
+ utf8_chunk = UTF8_NEXT_CHUNK(parser->raw_pointer ++);
+ if (utf8_chunk == 0xFF) {
+ parser->error = YAML_READER_ERROR;
+ return 0;
+ }
+ ch = ch << 6 + utf8_chunk;
+ parser->raw_unread --;
+ }
+
+ break;
+
+ case YAML_UTF16LE_ENCODING:
+
+ /* Check if the raw buffer contains an incomplete character. */
+
+ if (parser->raw_unread < 2) {
+ if (parser->eof) {
+ parser->error = YAML_READER_ERROR;
+ return 0;
+ }
+ incomplete = 1;
+ }
+
+ /* Get the current character. */
+
+ ch = UTF16LE_CHAR(parser->raw_pointer);
+ parser->raw_pointer += 2;
+ parser->raw_unread -= 2;
+
+ break;
+
+ case YAML_UTF16BE_ENCODING:
+
+ /* Check if the raw buffer contains an incomplete character. */
+
+ if (parser->raw_unread < 2) {
+ if (parser->eof) {
+ parser->error = YAML_READER_ERROR;
+ return 0;
+ }
+ incomplete = 1;
+ }
+
+ /* Get the current character. */
+
+ ch = UTF16BE_CHAR(parser->raw_pointer);
+ parser->raw_pointer += 2;
+ parser->raw_unread -= 2;
+
+ break;
+ }
+
+ /*
+ * Check if the character is in the allowed range:
+ * #x9 | #xA | #xD | [#x20-#x7E] (8 bit)
+ * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit)
+ * | [#x10000-#x10FFFF] (32 bit)
+ */
+
+ if (! (ch == 0x09 || ch == 0x0A || ch == 0x0D
+ || (ch >= 0x20 && ch <= 0x7E)
+ || (ch == 0x85) || (ch >= 0xA0 && ch <= 0xD7FF)
+ || (ch >= 0xE000 && ch <= 0xFFFD)
+ || (ch >= 0x10000 && ch <= 0x10FFFF))) {
+ parser->error = YAML_READER_ERROR;
+ return 0;
+ }
+
+ /* Finally put the character into the buffer. */
+
+ /* 0000 0000-0000 007F -> 0xxxxxxx */
+ if (ch <= 0x7F) {
+ *(parser->buffer_end++) = ch;
+ }
+ /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
+ else if (ch <= 0x7FF) {
+ *(parser->buffer_end++) = 0xC0 + (ch >> 6) & 0x1F;
+ *(parser->buffer_end++) = 0x80 + ch & 0x3F;
+ }
+ /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
+ else if (ch <= 0xFFFF) {
+ *(parser->buffer_end++) = 0x80 + ch & 0x3F;
+ *(parser->buffer_end++) = 0xC0 + (ch >> 6) & 0x1F;
+
+ }
+ /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+ else {
+ }
}
- parser->raw_buffer_size = RAW_BUFFER_SIZE;
+
}
- /* Next, determine the input encoding. */
+}
- if (!parser->encoding) {
- if (!yaml_parser_determine_encoding(parser))
+/*
+ * Determine the input stream encoding by checking the BOM symbol. If no BOM is
+ * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
+ */
+
+int
+yaml_parser_determine_encoding(yaml_parser_t *parser)
+{
+ /* Ensure that we had enough bytes in the raw buffer. */
+
+ while (!parser->eof && parser->raw_unread < 2) {
+ if (!yaml_parser_update_raw_buffer(parser)) {
return 0;
+ }
}
- /* more... */
+ /* Determine the encoding. */
+ if (parser->raw_unread >= 2 && IS_UTF16BE_BOM(parser->raw_pointer)) {
+ parser->encoding = YAML_UTF16BE_ENCODING;
+ }
+ else if (parser->raw_unread >= 2 && IS_UTF16LE_BOM(parser->raw_pointer)) {
+ parser->encoding = YAML_UTF16LE_ENCODING;
+ }
+ else {
+ parser->encoding = YAML_UTF8_ENCODING;
+ }
}
+/*
+ * Update the raw buffer.
+ */
+
+int
+yaml_parser_update_raw_buffer(yaml_parser_t *parser)
+{
+ size_t size_read = 0;
+
+ /* Return if the raw buffer is full. */
+
+ if (parser->raw_unread == YAML_RAW_BUFFER_SIZE) return 1;
+ /* Return on EOF. */
+
+ if (parser->eof) return 1;
+
+ /* Move the remaining bytes in the raw buffer to the beginning. */
+
+ if (parser->raw_unread && parser->raw_buffer < parser->raw_pointer) {
+ memmove(parser->raw_buffer, parser->raw_pointer, parser->raw_unread);
+ }
+ parser->raw_pointer = parser->raw_buffer;
+
+ /* Call the read handler to fill the buffer. */
+
+ if (!parser->read_handler(parser->read_handler_data,
+ parser->raw_buffer + parser->raw_unread,
+ YAML_RAW_BUFFER_SIZE - parser->raw_unread,
+ &size_read)) {
+ parser->error = YAML_READER_ERROR;
+ return 0;
+ }
+ parser->raw_unread += size_read;
+ if (!size_read) {
+ parser->eof = 1;
+ }
+
+ return 1;
+}