/**
* Copyright 2008-2010 Digital Bazaar, Inc.
*
* This file is part of librdfa.
*
* librdfa is Free Software, and can be licensed under any of the
* following three licenses:
*
* 1. GNU Lesser General Public License (LGPL) V2.1 or any
* newer version
* 2. GNU General Public License (GPL) V2 or any newer version
* 3. Apache License, V2.0 or any newer version
*
* You may not use this file except in compliance with at least one of
* the above three licenses.
*
* See LICENSE-* at the top of this software distribution for more
* information regarding the details of each license.
*
* You should have received a copy of the GNU Lesser General Public
* License along with librdfa. If not, see .
*
* The librdfa library is the Fastest RDFa Parser in the Universe. It is
* a stream parser, meaning that it takes an XML data as input and spits
* out RDF triples as it comes across them in the stream. Due to this
* processing approach, librdfa has a very, very small memory footprint.
* It is also very fast and can operate on hundreds of gigabytes of XML
* data without breaking a sweat.
*
* Usage:
*
* rdfacontext* context = rdfa_create_context(base_uri);
* context->callback_data = your_user_data;
* rdfa_set_triple_handler(context, triple_function);
* rdfa_set_buffer_filler(context, buffer_filler_function);
* rdfa_parse(context);
* rdfa_free_context(context);
*
* If you would like to get warnings/error triples from the processor graph:
*
* rdfa_set_issue_handler(context, triple_function);
*
* Usage if you need more control over when to fill rdfa's buffer:
*
* rdfacontext* context = rdfa_create_context(base_uri);
* context->callback_data = your_user_data;
* rdfa_set_triple_handler(context, triple_function);
* int rval = rdfa_parse_start(context);
* if(rval == RDFA_PARSE_SUCCESS)
* {
* FILE* myfile = fopen("myfilename");
* size_t buf_len = 0;
* size_t read = 0;
* do
* {
* char* buf = rdfa_get_buffer(context, &buf_len);
* if(buf_len > 0)
* {
* // fill buffer here up to buf_len bytes from your input stream
* read = fread(buf, sizeof(char), buf_len, myfile);
* }
*
* // parse the read data
* rdfa_parse_buffer(context, read);
* }
* while(read > 0);
* fclose(myfile);
*
* rdfa_parse_end(context);
* }
* rdfa_free_context(context);
*
*/
#ifndef _LIBRDFA_RDFA_H_
#define _LIBRDFA_RDFA_H_
#include
#include
/* Activate the stupid Windows DLL exporting mechanism if we're building for Windows */
#ifdef WIN32
#define DLLEXPORT __declspec(dllexport)
#else
#define DLLEXPORT
#endif
#ifdef LIBRDFA_IN_RAPTOR
#include "raptor2.h"
#include "raptor_internal.h"
#endif /* LIBRDFA_IN_RAPTOR */
#ifdef __cplusplus
extern "C"
{
#endif
#define DEBUG 0
/* RDFa version numbers */
#define RDFA_VERSION_1_0 1
#define RDFA_VERSION_1_1 2
/* parse process return types */
#define RDFA_PARSE_WARNING -2
#define RDFA_PARSE_FAILED -1
#define RDFA_PARSE_UNKNOWN 0
#define RDFA_PARSE_SUCCESS 1
/* maximum list lengths */
#define MAX_LOCAL_LIST_MAPPINGS 32
#define MAX_LIST_MAPPINGS 48
#define MAX_LIST_ITEMS 16
#define MAX_TERM_MAPPINGS 64
#define MAX_URI_MAPPINGS 128
#define MAX_INCOMPLETE_TRIPLES 128
/* host language definitions */
#define HOST_LANGUAGE_NONE 0
#define HOST_LANGUAGE_XML1 1
#define HOST_LANGUAGE_XHTML1 2
#define HOST_LANGUAGE_HTML 3
/* default mapping key for xmlns */
#define XMLNS_DEFAULT_MAPPING "XMLNS_DEFAULT"
/* whitespace characters for RDFa Core 1.1 */
#define RDFA_WHITESPACE " \t\n\v\f\r"
/**
* An RDF resource type is used to denote the content of a triple's
* object value.
*/
typedef enum
{
RDF_TYPE_NAMESPACE_PREFIX,
RDF_TYPE_IRI,
RDF_TYPE_PLAIN_LITERAL,
RDF_TYPE_XML_LITERAL,
RDF_TYPE_TYPED_LITERAL,
RDF_TYPE_UNKNOWN
} rdfresource_t;
/**
* An RDF triple is the result of an RDFa statement that contains, at
* the very least, a subject, a predicate and an object. It is the
* smallest, complete statement one can make in RDF.
*/
typedef struct rdftriple
{
char* subject;
char* predicate;
char* object;
rdfresource_t object_type;
char* datatype;
char* language;
} rdftriple;
/**
* The specification for a callback that is capable of handling
* triples. Produces a triple that must be freed once the application
* is done with the object.
*/
typedef void (*triple_handler_fp)(rdftriple*, void*);
/**
* The specification for a callback that is used to fill the input buffer
* with data to parse.
*/
typedef size_t (*buffer_filler_fp)(char*, size_t, void*);
/**
* An RDFA list item is used to hold each datum in an rdfa list. It
* contains a list of flags as well as the data for the list member.
*/
typedef struct rdfalistitem
{
unsigned char flags;
void* data;
} rdfalistitem;
/**
* An RDFa list is used to store multiple text strings that have a set
* of attributes associated with them. These can be lists of CURIEs,
* or lists of incomplete triples. The structure grows with use, but
* cannot be shrunk.
*/
typedef struct rdfalist
{
rdfalistitem** items;
size_t num_items;
size_t max_items;
unsigned int user_data;
} rdfalist;
/**
* The RDFa Parser structure is responsible for keeping track of the state of
* the current RDFa parser. Things such as the default namespace,
* CURIE mappings, and other context-specific
*/
typedef struct rdfacontext
{
unsigned char rdfa_version;
char* base;
char* parent_subject;
char* parent_object;
char* default_vocabulary;
#ifndef LIBRDFA_IN_RAPTOR
void** uri_mappings;
#endif
void** term_mappings;
void** list_mappings;
void** local_list_mappings;
rdfalist* incomplete_triples;
rdfalist* local_incomplete_triples;
char* language;
unsigned char host_language;
triple_handler_fp default_graph_triple_callback;
buffer_filler_fp buffer_filler_callback;
triple_handler_fp processor_graph_triple_callback;
unsigned char recurse;
unsigned char skip_element;
char* new_subject;
char* current_object_resource;
char* about;
char* typed_resource;
char* resource;
char* href;
char* src;
char* content;
char* datatype;
rdfalist* property;
unsigned char inlist_present;
unsigned char rel_present;
unsigned char rev_present;
char* plain_literal;
size_t plain_literal_size;
char* xml_literal;
size_t xml_literal_size;
void* callback_data;
/* parse state */
size_t bnode_count;
char* underscore_colon_bnode_name;
unsigned char xml_literal_namespaces_defined;
unsigned char xml_literal_xml_lang_defined;
size_t wb_allocated;
char* working_buffer;
size_t wb_position;
#ifdef LIBRDFA_IN_RAPTOR
raptor_world *world;
raptor_locator *locator;
/* a pointer (in every context) to the error_handlers structure
* held in the raptor_parser object */
raptor_uri* base_uri;
raptor_sax2* sax2;
raptor_namespace_handler namespace_handler;
void* namespace_handler_user_data;
int raptor_rdfa_version; /* 10 or 11 or otherwise default */
#else
xmlParserCtxtPtr parser;
#endif
int done;
rdfalist* context_stack;
size_t wb_preread;
int preread;
int depth;
} rdfacontext;
/**
* Creates an initial context for RDFa.
*
* @param base The base URI that should be used for the parser.
*
* @return a pointer to the base RDFa context, or NULL if memory
* allocation failed.
*/
DLLEXPORT rdfacontext* rdfa_create_context(const char* base);
/**
* Sets the default graph triple handler for the application.
*
* @param context the base rdfa context for the application.
* @param th the triple handler function.
*/
DLLEXPORT void rdfa_set_default_graph_triple_handler(
rdfacontext* context, triple_handler_fp th);
/**
* Sets the processor graph triple handler for the application.
*
* @param context the base rdfa context for the application.
* @param th the triple handler function.
*/
DLLEXPORT void rdfa_set_processor_graph_triple_handler(
rdfacontext* context, triple_handler_fp th);
/**
* Sets the buffer filler for the application.
*
* @param context the base rdfa context for the application.
* @param bf the buffer filler function.
*/
DLLEXPORT void rdfa_set_buffer_filler(
rdfacontext* context, buffer_filler_fp bf);
/**
* Starts processing given the base rdfa context.
*
* @param context the base rdfa context.
*
* @return RDFA_PARSE_SUCCESS if everything went well. RDFA_PARSE_FAILED
* if there was a fatal error and RDFA_PARSE_WARNING if there
* was a non-fatal error.
*/
DLLEXPORT int rdfa_parse(rdfacontext* context);
DLLEXPORT int rdfa_parse_start(rdfacontext* context);
DLLEXPORT int rdfa_parse_chunk(
rdfacontext* context, char* data, size_t wblen, int done);
/**
* Gets the input buffer for the given context so it can be filled with data.
* A pointer to the buffer will be returned and the maximum number of bytes
* that can be written to that buffer will be set to the blen parameter. Once
* data has been written to the buffer, rdfa_parse_buffer() should be called.
*
* @param context the base rdfa context.
* @param blen the variable to set to the buffer length.
*
* @return a pointer to the context's input buffer.
*/
DLLEXPORT char* rdfa_get_buffer(rdfacontext* context, size_t* blen);
/**
* Informs the parser to attempt to parse more of the given context's input
* buffer. To fill the input buffer with data, call rdfa_get_buffer().
*
* If any of the input buffer can be parsed, it will be. It is possible
* that none of the data will be parsed, in which case this function will
* still return RDFA_PARSE_SUCCESS. More data should be written to the input
* buffer using rdfa_get_buffer() as it is made available to the application.
* Once there is no more data to write, rdfa_parse_end() should be called.
*
* @param context the base rdfa context.
* @param bytes the number of bytes written to the input buffer via the last
* call to rdfa_get_buffer(), a value of 0 will indicate that there
* is no more data to parse.
*
* @return RDFA_PARSE_SUCCESS if everything went well. RDFA_PARSE_FAILED
* if there was a fatal error and RDFA_PARSE_WARNING if there
* was a non-fatal error.
*/
DLLEXPORT int rdfa_parse_buffer(rdfacontext* context, size_t bytes);
DLLEXPORT void rdfa_parse_end(rdfacontext* context);
DLLEXPORT void rdfa_init_context(rdfacontext* context);
DLLEXPORT char* rdfa_iri_get_base(const char* iri);
/**
* Destroys the given rdfa context by freeing all memory associated
* with the context.
*
* @param context the rdfa context.
*/
DLLEXPORT void rdfa_free_context(rdfacontext* context);
#ifdef __cplusplus
}
#endif
#endif