From 11e5784ed2616b51a464d5e7c0b487cf575d08bf Mon Sep 17 00:00:00 2001 From: Sam Thursfield Date: Wed, 1 Jul 2015 18:29:22 +0100 Subject: schema: Add prototype validation code Current sticking points: there's a bug in rdflib that means the Resource() api doesn't work how I'd expect ... and we need the RDF, RDFS, OWL and DC schemas available too in order to do a 'validation' stage. Change-Id: I1d64fbb02af05724bcf967457c287600bb494d5a --- schema/surf-test.py | 32 +++++++++++++++++++----- schema/validate.py | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 6 deletions(-) create mode 100644 schema/validate.py diff --git a/schema/surf-test.py b/schema/surf-test.py index 0f0fab87..6e60bfe5 100644 --- a/schema/surf-test.py +++ b/schema/surf-test.py @@ -10,6 +10,7 @@ import os import warnings import parse +import validate DATABASE = 'memory' @@ -40,15 +41,34 @@ elif DATABASE == 'virtuoso': session = surf.Session(store) + parse.load_all_morphologies(session, store) +schema = rdflib.Graph() +schema.parse("baserock-owl-schema.turtle", format="turtle") + +# Only works for 'memory' database, but I don't really care any more. +all_data = store.reader.graph + +validate.check_data_against_schema( + data=all_data, + schema=schema) + + +def serialize_to_json_ld(surflib_resource): + rdflib_graph = surflib_resource.graph() + + context = { + "@vocab": "http://baserock.org/definitions/example-schema#", + "@language": "en" + } + # requires rdflib-jsonld Python module. + return rdflib_graph.serialize(format='json-ld', indent=4, context=context) + + Cluster = session.get_class(surf.ns.BASEROCK.Cluster) cluster = Cluster.all() for s in cluster: s.load() - # hack - text = s.serialize('json') - import json - data = json.loads(text) - print json.dumps(data, indent=4) - break + print serialize_to_json_ld(s) + diff --git a/schema/validate.py b/schema/validate.py new file mode 100644 index 00000000..56eb7230 --- /dev/null +++ b/schema/validate.py @@ -0,0 +1,71 @@ +import rdflib + + +class ValidationError(Exception): + pass + +class UnknownClass(ValidationError): + def __init__(self, owl_class): + super(UnknownClass, self).__init__( + "Class %s is not defined by the given schema." % owl_class) + + +def check_data_against_schema(data, schema): + '''Validate data against a schema. + + The data is assumed to be an RDFLib.Graph instance containing a set of + arbitrary triples. + + The schema is assumed to be an RDFLib.Graph instance containing one or more + OWL ontologies. + + This function assumes 'data' should be completely authorative according to + the schema (i.e. everything that must be known about a resource is known). + You can see this as assuming a "closed world" rather than an "open world". + It also assumes there should be nothing in 'data' that the schema does not + describe. + + ''' + for uriref in data.subjects(): + resource = rdflib.resource.Resource(data, uriref) + validate_resource(resource, data, schema) + + +def validate_resource(resource, data, schema): + RDF = rdflib.RDF + OWL = rdflib.OWL + + # Are the type of 'resource' all classes defined in the schema? + classes = schema[:RDF.type:OWL.Class] + resource_types = resource.value(RDF.type) + + for resource_type in resource_types: + if resource_type.identifier not in classes: + raise UnknownClass(resource_type) + + resource_property_value_pairs = resource.predicate_objects() + for prop, value in resource_property_value_pairs: + validate_resource_property(resource, prop, value, data, + schema) + + +def validate_resource_property(resource, prop, value, data, schema): + RDF = rdflib.RDF + OWL = rdflib.OWL + + resource_types = resource.value(RDF.type) + + prop_uriref = prop.identifier + + # FIXME: this gives no results at all ... + prop = rdflib.resource.Resource(schema, prop) + print list(prop.items()) + + # But this gives the expected results. :( + print list(schema[prop_uriref:]) + + # FIXME: some properties are defined in rdfs, dc, owl, .... need those + # ontologies available to validate too. + + import pdb + pdb.set_trace() -- cgit v1.2.1