diff options
-rw-r--r-- | rdflib/plugins/parsers/hext.py | 13 | ||||
-rw-r--r-- | rdflib/plugins/serializers/hext.py | 14 | ||||
-rw-r--r-- | test/test_parser_hext.py | 39 | ||||
-rw-r--r-- | test/test_roundtrip.py | 25 | ||||
-rw-r--r-- | test/test_serializer_hext.py | 74 |
5 files changed, 153 insertions, 12 deletions
diff --git a/rdflib/plugins/parsers/hext.py b/rdflib/plugins/parsers/hext.py index 59e045cf..fab70c78 100644 --- a/rdflib/plugins/parsers/hext.py +++ b/rdflib/plugins/parsers/hext.py @@ -24,14 +24,23 @@ class HextuplesParser(Parser): pass def _load_json_line(self, line: str): - return [x if x != "" else None for x in json.loads(line)] + # this complex handing is because the 'value' component is + # allowed to be "" but not None + # all other "" values are treated as None + ret1 = json.loads(line) + ret2 = [x if x != "" else None for x in ret1] + if ret1[2] == "": + ret2[2] = "" + return ret2 def _parse_hextuple(self, cg: ConjunctiveGraph, tup: List[Union[str, None]]): # all values check # subject, predicate, value, datatype cannot be None # language and graph may be None if tup[0] is None or tup[1] is None or tup[2] is None or tup[3] is None: - raise ValueError("subject, predicate, value, datatype cannot be None") + raise ValueError( + "subject, predicate, value, datatype cannot be None. Given: " + f"{tup}") # 1 - subject s: Union[URIRef, BNode] diff --git a/rdflib/plugins/serializers/hext.py b/rdflib/plugins/serializers/hext.py index c86882a2..3fdf3684 100644 --- a/rdflib/plugins/serializers/hext.py +++ b/rdflib/plugins/serializers/hext.py @@ -2,7 +2,8 @@ HextuplesSerializer RDF graph serializer for RDFLib. See <https://github.com/ontola/hextuples> for details about the format. """ -from typing import IO, Optional, Union +from typing import IO, Optional, Type, Union +import json from rdflib.graph import Graph, ConjunctiveGraph from rdflib.term import Literal, URIRef, Node, BNode from rdflib.serializer import Serializer @@ -19,7 +20,9 @@ class HextuplesSerializer(Serializer): def __init__(self, store: Union[Graph, ConjunctiveGraph]): self.default_context: Optional[Node] + self.graph_type: Type[Graph] if isinstance(store, ConjunctiveGraph): + self.graph_type = ConjunctiveGraph self.contexts = list(store.contexts()) if store.default_context: self.default_context = store.default_context @@ -27,6 +30,7 @@ class HextuplesSerializer(Serializer): else: self.default_context = None else: + self.graph_type = Graph self.contexts = [store] self.default_context = None @@ -101,14 +105,14 @@ class HextuplesSerializer(Serializer): else: language = "" - return '["%s", "%s", "%s", "%s", "%s", "%s"]\n' % ( + return json.dumps([ self._iri_or_bn(triple[0]), triple[1], value, datatype, language, - self._context(context), - ) + self._context(context) + ]) + "\n" else: # do not return anything for non-IRIs or BNs, e.g. QuotedGraph, Subjects return None @@ -121,7 +125,7 @@ class HextuplesSerializer(Serializer): return None def _context(self, context): - if self.default_context is None: + if self.graph_type == Graph: return "" if context.identifier == "urn:x-rdflib:default": return "" diff --git a/test/test_parser_hext.py b/test/test_parser_hext.py index 27d00838..fdf41911 100644 --- a/test/test_parser_hext.py +++ b/test/test_parser_hext.py @@ -22,13 +22,30 @@ def test_small_string(): assert len(d) == 10 +def test_small_string_cg(): + s = """ + ["http://example.com/s01", "http://example.com/a", "http://example.com/Type1", "globalId", "", ""] + ["http://example.com/s01", "http://example.com/label", "This is a Label", "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString", "en", ""] + ["http://example.com/s01", "http://example.com/comment", "This is a comment", "http://www.w3.org/2001/XMLSchema#string", "", ""] + ["http://example.com/s01", "http://example.com/creationDate", "2021-12-01", "http://www.w3.org/2001/XMLSchema#date", "", ""] + ["http://example.com/s01", "http://example.com/creationTime", "2021-12-01T12:13:00", "http://www.w3.org/2001/XMLSchema#dateTime", "", ""] + ["http://example.com/s01", "http://example.com/age", "42", "http://www.w3.org/2001/XMLSchema#integer", "", ""] + ["http://example.com/s01", "http://example.com/trueFalse", "false", ",http://www.w3.org/2001/XMLSchema#boolean", "", ""] + ["http://example.com/s01", "http://example.com/op1", "http://example.com/o1", "globalId", "", ""] + ["http://example.com/s01", "http://example.com/op1", "http://example.com/o2", "globalId", "", ""] + ["http://example.com/s01", "http://example.com/op2", "http://example.com/o3", "globalId", "", ""] + """ + d = ConjunctiveGraph().parse(data=s, format="hext") + assert len(d) == 10 + + def test_small_file_singlegraph(): d = Dataset().parse(Path(__file__).parent / "test_parser_hext_singlegraph.ndjson", format="hext") assert len(d) == 10 def test_small_file_multigraph(): - d = ConjunctiveGraph() + d = Dataset() assert len(d) == 0 d.parse( Path(__file__).parent / "test_parser_hext_multigraph.ndjson", @@ -47,6 +64,26 @@ def test_small_file_multigraph(): assert total_triples == 18 +def test_small_file_multigraph_cg(): + d = ConjunctiveGraph() + assert len(d) == 0 + d.parse( + Path(__file__).parent / "test_parser_hext_multigraph.ndjson", + format="hext", + publicID=d.default_context.identifier + ) + + """There are 22 lines in the file test_parser_hext_multigraph.ndjson. When loaded + into a CG, we get only 18 quads since the the CG can contextualise + the triples and thus deduplicate 4.""" + total_triples = 0 + # count all the triples in the Dataset + for context in d.contexts(): + for triple in context.triples((None, None, None)): + total_triples += 1 + assert total_triples == 18 + + def test_roundtrip(): # these are some RDF files that HexT can round-trip since the have no # literals with no datatype declared: diff --git a/test/test_roundtrip.py b/test/test_roundtrip.py index 3b083cdd..4a7b7acd 100644 --- a/test/test_roundtrip.py +++ b/test/test_roundtrip.py @@ -11,6 +11,7 @@ from _pytest.mark.structures import Mark, MarkDecorator, ParameterSet import rdflib import rdflib.compare from rdflib.util import guess_format +from rdflib.namespace import XSD """ Test round-tripping by all serializers/parser that are registered. @@ -116,6 +117,14 @@ XFAILS = { reason="rdflib.compare.isomorphic does not work for quoted graphs.", raises=AssertionError, ), + ("hext", "n3-writer-test-22.n3"): pytest.mark.xfail( + reason='HexTuples conflates "" and ""^^xsd:string strings', + raises=AssertionError, + ), + ("hext", "rdf-test-21.n3"): pytest.mark.xfail( + reason='HexTuples conflates "" and ""^^xsd:string strings', + raises=AssertionError, + ), } # This is for files which can only be represented properly in one format @@ -155,6 +164,18 @@ def roundtrip(infmt: str, testfmt: str, source: Path, verbose: bool = False) -> g2 = rdflib.ConjunctiveGraph() g2.parse(data=s, format=testfmt) + if testfmt == "hext": + # HexTuples always sets Literal("abc") -> Literal("abc", datatype=XSD.string) + # and this prevents roundtripping since most other formats don't equate "" with + # ""^^xsd:string, at least not in these tests + # + # So we have to scrub the literals' string datatype declarations... + for c in g2.contexts(): + for s, p, o in c.triples((None, None, None)): + if type(o) == rdflib.Literal and o.datatype == XSD.string: + c.remove((s, p, o)) + c.add((s, p, rdflib.Literal(str(o)))) + if verbose: both, first, second = rdflib.compare.graph_diff(g1, g2) print("Diff:") @@ -193,8 +214,8 @@ def get_formats() -> Set[str]: def make_cases(files: Collection[Tuple[Path, str]]) -> Iterable[ParameterSet]: formats = get_formats() for testfmt in formats: - if testfmt == "hext": - continue + # if testfmt == "hext": + # continue logging.debug("testfmt = %s", testfmt) for f, infmt in files: constrained_formats = CONSTRAINED_FORMAT_MAP.get(f.name, None) diff --git a/test/test_serializer_hext.py b/test/test_serializer_hext.py index c322a211..7231338f 100644 --- a/test/test_serializer_hext.py +++ b/test/test_serializer_hext.py @@ -1,7 +1,7 @@ import sys from pathlib import Path sys.path.append(str(Path(__file__).parent.parent.absolute())) -from rdflib import Dataset, Graph +from rdflib import Dataset, Graph, ConjunctiveGraph import json @@ -31,7 +31,7 @@ def test_hext_graph(): g.parse(data=turtle_data, format="turtle") out = g.serialize(format="hext") - # note: cant' test for BNs in result as they will be different ever time + # note: can't test for BNs in result as they will be different every time testing_lines = [ [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", ""]'], [False, '["http://example.com/s1", "http://example.com/p3", "Object 3", "http://www.w3.org/2001/XMLSchema#string", "", ""]'], @@ -54,6 +54,76 @@ def test_hext_graph(): assert all([x[0] for x in testing_lines]) +def test_hext_cg(): + """Tests ConjunctiveGraph data""" + d = ConjunctiveGraph() + trig_data = """ + PREFIX ex: <http://example.com/> + PREFIX owl: <http://www.w3.org/2002/07/owl#> + PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> + PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> + + ex:g1 { + ex:s1 + ex:p1 ex:o1 , ex:o2 ; + ex:p2 [ + a owl:Thing ; + rdf:value "thingy" ; + ] ; + ex:p3 "Object 3" , "Object 4 - English"@en ; + ex:p4 "2021-12-03"^^xsd:date ; + ex:p5 42 ; + ex:p6 "42" ; + . + } + + ex:g2 { + ex:s1 + ex:p1 ex:o1 , ex:o2 ; + . + ex:s11 ex:p11 ex:o11 , ex:o12 . + } + + # default graph triples + ex:s1 ex:p1 ex:o1 , ex:o2 . + ex:s21 ex:p21 ex:o21 , ex:o22 . + + # other default graph triples + { + ex:s1 ex:p1 ex:o1 , ex:o2 . + } + """ + d.parse(data=trig_data, format="trig", publicID=d.default_context.identifier) + out = d.serialize(format="hext") + # note: cant' test for BNs in result as they will be different ever time + testing_lines = [ + [False, '["http://example.com/s21", "http://example.com/p21", "http://example.com/o21", "globalId", "", ""]'], + [False, '["http://example.com/s21", "http://example.com/p21", "http://example.com/o22", "globalId", "", ""]'], + [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", ""]'], + [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", ""]'], + [False, '["http://example.com/s11", "http://example.com/p11", "http://example.com/o12", "globalId", "", "http://example.com/g2"]'], + [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", "http://example.com/g2"]'], + [False, '["http://example.com/s11", "http://example.com/p11", "http://example.com/o11", "globalId", "", "http://example.com/g2"]'], + [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", "http://example.com/g2"]'], + [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", "http://example.com/g1"]'], + [False, '["http://example.com/s1", "http://example.com/p2"'], + [False, '"http://www.w3.org/1999/02/22-rdf-syntax-ns#value", "thingy", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]'], + [False, '"http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/2002/07/owl#Thing", "globalId", "", "http://example.com/g1"]'], + [False, '["http://example.com/s1", "http://example.com/p3", "Object 4 - English", "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString", "en", "http://example.com/g1"]'], + [False, '["http://example.com/s1", "http://example.com/p6", "42", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]'], + [False, '["http://example.com/s1", "http://example.com/p4", "2021-12-03", "http://www.w3.org/2001/XMLSchema#date", "", "http://example.com/g1"]'], + [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", "http://example.com/g1"]'], + [False, '["http://example.com/s1", "http://example.com/p5", "42", "http://www.w3.org/2001/XMLSchema#integer", "", "http://example.com/g1"]'], + [False, '["http://example.com/s1", "http://example.com/p3", "Object 3", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]'], + ] + for line in out.splitlines(): + for test in testing_lines: + if test[1] in line: + test[0] = True + + assert all([x[0] for x in testing_lines]) + + def test_hext_dataset(): """Tests context-aware (multigraph) data""" d = Dataset() |