diff options
author | Sam Thursfield <sam@afuera.me.uk> | 2016-07-07 00:59:15 +0100 |
---|---|---|
committer | Sam Thursfield <sam@afuera.me.uk> | 2016-07-07 00:59:15 +0100 |
commit | 661f2e377309b523f7406ae86d8f21b378f886a7 (patch) | |
tree | 09aafde0d7e64c5a8b8711c9ec24cee83e6e08a7 | |
parent | eb8873a67090adc57202b4b74c693fe31c9252aa (diff) | |
download | tracker-wip/sam/resource-jsonld.tar.gz |
WORK IN PROGRESS converting 400 extractor testswip/sam/resource-jsonld
Ignore this commit (or finish it off for me ... :-)
4 files changed, 47 insertions, 258 deletions
diff --git a/tests/functional-tests/400-extractor.py b/tests/functional-tests/400-extractor.py index 140b5d527..4fd3b8b5e 100755 --- a/tests/functional-tests/400-extractor.py +++ b/tests/functional-tests/400-extractor.py @@ -1,6 +1,7 @@ #!/usr/bin/python # # Copyright (C) 2010, Nokia <ivan.frade@nokia.com> +# Copyright (C) 2016, Sam Thursfield <sam@afuera.me.uk> # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -24,7 +25,7 @@ directory (containing xxx.expected files) """ from common.utils import configuration as cfg -from common.utils.extractor import get_tracker_extract_output +from common.utils.extractor import get_tracker_extract_jsonld_output import unittest2 as ut import os import sys @@ -112,7 +113,7 @@ class ExtractionTestCase (ut.TestCase): filename_to_extract = self.configParser.get ("TestFile", "Filename") self.file_to_extract = os.path.join (desc_root, filename_to_extract) - result = get_tracker_extract_output(self.file_to_extract) + result = get_tracker_extract_jsonld_output(self.file_to_extract) self.__assert_extraction_ok (result) def assertDictHasKey (self, d, key, msg=None): @@ -156,6 +157,8 @@ class ExtractionTestCase (ut.TestCase): unexpected_pairs.append ( (k[1:].replace ("_", ":"), v) ) elif k.startswith ("@"): expected_keys.append ( k[1:].replace ("_", ":") ) + elif k == 'a': + expected_keys.append ( '@type' ) else: expected_pairs.append ( (k.replace ("_", ":"), v) ) diff --git a/tests/functional-tests/common/utils/extractor.py b/tests/functional-tests/common/utils/extractor.py index 8dd05604e..7ca54701c 100644 --- a/tests/functional-tests/common/utils/extractor.py +++ b/tests/functional-tests/common/utils/extractor.py @@ -1,6 +1,7 @@ #!/usr/bin/python # # Copyright (C) 2010, Nokia <ivan.frade@nokia.com> +# Copyright (C) 2016, Sam Thursfield <sam@afuera.me.uk> # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -20,246 +21,19 @@ from common.utils import configuration as cfg from common.utils.helpers import log + +import json import os -import re import subprocess -class ExtractorParser(object): - def parse_tracker_extract_output(self, text): - """ - Parse stdout of `tracker-extract --file` to get SPARQL statements. - - Calls the extractor a returns a dictionary of property, value. - - Example: - { 'nie:filename': 'a.jpeg' , - 'tracker:added': '2008-12-12T12:23:34Z' - } - """ - - metadata = {} - parts = self.get_statements_from_stdout_output(text) - extras = self.__process_where_part(parts['where']) - for attribute_value in self.__process_lines(parts['item']): - att, value = attribute_value.split(" ", 1) - if value.startswith("?") and extras.has_key(value): - value = extras[value] - - if metadata.has_key(att): - metadata [att].append(value) - else: - metadata [att] = [value] - - return metadata - - def get_statements_from_stdout_output(self, text): - lines = text.split('\n') - parts = {} - - current_part = None - part_start = None - - i = 0 - for i in range(0, len(lines)): - if lines[i] == 'SPARQL pre-update:': - current_part = 'preupdate' - elif lines[i] == 'SPARQL item:': - current_part = 'item' - elif lines[i] == 'SPARQL where clause:': - current_part = 'where' - elif lines[i] == 'SPARQL post-update:': - current_part = 'postupdate' - - if lines[i] == '--': - if part_start is None: - part_start = i + 1 - else: - part_lines = lines[part_start:i] - parts[current_part] = '\n'.join(part_lines) - current_part = None - part_start = None - - if current_part is not None: - raise Exception("End of text while parsing %s in tracker-extract " - "output" % current_part) - - if len(parts) == 0: - raise Exception("No metadata was found by tracker-extract") - - return parts - - def __process_lines(self, embedded): - """ - Translate each line in a "prop value" string, handling anonymous nodes. - - Example: - nfo:width 699 ; -> 'nfo:width 699' - or - nao:hasTag [ a nao:Tag ; - nao:prefLabel "tracker"] ; -> nao:hasTag:prefLabel 'tracker' - - Would be so cool to implement this with yield and generators... :) - """ - grouped_lines = [] - current_line = "" - anon_node_open = False - for l in embedded.split ("\n\t"): - if "[" in l: - current_line = current_line + l - anon_node_open = True - continue - - if "]" in l: - anon_node_open = False - current_line += l - final_lines = self.__handle_anon_nodes (current_line.strip ()) - grouped_lines = grouped_lines + final_lines - current_line = "" - continue - - if anon_node_open: - current_line += l - else: - if (len (l.strip ()) == 0): - continue - - final_lines = self.__handle_multivalues (l.strip ()) - grouped_lines = grouped_lines + final_lines - - return map (self.__clean_value, grouped_lines) - - def __process_where_part(self, where): - gettags = re.compile ("(\?\w+)\ a\ nao:Tag\ ;\ nao:prefLabel\ \"([\w\ -]+)\"") - tags = {} - for l in where.split ("\n"): - if len (l) == 0: - continue - match = gettags.search (l) - if (match): - tags [match.group(1)] = match.group (2) - else: - print "This line is not a tag:", l - - return tags - - def __handle_multivalues(self, line): - """ - Split multivalues like: - a nfo:Image, nmm:Photo ; - -> a nfo:Image ; - -> a nmm:Photo ; - """ - hasEscapedComma = re.compile ("\".+,.+\"") - - if "," in line and not hasEscapedComma.search (line): - prop, multival = line.split (" ", 1) - results = [] - for value in multival.split (","): - results.append ("%s %s" % (prop, value.strip ())) - return results - else: - return [line] - - def __handle_anon_nodes(self, line): - """ - Traslates anonymous nodes in 'flat' properties: - - nao:hasTag [a nao:Tag; nao:prefLabel "xxx"] - -> nao:hasTag:prefLabel "xxx" - - slo:location [a slo:GeoLocation; slo:postalAddress <urn:uuid:1231-123> .] - -> slo:location <urn:uuid:1231-123> - - nfo:hasMediaFileListEntry [ a nfo:MediaFileListEntry ; nfo:entryUrl "file://x.mp3"; nfo:listPosition 1] - -> nfo:hasMediaFileListEntry:entryUrl "file://x.mp3" - - """ - - # hasTag case - if line.startswith ("nao:hasTag"): - getlabel = re.compile ("nao:prefLabel\ \"([\w\ -]+)\"") - match = getlabel.search (line) - if (match): - line = 'nao:hasTag:prefLabel "%s" ;' % (match.group(1)) - return [line] - else: - print "Whats wrong on line", line, "?" - return [line] - - # location case - elif line.startswith ("slo:location"): - results = [] - - # Can have country AND/OR city - getpa = re.compile ("slo:postalAddress\ \<([\w:-]+)\>") - pa_match = getpa.search (line) - - if (pa_match): - results.append ('slo:location:postalAddress "%s" ;' % (pa_match.group(1))) - else: - print "FIXME another location subproperty in ", line - - return results - elif line.startswith ("nco:creator"): - getcreator = re.compile ("nco:fullname\ \"([\w\ ]+)\"") - creator_match = getcreator.search (line) - - if (creator_match): - new_line = 'nco:creator:fullname "%s" ;' % (creator_match.group (1)) - return [new_line] - else: - print "Something special in this line '%s'" % (line) - - elif line.startswith ("nfo:hasMediaFileListEntry"): - return self.__handle_playlist_entries (line) - - else: - return [line] - - def __handle_playlist_entries(self, line): - """ - Playlist entries come in one big line: - nfo:hMFLE [ a nfo:MFLE; nfo:entryUrl '...'; nfo:listPosition X] , [ ... ], [ ... ] - -> nfo:hMFLE:entryUrl '...' - -> nfo:hMFLE:entryUrl '...' - ... - """ - geturl = re.compile ("nfo:entryUrl \"([\w\.\:\/]+)\"") - entries = line.strip () [len ("nfo:hasMediaFileListEntry"):] - results = [] - for entry in entries.split (","): - url_match = geturl.search (entry) - if (url_match): - new_line = 'nfo:hasMediaFileListEntry:entryUrl "%s" ;' % (url_match.group (1)) - results.append (new_line) - else: - print " *** Something special in this line '%s'" % (entry) - return results - - def __clean_value(self, value): - """ - the value comes with a ';' or a '.' at the end - """ - if (len (value) < 2): - return value.strip () - - clean = value.strip () - if value[-1] in [';', '.']: - clean = value [:-1] - - clean = clean.replace ("\"", "") - - return clean.strip () - - -def get_tracker_extract_output(filename, mime_type=None): +def get_tracker_extract_jsonld_output(filename, mime_type=None): """ Runs `tracker-extract --file` to extract metadata from a file. """ - tracker_extract = os.path.join (cfg.EXEC_PREFIX, 'tracker-extract') - command = [tracker_extract, '--file', filename] + tracker = os.path.join (cfg.BINDIR, 'tracker') + command = [tracker, 'extract', '--verbosity=errors', '--output-format=json-ld', filename] if mime_type is not None: command.extend(['--mime', mime_type]) @@ -267,8 +41,13 @@ def get_tracker_extract_output(filename, mime_type=None): log ('Running: %s' % ' '.join(command)) output = subprocess.check_output (command) except subprocess.CalledProcessError as e: - raise Exception("Error %i from tracker-extract, output: %s" % - (e.returncode, e.output)) + raise RuntimeError("Error %i from tracker-extract, see stderr for " + "details" % e.returncode) + + try: + data = json.loads (output) + except ValueError as e: + raise RuntimeError("Invalid JSON returned by tracker-extract: " + "%s.\nOutput was: %s" % (e, output)) - parser = ExtractorParser() - return parser.parse_tracker_extract_output(output) + return data diff --git a/tests/functional-tests/test-extraction-data/office/pdf-doc.expected b/tests/functional-tests/test-extraction-data/office/pdf-doc.expected index 93de5b724..6f16a1762 100644 --- a/tests/functional-tests/test-extraction-data/office/pdf-doc.expected +++ b/tests/functional-tests/test-extraction-data/office/pdf-doc.expected @@ -1,9 +1,11 @@ -[TestFile] -Filename=pdf-doc.pdf -Comment=PDF document from the office tools +{ + "test": { + "Filename": "pdf-doc.pdf", + "Comment": "PDF document from the office tools" + }, -[Metadata] - -[Meego] -a=nfo:PaginatedTextDocument -nfo_pageCount=22 + "metadata": { + "@type": "nfo:PaginatedTextDocument", + "nfo:pageCount": "22" + } +} diff --git a/tests/functional-tests/test-extraction-data/playlists/playlist-test-1.expected b/tests/functional-tests/test-extraction-data/playlists/playlist-test-1.expected index fd822f84d..6a260518a 100644 --- a/tests/functional-tests/test-extraction-data/playlists/playlist-test-1.expected +++ b/tests/functional-tests/test-extraction-data/playlists/playlist-test-1.expected @@ -1,13 +1,18 @@ -[TestFile] -Filename=playlist-test-1.m3u -Bugzilla= -Comment=Regular m3u playlist file +{ + "test": { + "Filename": "playlist-test-1.m3u", + "Comment": "Regular m3u playlist file" + } -[Metadata] -a=nmm:Playlist -nfo_entryCounter=5 -nfo_hasMediaFileListEntry_entryUrl=http://www.apnaradio.com/live/ApnaRadio.mp3 -nfo_hasMediaFileListEntry_entryUrl=http://live.apnaradio.com:6464 -nfo_hasMediaFileListEntry_entryUrl=http://live.apnaradio.com:2424 -nfo_hasMediaFileListEntry_entryUrl=http://www.apnaradio.com/live/MaintenanceE.mp3 -nfo_hasMediaFileListEntry_entryUrl=http://www.apnaradio.com/live/MaintenanceP.mp3 + "metadata": { + "@type": "nmm:Playlist", + "nfo:entryCounter": "5", + "nfo:hasMediaFileListEntry": [ + { "nfo:entryUrl": "http://www.apnaradio.com/live/ApnaRadio.mp3" }, + { "nfo:entryUrl": "http://live.apnaradio.com:6464" }, + { "nfo:entryUrl": "http://live.apnaradio.com:2424" }, + { "nfo:entryUrl": "http://www.apnaradio.com/live/MaintenanceE.mp3" }, + { "nfo:entryUrl": "http://www.apnaradio.com/live/MaintenanceP.mp3" } + ] + } +} |