1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
#!/usr/bin/env python
'''
This benchmark will produce graph digests for all of the
downloadable ontologies available in Bioportal.
'''
from rdflib import *
from rdflib.compare import to_isomorphic
import sys, csv
from urllib import *
from io import StringIO
from collections import defaultdict
from threading import *
from Queue import Queue, Empty
bioportal_query = '''
PREFIX metadata: <http://data.bioontology.org/metadata/>
select distinct ?ontology ?title ?download where {
?ontology a metadata:Ontology;
metadata:omvname ?title;
metadata:links ?links.
?links metadata:Ontology ?download.
filter(regex(?download, "/download"))
}
'''
stat_cols = [
'id',
'ontology',
'download_url',
'tree_depth',
'color_count',
'individuations',
'prunings',
'initial_color_count',
'adjacent_nodes',
'initial_coloring_runtime',
'triple_count',
'graph_digest',
'to_hash_runtime',
'canonicalize_triples_runtime',
'error',
]
def bioportal_benchmark(apikey, output_file, threads):
metadata = Namespace("http://data.bioontology.org/metadata/")
url = 'http://data.bioontology.org/ontologies?apikey=%s'%apikey
ontology_graph = Graph()
print url
ontology_list_json = urlopen(url).read()
ontology_graph.parse(StringIO(unicode(ontology_list_json)), format="json-ld")
ontologies = ontology_graph.query(bioportal_query)
w = open(output_file, 'w')
writer = csv.DictWriter(w,stat_cols)
writer.writeheader()
tasks = Queue()
finished_tasks = Queue()
lock = Lock()
task_count = len(ontologies)
class Worker(Thread):
def run(self):
while True:
lock.acquire()
stats = tasks.get()
lock.release()
print stats['ontology'], stats['download_url']
try:
og = Graph()
og.load(stats['download_url']+"?apikey=%s"%apikey)
ig = to_isomorphic(og)
graph_digest = ig.graph_digest(stats)
except Exception as e:
print e
stats['error'] = str(e)
finished_tasks.put(stats)
tasks.task_done()
for i in range(int(threads)):
print "Starting worker", i
t = Worker()
t.daemon = True
t.start()
for ontology, title, download in ontologies:
stats = defaultdict(str)
stats.update({
"id":ontology,
"ontology": title,
"download_url": download
})
tasks.put(stats)
written_tasks = 0
while written_tasks < task_count:
stats = finished_tasks.get()
print "Writing", stats['ontology']
writer.writerow(stats)
w.flush()
written_tasks += 1
if __name__ == '__main__':
bioportal_benchmark(sys.argv[1], sys.argv[2], sys.argv[3])
|