Skip to content

Commit 2a60318

Browse files
Merge pull request #92 from GenomicDataInfrastructure/use-ldp-to-navigate-trough-fdp
refactor(domain): restructure FDP processing to use LDP hierarchy
2 parents 6930b67 + 710cbc5 commit 2a60318

9 files changed

+245
-227
lines changed

ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py

Lines changed: 30 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,15 @@
66

77
import logging
88
from typing import Dict, Iterable, Union
9+
from collections import deque
910

1011
import requests
1112
from rdflib import DCAT, DCTERMS, RDF, BNode, Graph, Literal, Namespace, URIRef
1213
from rdflib.term import Node
1314
from requests import HTTPError, JSONDecodeError
1415

1516
from ckanext.fairdatapoint.harvesters.domain.fair_data_point import FairDataPoint
17+
from ckanext.fairdatapoint.harvesters.domain.graph_to_fdp_record_mapper import GraphToFdpRecordMapper
1618
from ckanext.fairdatapoint.harvesters.domain.identifier import Identifier
1719

1820
LDP = Namespace("http://www.w3.org/ns/ldp#")
@@ -28,51 +30,18 @@ def __init__(self, fdp_end_point: str, harvest_catalogs: bool = False):
2830
self.harvest_catalogs = harvest_catalogs
2931

3032
def get_record_ids(self) -> Dict.keys:
31-
"""
32-
Returns all the FDP records which should end up as packages in CKAN to populate the "guids_in_harvest" list
33-
https://rdflib.readthedocs.io/en/stable/intro_to_parsing.html
34-
"""
35-
log.debug(
36-
"FAIR Data Point get_records from {}".format(
37-
self.fair_data_point.fdp_end_point
38-
)
39-
)
40-
33+
log.debug("FAIR Data Point get_records from {}".format(self.fair_data_point.fdp_end_point))
4134
result = dict()
42-
43-
fdp_graph = self.fair_data_point.get_graph(self.fair_data_point.fdp_end_point)
44-
45-
contains_predicate = LDP.contains
46-
for contains_object in fdp_graph.objects(predicate=contains_predicate):
47-
result.update(self._process_catalog(str(contains_object)))
48-
49-
return result.keys()
50-
51-
def _process_catalog(self, path: Union[str, URIRef]) -> Dict:
52-
result = dict()
53-
54-
catalogs_graph = self.fair_data_point.get_graph(path)
55-
56-
for catalog_subject in catalogs_graph.subjects(RDF.type, DCAT.Catalog):
57-
identifier = Identifier("")
58-
59-
identifier.add("catalog", str(catalog_subject))
60-
61-
if self.harvest_catalogs:
62-
result[identifier.guid] = catalog_subject
63-
64-
catalog_graph = self.fair_data_point.get_graph(catalog_subject)
65-
66-
for dataset_subject in catalog_graph.objects(predicate=DCAT.dataset):
35+
for fdp_record in self._breath_first_search_records(self.fair_data_point.fdp_end_point):
36+
if self.harvest_catalogs and fdp_record.is_catalog():
6737
identifier = Identifier("")
68-
69-
identifier.add("catalog", str(catalog_subject))
70-
71-
identifier.add("dataset", str(dataset_subject))
72-
73-
result[identifier.guid] = dataset_subject
74-
75-
return result
38+
identifier.add("catalog", str(fdp_record.url))
39+
result[identifier.guid] = fdp_record.url
40+
elif fdp_record.is_dataset():
41+
identifier = Identifier("")
42+
identifier.add("dataset", str(fdp_record.url))
43+
result[identifier.guid] = fdp_record.url
44+
return result.keys()
7645

7746
def get_record_by_id(self, guid: str) -> str:
7847
"""
@@ -158,6 +127,24 @@ def get_values(
158127
for value in graph.objects(subject=subject_uri, predicate=predicate_uri):
159128
yield value
160129

130+
def _map_record(self, url: str):
131+
mapper = GraphToFdpRecordMapper(url)
132+
graph = self.fair_data_point.get_graph(url)
133+
return mapper.map(graph)
134+
135+
def _breath_first_search_records(self, start_url: str):
136+
queue = deque([start_url])
137+
visited = set()
138+
while queue:
139+
url = queue.popleft()
140+
if url in visited:
141+
continue
142+
visited.add(url)
143+
record = self._map_record(url)
144+
if record:
145+
yield record
146+
queue.extend(record.children())
147+
161148
@staticmethod
162149
def _remove_fdp_defaults(g, subject_uri):
163150
for s, p, o in g.triples((subject_uri, DCTERMS.accessRights, None)):
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# SPDX-FileCopyrightText: 2023 Civity
2+
# SPDX-FileContributor: 2024 Stichting Health-RI
3+
#
4+
# SPDX-License-Identifier: AGPL-3.0-only
5+
6+
from rdflib import DCAT, RDF, URIRef
7+
8+
9+
class FdpRecord:
10+
def __init__(self, url, graph):
11+
self.url = url
12+
self._children = set()
13+
self._graph = graph
14+
15+
def children(self):
16+
return self._children
17+
18+
def add_children(self, child_url):
19+
self._children.add(child_url)
20+
21+
def is_catalog(self):
22+
return (URIRef(self.url), RDF.type, DCAT.Catalog) in self._graph
23+
24+
def is_dataset(self):
25+
return (URIRef(self.url), RDF.type, DCAT.Dataset) in self._graph
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# SPDX-FileCopyrightText: 2023 Civity
2+
# SPDX-FileContributor: 2024 Stichting Health-RI
3+
#
4+
# SPDX-License-Identifier: AGPL-3.0-only
5+
6+
from ckanext.fairdatapoint.harvesters.domain.fdp_record import FdpRecord
7+
from rdflib import Namespace
8+
9+
LDP = Namespace("http://www.w3.org/ns/ldp#")
10+
11+
12+
class GraphToFdpRecordMapper:
13+
def __init__(self, url):
14+
self.url = url
15+
16+
def map(self, rdf_graph):
17+
if rdf_graph is None:
18+
raise ValueError("rdf_graph cannot be None")
19+
20+
record = FdpRecord(self.url, rdf_graph)
21+
22+
for subject, predicate, obj in rdf_graph:
23+
if predicate == LDP.contains:
24+
record.add_children(str(obj))
25+
26+
return record
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# SPDX-FileCopyrightText: 2024 Stichting Health-RI
2+
#
3+
# SPDX-License-Identifier: AGPL-3.0-only
4+
5+
@prefix dcat: <http://www.w3.org/ns/dcat#> .
6+
@prefix dct: <http://purl.org/dc/terms/> .
7+
@prefix ldp: <http://www.w3.org/ns/ldp#> .
8+
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
9+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
10+
@prefix ex: <http://example.org/> .
11+
@prefix dcatap: <http://data.europa.eu/r5r/> .
12+
13+
# Study as an LDP DirectContainer
14+
ex:Study1 a ldp:DirectContainer;
15+
dct:title "Study on Genomic Data"@en ;
16+
ldp:membershipResource ex:Study1 ;
17+
ldp:hasMemberRelation dcat:dataset ;
18+
ldp:contains ex:Population1 .
19+
20+
# Population as an LDP DirectContainer under Study
21+
ex:Population1 a ldp:DirectContainer ;
22+
dct:title "Population Data"@en ;
23+
ldp:membershipResource ex:Population1 ;
24+
ldp:hasMemberRelation dcat:datasetSeries ;
25+
ldp:contains ex:Dataseries1 .
26+
27+
ex:Dataseries1 a ldp:DirectContainer, dcat:DatasetSeries ;
28+
dct:title "Dataseries of Population Study"@en ;
29+
ldp:membershipResource ex:Dataseries1 ;
30+
ldp:hasMemberRelation dcat:dataset ;
31+
ldp:contains ex:Dataset1 .
32+
33+
# Dataset under Dataseries
34+
ex:Dataset1 a dcat:Dataset ;
35+
dct:title "Genomic Variation Dataset"@en ;
36+
dct:description "Genomic data collected from multiple populations."@en .
37+
38+
ex:Catalog1 a ldp:DirectContainer, dcat:Catalog ;
39+
dct:title "Catalog of Datasets"@en ;
40+
ldp:membershipResource ex:Catalog1 ;
41+
ldp:hasMemberRelation dcat:dataset ;
42+
ldp:contains ex:Dataset1 .
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# SPDX-FileCopyrightText: 2024 Stichting Health-RI
2+
#
3+
# SPDX-License-Identifier: AGPL-3.0-only
4+
5+
@prefix ldp: <http://www.w3.org/ns/ldp#> .
6+
@prefix dcat: <http://www.w3.org/ns/dcat#> .
7+
8+
<http://example.com>
9+
a ldp:Container ;
10+
ldp:contains <http://example.com/catalog1> .
11+
12+
<http://example.com/catalog1>
13+
a dcat:Catalog, ldp:Container ;
14+
ldp:contains <http://example.com/dataset1> ;
15+
dcat:dataset <http://example.com/dataset1> .
16+
17+
<http://example.com/dataset1>
18+
a dcat:Dataset, ldp:Resource .

ckanext/fairdatapoint/tests/test_data/process_catalogs.ttl

Lines changed: 0 additions & 10 deletions
This file was deleted.

ckanext/fairdatapoint/tests/test_data/root_fdp_response.ttl

Lines changed: 11 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -3,59 +3,19 @@
33
# SPDX-License-Identifier: AGPL-3.0-only
44

55
@prefix dcat: <http://www.w3.org/ns/dcat#> .
6-
@prefix dcterms: <http://purl.org/dc/terms/> .
7-
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
6+
@prefix dct: <http://purl.org/dc/terms/> .
87
@prefix ldp: <http://www.w3.org/ns/ldp#> .
9-
@prefix ns1: <http://semanticscience.org/resource/> .
10-
@prefix ns2: <https://w3id.org/fdp/fdp-o#> .
11-
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
12-
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
8+
@prefix ex: <http://example.org/> .
139

14-
<https://fair.healthinformationportal.eu/catalog/> a ldp:DirectContainer ;
15-
dcterms:title "Catalogs" ;
16-
ldp:contains <https://fair.healthinformationportal.eu/catalog/113b6f3a-07b5-484b-9278-4b58d2b247ed>,
17-
<https://fair.healthinformationportal.eu/catalog/14225c50-00b0-4fba-8300-a677ab0c86f4>,
18-
<https://fair.healthinformationportal.eu/catalog/17412bc2-daf1-491e-94fb-6680f7a67b1e> ;
19-
ldp:hasMemberRelation ns2:metadataCatalog ;
20-
ldp:membershipResource <https://fair.healthinformationportal.eu> .
10+
ex:Catalog1 a ldp:DirectContainer, dcat:Catalog ;
11+
dct:title "Catalog of Datasets"@en ;
12+
ldp:membershipResource ex:Catalog1 ;
13+
ldp:hasMemberRelation dcat:dataset ;
14+
ldp:contains ex:Dataset1 .
2115

22-
<https://fair.healthinformationportal.eu#identifier> a <http://purl.org/spar/datacite/Identifier> ;
23-
dcterms:identifier "https://fair.healthinformationportal.eu" .
16+
# Dataset under Dataseries
17+
ex:Dataset1 a dcat:Dataset ;
18+
dct:title "Genomic Variation Dataset"@en ;
19+
dct:description "Genomic data collected from multiple populations."@en .
2420

25-
<https://fair.healthinformationportal.eu#publisher> a foaf:Agent ;
26-
foaf:name "HealthInformationPortal.eu" .
27-
28-
<https://fair.healthinformationportal.eu/metrics/445c0a70d1e214e545b261559e2842f4> ns1:SIO_000332 <https://www.ietf.org/rfc/rfc3986.txt> ;
29-
ns1:SIO_000628 <https://www.ietf.org/rfc/rfc3986.txt> .
30-
31-
<https://fair.healthinformationportal.eu/metrics/5d27e854a9e78eb3f663331cd47cdc13> ns1:SIO_000332 <https://www.wikidata.org/wiki/Q8777> ;
32-
ns1:SIO_000628 <https://www.wikidata.org/wiki/Q8777> .
33-
34-
<https://fair.healthinformationportal.eu/profile/77aaad6a-0136-4c6e-88b9-07ffccd0ee4c> rdfs:label "FAIR Data Point Profile" .
35-
36-
<https://fair.healthinformationportal.eu> a dcat:DataService,
37-
dcat:Resource,
38-
ns2:FAIRDataPoint,
39-
ns2:MetadataService ;
40-
rdfs:label "European Health Information portal endpoint" ;
41-
dcterms:accessRights <https://fair.healthinformationportal.eu#accessRights> ;
42-
dcterms:conformsTo <https://fair.healthinformationportal.eu/profile/77aaad6a-0136-4c6e-88b9-07ffccd0ee4c> ;
43-
dcterms:description "Welcome to the one-stop shop that facilitates access to population health and health care data, information and expertise across Europe." ;
44-
dcterms:hasVersion "1.0" ;
45-
dcterms:language <http://id.loc.gov/vocabulary/iso639-1/en> ;
46-
dcterms:license <https://creativecommons.org/licenses/by-nc/4.0> ;
47-
dcterms:publisher <https://fair.healthinformationportal.eu#publisher> ;
48-
dcterms:rights <http://publications.europa.eu/resource/authority/access-right/PUBLIC> ;
49-
dcterms:title "European Health Information portal endpoint" ;
50-
ns1:SIO_000628 <https://fair.healthinformationportal.eu/metrics/445c0a70d1e214e545b261559e2842f4>,
51-
<https://fair.healthinformationportal.eu/metrics/5d27e854a9e78eb3f663331cd47cdc13> ;
52-
dcat:endpointURL <https://fair.healthinformationportal.eu> ;
53-
ns2:fdpSoftwareVersion "FDP:v1.16.2~51911d6" ;
54-
ns2:metadataCatalog <https://fair.healthinformationportal.eu/catalog/113b6f3a-07b5-484b-9278-4b58d2b247ed>,
55-
<https://fair.healthinformationportal.eu/catalog/14225c50-00b0-4fba-8300-a677ab0c86f4>,
56-
<https://fair.healthinformationportal.eu/catalog/17412bc2-daf1-491e-94fb-6680f7a67b1e> ;
57-
ns2:metadataIdentifier <https://fair.healthinformationportal.eu#identifier> ;
58-
ns2:metadataIssued "2023-10-06T09:56:33.262000+00:00"^^xsd:dateTime ;
59-
ns2:metadataModified "2023-10-25T14:04:46.441000+00:00"^^xsd:dateTime ;
60-
ns2:uiLanguage <http://id.loc.gov/vocabulary/iso639-1/en> .
6121

Lines changed: 20 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,27 @@
11
# SPDX-FileCopyrightText: 2024 Stichting Health-RI
22
#
33
# SPDX-License-Identifier: AGPL-3.0-only
4-
5-
@prefix dcat: <http://www.w3.org/ns/dcat#> .
64
@prefix dcterms: <http://purl.org/dc/terms/> .
7-
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
8-
@prefix ldp: <http://www.w3.org/ns/ldp#> .
9-
@prefix ns1: <http://semanticscience.org/resource/> .
10-
@prefix ns2: <https://w3id.org/fdp/fdp-o#> .
115
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
6+
@prefix fdp: <https://w3id.org/fdp/fdp-o#> .
7+
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
128
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
139

14-
<https://fair.healthinformationportal.eu/catalog/> a ldp:DirectContainer ;
15-
dcterms:title "Catalogs" ;
16-
ldp:hasMemberRelation ns2:metadataCatalog ;
17-
ldp:membershipResource <https://fair.healthinformationportal.eu> .
18-
19-
<https://fair.healthinformationportal.eu#identifier> a <http://purl.org/spar/datacite/Identifier> ;
20-
dcterms:identifier "https://fair.healthinformationportal.eu" .
21-
22-
<https://fair.healthinformationportal.eu#publisher> a foaf:Agent ;
23-
foaf:name "HealthInformationPortal.eu" .
24-
25-
<https://fair.healthinformationportal.eu/metrics/445c0a70d1e214e545b261559e2842f4> ns1:SIO_000332 <https://www.ietf.org/rfc/rfc3986.txt> ;
26-
ns1:SIO_000628 <https://www.ietf.org/rfc/rfc3986.txt> .
27-
28-
<https://fair.healthinformationportal.eu/metrics/5d27e854a9e78eb3f663331cd47cdc13> ns1:SIO_000332 <https://www.wikidata.org/wiki/Q8777> ;
29-
ns1:SIO_000628 <https://www.wikidata.org/wiki/Q8777> .
30-
31-
<https://fair.healthinformationportal.eu/profile/77aaad6a-0136-4c6e-88b9-07ffccd0ee4c> rdfs:label "FAIR Data Point Profile" .
32-
33-
<https://fair.healthinformationportal.eu> a dcat:DataService,
34-
dcat:Resource,
35-
ns2:FAIRDataPoint,
36-
ns2:MetadataService ;
37-
rdfs:label "European Health Information portal endpoint" ;
38-
dcterms:accessRights <https://fair.healthinformationportal.eu#accessRights> ;
39-
dcterms:conformsTo <https://fair.healthinformationportal.eu/profile/77aaad6a-0136-4c6e-88b9-07ffccd0ee4c> ;
40-
dcterms:description "Welcome to the one-stop shop that facilitates access to population health and health care data, information and expertise across Europe." ;
41-
dcterms:hasVersion "1.0" ;
42-
dcterms:language <http://id.loc.gov/vocabulary/iso639-1/en> ;
43-
dcterms:license <https://creativecommons.org/licenses/by-nc/4.0> ;
44-
dcterms:publisher <https://fair.healthinformationportal.eu#publisher> ;
45-
dcterms:rights <http://publications.europa.eu/resource/authority/access-right/PUBLIC> ;
46-
dcterms:title "European Health Information portal endpoint" ;
47-
ns1:SIO_000628 <https://fair.healthinformationportal.eu/metrics/445c0a70d1e214e545b261559e2842f4>,
48-
<https://fair.healthinformationportal.eu/metrics/5d27e854a9e78eb3f663331cd47cdc13> ;
49-
dcat:endpointURL <https://fair.healthinformationportal.eu> ;
50-
ns2:fdpSoftwareVersion "FDP:v1.16.2~51911d6" ;
51-
ns2:metadataIdentifier <https://fair.healthinformationportal.eu#identifier> ;
52-
ns2:metadataIssued "2023-10-06T09:56:33.262000+00:00"^^xsd:dateTime ;
53-
ns2:metadataModified "2023-10-25T14:04:46.441000+00:00"^^xsd:dateTime ;
54-
ns2:uiLanguage <http://id.loc.gov/vocabulary/iso639-1/en> .
55-
10+
<https://example.com/fdp>
11+
a fdp:FairDataPoint ;
12+
dcterms:title "Example FDP" ;
13+
dcterms:description "This is an example FAIR Data Point without catalogs." ;
14+
dcterms:issued "2025-03-07T00:00:00Z"^^xsd:dateTime ;
15+
dcterms:modified "2025-03-07T00:00:00Z"^^xsd:dateTime ;
16+
dcterms:license <https://creativecommons.org/publicdomain/zero/1.0/> ;
17+
dcterms:publisher <https://example.com/publisher> ;
18+
dcterms:contactPoint <https://example.com/contact> .
19+
20+
<https://example.com/publisher>
21+
a foaf:Organization ;
22+
foaf:name "Example Organization" .
23+
24+
<https://example.com/contact>
25+
a foaf:Person ;
26+
foaf:name "FAIR Data Support" ;
27+
foaf:mbox "mailto:[email protected]" .

0 commit comments

Comments
 (0)