Skip to content

Commit 9b1f888

Browse files
Merge pull request #73 from GenomicDataInfrastructure/72-user-story-as-user-i-want-to-configure-per-harvest-source-i-have-want-to-harvest-the-catalog-as-well
72 user story as user i want to configure per harvest source i have want to harvest the catalog as well
2 parents cb9b75a + e781fa0 commit 9b1f888

File tree

6 files changed

+218
-64
lines changed

6 files changed

+218
-64
lines changed

README.md

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,13 +86,11 @@ To install gdi-userportal-ckanext-fairdatapoint:
8686

8787
## Config settings
8888

89-
None at present
89+
There is a setting `ckanext.fairdatapoint.harvest_catalogs`. Default is `false`. If set to `true`,
90+
CKAN will harvest catalogs as datasets.
9091

91-
**TODO:** Document any optional config settings here. For example:
92-
93-
# The minimum number of hours to wait before re-checking a resource
94-
# (optional, default: 24).
95-
ckanext.fairdatapoint.some_setting = some_default_value
92+
The setting can be overriden in the harvester profile, by setting `"harvest_catalogs": "true"` or
93+
`"harvest_catalogs": "false"` in the harvester configuration JSON.
9694

9795

9896
## Developer installation

ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py

Lines changed: 44 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,28 @@
1818
from typing import Dict, Iterable, Union
1919

2020

21-
LDP = Namespace('http://www.w3.org/ns/ldp#')
22-
VCARD = Namespace('http://www.w3.org/2006/vcard/ns#')
21+
LDP = Namespace("http://www.w3.org/ns/ldp#")
22+
VCARD = Namespace("http://www.w3.org/2006/vcard/ns#")
2323

2424
log = logging.getLogger(__name__)
2525

2626

2727
class FairDataPointRecordProvider:
2828

29-
def __init__(self, fdp_end_point: str):
29+
def __init__(self, fdp_end_point: str, harvest_catalogs: bool = False):
3030
self.fair_data_point = FairDataPoint(fdp_end_point)
31+
self.harvest_catalogs = harvest_catalogs
3132

3233
def get_record_ids(self) -> Dict.keys:
3334
"""
3435
Returns all the FDP records which should end up as packages in CKAN to populate the "guids_in_harvest" list
3536
https://rdflib.readthedocs.io/en/stable/intro_to_parsing.html
3637
"""
37-
log.debug('FAIR Data Point get_records from {}'.format(self.fair_data_point.fdp_end_point))
38+
log.debug(
39+
"FAIR Data Point get_records from {}".format(
40+
self.fair_data_point.fdp_end_point
41+
)
42+
)
3843

3944
result = dict()
4045

@@ -52,20 +57,21 @@ def _process_catalog(self, path: Union[str, URIRef]) -> Dict:
5257
catalogs_graph = self.fair_data_point.get_graph(path)
5358

5459
for catalog_subject in catalogs_graph.subjects(RDF.type, DCAT.Catalog):
55-
identifier = Identifier('')
60+
identifier = Identifier("")
5661

57-
identifier.add('catalog', str(catalog_subject))
62+
identifier.add("catalog", str(catalog_subject))
5863

59-
result[identifier.guid] = catalog_subject
64+
if self.harvest_catalogs:
65+
result[identifier.guid] = catalog_subject
6066

6167
catalog_graph = self.fair_data_point.get_graph(catalog_subject)
6268

6369
for dataset_subject in catalog_graph.objects(predicate=DCAT.dataset):
64-
identifier = Identifier('')
70+
identifier = Identifier("")
6571

66-
identifier.add('catalog', str(catalog_subject))
72+
identifier.add("catalog", str(catalog_subject))
6773

68-
identifier.add('dataset', str(dataset_subject))
74+
identifier.add("dataset", str(dataset_subject))
6975

7076
result[identifier.guid] = dataset_subject
7177

@@ -76,7 +82,10 @@ def get_record_by_id(self, guid: str) -> str:
7682
Get additional information for FDP record.
7783
"""
7884
log.debug(
79-
'FAIR data point get_record_by_id from {} for {}'.format(self.fair_data_point.fdp_end_point, guid))
85+
"FAIR data point get_record_by_id from {} for {}".format(
86+
self.fair_data_point.fdp_end_point, guid
87+
)
88+
)
8089

8190
identifier = Identifier(guid)
8291

@@ -89,7 +98,9 @@ def get_record_by_id(self, guid: str) -> str:
8998
self._remove_fdp_defaults(g, subject_uri)
9099

91100
# Add information from distribution to graph
92-
for distribution_uri in g.objects(subject=subject_uri, predicate=DCAT.distribution):
101+
for distribution_uri in g.objects(
102+
subject=subject_uri, predicate=DCAT.distribution
103+
):
93104
distribution_g = self.fair_data_point.get_graph(distribution_uri)
94105

95106
self._remove_fdp_defaults(g, distribution_uri)
@@ -99,17 +110,21 @@ def get_record_by_id(self, guid: str) -> str:
99110
DCTERMS.format,
100111
DCTERMS.license,
101112
DCTERMS.title,
102-
DCAT.accessURL
113+
DCAT.accessURL,
103114
]:
104-
for distr_attribute_value in self.get_values(distribution_g, distribution_uri, predicate):
115+
for distr_attribute_value in self.get_values(
116+
distribution_g, distribution_uri, predicate
117+
):
105118
g.add((distribution_uri, predicate, distr_attribute_value))
106119

107120
# Look-up contact information
108121
for contact_point_uri in self.get_values(g, subject_uri, DCAT.contactPoint):
109122
if isinstance(contact_point_uri, URIRef):
110-
self._parse_contact_point(g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri)
123+
self._parse_contact_point(
124+
g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri
125+
)
111126

112-
result = g.serialize(format='ttl')
127+
result = g.serialize(format="ttl")
113128

114129
return result
115130

@@ -123,19 +138,23 @@ def _parse_contact_point(g: Graph, subject_uri: URIRef, contact_point_uri: URIRe
123138
g.add((subject_uri, DCAT.contactPoint, vcard_node))
124139
g.add((vcard_node, RDF.type, VCARD.Kind))
125140
g.add((vcard_node, VCARD.hasUID, contact_point_uri))
126-
if 'orcid' in str(contact_point_uri):
141+
if "orcid" in str(contact_point_uri):
127142
try:
128-
orcid_response = requests.get(str(contact_point_uri).rstrip('/') + '/public-record.json')
143+
orcid_response = requests.get(
144+
str(contact_point_uri).rstrip("/") + "/public-record.json"
145+
)
129146
json_orcid_response = orcid_response.json()
130-
name = json_orcid_response['displayName']
147+
name = json_orcid_response["displayName"]
131148
g.add((vcard_node, VCARD.fn, Literal(name)))
132149
except (JSONDecodeError, HTTPError) as e:
133-
log.error(f'Failed to get data from ORCID for {contact_point_uri}: {e}')
150+
log.error(f"Failed to get data from ORCID for {contact_point_uri}: {e}")
134151

135152
@staticmethod
136-
def get_values(graph: Graph,
137-
subject: Union[str, URIRef, Node],
138-
predicate: Union[str, URIRef, Node]) -> Iterable[Node]:
153+
def get_values(
154+
graph: Graph,
155+
subject: Union[str, URIRef, Node],
156+
predicate: Union[str, URIRef, Node],
157+
) -> Iterable[Node]:
139158
subject_uri = URIRef(subject)
140159
predicate_uri = URIRef(predicate)
141160

@@ -144,8 +163,8 @@ def get_values(graph: Graph,
144163

145164
@staticmethod
146165
def _remove_fdp_defaults(g, subject_uri):
147-
for (s, p, o) in g.triples((subject_uri, DCTERMS.accessRights, None)):
148-
access_rights_default = URIRef(f'{subject_uri}#accessRights')
166+
for s, p, o in g.triples((subject_uri, DCTERMS.accessRights, None)):
167+
access_rights_default = URIRef(f"{subject_uri}#accessRights")
149168
if o == access_rights_default:
150169
g.remove((subject_uri, DCTERMS.accessRights, o))
151170
g.remove((access_rights_default, None, None))

ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,60 @@
22
# SPDX-FileContributor: 2024 Stichting Health-RI
33
#
44
# SPDX-License-Identifier: AGPL-3.0-only
5-
5+
import logging
66

77
from ckanext.fairdatapoint.harvesters.civity_harvester import CivityHarvester
8-
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import FairDataPointRecordProvider
9-
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import \
10-
FairDataPointRecordToPackageConverter
8+
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import (
9+
FairDataPointRecordProvider,
10+
)
11+
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import (
12+
FairDataPointRecordToPackageConverter,
13+
)
14+
from ckan.plugins import toolkit
1115

12-
PROFILE = 'profile'
16+
PROFILE = "profile"
17+
HARVEST_CATALOG = "harvest_catalogs"
18+
HARVEST_CATALOG_CONFIG = "ckanext.fairdatapoint.harvest_catalogs"
1319

20+
log = logging.getLogger(__name__)
1421

15-
class FairDataPointCivityHarvester(CivityHarvester):
1622

23+
class FairDataPointCivityHarvester(CivityHarvester):
1724
def setup_record_provider(self, harvest_url, harvest_config_dict):
18-
self.record_provider = FairDataPointRecordProvider(harvest_url)
25+
# Harvest catalog config can be set on global CKAN level, but can be overriden by harvest config
26+
harvest_catalogs = self._get_harvest_catalog_setting(harvest_config_dict)
27+
28+
self.record_provider = FairDataPointRecordProvider(
29+
harvest_url, harvest_catalogs
30+
)
1931

2032
def setup_record_to_package_converter(self, harvest_url, harvest_config_dict):
2133
if PROFILE in harvest_config_dict:
22-
self.record_to_package_converter = FairDataPointRecordToPackageConverter(harvest_config_dict.get(PROFILE))
34+
self.record_to_package_converter = FairDataPointRecordToPackageConverter(
35+
harvest_config_dict.get(PROFILE)
36+
)
2337
else:
24-
raise Exception('[{0}] not found in harvester config JSON'.format(PROFILE))
38+
raise Exception("[{0}] not found in harvester config JSON".format(PROFILE))
2539

26-
def info(self):
40+
@staticmethod
41+
def info():
2742
return {
28-
'name': 'fair_data_point_harvester',
29-
'title': 'FAIR data point harvester',
30-
'description': 'Harvester for end points implementing the FAIR data point protocol'
43+
"name": "fair_data_point_harvester",
44+
"title": "FAIR data point harvester",
45+
"description": "Harvester for end points implementing the FAIR data point protocol",
3146
}
47+
48+
@staticmethod
49+
def _get_harvest_catalog_setting(harvest_config_dict):
50+
if HARVEST_CATALOG in harvest_config_dict:
51+
log.debug("Using harvest_catalogs from harvest_config_dict")
52+
harvest_catalog_setting = toolkit.asbool(
53+
harvest_config_dict[HARVEST_CATALOG]
54+
)
55+
else:
56+
log.debug("Using harvest_catalogs from global CKAN config")
57+
harvest_catalog_setting = toolkit.asbool(
58+
toolkit.config.get(HARVEST_CATALOG_CONFIG, False)
59+
)
60+
log.debug("Harvesting catalogs is set to %s", harvest_catalog_setting)
61+
return harvest_catalog_setting
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# SPDX-FileCopyrightText: 2024 Stichting Health-RI
2+
#
3+
# SPDX-License-Identifier: AGPL-3.0-only
4+
5+
@prefix ldp: <http://www.w3.org/ns/ldp#> .
6+
@prefix dcat: <http://www.w3.org/ns/dcat#> .
7+
<http://example.com> ldp:contains <http://example.com/catalog1> .
8+
<http://example.com/catalog1> a dcat:Catalog .
9+
<http://example.com/catalog1> dcat:dataset <http://example.com/dataset1> .
10+
<http://example.com/dataset1> a dcat:Dataset .
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# SPDX-FileCopyrightText: 2024 Stichting Health-RI
2+
#
3+
# SPDX-License-Identifier: AGPL-3.0-only
4+
5+
import unittest
6+
import ckanext.fairdatapoint.plugin as plugin
7+
from unittest.mock import patch, MagicMock
8+
from ckanext.fairdatapoint.harvesters import FairDataPointCivityHarvester
9+
from ckanext.fairdatapoint.harvesters import fair_data_point_civity_harvester
10+
11+
12+
class TestFairDataPointCivityHarvester(unittest.TestCase):
13+
14+
def setUp(self):
15+
plugin.toolkit = MagicMock()
16+
17+
def test_get_harvest_catalog_setting_from_dict(self):
18+
harvester = FairDataPointCivityHarvester()
19+
harvest_config_dict = {fair_data_point_civity_harvester.HARVEST_CATALOG: 'true'}
20+
result = harvester._get_harvest_catalog_setting(harvest_config_dict)
21+
self.assertTrue(result)
22+
23+
@patch('ckan.plugins.toolkit.config')
24+
def test_get_harvest_catalog_setting_from_global_config(self, mock_config):
25+
mock_config.get.return_value = 'false'
26+
harvester = FairDataPointCivityHarvester()
27+
28+
harvest_config_dict = {}
29+
result = harvester._get_harvest_catalog_setting(harvest_config_dict)
30+
31+
self.assertFalse(result)
32+
mock_config.get.assert_called_once_with(fair_data_point_civity_harvester.HARVEST_CATALOG_CONFIG, False)
33+
34+
@patch('ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider.FairDataPointRecordProvider'
35+
'.__init__')
36+
def test_setup_record_provider(self, mock_record_provider):
37+
mock_record_provider.return_value = None
38+
harvester = FairDataPointCivityHarvester()
39+
harvester._get_harvest_catalog_setting = MagicMock(return_value=True)
40+
harvest_url = 'http://example.com'
41+
harvest_config_dict = {fair_data_point_civity_harvester.HARVEST_CATALOG: 'true'}
42+
harvester.setup_record_provider(harvest_url, harvest_config_dict)
43+
harvester._get_harvest_catalog_setting.assert_called_once_with(harvest_config_dict)
44+
mock_record_provider.assert_called_once_with(harvest_url, True)
45+
46+
@patch(
47+
'ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter'
48+
'.FairDataPointRecordToPackageConverter'
49+
'.__init__')
50+
def test_setup_record_to_package_converter_with_profile(self, mock_converter):
51+
mock_converter.return_value = None
52+
harvester = FairDataPointCivityHarvester()
53+
harvest_url = 'http://example.com'
54+
harvest_config_dict = {fair_data_point_civity_harvester.PROFILE: 'test_profile'}
55+
harvester.setup_record_to_package_converter(harvest_url, harvest_config_dict)
56+
mock_converter.assert_called_once_with('test_profile')
57+
58+
def test_setup_record_to_package_converter_raises_exception(self):
59+
# Instantiate the harvester
60+
harvester = FairDataPointCivityHarvester()
61+
62+
# Test data without PROFILE in the dictionary
63+
harvest_url = 'http://example.com'
64+
harvest_config_dict = {} # No PROFILE key
65+
66+
# Verify that an exception is raised when PROFILE is missing
67+
with self.assertRaises(Exception) as context:
68+
harvester.setup_record_to_package_converter(harvest_url, harvest_config_dict)
69+
70+
# Check the exception message
71+
self.assertEqual(
72+
str(context.exception),
73+
"[profile] not found in harvester config JSON"
74+
)

0 commit comments

Comments
 (0)