Skip to content

Commit 727ac17

Browse files
authored
Merge pull request #107 from GenomicDataInfrastructure/dataseries-support
feat(dataseries) Be able to store dataseries via FDP route
2 parents b026317 + 0366a0b commit 727ac17

File tree

10 files changed

+156
-45
lines changed

10 files changed

+156
-45
lines changed

.github/workflows/test.yml

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,16 +38,28 @@ jobs:
3838

3939
steps:
4040
- uses: actions/checkout@v4
41-
- name: Install requirements
41+
- name: Install requirements (common)
4242
run: |
4343
pip install -r requirements.txt
4444
pip install -r dev-requirements.txt
45-
pip install --upgrade pytest-rerunfailures
46-
pip install -e 'git+https://github.com/CivityNL/[email protected]#egg=ckanext-scheming[requirements]'
47-
pip install -e 'git+https://github.com/ckan/[email protected]#egg=ckanext-harvest[requirements]'
48-
pip install -r https://raw.githubusercontent.com/ckan/ckanext-harvest/v1.6.1/requirements.txt
49-
pip install ckanext-dcat==2.3.0
50-
python3 setup.py develop
45+
pip install -e .
46+
- name: Setup CKAN extensions (harvest, scheming, dcat)
47+
run: |
48+
# Harvest v1.6.1 from GitHub
49+
git clone https://github.com/ckan/ckanext-harvest
50+
cd ckanext-harvest
51+
git checkout tags/v1.6.1
52+
pip install -e .
53+
pip install -r requirements.txt
54+
55+
# Scheming (Civity fork)
56+
pip install -e 'git+https://github.com/CivityNL/[email protected]#egg=ckanext-scheming[requirements]'
57+
58+
git clone https://github.com/GenomicDataInfrastructure/gdi-userportal-ckanext-dcat
59+
cd gdi-userportal-ckanext-dcat
60+
git checkout master
61+
pip install -e .
62+
pip install -r requirements.txt
5163
- name: Setup extension
5264
run: |
5365
sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini

ckanext/fairdatapoint/harvesters/civity_harvester.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,12 @@
1515
from ckan import model
1616

1717
from ckanext.fairdatapoint.harvesters.config import get_harvester_setting
18+
from ckanext.fairdatapoint.harvesters.domain.identifier import Identifier
1819
from ckanext.fairdatapoint.labels import resolve_labels
1920
from ckanext.harvest.harvesters import HarvesterBase
2021
from ckanext.harvest.model import HarvestObject
2122
from ckanext.harvest.model import HarvestObjectExtra as HOExtra
2223

23-
24-
2524
ID = "id"
2625

2726
log = logging.getLogger(__name__)
@@ -97,11 +96,18 @@ def gather_stage(self, harvest_job):
9796
guids_in_db = set(guids_to_package_ids.keys())
9897

9998
guids_in_harvest = self._get_guids_in_harvest(harvest_job)
100-
10199
if guids_in_harvest:
102-
new = guids_in_harvest - guids_in_db
103-
delete = guids_in_db - guids_in_harvest
104-
change = guids_in_db & guids_in_harvest
100+
# Sort so that dataseries are processed before datasets
101+
guids_in_harvest = sorted(
102+
guids_in_harvest,
103+
key=lambda g: 0 if "dataseries=" in g else 1
104+
)
105+
106+
guids_in_harvest_set = set(guids_in_harvest)
107+
108+
new = guids_in_harvest_set - guids_in_db
109+
delete = guids_in_db - guids_in_harvest_set
110+
change = guids_in_db & guids_in_harvest_set
105111

106112
for guid in new:
107113
existing = (
@@ -302,9 +308,27 @@ def import_stage(self, harvest_object):
302308
return False
303309

304310
try:
305-
package_dict = self.record_to_package_converter.record_to_package(
306-
harvest_object.guid, str(harvest_object.content)
307-
)
311+
# Determine datatype
312+
identifier_harvest_object = Identifier(harvest_object.guid)
313+
datatype = identifier_harvest_object.get_id_type()
314+
315+
if datatype == "dataset":
316+
# Build mapping from dataseries GUID to package ID for all active dataset_series in the database
317+
series_results = model.Session.query(model.PackageExtra.value, model.Package.id) \
318+
.join(model.Package) \
319+
.filter(model.PackageExtra.key == 'guid') \
320+
.filter(model.Package.type == 'dataset_series') \
321+
.filter(model.Package.state == 'active') \
322+
.all()
323+
series_mapping = {guid: {"id": series_id} for guid, series_id in series_results}
324+
package_dict = self.record_to_package_converter.record_to_package(
325+
harvest_object.guid, str(harvest_object.content), series_mapping=series_mapping
326+
)
327+
else:
328+
package_dict = self.record_to_package_converter.record_to_package(
329+
harvest_object.guid, str(harvest_object.content)
330+
)
331+
package_dict.setdefault("extras", []).append({"key": "guid", "value": identifier_harvest_object.get_id_value()})
308332
except Exception as e:
309333
logger.error(
310334
"Error converting record to package for identifier [%s] [%r]"
@@ -352,6 +376,7 @@ def import_stage(self, harvest_object):
352376
self._save_object_error(str(e), harvest_object)
353377
return False
354378

379+
355380
# Fallback: ensure name is always set
356381
if "name" not in package_dict:
357382
package_dict["name"] = self._gen_new_name(package_dict["title"])
@@ -435,6 +460,7 @@ def import_stage(self, harvest_object):
435460
logger.debug("Finished import stage for harvest_object [%s]", harvest_object.id)
436461
return True
437462

463+
438464
def _create_or_update_package(
439465
self, package_dict, create_or_update, context, harvest_object
440466
):

ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ def get_record_ids(self) -> Dict.keys:
4141
identifier = Identifier("")
4242
identifier.add("dataset", str(fdp_record.url))
4343
result[identifier.guid] = fdp_record.url
44+
elif fdp_record.is_dataseries():
45+
identifier = Identifier("")
46+
identifier.add("dataseries", str(fdp_record.url))
47+
result[identifier.guid] = fdp_record.url
4448
return result.keys()
4549

4650
def get_record_by_id(self, guid: str) -> str:

ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_to_package_converter.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
# SPDX-License-Identifier: AGPL-3.0-only
55

66
import logging
7+
from typing import Any, Dict, Optional
78

8-
from ckanext.dcat.processors import RDFParser, RDFParserException
9+
from ckanext.dcat.processors import RDFParserException
910
from ckanext.fairdatapoint.harvesters.domain.identifier import Identifier
1011
from ckanext.fairdatapoint.processors import FairDataPointRDFParser
1112

@@ -17,20 +18,27 @@ class FairDataPointRecordToPackageConverter:
1718
def __init__(self, profile: str):
1819
self.profile = profile
1920

20-
def record_to_package(self, guid: str, record: str):
21+
def record_to_package(self, guid: str, record: str, series_mapping=None) -> Optional[Dict[str, Any]]:
2122
parser = FairDataPointRDFParser(profiles=[self.profile])
2223

2324
try:
2425
parser.parse(record, _format="ttl")
2526

2627
identifier = Identifier(guid)
27-
if identifier.get_id_type() == "catalog":
28-
for catalog in parser.catalogs():
29-
return catalog
28+
datatype = identifier.get_id_type()
29+
if datatype == "catalog":
30+
items = list(parser.catalogs())
31+
elif datatype == "dataseries":
32+
items = list(parser.dataset_series())
3033
else:
31-
for dataset in parser.datasets():
32-
return dataset
34+
items = list(parser.datasets(series_mapping=series_mapping))
35+
36+
if not items or len(items) < 1:
37+
log.warning("No %s found in RDF", datatype)
38+
return None # Returning None instead of False for clarity
39+
40+
return items[0] # Assuming single item per record
3341
except RDFParserException as e:
3442
raise Exception(
35-
"Error parsing the RDF content [{0}]: {1}".format(record, e)
36-
)
43+
f"Error parsing the RDF content [{record}]: {e}"
44+
) from e

ckanext/fairdatapoint/harvesters/domain/fdp_record.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,7 @@ def is_catalog(self):
2323

2424
def is_dataset(self):
2525
return (URIRef(self.url), RDF.type, DCAT.Dataset) in self._graph
26+
27+
def is_dataseries(self):
28+
return (URIRef(self.url), RDF.type, DCAT.DatasetSeries) in self._graph
29+

ckanext/fairdatapoint/tests/test_data/root_fdp_response.ttl

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,23 @@
77
@prefix ldp: <http://www.w3.org/ns/ldp#> .
88
@prefix ex: <http://example.org/> .
99

10+
########### Catalog ###########
1011
ex:Catalog1 a ldp:DirectContainer, dcat:Catalog ;
1112
dct:title "Catalog of Datasets"@en ;
1213
ldp:membershipResource ex:Catalog1 ;
1314
ldp:hasMemberRelation dcat:dataset ;
15+
ldp:contains ex:DatasetSeries1 .
16+
17+
########### Dataset Series as LDP Container ###########
18+
ex:DatasetSeries1 a ldp:DirectContainer, dcat:DatasetSeries ;
19+
dct:title "Genomic Studies over Time"@en ;
20+
dct:description "A series of genomic datasets collected across multiple years and studies."@en ;
21+
ldp:membershipResource ex:DatasetSeries1 ;
22+
ldp:hasMemberRelation dcat:hasPart ;
1423
ldp:contains ex:Dataset1 .
1524

16-
# Dataset under Dataseries
25+
########### Dataset belonging to the series ###########
1726
ex:Dataset1 a dcat:Dataset ;
18-
dct:title "Genomic Variation Dataset"@en ;
19-
dct:description "Genomic data collected from multiple populations."@en .
20-
21-
27+
dct:title "Genomic Variation Dataset 2023"@en ;
28+
dct:description "Genomic data collected from multiple populations in 2023."@en ;
29+
dcat:inSeries ex:DatasetSeries1 .

ckanext/fairdatapoint/tests/test_harvester.py

Lines changed: 57 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,15 @@ def _get_user_name():
6060

6161
@pytest.fixture
6262
def configurable_harvester():
63-
def _create(record_return_value):
63+
def _create(record_return_value, package_return_value):
6464
class DummyFDPHarvester(CivityHarvester):
6565
def setup_record_provider(self, url, config):
6666
self.record_provider = MagicMock()
6767
self.record_provider.get_record_by_id = MagicMock(return_value=record_return_value)
6868

6969
def setup_record_to_package_converter(self, url, config):
70-
pass
70+
self.record_to_package_converter = MagicMock()
71+
self.record_to_package_converter.record_to_package= MagicMock(return_value=package_return_value)
7172
return DummyFDPHarvester()
7273
return _create
7374

@@ -192,15 +193,15 @@ def test_fetch_stage_status_delete(dummy_harvester, harvest_object):
192193
assert result is True
193194

194195
def test_fetch_stage_successful_fetch(configurable_harvester, harvest_object):
195-
harvester = configurable_harvester("<rdf>dummy content</rdf>")
196+
harvester = configurable_harvester("<rdf>dummy content</rdf>", configurable_harvester)
196197
result = harvester.fetch_stage(harvest_object)
197198

198199
assert result is True
199200
assert harvest_object.content == "<rdf>dummy content</rdf>"
200201
harvest_object.save.assert_called_once()
201202

202203
def test_fetch_stage_empty_record(configurable_harvester, harvest_object):
203-
harvester = configurable_harvester(None)
204+
harvester = configurable_harvester(None, None)
204205
harvester._save_object_error = MagicMock()
205206
harvest_object.extras = [HOExtra(key="status", value="change")]
206207

@@ -251,19 +252,24 @@ def test_import_stage_conversion_error(dummy_harvester, harvest_object):
251252

252253

253254
def test_import_stage_success_new_package(dummy_harvester, harvest_object):
254-
dummy_harvester.setup_record_to_package_converter(harvest_object.source.url, {})
255-
dummy_harvester.record_to_package_converter.record_to_package.return_value = {
255+
harvester = dummy_harvester
256+
harvester.setup_record_to_package_converter(harvest_object.source.url, {})
257+
harvester.record_to_package_converter.record_to_package.return_value = {
256258
"title": "My Dataset",
259+
"name": "my-dataset",
257260
"resources": []
258261
}
262+
259263
harvest_object.content = "<rdf>dummy content</rdf>"
264+
# Ensure _create_or_update_package and _create_resources are mocked for isolation
265+
harvester._create_or_update_package = MagicMock(return_value="pkg-123")
266+
harvester._create_resources = MagicMock(return_value=True)
260267

261268
with patch("ckanext.fairdatapoint.harvesters.civity_harvester.model.Session") as mock_session:
262-
result = dummy_harvester.import_stage(harvest_object)
263-
269+
result = harvester.import_stage(harvest_object)
264270
assert result is True
265-
dummy_harvester._create_or_update_package.assert_called_once()
266-
dummy_harvester._create_resources.assert_called_once()
271+
harvester._create_or_update_package.assert_called_once()
272+
harvester._create_resources.assert_called_once()
267273
assert harvest_object.current is True
268274
harvest_object.add.assert_called()
269275
mock_session.commit.assert_called()
@@ -287,3 +293,44 @@ def test_import_stage_success_update(dummy_harvester, harvest_object):
287293
dummy_harvester._create_or_update_package.assert_called_once()
288294
dummy_harvester._create_resources.assert_called_once()
289295
mock_session.commit.assert_called()
296+
297+
298+
def test_import_stage_dataset_links_to_existing_series(configurable_harvester, harvest_object):
299+
# This test verifies that a dataset can be updated to include a link to an existing dataseries.
300+
# Note: The reverse (adding datasets to a dataseries) is not handled here.
301+
dataset_guid = "dataset=https://fdp.example.org/dataset/abc"
302+
dataseries_guid = "dataseries=https://fdp.example.org/datasetseries/xyz"
303+
harvest_object.guid = dataset_guid
304+
harvest_object.extras = [HOExtra(key="status", value="change")]
305+
harvest_object.package_id = "existing-dataset"
306+
harvest_object.content = "<rdf>dummy dataset referencing series</rdf>"
307+
308+
package_to_return = {
309+
"title": "Updated Dataset",
310+
"name": "updated-dataset",
311+
"resources": [],
312+
"in_series": [dataseries_guid],
313+
"owner_org": harvest_object.owner_org,
314+
}
315+
316+
harvester = configurable_harvester(harvest_object.content, package_to_return)
317+
harvester._create_or_update_package = MagicMock(return_value="pkg-123")
318+
319+
with patch("ckanext.fairdatapoint.harvesters.civity_harvester.model.Session") as mock_session, \
320+
patch("ckanext.fairdatapoint.harvesters.civity_harvester.toolkit.get_action") as mock_get_action:
321+
mock_query = MagicMock()
322+
mock_query.join.return_value.filter.return_value.filter.return_value.filter.return_value.all.return_value = [
323+
(dataseries_guid, "series-xyz")
324+
]
325+
mock_session.query.return_value = mock_query
326+
327+
result = harvester.import_stage(harvest_object)
328+
329+
assert result is True
330+
assert harvest_object.current is True
331+
harvester._create_or_update_package.assert_called_once()
332+
updated_pkg = harvester._create_or_update_package.call_args[0][0]
333+
assert updated_pkg["id"] == "existing-dataset"
334+
# Ensure that only the dataset receives the in_series update
335+
assert dataseries_guid in updated_pkg["in_series"]
336+
mock_session.commit.assert_called()

ckanext/fairdatapoint/tests/test_processors.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def test_fdp_record_converter_catalog(self, parser_catalogs):
3636
data = Graph().parse(Path(TEST_DATA_DIRECTORY, "fdp_catalog.ttl")).serialize()
3737
fdp_record_to_package.record_to_package(
3838
guid="catalog=https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d",
39-
record=data)
39+
record=data, series_mapping=None)
4040
assert parser_catalogs.called
4141

4242
def test_fdp_record_converter_dataset_dict(self):
@@ -46,7 +46,7 @@ def test_fdp_record_converter_dataset_dict(self):
4646
guid="catalog=https://covid19initiatives.health-ri.nl/p/ProjectOverview?focusarea="
4747
"http://purl.org/zonmw/generic/10006;"
4848
"dataset=https://covid19initiatives.health-ri.nl/p/Project/27866022694497978",
49-
record=data)
49+
record=data, series_mapping=None)
5050
expected_dataset = dict(extras=[], uri="https://covid19initiatives.health-ri.nl/p/Project/27866022694497978",
5151
resources=[], title="COVID-NL cohort MUMC+",
5252
notes="Clinical data of MUMC COVID-NL cohort", tags=[],
@@ -64,7 +64,8 @@ def test_fdp_record_converter_catalog_dict(self):
6464
data = Graph().parse(Path(TEST_DATA_DIRECTORY, "fdp_catalog.ttl")).serialize()
6565
actual = fdp_record_to_package.record_to_package(
6666
guid="catalog=https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d",
67-
record=data)
67+
record=data, series_mapping=None)
68+
6869

6970
expected = {
7071
"uri": "https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d",

ckanext/fairdatapoint/tests/test_profiles.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def test_parse_dataset():
3838
actual = fdp_record_to_package.record_to_package(
3939
guid="catalog=https://health-ri.sandbox.semlab-leiden.nl/catalog/5c85cb9f-be4a-406c-ab0a-287fa787caa0;"
4040
"dataset=https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5",
41-
record=data)
41+
record=data, series_mapping=None)
4242
expected = {
4343
'extras': [],
4444
'resources': [

ckanext/fairdatapoint/tests/test_record_provider.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class TestRecordProvider:
3333
(
3434
Path(TEST_DATA_DIRECTORY, "root_fdp_response.ttl"),
3535
{
36-
'dataset=http://example.org/Dataset1',
36+
'dataseries=http://example.org/DatasetSeries1', 'dataset=http://example.org/Dataset1'
3737
}
3838
),
3939
(
@@ -62,6 +62,7 @@ def test_get_record_ids(self, mocker, fdp_response_file, expected):
6262
(
6363
Path(TEST_DATA_DIRECTORY, "fdp_multiple_parents.ttl"),
6464
{
65+
'dataseries=http://example.org/Dataseries1',
6566
'dataset=http://example.org/Dataset1'
6667
},
6768
)

0 commit comments

Comments
 (0)