Merge pull request #107 from GenomicDataInfrastructure/dataseries-support

hcvdwerf · web-flow · commit 727ac172cdcc · 2025-07-02T11:28:42.000+02:00
feat(dataseries) Be able to store dataseries via FDP route
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -38,16 +38,28 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
-    - name: Install requirements
+    - name: Install requirements (common)
       run: |
         pip install -r requirements.txt
         pip install -r dev-requirements.txt
-        pip install --upgrade pytest-rerunfailures
-        pip install -e 'git+https://github.com/CivityNL/ckanext-scheming.git@release-3.0.0-civity-1#egg=ckanext-scheming[requirements]'
-        pip install -e 'git+https://github.com/ckan/ckanext-harvest.git@v1.6.1#egg=ckanext-harvest[requirements]'
-        pip install -r https://raw.githubusercontent.com/ckan/ckanext-harvest/v1.6.1/requirements.txt
-        pip install ckanext-dcat==2.3.0
-        python3 setup.py develop
+        pip install -e .
+    - name: Setup CKAN extensions (harvest, scheming, dcat)
+      run: |
+        # Harvest v1.6.1 from GitHub
+        git clone https://github.com/ckan/ckanext-harvest
+        cd ckanext-harvest
+        git checkout tags/v1.6.1
+        pip install -e .
+        pip install -r requirements.txt
+
+        # Scheming (Civity fork)
+        pip install -e 'git+https://github.com/CivityNL/ckanext-scheming.git@3.0.0-civity-1#egg=ckanext-scheming[requirements]'
+
+        git clone https://github.com/GenomicDataInfrastructure/gdi-userportal-ckanext-dcat
+        cd gdi-userportal-ckanext-dcat
+        git checkout master
+        pip install -e .
+        pip install -r requirements.txt
     - name: Setup extension
       run: |
         sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini
diff --git a/ckanext/fairdatapoint/harvesters/civity_harvester.py b/ckanext/fairdatapoint/harvesters/civity_harvester.py
@@ -15,13 +15,12 @@
 from ckan import model
 
 from ckanext.fairdatapoint.harvesters.config import get_harvester_setting
+from ckanext.fairdatapoint.harvesters.domain.identifier import Identifier
 from ckanext.fairdatapoint.labels import resolve_labels
 from ckanext.harvest.harvesters import HarvesterBase
 from ckanext.harvest.model import HarvestObject
 from ckanext.harvest.model import HarvestObjectExtra as HOExtra
 
-
-
 ID = "id"
 
 log = logging.getLogger(__name__)
@@ -97,11 +96,18 @@ def gather_stage(self, harvest_job):
         guids_in_db = set(guids_to_package_ids.keys())
 
         guids_in_harvest = self._get_guids_in_harvest(harvest_job)
-
         if guids_in_harvest:
-            new = guids_in_harvest - guids_in_db
-            delete = guids_in_db - guids_in_harvest
-            change = guids_in_db & guids_in_harvest
+            # Sort so that dataseries are processed before datasets
+            guids_in_harvest = sorted(
+                guids_in_harvest,
+                key=lambda g: 0 if "dataseries=" in g else 1
+            )
+
+            guids_in_harvest_set = set(guids_in_harvest)
+
+            new = guids_in_harvest_set - guids_in_db
+            delete = guids_in_db - guids_in_harvest_set
+            change = guids_in_db & guids_in_harvest_set
 
             for guid in new:
                 existing = (
@@ -302,9 +308,27 @@ def import_stage(self, harvest_object):
             return False
 
         try:
-            package_dict = self.record_to_package_converter.record_to_package(
-                harvest_object.guid, str(harvest_object.content)
-            )
+            # Determine datatype
+            identifier_harvest_object = Identifier(harvest_object.guid)
+            datatype = identifier_harvest_object.get_id_type()
+
+            if datatype == "dataset":
+                # Build mapping from dataseries GUID to package ID for all active dataset_series in the database
+                series_results = model.Session.query(model.PackageExtra.value, model.Package.id) \
+                    .join(model.Package) \
+                    .filter(model.PackageExtra.key == 'guid') \
+                    .filter(model.Package.type == 'dataset_series') \
+                    .filter(model.Package.state == 'active') \
+                    .all()
+                series_mapping = {guid: {"id": series_id} for guid, series_id in series_results}
+                package_dict = self.record_to_package_converter.record_to_package(
+                    harvest_object.guid, str(harvest_object.content), series_mapping=series_mapping
+                )
+            else:
+                package_dict = self.record_to_package_converter.record_to_package(
+                    harvest_object.guid, str(harvest_object.content)
+                )
+            package_dict.setdefault("extras", []).append({"key": "guid", "value": identifier_harvest_object.get_id_value()})
         except Exception as e:
             logger.error(
                 "Error converting record to package for identifier [%s] [%r]"
@@ -352,6 +376,7 @@ def import_stage(self, harvest_object):
             self._save_object_error(str(e), harvest_object)
             return False
 
+
         # Fallback: ensure name is always set
         if "name" not in package_dict:
             package_dict["name"] = self._gen_new_name(package_dict["title"])
@@ -435,6 +460,7 @@ def import_stage(self, harvest_object):
         logger.debug("Finished import stage for harvest_object [%s]", harvest_object.id)
         return True
 
+
     def _create_or_update_package(
         self, package_dict, create_or_update, context, harvest_object
     ):
diff --git a/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py b/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py
@@ -41,6 +41,10 @@ def get_record_ids(self) -> Dict.keys:
                 identifier = Identifier("")
                 identifier.add("dataset", str(fdp_record.url))
                 result[identifier.guid] = fdp_record.url
+            elif fdp_record.is_dataseries():
+                identifier = Identifier("")
+                identifier.add("dataseries", str(fdp_record.url))
+                result[identifier.guid] = fdp_record.url
         return result.keys()
 
     def get_record_by_id(self, guid: str) -> str:
diff --git a/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_to_package_converter.py b/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_to_package_converter.py
@@ -4,8 +4,9 @@
 # SPDX-License-Identifier: AGPL-3.0-only
 
 import logging
+from typing import Any, Dict, Optional
 
-from ckanext.dcat.processors import RDFParser, RDFParserException
+from ckanext.dcat.processors import RDFParserException
 from ckanext.fairdatapoint.harvesters.domain.identifier import Identifier
 from ckanext.fairdatapoint.processors import FairDataPointRDFParser
 
@@ -17,20 +18,27 @@ class FairDataPointRecordToPackageConverter:
     def __init__(self, profile: str):
         self.profile = profile
 
-    def record_to_package(self, guid: str, record: str):
+    def record_to_package(self, guid: str, record: str, series_mapping=None) -> Optional[Dict[str, Any]]:
         parser = FairDataPointRDFParser(profiles=[self.profile])
 
         try:
             parser.parse(record, _format="ttl")
 
             identifier = Identifier(guid)
-            if identifier.get_id_type() == "catalog":
-                for catalog in parser.catalogs():
-                    return catalog
+            datatype = identifier.get_id_type()
+            if datatype == "catalog":
+                items = list(parser.catalogs())
+            elif datatype == "dataseries":
+                items = list(parser.dataset_series())
             else:
-                for dataset in parser.datasets():
-                    return dataset
+                items = list(parser.datasets(series_mapping=series_mapping))
+
+            if not items or len(items) < 1:
+                log.warning("No %s found in RDF", datatype)
+                return None  # Returning None instead of False for clarity
+
+            return items[0]  # Assuming single item per record
         except RDFParserException as e:
             raise Exception(
-                "Error parsing the RDF content [{0}]: {1}".format(record, e)
-            )
+                f"Error parsing the RDF content [{record}]: {e}"
+            ) from e
diff --git a/ckanext/fairdatapoint/harvesters/domain/fdp_record.py b/ckanext/fairdatapoint/harvesters/domain/fdp_record.py
@@ -23,3 +23,7 @@ def is_catalog(self):
 
     def is_dataset(self):
         return (URIRef(self.url), RDF.type, DCAT.Dataset) in self._graph
+
+    def is_dataseries(self):
+        return (URIRef(self.url), RDF.type, DCAT.DatasetSeries) in self._graph
+    
diff --git a/ckanext/fairdatapoint/tests/test_data/root_fdp_response.ttl b/ckanext/fairdatapoint/tests/test_data/root_fdp_response.ttl
@@ -7,15 +7,23 @@
 @prefix ldp: <http://www.w3.org/ns/ldp#> .
 @prefix ex: <http://example.org/> .
 
+########### Catalog ###########
 ex:Catalog1 a ldp:DirectContainer, dcat:Catalog ;
     dct:title "Catalog of Datasets"@en ;
     ldp:membershipResource ex:Catalog1 ;
     ldp:hasMemberRelation dcat:dataset ;
+    ldp:contains ex:DatasetSeries1 .
+
+########### Dataset Series as LDP Container ###########
+ex:DatasetSeries1 a ldp:DirectContainer, dcat:DatasetSeries ;
+    dct:title "Genomic Studies over Time"@en ;
+    dct:description "A series of genomic datasets collected across multiple years and studies."@en ;
+    ldp:membershipResource ex:DatasetSeries1 ;
+    ldp:hasMemberRelation dcat:hasPart ;
     ldp:contains ex:Dataset1 .
 
-# Dataset under Dataseries
+########### Dataset belonging to the series ###########
 ex:Dataset1 a dcat:Dataset ;
-    dct:title "Genomic Variation Dataset"@en ;
-    dct:description "Genomic data collected from multiple populations."@en .
-
-
+    dct:title "Genomic Variation Dataset 2023"@en ;
+    dct:description "Genomic data collected from multiple populations in 2023."@en ;
+    dcat:inSeries ex:DatasetSeries1 .
diff --git a/ckanext/fairdatapoint/tests/test_harvester.py b/ckanext/fairdatapoint/tests/test_harvester.py
@@ -60,14 +60,15 @@ def _get_user_name():
 
 @pytest.fixture
 def configurable_harvester():
-    def _create(record_return_value):
+    def _create(record_return_value, package_return_value):
         class DummyFDPHarvester(CivityHarvester):
             def setup_record_provider(self, url, config):
                 self.record_provider = MagicMock()
                 self.record_provider.get_record_by_id = MagicMock(return_value=record_return_value)
 
             def setup_record_to_package_converter(self, url, config):
-                pass
+                self.record_to_package_converter = MagicMock()
+                self.record_to_package_converter.record_to_package= MagicMock(return_value=package_return_value)
         return DummyFDPHarvester()
     return _create
 
@@ -192,15 +193,15 @@ def test_fetch_stage_status_delete(dummy_harvester, harvest_object):
     assert result is True
 
 def test_fetch_stage_successful_fetch(configurable_harvester, harvest_object):
-    harvester = configurable_harvester("<rdf>dummy content</rdf>")
+    harvester = configurable_harvester("<rdf>dummy content</rdf>", configurable_harvester)
     result = harvester.fetch_stage(harvest_object)
 
     assert result is True
     assert harvest_object.content == "<rdf>dummy content</rdf>"
     harvest_object.save.assert_called_once()
 
 def test_fetch_stage_empty_record(configurable_harvester, harvest_object):
-    harvester = configurable_harvester(None)
+    harvester = configurable_harvester(None, None)
     harvester._save_object_error = MagicMock()
     harvest_object.extras = [HOExtra(key="status", value="change")]
 
@@ -251,19 +252,24 @@ def test_import_stage_conversion_error(dummy_harvester, harvest_object):
 
 
 def test_import_stage_success_new_package(dummy_harvester, harvest_object):
-    dummy_harvester.setup_record_to_package_converter(harvest_object.source.url, {})
-    dummy_harvester.record_to_package_converter.record_to_package.return_value = {
+    harvester = dummy_harvester
+    harvester.setup_record_to_package_converter(harvest_object.source.url, {})
+    harvester.record_to_package_converter.record_to_package.return_value = {
         "title": "My Dataset",
+        "name": "my-dataset",
         "resources": []
     }
+
     harvest_object.content = "<rdf>dummy content</rdf>"
+    # Ensure _create_or_update_package and _create_resources are mocked for isolation
+    harvester._create_or_update_package = MagicMock(return_value="pkg-123")
+    harvester._create_resources = MagicMock(return_value=True)
 
     with patch("ckanext.fairdatapoint.harvesters.civity_harvester.model.Session") as mock_session:
-        result = dummy_harvester.import_stage(harvest_object)
-
+        result = harvester.import_stage(harvest_object)
         assert result is True
-        dummy_harvester._create_or_update_package.assert_called_once()
-        dummy_harvester._create_resources.assert_called_once()
+        harvester._create_or_update_package.assert_called_once()
+        harvester._create_resources.assert_called_once()
         assert harvest_object.current is True
         harvest_object.add.assert_called()
         mock_session.commit.assert_called()
@@ -287,3 +293,44 @@ def test_import_stage_success_update(dummy_harvester, harvest_object):
         dummy_harvester._create_or_update_package.assert_called_once()
         dummy_harvester._create_resources.assert_called_once()
         mock_session.commit.assert_called()
+
+
+def test_import_stage_dataset_links_to_existing_series(configurable_harvester, harvest_object):
+    # This test verifies that a dataset can be updated to include a link to an existing dataseries.
+    # Note: The reverse (adding datasets to a dataseries) is not handled here.
+    dataset_guid = "dataset=https://fdp.example.org/dataset/abc"
+    dataseries_guid = "dataseries=https://fdp.example.org/datasetseries/xyz"
+    harvest_object.guid = dataset_guid
+    harvest_object.extras = [HOExtra(key="status", value="change")]
+    harvest_object.package_id = "existing-dataset"
+    harvest_object.content = "<rdf>dummy dataset referencing series</rdf>"
+
+    package_to_return = {
+        "title": "Updated Dataset",
+        "name": "updated-dataset",
+        "resources": [],
+        "in_series": [dataseries_guid],
+        "owner_org": harvest_object.owner_org,
+    }
+
+    harvester = configurable_harvester(harvest_object.content, package_to_return)
+    harvester._create_or_update_package = MagicMock(return_value="pkg-123")
+
+    with patch("ckanext.fairdatapoint.harvesters.civity_harvester.model.Session") as mock_session, \
+         patch("ckanext.fairdatapoint.harvesters.civity_harvester.toolkit.get_action") as mock_get_action:
+        mock_query = MagicMock()
+        mock_query.join.return_value.filter.return_value.filter.return_value.filter.return_value.all.return_value = [
+            (dataseries_guid, "series-xyz")
+        ]
+        mock_session.query.return_value = mock_query
+
+        result = harvester.import_stage(harvest_object)
+
+        assert result is True
+        assert harvest_object.current is True
+        harvester._create_or_update_package.assert_called_once()
+        updated_pkg = harvester._create_or_update_package.call_args[0][0]
+        assert updated_pkg["id"] == "existing-dataset"
+        # Ensure that only the dataset receives the in_series update
+        assert dataseries_guid in updated_pkg["in_series"]
+        mock_session.commit.assert_called()
diff --git a/ckanext/fairdatapoint/tests/test_processors.py b/ckanext/fairdatapoint/tests/test_processors.py
@@ -36,7 +36,7 @@ def test_fdp_record_converter_catalog(self, parser_catalogs):
         data = Graph().parse(Path(TEST_DATA_DIRECTORY, "fdp_catalog.ttl")).serialize()
         fdp_record_to_package.record_to_package(
             guid="catalog=https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d",
-            record=data)
+            record=data, series_mapping=None)
         assert parser_catalogs.called
 
     def test_fdp_record_converter_dataset_dict(self):
@@ -46,7 +46,7 @@ def test_fdp_record_converter_dataset_dict(self):
             guid="catalog=https://covid19initiatives.health-ri.nl/p/ProjectOverview?focusarea="
                  "http://purl.org/zonmw/generic/10006;"
                  "dataset=https://covid19initiatives.health-ri.nl/p/Project/27866022694497978",
-            record=data)
+            record=data, series_mapping=None)
         expected_dataset = dict(extras=[], uri="https://covid19initiatives.health-ri.nl/p/Project/27866022694497978",
                                 resources=[], title="COVID-NL cohort MUMC+",
                                 notes="Clinical data of MUMC COVID-NL cohort", tags=[],
@@ -64,7 +64,8 @@ def test_fdp_record_converter_catalog_dict(self):
         data = Graph().parse(Path(TEST_DATA_DIRECTORY, "fdp_catalog.ttl")).serialize()
         actual = fdp_record_to_package.record_to_package(
             guid="catalog=https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d",
-            record=data)
+            record=data, series_mapping=None)
+
 
         expected = {
             "uri": "https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d",
diff --git a/ckanext/fairdatapoint/tests/test_profiles.py b/ckanext/fairdatapoint/tests/test_profiles.py
@@ -38,7 +38,7 @@ def test_parse_dataset():
     actual = fdp_record_to_package.record_to_package(
         guid="catalog=https://health-ri.sandbox.semlab-leiden.nl/catalog/5c85cb9f-be4a-406c-ab0a-287fa787caa0;"
              "dataset=https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5",
-        record=data)
+        record=data, series_mapping=None)
     expected = {
         'extras': [],
         'resources': [
diff --git a/ckanext/fairdatapoint/tests/test_record_provider.py b/ckanext/fairdatapoint/tests/test_record_provider.py
@@ -33,7 +33,7 @@ class TestRecordProvider:
             (
                     Path(TEST_DATA_DIRECTORY, "root_fdp_response.ttl"),
                     {
-                        'dataset=http://example.org/Dataset1',
+                        'dataseries=http://example.org/DatasetSeries1', 'dataset=http://example.org/Dataset1'
                     }
             ),
             (
@@ -62,6 +62,7 @@ def test_get_record_ids(self, mocker, fdp_response_file, expected):
             (
                     Path(TEST_DATA_DIRECTORY, "fdp_multiple_parents.ttl"),
                     {
+                        'dataseries=http://example.org/Dataseries1',
                         'dataset=http://example.org/Dataset1'
                     },
             )

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ class TestRecordProvider:`
`33`	`33`	`(`
`34`	`34`	`Path(TEST_DATA_DIRECTORY, "root_fdp_response.ttl"),`
`35`	`35`	`{`
`36`		`- 'dataset=http://example.org/Dataset1',`
	`36`	`+ 'dataseries=http://example.org/DatasetSeries1', 'dataset=http://example.org/Dataset1'`
`37`	`37`	`}`
`38`	`38`	`),`
`39`	`39`	`(`
`@@ -62,6 +62,7 @@ def test_get_record_ids(self, mocker, fdp_response_file, expected):`
`62`	`62`	`(`
`63`	`63`	`Path(TEST_DATA_DIRECTORY, "fdp_multiple_parents.ttl"),`
`64`	`64`	`{`
	`65`	`+ 'dataseries=http://example.org/Dataseries1',`
`65`	`66`	`'dataset=http://example.org/Dataset1'`
`66`	`67`	`},`
`67`	`68`	`)`