GenomicDataInfrastructure
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 55 additions & 43 deletions b/‎.github/workflows/test.yml‎
Lines changed: 55 additions & 43 deletions
diff --git a/‎ckanext/fairdatapoint/profiles.py‎
Lines changed: 33 additions & 2 deletions b/‎ckanext/fairdatapoint/profiles.py‎
Lines changed: 33 additions & 2 deletions
diff --git a/‎ckanext/fairdatapoint/tests/test_processors.py‎
Lines changed: 99 additions & 43 deletions b/‎ckanext/fairdatapoint/tests/test_processors.py‎
Lines changed: 99 additions & 43 deletions
@@ -27,7 +27,7 @@ jobs:
           POSTGRES_DB: postgres
         options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5
       redis:
-          image: redis:8
+        image: redis:8
 
     env:
       CKAN_SQLALCHEMY_URL: postgresql://ckan_default:pass@postgres/ckan_test
@@ -36,48 +36,60 @@ jobs:
       CKAN_SOLR_URL: http://solr:8983/solr/ckan
       CKAN_REDIS_URL: redis://redis:6379/1
 
+    permissions:
+      contents: read
+      packages: write
+
     steps:
-    - uses: actions/checkout@v5
-    - name: Install requirements (common)
-      run: |
-        pip install -r requirements.txt
-        pip install -r dev-requirements.txt
-        pip install -e .
-    - name: Setup CKAN extensions (harvest, scheming, dcat)
-      run: |
-        # Harvest v1.6.1 from GitHub
-        git clone https://github.com/ckan/ckanext-harvest
-        cd ckanext-harvest
-        git checkout tags/v1.6.1
-        pip install -e .
-        pip install -r requirements.txt
+      - uses: actions/checkout@v5
+      - name: REUSE Compliance Check
+        uses: fsfe/reuse-action@v5
+
+      - name: Install requirements (common)
+        run: |
+          pip install -r requirements.txt
+          pip install -r dev-requirements.txt
+          pip install -e .
+      - name: Setup CKAN extensions (harvest, scheming, dcat, fluent)
+        run: |
+          # Harvest v1.6.1 from GitHub
+          git clone https://github.com/ckan/ckanext-harvest
+          cd ckanext-harvest
+          git checkout tags/v1.6.1
+          pip install -e .
+          pip install -r requirements.txt
+          cd ..
 
-        # Scheming (Civity fork)
-        pip install -e 'git+https://github.com/CivityNL/ckanext-scheming.git@3.0.0-civity-1#egg=ckanext-scheming[requirements]'
+          # Scheming release 3.1.0
+          pip install -e 'git+https://github.com/ckan/ckanext-scheming.git@release-3.1.0#egg=ckanext-scheming[requirements]'
 
-        git clone https://github.com/GenomicDataInfrastructure/gdi-userportal-ckanext-dcat
-        cd gdi-userportal-ckanext-dcat
-        git checkout master
-        pip install -e .
-        pip install -r requirements.txt
-    - name: Setup extension
-      run: |
-        sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini
-        ckan -c test.ini db init
-        ckan -c test.ini db pending-migrations --apply
-    - name: Run tests
-      run: |
-        pytest --ckan-ini=test.ini --cov=ckanext.fairdatapoint --disable-warnings ckanext/fairdatapoint 
-    - name: Generate coverage report
-      run: |
-        coverage xml -o coverage.xml
-    - name: Install unzip
-      run: apt-get update && apt-get install -y unzip
-    - name: SonarCloud Scan
-      uses: sonarsource/sonarcloud-github-action@v5
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}  # Needed to get PR information, if any
-        SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
-    - uses: actions/checkout@v5
-    - name: REUSE Compliance Check
-      uses: fsfe/reuse-action@v5
+          # DCAT extension for FAIR Data Point
+          git clone https://github.com/GenomicDataInfrastructure/gdi-userportal-ckanext-dcat
+          cd gdi-userportal-ckanext-dcat
+          git checkout v2.3.3
+          pip install -e .
+          if [ -f requirements.txt ]; then
+            pip install -r requirements.txt
+          fi
+          cd ..
+      - name: Setup extension
+        run: |
+          sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini
+          ckan -c test.ini db init
+          ckan -c test.ini db pending-migrations --apply
+      - name: Run tests
+        run: |
+          pytest --ckan-ini=test.ini --cov=ckanext.fairdatapoint --disable-warnings ckanext/fairdatapoint
+      - name: Set SONAR_TOKEN env
+        run: echo "SONAR_TOKEN=${{ secrets.SONAR_TOKEN }}" >> $GITHUB_ENV
+      - name: Generate coverage report
+        run: |
+          coverage xml -o coverage.xml
+      - name: Install unzip
+        run: apt-get update && apt-get install -y unzip
+      - name: Sonar scan
+        uses: SonarSource/sonarqube-scan-action@v6
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
+          SONAR_HOST_URL: https://sonarcloud.io
@@ -64,12 +64,43 @@ class FAIRDataPointDCATAPProfile(EuropeanHealthDCATAPProfile):
     def parse_dataset(self, dataset_dict: Dict, dataset_ref: URIRef) -> Dict:
         super(FAIRDataPointDCATAPProfile, self).parse_dataset(dataset_dict, dataset_ref)
 
-        dataset_dict['tags'] = validate_tags(dataset_dict['tags'])
+        tags_translated = dataset_dict.get('tags_translated')
+        if isinstance(tags_translated, dict):
+            dataset_dict['tags_translated'] = self._sanitize_tags_translated(tags_translated)
+
+            default_lang_tags = dataset_dict['tags_translated'].get(self._default_lang) or next(
+                (values for values in dataset_dict['tags_translated'].values() if values),
+                []
+            )
+            dataset_dict['tags'] = [{'name': tag} for tag in default_lang_tags]
+
+        dataset_dict['tags'] = validate_tags(dataset_dict.get('tags', []))
 
         dataset_dict = self._fix_wikidata_uris(dataset_dict, PACKAGE_REPLACE_FIELDS)
 
         return dataset_dict
 
+    def _sanitize_tags_translated(self, tags_translated: Dict[str, List[str]]) -> Dict[str, List[str]]:
+        """Remove invalid multilingual tags to satisfy CKAN length rules."""
+
+        sanitized: Dict[str, List[str]] = {}
+
+        for lang, values in tags_translated.items():
+            tag_dicts = [{'name': value} for value in values if value]
+            cleaned = validate_tags(tag_dicts)
+            sanitized[lang] = [tag['name'] for tag in cleaned]
+
+            if len(values) != len(sanitized[lang]):
+                removed_tags = [v for v in values if v not in sanitized[lang]]
+                log.warning(
+                    'Removed invalid tags for language %s during multilingual sanitation. Original: %r, Removed: %r',
+                    lang,
+                    values,
+                    removed_tags
+                )
+
+        return sanitized
+
     @staticmethod
     def _rewrite_wikidata_url(uri: str) -> str:
         """This function fixes Wikidata URIs to use references instead of web URI
@@ -110,4 +141,4 @@ def _fix_wikidata_uris(self, dataset_dict: dict, fields_list: list[str]):
                 else:
                     new_value = self._rewrite_wikidata_url(value)
                 dataset_dict[field] = new_value
-        return dataset_dict
+        return dataset_dict
@@ -2,13 +2,11 @@
 #
 # SPDX-License-Identifier: AGPL-3.0-only
 
-import pytest
-from datetime import datetime
-from dateutil.tz import tzutc
+import json
 from pathlib import Path
 from unittest.mock import patch
 
-from docopt import extras
+import pytest
 from rdflib import Graph
 from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import (
     FairDataPointRecordToPackageConverter)
@@ -39,6 +37,10 @@ def test_fdp_record_converter_catalog(self, parser_catalogs):
             record=data, series_mapping=None)
         assert parser_catalogs.called
 
+    @staticmethod
+    def _extras_to_dict(extras_list):
+        return {item["key"]: item["value"] for item in extras_list}
+
     def test_fdp_record_converter_dataset_dict(self):
         fdp_record_to_package = FairDataPointRecordToPackageConverter(profile="fairdatapoint_dcat_ap")
         data = Graph().parse(Path(TEST_DATA_DIRECTORY, "Project_27866022694497978_out.ttl")).serialize()
@@ -47,17 +49,64 @@ def test_fdp_record_converter_dataset_dict(self):
                  "http://purl.org/zonmw/generic/10006;"
                  "dataset=https://covid19initiatives.health-ri.nl/p/Project/27866022694497978",
             record=data, series_mapping=None)
-        expected_dataset = dict(extras=[], uri="https://covid19initiatives.health-ri.nl/p/Project/27866022694497978",
-                                resources=[], title="COVID-NL cohort MUMC+",
-                                notes="Clinical data of MUMC COVID-NL cohort", tags=[],
-                                license_id="", identifier="27866022694497978",
-                                has_version=[
-                                    "https://repo.metadatacenter.org/template-instances/2836bf1c-76e9-44e7-a65e-80e9ca63025a"],
-                                contact=[{'email': '', 'identifier': 'https://orcid.org/0000-0002-4348-707X', 'name': 'N.K. De Vries','uri': '', 'url': ''}
-                                ], creator=[{'email': '', 'identifier': '', 'name': '', 'type': '', 'uri': 'https://orcid.org/0000-0002-0180-3636', 'url': ''}],
-                                publisher=[{'email': '','identifier': '','name': '','type': '','uri': 'https://opal.health-ri.nl/pub', 'url': ''}],
-                                temporal_start='2020-01-01', temporal_end='2025-12-31')
-        assert actual_dataset == expected_dataset
+        extras_dict = self._extras_to_dict(actual_dataset["extras"])
+
+        assert actual_dataset["resources"] == []
+        assert actual_dataset["title"] == "COVID-NL cohort MUMC+"
+        assert actual_dataset["notes"] == "Clinical data of MUMC COVID-NL cohort"
+        assert actual_dataset["tags"] == []
+        assert actual_dataset["license_id"] == ""
+        assert actual_dataset["has_version"] == [
+            "https://repo.metadatacenter.org/template-instances/2836bf1c-76e9-44e7-a65e-80e9ca63025a"
+        ]
+        assert actual_dataset["contact"] == [
+            {
+                "email": "",
+                "identifier": "https://orcid.org/0000-0002-4348-707X",
+                "name": "N.K. De Vries",
+                "uri": "",
+                "url": "",
+            }
+        ]
+        assert actual_dataset["creator"] == [
+            {
+                "email": "",
+                "identifier": "",
+                "name": "",
+                "type": "",
+                "uri": "https://orcid.org/0000-0002-0180-3636",
+                "url": "",
+            }
+        ]
+        assert actual_dataset["publisher"] == [
+            {
+                "email": "",
+                "identifier": "",
+                "name": "",
+                "type": "",
+                "uri": "https://opal.health-ri.nl/pub",
+                "url": "",
+            }
+        ]
+        assert actual_dataset["temporal_start"] == "2020-01-01"
+        assert actual_dataset["temporal_end"] == "2025-12-31"
+        assert actual_dataset["retention_period"] == []
+
+        assert extras_dict["identifier"] == "27866022694497978"
+        assert (
+            extras_dict["uri"]
+            == "https://covid19initiatives.health-ri.nl/p/Project/27866022694497978"
+        )
+        assert extras_dict["contact_name"] == "N.K. De Vries"
+        assert (
+            extras_dict["contact_identifier"]
+            == "https://orcid.org/0000-0002-4348-707X"
+        )
+        assert (
+            extras_dict["publisher_uri"] == "https://opal.health-ri.nl/pub"
+        )
+        assert extras_dict["creator_uri"] == "https://orcid.org/0000-0002-0180-3636"
+        assert extras_dict["homepage"] == "http://localhost:5000"
 
     def test_fdp_record_converter_catalog_dict(self):
         fdp_record_to_package = FairDataPointRecordToPackageConverter(profile="fairdatapoint_dcat_ap")
@@ -66,33 +115,40 @@ def test_fdp_record_converter_catalog_dict(self):
             guid="catalog=https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d",
             record=data, series_mapping=None)
 
+        extras_dict = self._extras_to_dict(actual["extras"])
 
-        expected = {
-            "uri": "https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d",
-            "access_rights": "https://fair.healthinformationportal.eu/catalog/"
-                             "1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#accessRights",
-            "conforms_to": ["https://fair.healthinformationportal.eu/profile/"
-                            "a0949e72-4466-4d53-8900-9436d1049a4b"],
-            "extras": [],
-            "has_version": ["1.0"],
-            "issued": '2023-10-06T10:12:55.614000+00:00',
-            "language": ["http://id.loc.gov/vocabulary/iso639-1/en"],
-            "license_id": "",
-            "modified": '2023-10-06T10:12:55.614000+00:00',
-            'publisher': [
-                {
-                    'email': '',
-                    'identifier': '',
-                    "name": "Automatic",
-                    'type': '',
-                    'uri': '',
-                    'url': '',
-                },
-            ],
-
-            "resources": [],
-            "tags": [],
-            "title": "Slovenia National Node"
-        }
+        assert actual["has_version"] == ["1.0"]
+        assert actual["issued"] == "2023-10-06T10:12:55.614000+00:00"
+        assert actual["modified"] == "2023-10-06T10:12:55.614000+00:00"
+        assert actual["license_id"] == ""
+        assert actual["publisher"] == [
+            {
+                "email": "",
+                "identifier": "",
+                "name": "Automatic",
+                "type": "",
+                "uri": "",
+                "url": "",
+            }
+        ]
+        assert actual["resources"] == []
+        assert actual["tags"] == []
+        assert actual["title"] == "Slovenia National Node"
+        assert actual["retention_period"] == []
 
-        assert actual == expected
+        assert (
+            extras_dict["uri"]
+            == "https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d"
+        )
+        assert (
+            extras_dict["access_rights"]
+            == "https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#accessRights"
+        )
+        assert json.loads(extras_dict["conforms_to"]) == [
+            "https://fair.healthinformationportal.eu/profile/a0949e72-4466-4d53-8900-9436d1049a4b"
+        ]
+        assert json.loads(extras_dict["language"]) == [
+            "http://id.loc.gov/vocabulary/iso639-1/en"
+        ]
+        assert extras_dict["publisher_name"] == "Automatic"
+        assert extras_dict["homepage"] == "http://localhost:5000"