diff --git a/.github/workflows/generate_changelog.yml b/.github/workflows/generate_changelog.yml index 385d01a..2b466f2 100644 --- a/.github/workflows/generate_changelog.yml +++ b/.github/workflows/generate_changelog.yml @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout Repository - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: token: '${{ secrets.GITHUB_TOKEN }}' fetch-depth: 0 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c8e27d2..8b55501 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -18,7 +18,7 @@ jobs: permissions: contents: read steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: oss-review-toolkit/ort-ci-github-action@v1 with: allow-dynamic-versions: "true" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5c5796c..3d93cf5 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -19,7 +19,7 @@ jobs: strategy: fail-fast: false steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: oss-review-toolkit/ort-ci-github-action@v1 with: allow-dynamic-versions: "true" @@ -33,7 +33,7 @@ jobs: new_tag: ${{ steps.tagging.outputs.new_tag }} steps: - name: Checkout Repository - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 @@ -73,7 +73,7 @@ jobs: needs: versioning steps: - name: Checkout Repository - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install GitHub CLI run: sudo apt-get install -y gh diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 560c787..93fe8b4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,7 +27,7 @@ jobs: POSTGRES_DB: postgres options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 redis: - image: redis:8 + image: redis:8 env: CKAN_SQLALCHEMY_URL: postgresql://ckan_default:pass@postgres/ckan_test @@ -36,48 +36,59 @@ jobs: CKAN_SOLR_URL: http://solr:8983/solr/ckan CKAN_REDIS_URL: redis://redis:6379/1 + permissions: + contents: read + packages: write + steps: - - uses: actions/checkout@v4 - - name: Install requirements (common) - run: | - pip install -r requirements.txt - pip install -r dev-requirements.txt - pip install -e . - - name: Setup CKAN extensions (harvest, scheming, dcat) - run: | - # Harvest v1.6.1 from GitHub - git clone https://github.com/ckan/ckanext-harvest - cd ckanext-harvest - git checkout tags/v1.6.1 - pip install -e . - pip install -r requirements.txt + - uses: actions/checkout@v5 + - name: REUSE Compliance Check + uses: fsfe/reuse-action@v5 + + - name: Install requirements (common) + run: | + pip install -r requirements.txt + pip install -r dev-requirements.txt + pip install -e . + - name: Setup CKAN extensions (harvest, scheming, dcat, fluent) + run: | + # Harvest v1.6.1 from GitHub + git clone https://github.com/ckan/ckanext-harvest + cd ckanext-harvest + git checkout tags/v1.6.1 + pip install -e . + pip install -r requirements.txt + cd .. - # Scheming (Civity fork) - pip install -e 'git+https://github.com/CivityNL/ckanext-scheming.git@3.0.0-civity-1#egg=ckanext-scheming[requirements]' + # Scheming release 3.1.0 + pip install -e 'git+https://github.com/ckan/ckanext-scheming.git@release-3.1.0#egg=ckanext-scheming[requirements]' - git clone https://github.com/GenomicDataInfrastructure/gdi-userportal-ckanext-dcat - cd gdi-userportal-ckanext-dcat - git checkout master - pip install -e . - pip install -r requirements.txt - - name: Setup extension - run: | - sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini - ckan -c test.ini db init - ckan -c test.ini db pending-migrations --apply - - name: Run tests - run: | - pytest --ckan-ini=test.ini --cov=ckanext.fairdatapoint --disable-warnings ckanext/fairdatapoint - - name: Generate coverage report - run: | - coverage xml -o coverage.xml - - name: Install unzip - run: apt-get update && apt-get install -y unzip - - name: SonarCloud Scan - uses: sonarsource/sonarcloud-github-action@v5 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information, if any - SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} - - uses: actions/checkout@v4 - - name: REUSE Compliance Check - uses: fsfe/reuse-action@v5 + # DCAT extension for FAIR Data Point + git clone https://github.com/GenomicDataInfrastructure/gdi-userportal-ckanext-dcat + cd gdi-userportal-ckanext-dcat + git checkout v2.3.3 + pip install -e . + if [ -f requirements.txt ]; then + pip install -r requirements.txt + fi + cd .. + + - name: Setup extension + run: | + sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini + ckan -c test.ini db init + ckan -c test.ini db pending-migrations --apply + - name: Run tests + run: | + pytest --ckan-ini=test.ini --cov=ckanext.fairdatapoint --disable-warnings ckanext/fairdatapoint + - name: Generate coverage report + run: | + coverage xml -o coverage.xml + - name: Install unzip + run: apt-get update && apt-get install -y unzip + - name: Sonar scan + uses: SonarSource/sonarqube-scan-action@v6 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} + SONAR_HOST_URL: https://sonarcloud.io diff --git a/ckanext/fairdatapoint/profiles.py b/ckanext/fairdatapoint/profiles.py index 9d6b3fc..82d07ea 100644 --- a/ckanext/fairdatapoint/profiles.py +++ b/ckanext/fairdatapoint/profiles.py @@ -64,12 +64,43 @@ class FAIRDataPointDCATAPProfile(EuropeanHealthDCATAPProfile): def parse_dataset(self, dataset_dict: Dict, dataset_ref: URIRef) -> Dict: super(FAIRDataPointDCATAPProfile, self).parse_dataset(dataset_dict, dataset_ref) - dataset_dict['tags'] = validate_tags(dataset_dict['tags']) + tags_translated = dataset_dict.get('tags_translated') + if isinstance(tags_translated, dict): + dataset_dict['tags_translated'] = self._sanitize_tags_translated(tags_translated) + + default_lang_tags = dataset_dict['tags_translated'].get(self._default_lang) or next( + (values for values in dataset_dict['tags_translated'].values() if values), + [] + ) + dataset_dict['tags'] = [{'name': tag} for tag in default_lang_tags] + + dataset_dict['tags'] = validate_tags(dataset_dict.get('tags', [])) dataset_dict = self._fix_wikidata_uris(dataset_dict, PACKAGE_REPLACE_FIELDS) return dataset_dict + def _sanitize_tags_translated(self, tags_translated: Dict[str, List[str]]) -> Dict[str, List[str]]: + """Remove invalid multilingual tags to satisfy CKAN length rules.""" + + sanitized: Dict[str, List[str]] = {} + + for lang, values in tags_translated.items(): + tag_dicts = [{'name': value} for value in values if value] + cleaned = validate_tags(tag_dicts) + sanitized[lang] = [tag['name'] for tag in cleaned] + + if len(values) != len(sanitized[lang]): + removed_tags = [v for v in values if v not in sanitized[lang]] + log.warning( + 'Removed invalid tags for language %s during multilingual sanitation. Original: %r, Removed: %r', + lang, + values, + removed_tags + ) + + return sanitized + @staticmethod def _rewrite_wikidata_url(uri: str) -> str: """This function fixes Wikidata URIs to use references instead of web URI @@ -110,4 +141,4 @@ def _fix_wikidata_uris(self, dataset_dict: dict, fields_list: list[str]): else: new_value = self._rewrite_wikidata_url(value) dataset_dict[field] = new_value - return dataset_dict \ No newline at end of file + return dataset_dict diff --git a/ckanext/fairdatapoint/tests/test_processors.py b/ckanext/fairdatapoint/tests/test_processors.py index 10d8747..3833866 100644 --- a/ckanext/fairdatapoint/tests/test_processors.py +++ b/ckanext/fairdatapoint/tests/test_processors.py @@ -2,13 +2,11 @@ # # SPDX-License-Identifier: AGPL-3.0-only -import pytest -from datetime import datetime -from dateutil.tz import tzutc +import json from pathlib import Path from unittest.mock import patch -from docopt import extras +import pytest from rdflib import Graph from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import ( FairDataPointRecordToPackageConverter) @@ -39,6 +37,10 @@ def test_fdp_record_converter_catalog(self, parser_catalogs): record=data, series_mapping=None) assert parser_catalogs.called + @staticmethod + def _extras_to_dict(extras_list): + return {item["key"]: item["value"] for item in extras_list} + def test_fdp_record_converter_dataset_dict(self): fdp_record_to_package = FairDataPointRecordToPackageConverter(profile="fairdatapoint_dcat_ap") data = Graph().parse(Path(TEST_DATA_DIRECTORY, "Project_27866022694497978_out.ttl")).serialize() @@ -47,17 +49,64 @@ def test_fdp_record_converter_dataset_dict(self): "http://purl.org/zonmw/generic/10006;" "dataset=https://covid19initiatives.health-ri.nl/p/Project/27866022694497978", record=data, series_mapping=None) - expected_dataset = dict(extras=[], uri="https://covid19initiatives.health-ri.nl/p/Project/27866022694497978", - resources=[], title="COVID-NL cohort MUMC+", - notes="Clinical data of MUMC COVID-NL cohort", tags=[], - license_id="", identifier="27866022694497978", - has_version=[ - "https://repo.metadatacenter.org/template-instances/2836bf1c-76e9-44e7-a65e-80e9ca63025a"], - contact=[{'email': '', 'identifier': 'https://orcid.org/0000-0002-4348-707X', 'name': 'N.K. De Vries','uri': '', 'url': ''} - ], creator=[{'email': '', 'identifier': '', 'name': '', 'type': '', 'uri': 'https://orcid.org/0000-0002-0180-3636', 'url': ''}], - publisher=[{'email': '','identifier': '','name': '','type': '','uri': 'https://opal.health-ri.nl/pub', 'url': ''}], - temporal_start='2020-01-01', temporal_end='2025-12-31') - assert actual_dataset == expected_dataset + extras_dict = self._extras_to_dict(actual_dataset["extras"]) + + assert actual_dataset["resources"] == [] + assert actual_dataset["title"] == "COVID-NL cohort MUMC+" + assert actual_dataset["notes"] == "Clinical data of MUMC COVID-NL cohort" + assert actual_dataset["tags"] == [] + assert actual_dataset["license_id"] == "" + assert actual_dataset["has_version"] == [ + "https://repo.metadatacenter.org/template-instances/2836bf1c-76e9-44e7-a65e-80e9ca63025a" + ] + assert actual_dataset["contact"] == [ + { + "email": "", + "identifier": "https://orcid.org/0000-0002-4348-707X", + "name": "N.K. De Vries", + "uri": "", + "url": "", + } + ] + assert actual_dataset["creator"] == [ + { + "email": "", + "identifier": "", + "name": "", + "type": "", + "uri": "https://orcid.org/0000-0002-0180-3636", + "url": "", + } + ] + assert actual_dataset["publisher"] == [ + { + "email": "", + "identifier": "", + "name": "", + "type": "", + "uri": "https://opal.health-ri.nl/pub", + "url": "", + } + ] + assert actual_dataset["temporal_start"] == "2020-01-01" + assert actual_dataset["temporal_end"] == "2025-12-31" + assert actual_dataset["retention_period"] == [] + + assert extras_dict["identifier"] == "27866022694497978" + assert ( + extras_dict["uri"] + == "https://covid19initiatives.health-ri.nl/p/Project/27866022694497978" + ) + assert extras_dict["contact_name"] == "N.K. De Vries" + assert ( + extras_dict["contact_identifier"] + == "https://orcid.org/0000-0002-4348-707X" + ) + assert ( + extras_dict["publisher_uri"] == "https://opal.health-ri.nl/pub" + ) + assert extras_dict["creator_uri"] == "https://orcid.org/0000-0002-0180-3636" + assert extras_dict["homepage"] == "http://localhost:5000" def test_fdp_record_converter_catalog_dict(self): fdp_record_to_package = FairDataPointRecordToPackageConverter(profile="fairdatapoint_dcat_ap") @@ -66,33 +115,40 @@ def test_fdp_record_converter_catalog_dict(self): guid="catalog=https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d", record=data, series_mapping=None) + extras_dict = self._extras_to_dict(actual["extras"]) - expected = { - "uri": "https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d", - "access_rights": "https://fair.healthinformationportal.eu/catalog/" - "1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#accessRights", - "conforms_to": ["https://fair.healthinformationportal.eu/profile/" - "a0949e72-4466-4d53-8900-9436d1049a4b"], - "extras": [], - "has_version": ["1.0"], - "issued": '2023-10-06T10:12:55.614000+00:00', - "language": ["http://id.loc.gov/vocabulary/iso639-1/en"], - "license_id": "", - "modified": '2023-10-06T10:12:55.614000+00:00', - 'publisher': [ - { - 'email': '', - 'identifier': '', - "name": "Automatic", - 'type': '', - 'uri': '', - 'url': '', - }, - ], - - "resources": [], - "tags": [], - "title": "Slovenia National Node" - } + assert actual["has_version"] == ["1.0"] + assert actual["issued"] == "2023-10-06T10:12:55.614000+00:00" + assert actual["modified"] == "2023-10-06T10:12:55.614000+00:00" + assert actual["license_id"] == "" + assert actual["publisher"] == [ + { + "email": "", + "identifier": "", + "name": "Automatic", + "type": "", + "uri": "", + "url": "", + } + ] + assert actual["resources"] == [] + assert actual["tags"] == [] + assert actual["title"] == "Slovenia National Node" + assert actual["retention_period"] == [] - assert actual == expected + assert ( + extras_dict["uri"] + == "https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d" + ) + assert ( + extras_dict["access_rights"] + == "https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#accessRights" + ) + assert json.loads(extras_dict["conforms_to"]) == [ + "https://fair.healthinformationportal.eu/profile/a0949e72-4466-4d53-8900-9436d1049a4b" + ] + assert json.loads(extras_dict["language"]) == [ + "http://id.loc.gov/vocabulary/iso639-1/en" + ] + assert extras_dict["publisher_name"] == "Automatic" + assert extras_dict["homepage"] == "http://localhost:5000" diff --git a/ckanext/fairdatapoint/tests/test_profiles.py b/ckanext/fairdatapoint/tests/test_profiles.py index 3ec28fe..d69554a 100644 --- a/ckanext/fairdatapoint/tests/test_profiles.py +++ b/ckanext/fairdatapoint/tests/test_profiles.py @@ -2,15 +2,16 @@ # # SPDX-License-Identifier: AGPL-3.0-only -import pytest -from datetime import datetime -from dateutil.tz import tzutc +import json from pathlib import Path + +import pytest from rdflib import Graph -from ckanext.fairdatapoint.profiles import validate_tags + from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import ( - FairDataPointRecordToPackageConverter + FairDataPointRecordToPackageConverter, ) +from ckanext.fairdatapoint.profiles import validate_tags TEST_DATA_DIRECTORY = Path(Path(__file__).parent.resolve(), "test_data") @@ -39,38 +40,71 @@ def test_parse_dataset(): guid="catalog=https://health-ri.sandbox.semlab-leiden.nl/catalog/5c85cb9f-be4a-406c-ab0a-287fa787caa0;" "dataset=https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5", record=data, series_mapping=None) - expected = { - 'extras': [], - 'resources': [ - {'name': 'Clinical data for [PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)', - 'description': 'Clinical data for [PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)', - 'access_url': 'https://cbioportal.health-ri.nl/study/clinicalData?id=lgg_ucsf_2014', - 'license': 'http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0', - 'url': 'https://cbioportal.health-ri.nl/study/clinicalData?id=lgg_ucsf_2014', - 'uri': 'https://health-ri.sandbox.semlab-leiden.nl/distribution/931ed9c4-ad23-47ff-b121-2eb428e57423', - 'distribution_ref': 'https://health-ri.sandbox.semlab-leiden.nl/distribution/931ed9c4-ad23-47ff-b121-2eb428e57423'}, - {'name': 'Mutations', - 'description': 'Mutation data from whole exome sequencing of 23 grade II glioma tumor/normal pairs. (MAF)', - 'access_url': 'https://cbioportal.health-ri.nl/study/summary?id=lgg_ucsf_2014', - 'license': 'http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0', - 'url': 'https://cbioportal.health-ri.nl/study/summary?id=lgg_ucsf_2014', - 'uri': 'https://health-ri.sandbox.semlab-leiden.nl/distribution/ad00299f-6efb-42aa-823d-5ff2337f38f7', - 'distribution_ref': 'https://health-ri.sandbox.semlab-leiden.nl/distribution/ad00299f-6efb-42aa-823d-5ff2337f38f7'} - ], - 'title': '[PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)', - 'notes': 'Whole exome sequencing of 23 grade II glioma tumor/normal pairs.', - 'url': 'https://cbioportal.health-ri.nl/study/summary?id=lgg_ucsf_2014', - 'tags': [{'name': 'CNS Brain'}, {'name': 'Diffuse Glioma'}, {'name': 'Glioma'}], - 'license_id': '', - 'issued': '2019-10-30 23:00:00', - 'modified': '2019-10-30 23:00:00', - 'identifier': 'lgg_ucsf_2014', - 'language': ['http://id.loc.gov/vocabulary/iso639-1/en'], - 'conforms_to': ['https://health-ri.sandbox.semlab-leiden.nl/profile/2f08228e-1789-40f8-84cd-28e3288c3604'], - 'publisher': [ - {'email': '', 'identifier': '', 'name': '', 'type': '', 'uri': 'https://www.health-ri.nl', 'url': ''}], - 'uri': 'https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5', - 'is_referenced_by': ['https://pubmed.ncbi.nlm.nih.gov/24336570'] # Make this a list to match 'actual' - } + extras_dict = {item["key"]: item["value"] for item in actual["extras"]} + + expected_resources = [ + { + "name": "Clinical data for [PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)", + "description": "Clinical data for [PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)", + "access_url": "https://cbioportal.health-ri.nl/study/clinicalData?id=lgg_ucsf_2014", + "license": "http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0", + "url": "https://cbioportal.health-ri.nl/study/clinicalData?id=lgg_ucsf_2014", + "uri": "https://health-ri.sandbox.semlab-leiden.nl/distribution/931ed9c4-ad23-47ff-b121-2eb428e57423", + "distribution_ref": "https://health-ri.sandbox.semlab-leiden.nl/distribution/931ed9c4-ad23-47ff-b121-2eb428e57423", + }, + { + "name": "Mutations", + "description": "Mutation data from whole exome sequencing of 23 grade II glioma tumor/normal pairs. (MAF)", + "access_url": "https://cbioportal.health-ri.nl/study/summary?id=lgg_ucsf_2014", + "license": "http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0", + "url": "https://cbioportal.health-ri.nl/study/summary?id=lgg_ucsf_2014", + "uri": "https://health-ri.sandbox.semlab-leiden.nl/distribution/ad00299f-6efb-42aa-823d-5ff2337f38f7", + "distribution_ref": "https://health-ri.sandbox.semlab-leiden.nl/distribution/ad00299f-6efb-42aa-823d-5ff2337f38f7", + }, + ] + + assert len(actual["resources"]) == len(expected_resources) + for actual_resource, expected_resource in zip(actual["resources"], expected_resources): + assert actual_resource["retention_period"] == [] + for field, value in expected_resource.items(): + assert actual_resource[field] == value + + assert actual["title"] == "[PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)" + assert actual["notes"] == "Whole exome sequencing of 23 grade II glioma tumor/normal pairs." + assert actual["url"] == "https://cbioportal.health-ri.nl/study/summary?id=lgg_ucsf_2014" + assert actual["tags"] == [ + {"name": "CNS Brain"}, + {"name": "Diffuse Glioma"}, + {"name": "Glioma"}, + ] + assert actual["license_id"] == "" + assert actual["issued"] == "2019-10-30 23:00:00" + assert actual["modified"] == "2019-10-30 23:00:00" + assert actual["publisher"] == [ + { + "email": "", + "identifier": "", + "name": "", + "type": "", + "uri": "https://www.health-ri.nl", + "url": "", + } + ] + assert actual["retention_period"] == [] - assert actual == expected + assert extras_dict["identifier"] == "lgg_ucsf_2014" + assert json.loads(extras_dict["language"]) == [ + "http://id.loc.gov/vocabulary/iso639-1/en" + ] + assert json.loads(extras_dict["conforms_to"]) == [ + "https://health-ri.sandbox.semlab-leiden.nl/profile/2f08228e-1789-40f8-84cd-28e3288c3604" + ] + assert extras_dict["publisher_uri"] == "https://www.health-ri.nl" + assert ( + extras_dict["uri"] + == "https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5" + ) + assert extras_dict["homepage"] == "http://localhost:5000" + assert json.loads(extras_dict["is_referenced_by"]) == [ + "https://pubmed.ncbi.nlm.nih.gov/24336570" + ] diff --git a/requirements.txt b/requirements.txt index 1c08861..39b2b08 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: AGPL-3.0-only -rdflib~=7.1.0 +rdflib~=7.2.1 setuptools~=80.9.0 nose~=1.3.7 requests~=2.32.3 \ No newline at end of file