Skip to content

Commit 76aa614

Browse files
authored
Merge branch 'multilingual-support' into renovate/actions-checkout-5.x
2 parents 9fa1f01 + d3213f7 commit 76aa614

File tree

5 files changed

+261
-128
lines changed

5 files changed

+261
-128
lines changed

.github/workflows/test.yml

Lines changed: 55 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ jobs:
2727
POSTGRES_DB: postgres
2828
options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5
2929
redis:
30-
image: redis:8
30+
image: redis:8
3131

3232
env:
3333
CKAN_SQLALCHEMY_URL: postgresql://ckan_default:pass@postgres/ckan_test
@@ -36,48 +36,60 @@ jobs:
3636
CKAN_SOLR_URL: http://solr:8983/solr/ckan
3737
CKAN_REDIS_URL: redis://redis:6379/1
3838

39+
permissions:
40+
contents: read
41+
packages: write
42+
3943
steps:
40-
- uses: actions/checkout@v5
41-
- name: Install requirements (common)
42-
run: |
43-
pip install -r requirements.txt
44-
pip install -r dev-requirements.txt
45-
pip install -e .
46-
- name: Setup CKAN extensions (harvest, scheming, dcat)
47-
run: |
48-
# Harvest v1.6.1 from GitHub
49-
git clone https://github.com/ckan/ckanext-harvest
50-
cd ckanext-harvest
51-
git checkout tags/v1.6.1
52-
pip install -e .
53-
pip install -r requirements.txt
44+
- uses: actions/checkout@v5
45+
- name: REUSE Compliance Check
46+
uses: fsfe/reuse-action@v5
47+
48+
- name: Install requirements (common)
49+
run: |
50+
pip install -r requirements.txt
51+
pip install -r dev-requirements.txt
52+
pip install -e .
53+
- name: Setup CKAN extensions (harvest, scheming, dcat, fluent)
54+
run: |
55+
# Harvest v1.6.1 from GitHub
56+
git clone https://github.com/ckan/ckanext-harvest
57+
cd ckanext-harvest
58+
git checkout tags/v1.6.1
59+
pip install -e .
60+
pip install -r requirements.txt
61+
cd ..
5462
55-
# Scheming (Civity fork)
56-
pip install -e 'git+https://github.com/CivityNL/ckanext-scheming.git@3.0.0-civity-1#egg=ckanext-scheming[requirements]'
63+
# Scheming release 3.1.0
64+
pip install -e 'git+https://github.com/ckan/ckanext-scheming.git@release-3.1.0#egg=ckanext-scheming[requirements]'
5765
58-
git clone https://github.com/GenomicDataInfrastructure/gdi-userportal-ckanext-dcat
59-
cd gdi-userportal-ckanext-dcat
60-
git checkout master
61-
pip install -e .
62-
pip install -r requirements.txt
63-
- name: Setup extension
64-
run: |
65-
sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini
66-
ckan -c test.ini db init
67-
ckan -c test.ini db pending-migrations --apply
68-
- name: Run tests
69-
run: |
70-
pytest --ckan-ini=test.ini --cov=ckanext.fairdatapoint --disable-warnings ckanext/fairdatapoint
71-
- name: Generate coverage report
72-
run: |
73-
coverage xml -o coverage.xml
74-
- name: Install unzip
75-
run: apt-get update && apt-get install -y unzip
76-
- name: SonarCloud Scan
77-
uses: sonarsource/sonarcloud-github-action@v5
78-
env:
79-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information, if any
80-
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
81-
- uses: actions/checkout@v5
82-
- name: REUSE Compliance Check
83-
uses: fsfe/reuse-action@v5
66+
# DCAT extension for FAIR Data Point
67+
git clone https://github.com/GenomicDataInfrastructure/gdi-userportal-ckanext-dcat
68+
cd gdi-userportal-ckanext-dcat
69+
git checkout v2.3.3
70+
pip install -e .
71+
if [ -f requirements.txt ]; then
72+
pip install -r requirements.txt
73+
fi
74+
cd ..
75+
- name: Setup extension
76+
run: |
77+
sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini
78+
ckan -c test.ini db init
79+
ckan -c test.ini db pending-migrations --apply
80+
- name: Run tests
81+
run: |
82+
pytest --ckan-ini=test.ini --cov=ckanext.fairdatapoint --disable-warnings ckanext/fairdatapoint
83+
- name: Set SONAR_TOKEN env
84+
run: echo "SONAR_TOKEN=${{ secrets.SONAR_TOKEN }}" >> $GITHUB_ENV
85+
- name: Generate coverage report
86+
run: |
87+
coverage xml -o coverage.xml
88+
- name: Install unzip
89+
run: apt-get update && apt-get install -y unzip
90+
- name: Sonar scan
91+
uses: SonarSource/sonarqube-scan-action@v6
92+
env:
93+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
94+
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
95+
SONAR_HOST_URL: https://sonarcloud.io

ckanext/fairdatapoint/profiles.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,43 @@ class FAIRDataPointDCATAPProfile(EuropeanHealthDCATAPProfile):
6464
def parse_dataset(self, dataset_dict: Dict, dataset_ref: URIRef) -> Dict:
6565
super(FAIRDataPointDCATAPProfile, self).parse_dataset(dataset_dict, dataset_ref)
6666

67-
dataset_dict['tags'] = validate_tags(dataset_dict['tags'])
67+
tags_translated = dataset_dict.get('tags_translated')
68+
if isinstance(tags_translated, dict):
69+
dataset_dict['tags_translated'] = self._sanitize_tags_translated(tags_translated)
70+
71+
default_lang_tags = dataset_dict['tags_translated'].get(self._default_lang) or next(
72+
(values for values in dataset_dict['tags_translated'].values() if values),
73+
[]
74+
)
75+
dataset_dict['tags'] = [{'name': tag} for tag in default_lang_tags]
76+
77+
dataset_dict['tags'] = validate_tags(dataset_dict.get('tags', []))
6878

6979
dataset_dict = self._fix_wikidata_uris(dataset_dict, PACKAGE_REPLACE_FIELDS)
7080

7181
return dataset_dict
7282

83+
def _sanitize_tags_translated(self, tags_translated: Dict[str, List[str]]) -> Dict[str, List[str]]:
84+
"""Remove invalid multilingual tags to satisfy CKAN length rules."""
85+
86+
sanitized: Dict[str, List[str]] = {}
87+
88+
for lang, values in tags_translated.items():
89+
tag_dicts = [{'name': value} for value in values if value]
90+
cleaned = validate_tags(tag_dicts)
91+
sanitized[lang] = [tag['name'] for tag in cleaned]
92+
93+
if len(values) != len(sanitized[lang]):
94+
removed_tags = [v for v in values if v not in sanitized[lang]]
95+
log.warning(
96+
'Removed invalid tags for language %s during multilingual sanitation. Original: %r, Removed: %r',
97+
lang,
98+
values,
99+
removed_tags
100+
)
101+
102+
return sanitized
103+
73104
@staticmethod
74105
def _rewrite_wikidata_url(uri: str) -> str:
75106
"""This function fixes Wikidata URIs to use references instead of web URI
@@ -110,4 +141,4 @@ def _fix_wikidata_uris(self, dataset_dict: dict, fields_list: list[str]):
110141
else:
111142
new_value = self._rewrite_wikidata_url(value)
112143
dataset_dict[field] = new_value
113-
return dataset_dict
144+
return dataset_dict

ckanext/fairdatapoint/tests/test_processors.py

Lines changed: 99 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,11 @@
22
#
33
# SPDX-License-Identifier: AGPL-3.0-only
44

5-
import pytest
6-
from datetime import datetime
7-
from dateutil.tz import tzutc
5+
import json
86
from pathlib import Path
97
from unittest.mock import patch
108

11-
from docopt import extras
9+
import pytest
1210
from rdflib import Graph
1311
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import (
1412
FairDataPointRecordToPackageConverter)
@@ -39,6 +37,10 @@ def test_fdp_record_converter_catalog(self, parser_catalogs):
3937
record=data, series_mapping=None)
4038
assert parser_catalogs.called
4139

40+
@staticmethod
41+
def _extras_to_dict(extras_list):
42+
return {item["key"]: item["value"] for item in extras_list}
43+
4244
def test_fdp_record_converter_dataset_dict(self):
4345
fdp_record_to_package = FairDataPointRecordToPackageConverter(profile="fairdatapoint_dcat_ap")
4446
data = Graph().parse(Path(TEST_DATA_DIRECTORY, "Project_27866022694497978_out.ttl")).serialize()
@@ -47,17 +49,64 @@ def test_fdp_record_converter_dataset_dict(self):
4749
"http://purl.org/zonmw/generic/10006;"
4850
"dataset=https://covid19initiatives.health-ri.nl/p/Project/27866022694497978",
4951
record=data, series_mapping=None)
50-
expected_dataset = dict(extras=[], uri="https://covid19initiatives.health-ri.nl/p/Project/27866022694497978",
51-
resources=[], title="COVID-NL cohort MUMC+",
52-
notes="Clinical data of MUMC COVID-NL cohort", tags=[],
53-
license_id="", identifier="27866022694497978",
54-
has_version=[
55-
"https://repo.metadatacenter.org/template-instances/2836bf1c-76e9-44e7-a65e-80e9ca63025a"],
56-
contact=[{'email': '', 'identifier': 'https://orcid.org/0000-0002-4348-707X', 'name': 'N.K. De Vries','uri': '', 'url': ''}
57-
], creator=[{'email': '', 'identifier': '', 'name': '', 'type': '', 'uri': 'https://orcid.org/0000-0002-0180-3636', 'url': ''}],
58-
publisher=[{'email': '','identifier': '','name': '','type': '','uri': 'https://opal.health-ri.nl/pub', 'url': ''}],
59-
temporal_start='2020-01-01', temporal_end='2025-12-31')
60-
assert actual_dataset == expected_dataset
52+
extras_dict = self._extras_to_dict(actual_dataset["extras"])
53+
54+
assert actual_dataset["resources"] == []
55+
assert actual_dataset["title"] == "COVID-NL cohort MUMC+"
56+
assert actual_dataset["notes"] == "Clinical data of MUMC COVID-NL cohort"
57+
assert actual_dataset["tags"] == []
58+
assert actual_dataset["license_id"] == ""
59+
assert actual_dataset["has_version"] == [
60+
"https://repo.metadatacenter.org/template-instances/2836bf1c-76e9-44e7-a65e-80e9ca63025a"
61+
]
62+
assert actual_dataset["contact"] == [
63+
{
64+
"email": "",
65+
"identifier": "https://orcid.org/0000-0002-4348-707X",
66+
"name": "N.K. De Vries",
67+
"uri": "",
68+
"url": "",
69+
}
70+
]
71+
assert actual_dataset["creator"] == [
72+
{
73+
"email": "",
74+
"identifier": "",
75+
"name": "",
76+
"type": "",
77+
"uri": "https://orcid.org/0000-0002-0180-3636",
78+
"url": "",
79+
}
80+
]
81+
assert actual_dataset["publisher"] == [
82+
{
83+
"email": "",
84+
"identifier": "",
85+
"name": "",
86+
"type": "",
87+
"uri": "https://opal.health-ri.nl/pub",
88+
"url": "",
89+
}
90+
]
91+
assert actual_dataset["temporal_start"] == "2020-01-01"
92+
assert actual_dataset["temporal_end"] == "2025-12-31"
93+
assert actual_dataset["retention_period"] == []
94+
95+
assert extras_dict["identifier"] == "27866022694497978"
96+
assert (
97+
extras_dict["uri"]
98+
== "https://covid19initiatives.health-ri.nl/p/Project/27866022694497978"
99+
)
100+
assert extras_dict["contact_name"] == "N.K. De Vries"
101+
assert (
102+
extras_dict["contact_identifier"]
103+
== "https://orcid.org/0000-0002-4348-707X"
104+
)
105+
assert (
106+
extras_dict["publisher_uri"] == "https://opal.health-ri.nl/pub"
107+
)
108+
assert extras_dict["creator_uri"] == "https://orcid.org/0000-0002-0180-3636"
109+
assert extras_dict["homepage"] == "http://localhost:5000"
61110

62111
def test_fdp_record_converter_catalog_dict(self):
63112
fdp_record_to_package = FairDataPointRecordToPackageConverter(profile="fairdatapoint_dcat_ap")
@@ -66,33 +115,40 @@ def test_fdp_record_converter_catalog_dict(self):
66115
guid="catalog=https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d",
67116
record=data, series_mapping=None)
68117

118+
extras_dict = self._extras_to_dict(actual["extras"])
69119

70-
expected = {
71-
"uri": "https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d",
72-
"access_rights": "https://fair.healthinformationportal.eu/catalog/"
73-
"1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#accessRights",
74-
"conforms_to": ["https://fair.healthinformationportal.eu/profile/"
75-
"a0949e72-4466-4d53-8900-9436d1049a4b"],
76-
"extras": [],
77-
"has_version": ["1.0"],
78-
"issued": '2023-10-06T10:12:55.614000+00:00',
79-
"language": ["http://id.loc.gov/vocabulary/iso639-1/en"],
80-
"license_id": "",
81-
"modified": '2023-10-06T10:12:55.614000+00:00',
82-
'publisher': [
83-
{
84-
'email': '',
85-
'identifier': '',
86-
"name": "Automatic",
87-
'type': '',
88-
'uri': '',
89-
'url': '',
90-
},
91-
],
92-
93-
"resources": [],
94-
"tags": [],
95-
"title": "Slovenia National Node"
96-
}
120+
assert actual["has_version"] == ["1.0"]
121+
assert actual["issued"] == "2023-10-06T10:12:55.614000+00:00"
122+
assert actual["modified"] == "2023-10-06T10:12:55.614000+00:00"
123+
assert actual["license_id"] == ""
124+
assert actual["publisher"] == [
125+
{
126+
"email": "",
127+
"identifier": "",
128+
"name": "Automatic",
129+
"type": "",
130+
"uri": "",
131+
"url": "",
132+
}
133+
]
134+
assert actual["resources"] == []
135+
assert actual["tags"] == []
136+
assert actual["title"] == "Slovenia National Node"
137+
assert actual["retention_period"] == []
97138

98-
assert actual == expected
139+
assert (
140+
extras_dict["uri"]
141+
== "https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d"
142+
)
143+
assert (
144+
extras_dict["access_rights"]
145+
== "https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#accessRights"
146+
)
147+
assert json.loads(extras_dict["conforms_to"]) == [
148+
"https://fair.healthinformationportal.eu/profile/a0949e72-4466-4d53-8900-9436d1049a4b"
149+
]
150+
assert json.loads(extras_dict["language"]) == [
151+
"http://id.loc.gov/vocabulary/iso639-1/en"
152+
]
153+
assert extras_dict["publisher_name"] == "Automatic"
154+
assert extras_dict["homepage"] == "http://localhost:5000"

0 commit comments

Comments
 (0)