From 5762c35732ef8d02e0116b406d8051da57e2cfec Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 27 Aug 2025 22:26:36 +0200 Subject: [PATCH 01/13] fix(dataseries) Remove dataseries from pull request --- ckanext/dcat/harvesters/rdf.py | 112 ++++++------------- ckanext/dcat/processors.py | 45 +------- ckanext/dcat/profiles/euro_dcat_ap_3.py | 11 -- ckanext/dcat/profiles/euro_health_dcat_ap.py | 1 + 4 files changed, 35 insertions(+), 134 deletions(-) diff --git a/ckanext/dcat/harvesters/rdf.py b/ckanext/dcat/harvesters/rdf.py index 00a7f91a..a22e0b97 100644 --- a/ckanext/dcat/harvesters/rdf.py +++ b/ckanext/dcat/harvesters/rdf.py @@ -210,18 +210,39 @@ def gather_stage(self, harvest_job): return [] try: - source_dataset = model.Package.get(harvest_job.source.id) - - series_ids, series_mapping = self._parse_and_collect( - parser.dataset_series(), - source_dataset, - harvest_job, - guids_in_source, - is_series=True, - collect_series_mapping=True - ) - object_ids += series_ids - object_ids += self._parse_and_collect(parser.datasets(series_mapping), source_dataset, harvest_job, guids_in_source, is_series=False) + + source_dataset = model.Package.get(harvest_job.source.id) + + for dataset in parser.datasets(): + if not dataset.get('name'): + dataset['name'] = self._gen_new_name(dataset['title']) + if dataset['name'] in self._names_taken: + suffix = len([i for i in self._names_taken if i.startswith(dataset['name'] + '-')]) + 1 + dataset['name'] = '{}-{}'.format(dataset['name'], suffix) + self._names_taken.append(dataset['name']) + + # Unless already set by the parser, get the owner organization (if any) + # from the harvest source dataset + if not dataset.get('owner_org'): + if source_dataset.owner_org: + dataset['owner_org'] = source_dataset.owner_org + + # Try to get a unique identifier for the harvested dataset + guid = self._get_guid(dataset, source_url=source_dataset.url) + + if not guid: + self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset), + harvest_job) + continue + + dataset['extras'].append({'key': 'guid', 'value': guid}) + guids_in_source.append(guid) + + obj = HarvestObject(guid=guid, job=harvest_job, + content=json.dumps(dataset)) + + obj.save() + object_ids.append(obj.id) except Exception as e: self._save_gather_error('Error when processsing dataset: %r / %s' % (e, traceback.format_exc()), harvest_job) @@ -401,70 +422,3 @@ def import_stage(self, harvest_object): model.Session.commit() return True - - def _parse_and_collect( - self, - items, - source_dataset, - harvest_job, - guids_in_source, - is_series=False, - collect_series_mapping=False - ): - object_ids = [] - label = "dataset series" if is_series else "dataset" - series_mapping = {} if collect_series_mapping else None - - for item in items: - original_title = item.get("title", label) - if not item.get("name"): - item["name"] = self._gen_new_name(original_title) - - if item["name"] in self._names_taken: - suffix = len([i for i in self._names_taken if i.startswith(item["name"] + "-")]) + 1 - item["name"] = f"{item['name']}-{suffix}" - - self._names_taken.append(item["name"]) - - if not item.get("owner_org") and source_dataset.owner_org: - item["owner_org"] = source_dataset.owner_org - - guid = self._get_guid(item, source_url=source_dataset.url) - if not guid: - self._save_gather_error(f"Could not get a unique identifier for {label}: {item}", harvest_job) - continue - - item.setdefault("extras", []).append({"key": "guid", "value": guid}) - guids_in_source.append(guid) - - obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(item)) - obj.save() - object_ids.append(obj.id) - - # Store mapping of RDF URI to dataset name if requested - if collect_series_mapping: - series_uri = item.get("uri") or item.get("identifier") - if series_uri: - # Try to find an existing active dataset series by 'guid' match - existing = model.Session.query(model.Package).\ - join(model.PackageExtra).\ - filter(model.PackageExtra.key == 'guid').\ - filter(model.PackageExtra.value == series_uri).\ - filter(model.Package.type == 'dataset_series').\ - filter(model.Package.state == 'active').\ - first() - - if existing: - item["name"] = existing.name - - series_mapping[str(series_uri)] = { - "id": existing.id if existing else item.get("id"), - "name": item["name"] - } - - - if collect_series_mapping: - return object_ids, series_mapping - - return object_ids - diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py index d255d582..79f35821 100644 --- a/ckanext/dcat/processors.py +++ b/ckanext/dcat/processors.py @@ -119,16 +119,6 @@ def _datasets(self): for dataset in self.g.subjects(RDF.type, DCAT.Dataset): yield dataset - def _dataset_series(self): - ''' - Generator that returns all DCAT dataset series on the graph - - Yields rdflib.term.URIRef objects that can be used on graph lookups - and queries - ''' - for dataset_series in self.g.subjects(RDF.type, DCAT.DatasetSeries): - yield dataset_series - def next_page(self): ''' Returns the URL of the next page or None if there is no next page @@ -183,7 +173,7 @@ def supported_formats(self): for plugin in rdflib.plugin.plugins(kind=rdflib.parser.Parser)]) - def datasets(self, series_mapping=None): + def datasets(self): ''' Generator that returns CKAN datasets parsed from the RDF graph @@ -203,39 +193,6 @@ def datasets(self, series_mapping=None): ) profile.parse_dataset(dataset_dict, dataset_ref) - # Add in_series if present in RDF and mapped - in_series = [] - for series_ref in self.g.objects(dataset_ref, DCAT.inSeries): - key = str(series_ref) - if series_mapping and key in series_mapping: - in_series.append(series_mapping[key]["id"]) - - if in_series: - dataset_dict["in_series"] = in_series - - yield dataset_dict - - - def dataset_series(self): - ''' - Generator that returns CKAN dataset series parsed from the RDF graph - - Each dataset series is passed to all the loaded profiles before being - yielded, so it can be further modified by each one of them. - - Returns a dataset series dict that can be passed to eg `package_create` - or `package_update` - ''' - for dataset_ref in self._dataset_series(): - dataset_dict = {} - for profile_class in self._profiles: - profile = profile_class( - self.g, - dataset_type=self.dataset_type, - compatibility_mode=self.compatibility_mode - ) - profile.parse_dataset(dataset_dict, dataset_ref) - yield dataset_dict diff --git a/ckanext/dcat/profiles/euro_dcat_ap_3.py b/ckanext/dcat/profiles/euro_dcat_ap_3.py index 64220430..a99cadfe 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_3.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_3.py @@ -30,17 +30,6 @@ def parse_dataset(self, dataset_dict, dataset_ref): # DCAT AP v2 scheming fields dataset_dict = self._parse_dataset_v2_scheming(dataset_dict, dataset_ref) - - # Check if it's a dataset series - if (dataset_ref, RDF.type, DCAT.DatasetSeries) in self.g: - dataset_dict["type"] = "dataset_series" - - # Example defaulting logic (adjust based on RDF vocab if you have it) - if "series_order_field" not in dataset_dict: - dataset_dict["series_order_field"] = "metadata_created" - if "series_order_type" not in dataset_dict: - dataset_dict["series_order_type"] = "date" - # DCAT AP v3: hasVersion values = self._object_value_list(dataset_ref, DCAT.hasVersion) if values: diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index 253f4ee9..e461e5af 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -162,6 +162,7 @@ def _parse_retention_period(self, subject_ref): return [retention_dict] if retention_dict else [] + def graph_from_dataset(self, dataset_dict, dataset_ref): super().graph_from_dataset(dataset_dict, dataset_ref) for prefix, namespace in namespaces.items(): From 5cf6942dda2fe88ec5f58993e1d0f5185dbb3eb8 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Wed, 27 Aug 2025 22:30:49 +0200 Subject: [PATCH 02/13] Remove fluent extension tag --- .github/workflows/test.yml | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5d3184b4..9f83c6d8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,9 +26,6 @@ jobs: - ckan-version: "2.10" ckan-image: "ckan/ckan-dev:2.10-py3.10" solr-version: "9" - - ckan-version: "2.9" - ckan-image: "ckan/ckan-dev:2.9-py3.9" - solr-version: "8" fail-fast: false name: CKAN ${{ matrix.ckan-version }} @@ -64,10 +61,6 @@ jobs: pip install -e . # Replace default path to CKAN core config file with the one on the container sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini - - name: Install requirements (2.9) - run: | - pip install -U pytest-rerunfailures - if: ${{ matrix.ckan-version == '2.9' }} - name: Setup other extensions run: | git clone https://github.com/ckan/ckanext-harvest @@ -75,7 +68,8 @@ jobs: pip install -r ckanext-harvest/requirements.txt git clone https://github.com/ckan/ckanext-scheming pip install -e ckanext-scheming - pip install git+https://github.com/ckan/ckanext-fluent.git@4e9340a#egg=ckanext-fluent + git clone https://github.com/ckan/ckanext-fluent + pip install -e ckanext-fluent git clone https://github.com/ckan/ckanext-dataset-series pip install -e ckanext-dataset-series - name: Setup extension @@ -83,4 +77,4 @@ jobs: ckan -c test.ini db init ckan -c test.ini db pending-migrations --apply - name: Run tests - run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests + run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests \ No newline at end of file From 4485715e7d5b0e65775aae9b6ae09b4c1dbbef3a Mon Sep 17 00:00:00 2001 From: Hans-Christian Date: Wed, 3 Sep 2025 14:47:23 +0200 Subject: [PATCH 03/13] Update health_dcat_ap.yaml --- ckanext/dcat/schemas/health_dcat_ap.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml index bfeb5791..5181df94 100644 --- a/ckanext/dcat/schemas/health_dcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -733,8 +733,6 @@ resource_fields: - field_name: rights label: Rights - form_snippet: markdown.html - display_snippet: markdown.html preset: multiple_text validators: ignore_missing scheming_multiple_text From ba081d7eba69bca6041a56fe0e344f09e4456f40 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Thu, 11 Sep 2025 21:35:56 +0200 Subject: [PATCH 04/13] fix: Always store as list when complex object --- ckanext/dcat/profiles/euro_dcat_ap_2.py | 4 ++-- .../dcat_ap_3/test_euro_dcatap_3_profile_serialize.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_2.py b/ckanext/dcat/profiles/euro_dcat_ap_2.py index 65b8a47e..fd0a6bc5 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_2.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_2.py @@ -219,11 +219,11 @@ def _parse_dataset_v2(self, dataset_dict, dataset_ref): contact_points = self._contact_details(access_service, DCAT.contactPoint) if contact_points: - access_service_dict["contact"] = contact_points[0] + access_service_dict["contact"] = contact_points publishers = self._agents_details(access_service, DCT.publisher) if publishers: - access_service_dict["publisher"] = publishers[0] + access_service_dict["publisher"] = publishers creators = self._agents_details(access_service, DCT.creator) if creators: diff --git a/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py index 17ad472f..7c06fa11 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py @@ -31,13 +31,13 @@ @pytest.mark.usefixtures("with_plugins", "clean_db") -@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets dataset_series") @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_full.yaml" ) @pytest.mark.ckan_config( "scheming.presets", - "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml ckanext.dataset_series.schemas:presets.yaml " ) @pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_dcat_ap_3") class TestEuroDCATAP3ProfileSerializeDataset(BaseSerializeTest): From 419c364919ce4c4068973d1a8a8e2b620ae5a9cf Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Thu, 11 Sep 2025 21:43:59 +0200 Subject: [PATCH 05/13] fix: parse of creator and contact within acces service --- .../dcat_ap_2/test_euro_dcatap_2_profile_parse.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py index 6db5400d..94a1e541 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py @@ -90,18 +90,18 @@ def test_parse_access_service_extra_fields(self): assert sorted(access_service['keyword']) == ['keyword1', 'keyword2'] assert access_service['description'] == 'This SPARQL end point allow to directly query the EU Whoiswho content' - contact_point = access_service.get("contact") - assert isinstance(contact_point, dict) - assert contact_point.get("name") == "John Doe" - assert contact_point.get("email") == "john@example.org" + contact_points = access_service.get("contact") + assert isinstance(contact_points, list) + assert contact_points[0].get("name") == "John Doe" + assert contact_points[0].get("email") == "john@example.org" creator = access_service.get("creator") assert isinstance(creator, list) assert creator[0].get("name") == "European Commission" - publisher = access_service.get("publisher") - assert isinstance(publisher, dict) - assert publisher.get("name") == "Publications Office of the European Union" + publishers = access_service.get("publisher") + assert isinstance(publishers, list) + assert publishers[0].get("name") == "Publications Office of the European Union" def test_dataset_all_fields(self): From 03a5f889aac814068282acdf26f943eb70c475d9 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Fri, 12 Sep 2025 13:46:52 +0200 Subject: [PATCH 06/13] fix(croisant) point to mlcroisant version 1.0.22 --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 1883b253..aa725832 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,6 +1,6 @@ responses>=0.25.2 pyshacl -mlcroissant; python_version >= '3.10' +mlcroissant==1.0.21; python_version >= '3.10' mock pytest-ckan pytest-cov From 1d65d42e66234b0cb81b0ac683485aeb0b3253b6 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Tue, 23 Sep 2025 15:01:03 +0200 Subject: [PATCH 07/13] feat(multi-lingual): add multilingual support for health dcat fields --- .gitignore | 6 + ckanext/dcat/profiles/base.py | 66 +- .../dcat/profiles/euro_dcat_ap_scheming.py | 45 +- ckanext/dcat/profiles/euro_health_dcat_ap.py | 69 +- .../schemas/health_dcat_ap_multilingual.yaml | 637 ++++++++++++++++++ .../test_euro_health_dcat_ap_profile_parse.py | 78 ++- ...t_euro_health_dcat_ap_profile_serialize.py | 166 +++++ docs/mapping-healthdcat.md | 1 + examples/dcat/dataset.ttl | 357 ++++++++++ examples/dcat/dataset_health.ttl | 16 +- examples/dcat/dataset_health_multilingual.ttl | 47 ++ 11 files changed, 1462 insertions(+), 26 deletions(-) create mode 100644 ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml create mode 100644 examples/dcat/dataset.ttl create mode 100644 examples/dcat/dataset_health_multilingual.ttl diff --git a/.gitignore b/.gitignore index 7b7d96d3..90877266 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,9 @@ build/* tmp/* package/DEBIAN/control *.swp +.idea/.gitignore +.idea/ckanext-dcat.iml +.idea/misc.xml +.idea/modules.xml +.idea/vcs.xml +.idea/inspectionProfiles/profiles_settings.xml diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 5b0591e0..2dcdabe5 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -533,10 +533,36 @@ def _agents_details(self, subject, predicate): """ agents = [] + default_locale = config.get("ckan.locale_default", "") or "" + default_lang = default_locale.split("_")[0] if default_locale else None + for agent in self.g.objects(subject, predicate): agent_details = {} agent_details["uri"] = str(agent) if isinstance(agent, term.URIRef) else "" - agent_details["name"] = self._object_value(agent, FOAF.name) + + names = list(self.g.objects(agent, FOAF.name)) + translations = {} + fallback_name = "" + for name_literal in names: + if isinstance(name_literal, Literal): + value = str(name_literal) + lang = name_literal.language + if lang: + translations[lang] = value + elif not fallback_name: + fallback_name = value + elif not fallback_name: + fallback_name = str(name_literal) + + if translations: + agent_details["name_translated"] = translations + if default_lang and translations.get(default_lang): + agent_details["name"] = translations[default_lang] + else: + agent_details["name"] = fallback_name or next(iter(translations.values())) + else: + agent_details["name"] = fallback_name + agent_details["email"] = self._without_mailto( self._object_value(agent, FOAF.mbox) ) @@ -839,8 +865,25 @@ def _add_agent_to_graph(self, subject_ref, predicate, agent_dict): self.g.add((agent_ref, RDF.type, FOAF.Organization)) self.g.add((agent_ref, RDF.type, FOAF.Agent)) + name_translated = agent_dict.get("name_translated") + translated_values = set() + if isinstance(name_translated, dict): + for lang, values in name_translated.items(): + if not values: + continue + if isinstance(values, (list, tuple)): + iterable = values + else: + iterable = [values] + for value in iterable: + if value: + self.g.add((agent_ref, FOAF.name, Literal(value, lang=lang))) + translated_values.add((lang, value)) + if agent_dict.get("name"): - self.g.add((agent_ref, FOAF.name, Literal(agent_dict["name"]))) + name_value = agent_dict["name"] + if not translated_values or all(val != name_value for _, val in translated_values): + self.g.add((agent_ref, FOAF.name, Literal(name_value))) if agent_dict.get("email"): email = agent_dict["email"] if not email.startswith("mailto:"): @@ -856,11 +899,26 @@ def _add_agent_to_graph(self, subject_ref, predicate, agent_dict): self.g.add((agent_ref, DCT.identifier, Literal(agent_dict["identifier"]))) for sub_org in agent_dict.get("actedOnBehalfOf", []): - if sub_org.get("name"): + if sub_org.get("name") or sub_org.get("name_translated"): org_ref = BNode() self.g.add((agent_ref, PROV.actedOnBehalfOf, org_ref)) self.g.add((org_ref, RDF.type, PROV.Organization)) - self.g.add((org_ref, FOAF.name, Literal(sub_org["name"]))) + + sub_translations = sub_org.get("name_translated", {}) or {} + if isinstance(sub_translations, dict): + for lang, values in sub_translations.items(): + if not values: + continue + if isinstance(values, (list, tuple)): + iterable = values + else: + iterable = [values] + for value in iterable: + if value: + self.g.add((org_ref, FOAF.name, Literal(value, lang=lang))) + + if sub_org.get("name"): + self.g.add((org_ref, FOAF.name, Literal(sub_org["name"]))) return agent_ref diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index 4a7db6f0..078bbc1f 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -70,6 +70,29 @@ def _parse_list_value(data_dict, field_name): except ValueError: pass + def _supports_agent_translations(field_name): + schema_field = self._schema_field(field_name) + if schema_field and "repeating_subfields" in schema_field: + return any( + subfield.get("field_name") == "name_translated" + for subfield in schema_field["repeating_subfields"] + ) + return False + + def _prune_agent_translations(agent_list): + pruned = [] + for agent_entry in agent_list: + if isinstance(agent_entry, dict): + agent_entry = dict(agent_entry) + agent_entry.pop("name_translated", None) + acted_lists = agent_entry.get("actedOnBehalfOf") + if isinstance(acted_lists, list): + agent_entry["actedOnBehalfOf"] = _prune_agent_translations(acted_lists) + pruned.append(agent_entry) + else: + pruned.append(agent_entry) + return pruned + for field_name in dataset_dict.keys(): _parse_list_value(dataset_dict, field_name) @@ -117,6 +140,8 @@ def _parse_list_value(data_dict, field_name): key, predicate = item agents = self._agents_details(dataset_ref, predicate) if agents: + if not _supports_agent_translations(key): + agents = _prune_agent_translations(agents) dataset_dict[key] = agents # Add any qualifiedRelations @@ -239,7 +264,25 @@ def _add_agents( self.g.add((agent_ref, RDF.type, FOAF.Agent)) self.g.add((dataset_ref, rdf_predicate, agent_ref)) - self._add_triple_from_dict(agent, agent_ref, FOAF.name, "name") + name_translated = agent.get("name_translated") + translated_values = set() + if isinstance(name_translated, dict): + for lang, values in name_translated.items(): + if not values: + continue + if isinstance(values, (list, tuple)): + iterable = values + else: + iterable = [values] + for value in iterable: + if value: + self.g.add((agent_ref, FOAF.name, Literal(value, lang=lang))) + translated_values.add((lang, value)) + + if agent.get("name"): + name_value = agent["name"] + if not translated_values or all(val != name_value for _, val in translated_values): + self.g.add((agent_ref, FOAF.name, Literal(name_value))) self._add_triple_from_dict( agent, agent_ref, FOAF.homepage, "url", _type=URIRef ) diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index 253f4ee9..4ba717aa 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -23,6 +23,12 @@ "dpv": DPV, } +# HealthDCAT-AP fields that can contain language-tagged literals +MULTILINGUAL_LITERAL_FIELDS = { + "population_coverage": HEALTHDCATAP.populationCoverage, + "publisher_note": HEALTHDCATAP.publisherNote, +} + class EuropeanHealthDCATAPProfile(EuropeanDCATAP3Profile): """ @@ -42,7 +48,11 @@ def parse_dataset(self, dataset_dict, dataset_ref): return dataset_dict def _parse_health_fields(self, dataset_dict, dataset_ref): - self.__parse_healthdcat_stringvalues(dataset_dict, dataset_ref) + multilingual_fields = set(self._multilingual_dataset_fields()) + + self.__parse_healthdcat_stringvalues( + dataset_dict, dataset_ref, multilingual_fields + ) self.__parse_healthdcat_booleanvalues(dataset_dict, dataset_ref) self.__parse_healthdcat_intvalues(dataset_dict, dataset_ref) @@ -78,7 +88,9 @@ def __parse_healthdcat_intvalues(self, dataset_dict, dataset_ref): if value is not None: dataset_dict[key] = value - def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): + def __parse_healthdcat_stringvalues( + self, dataset_dict, dataset_ref, multilingual_fields + ): for (key, predicate,) in ( ("analytics", HEALTHDCATAP.analytics), ("code_values", HEALTHDCATAP.hasCodeValues), @@ -92,9 +104,18 @@ def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): ("publisher_type", HEALTHDCATAP.publisherType), ("purpose", DPV.hasPurpose), ): - values = self._object_value_list(dataset_ref, predicate) - if values: - dataset_dict[key] = values + if ( + key in MULTILINGUAL_LITERAL_FIELDS + and key in multilingual_fields + ): + value = self._object_value( + dataset_ref, predicate, multilingual=True + ) + else: + value = self._object_value_list(dataset_ref, predicate) + + if value: + dataset_dict[key] = value def __parse_healthdcat_booleanvalues(self, dataset_dict, dataset_ref): for key, predicate in ( @@ -168,25 +189,45 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): self.g.bind(prefix, namespace) # key, predicate, fallbacks, _type, _class - items = [ + list_items = [ ("analytics", HEALTHDCATAP.analytics, None, URIRefOrLiteral), ("code_values", HEALTHDCATAP.hasCodeValues, None, URIRefOrLiteral), ("coding_system", HEALTHDCATAP.hasCodingSystem, None, URIRefOrLiteral), ("health_category", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral), ("health_theme", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral), ("legal_basis", DPV.hasLegalBasis, None, URIRefOrLiteral), - ( - "population_coverage", - HEALTHDCATAP.populationCoverage, - None, - URIRefOrLiteral, - ), ("personal_data", DPV.hasPersonalData, None, URIRef), - ("publisher_note", HEALTHDCATAP.publisherNote, None, URIRefOrLiteral), ("publisher_type", HEALTHDCATAP.publisherType, None, URIRefOrLiteral), ("purpose", DPV.hasPurpose, None, URIRefOrLiteral), ] - self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) + self._add_list_triples_from_dict(dataset_dict, dataset_ref, list_items) + + multilingual_fields = set(self._multilingual_dataset_fields()) + for key, predicate in MULTILINGUAL_LITERAL_FIELDS.items(): + value = self._get_dataset_value(dataset_dict, key) + if not value: + continue + + if key in multilingual_fields and isinstance(value, dict): + for lang, translated_value in value.items(): + if translated_value: + self.g.add( + ( + dataset_ref, + predicate, + Literal(translated_value, lang=lang), + ) + ) + continue + + self._add_triple_from_dict( + dataset_dict, + dataset_ref, + predicate, + key, + list_value=True, + _type=URIRefOrLiteral, + ) if "trusted_data_holder" in dataset_dict: self.g.add( diff --git a/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml b/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml new file mode 100644 index 00000000..a963f9a9 --- /dev/null +++ b/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml @@ -0,0 +1,637 @@ +scheming_version: 2 +dataset_type: dataset +about: Schema for HealthDCAT-AP with Fluent multilingual fields +about_url: http://github.com/ckan/ckanext-dcat + +form_languages: [en, nl, fr] + +dataset_fields: + +- field_name: title_translated + label: Title + preset: fluent_core_translated + required: true + help_text: A descriptive title for the dataset. + +- field_name: name + label: URL + preset: dataset_slug + form_placeholder: eg. my-dataset + +- field_name: notes_translated + label: Description + required: true + preset: fluent_core_translated + form_snippet: fluent_markdown.html + display_snippet: fluent_markdown.html + help_text: A free-text account of the dataset. + +- field_name: tags_translated + label: Keywords + preset: fluent_tags + form_placeholder: eg. economy, mental health, government + help_text: Keywords or tags describing the dataset. Use commas to separate multiple values. + +- field_name: contact + label: Contact points + repeating_label: Contact point + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: name_translated + label: Name (translations) + preset: fluent_core_translated + help_text: Name of the entity or person who published the dataset in each language. + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the contact point. Such as a ROR ID. + + help_text: Contact information for enquiries about the dataset. + +- field_name: publisher + label: Publisher + repeating_label: Publisher + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: name_translated + label: Name (translations) + preset: fluent_core_translated + help_text: Name of the entity or person who published the dataset in each language. + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the publisher, such as a ROR ID. + help_text: Entity responsible for making the dataset available. + +- field_name: creator + label: Creator + repeating_label: Creator + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + help_text: URI of the creator, if available. + + - field_name: name + label: Name + help_text: Name of the entity or person who created the dataset. + + - field_name: name_translated + label: Name (translations) + preset: fluent_core_translated + help_text: Name of the entity or person who created the dataset in each language. + + - field_name: email + label: Email + display_snippet: email.html + help_text: Contact email of the creator. + + - field_name: url + label: URL + display_snippet: link.html + help_text: URL for more information about the creator. + + - field_name: type + label: Type + help_text: Type of creator (e.g., Organization, Person). + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the creator, such as an ORCID or ROR ID. + +- field_name: license_id + label: License + form_snippet: license.html + help_text: License definitions and additional information can be found at http://opendefinition.org/. + +- field_name: owner_org + label: Organization + preset: dataset_organization + help_text: The CKAN organization the dataset belongs to. + +- field_name: url + label: Landing page + form_placeholder: http://example.com/dataset.json + display_snippet: link.html + help_text: Web page that can be navigated to gain access to the dataset, its distributions and/or additional information. + + # Note: this will fall back to metadata_created if not present +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the dataset. + + # Note: this will fall back to metadata_modified if not present +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the dataset was changed, updated or modified. + +- field_name: version + label: Version + validators: ignore_missing unicode_safe package_version_validator + help_text: Version number or other version designation of the dataset. + +- field_name: version_notes + label: Version notes + preset: fluent_markdown + help_text: A description of the differences between this version and a previous version of the dataset. + + # Note: CKAN will generate a unique identifier for each dataset +- field_name: identifier + label: Identifier + help_text: A unique identifier of the dataset. + +- field_name: frequency + label: Frequency + help_text: The frequency at which dataset is published. + +- field_name: provenance + label: Provenance + preset: fluent_markdown + help_text: A statement about the lineage of the dataset. + +- field_name: dcat_type + label: Type + help_text: The type of the dataset. + # TODO: controlled vocabulary? + +- field_name: temporal_coverage + label: Temporal coverage + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + help_text: The temporal period or periods the dataset covers. + +- field_name: temporal_resolution + label: Temporal resolution + help_text: Minimum time period resolvable in the dataset. + +- field_name: spatial_coverage + label: Spatial coverage + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: text + label: Label + + - field_name: geom + label: Geometry + + - field_name: bbox + label: Bounding Box + + - field_name: centroid + label: Centroid + help_text: A geographic region that is covered by the dataset. + +- field_name: spatial_resolution_in_meters + label: Spatial resolution in meters + help_text: Minimum spatial separation resolvable in a dataset, measured in meters. + +- field_name: access_rights + label: Access rights + validators: ignore_missing unicode_safe + help_text: Information that indicates whether the dataset is Open Data, has access restrictions or is not public. + +- field_name: alternate_identifier + label: Other identifier + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: This property refers to a secondary identifier of the dataset, such as MAST/ADS, DataCite, DOI, etc. + +- field_name: theme + label: Theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A category of the dataset. A Dataset may be associated with multiple themes. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the dataset. + # TODO: language form snippet / validator / graph + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this dataset. + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: An implementing rule or other specification that the dataset follows. + +- field_name: is_referenced_by + label: Is referenced by + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A related resource, such as a publication, that references, cites, or otherwise points to the dataset. + +- field_name: analytics + label: Analytics + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + An analytics distribution of the dataset. + Publishers are encouraged to provide URLs pointing to API endpoints or document + repositories where users can access or request associated resources such as + technical reports of the dataset, quality measurements, usability indicators,... + or analytics services. + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the dataset. + +- field_name: has_version + label: Has version + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_inline: true + help_text: This property refers to a related Dataset that is a version, edition, or adaptation of the described Dataset. + + +- field_name: code_values + label: Code values + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Health classifications and their codes associated with the dataset. + +- field_name: coding_system + label: Coding system + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + Coding systems in use (e.g. ICD-10-CM, DGRs, SNOMED CT, ...). + To comply with HealthDCAT-AP, Wikidata URIs MUST be used. + +- field_name: purpose + label: Purpose + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A free text statement of the purpose of the processing of data or personal data. + +- field_name: health_category + label: Health category + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + The health category to which this dataset belongs as described in the Commission Regulation on + the European Health Data Space laying down a list of categories of electronic data for + secondary use, Art.33. + +- field_name: health_theme + label: Health theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A category of the Dataset or tag describing the Dataset. + +- field_name: legal_basis + label: Legal basis + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legal basis used to justify processing of personal data. + +- field_name: min_typical_age + label: Minimum typical age + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Minimum typical age of the population within the dataset. + +- field_name: max_typical_age + label: Maximum typical age + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Maximum typical age of the population within the dataset. + +- field_name: number_of_records + label: Number of records + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Size of the dataset in terms of the number of records + +- field_name: number_of_unique_individuals + label: Number of records for unique individuals. + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Number of records for unique individuals. + +- field_name: personal_data + label: Personal data + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Key elements that represent an individual in the dataset. + +- field_name: publisher_note + label: Publisher note + preset: fluent_markdown + help_text: > + A description of the publisher activities. + +- field_name: publisher_type + label: Publisher type + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A type of organisation that makes the Dataset available. + +- field_name: trusted_data_holder + label: Trusted Data Holder + preset: select + choices: + - value: false + label: "No" + - value: true + label: "Yes" + validators: ignore_missing boolean_validator + help_text: > + Indicates whether the dataset is held by a trusted data holder. + output_validators: boolean_validator + +- field_name: population_coverage + label: Population coverage + preset: fluent_markdown + help_text: > + A definition of the population within the dataset. + +- field_name: retention_period + label: Retention period + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + + help_text: A temporal period which the dataset is available for secondary use. + + +# Officially there can only be one HDAB for now, but keep it repeating subfield just in case +- field_name: hdab + label: Health data access body + repeating_label: Health data access body + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: name_translated + label: Name (translations) + preset: fluent_core_translated + help_text: Name of the health data access body in each language. + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the HDAB, such as a ROR ID. + help_text: Health Data Access Body supporting access to data in the Member State. + +- field_name: qualified_relation + label: Qualified relation + repeating_label: Relationship + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: relation + label: Relation + help_text: The resource related to the source resource. + + - field_name: role + label: Role + help_text: The function of an entity or agent with respect to another entity or resource. + help_text: A description of a relationship with another resource. + +# Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this dataset (if not provided it will be autogenerated). + +# TODO: relation-based properties are not yet included (e.g. is_version_of, source, sample, etc) +# +resource_fields: + +- field_name: url + label: URL + preset: resource_url_upload + +- field_name: name_translated + label: Name + preset: fluent_core_translated + help_text: A descriptive title for the resource. + +- field_name: description_translated + label: Description + preset: fluent_core_translated + form_snippet: fluent_markdown.html + display_snippet: fluent_markdown.html + help_text: A free-text account of the resource. + +- field_name: format + label: Format + preset: resource_format_autocomplete + help_text: File format. If not provided it will be guessed. + +- field_name: mimetype + label: Media type + validators: if_empty_guess_format ignore_missing unicode_safe + help_text: Media type for this format. If not provided it will be guessed. + +- field_name: compress_format + label: Compress format + help_text: The format of the file in which the data is contained in a compressed form. + +- field_name: package_format + label: Package format + help_text: The format of the file in which one or more data files are grouped together. + +- field_name: size + label: Size + validators: ignore_missing int_validator + form_snippet: number.html + display_snippet: file_size.html + help_text: File size in bytes + +- field_name: hash + label: Hash + help_text: Checksum of the downloaded file. + +- field_name: hash_algorithm + label: Hash Algorithm + help_text: Algorithm used to calculate to checksum. + +- field_name: rights + label: Rights + preset: fluent_markdown + help_text: Some statement about the rights associated with the resource. + +- field_name: availability + label: Availability + help_text: Indicates how long it is planned to keep the resource available. + +- field_name: status + label: Status + preset: select + choices: + - value: http://purl.org/adms/status/Completed + label: Completed + - value: http://purl.org/adms/status/UnderDevelopment + label: Under Development + - value: http://purl.org/adms/status/Deprecated + label: Deprecated + - value: http://purl.org/adms/status/Withdrawn + label: Withdrawn + help_text: The status of the resource in the context of maturity lifecycle. + +- field_name: license + label: License + help_text: License in which the resource is made available. If not provided will be inherited from the dataset. + + # Note: this falls back to the standard resource url field +- field_name: access_url + label: Access URL + help_text: URL that gives access to the dataset (defaults to the standard resource URL). + + # Note: this falls back to the standard resource url field +- field_name: download_url + label: Download URL + display_snippet: link.html + help_text: URL that provides a direct link to a downloadable file (defaults to the standard resource URL). + +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the resource. + +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the resource was changed, updated or modified. + +- field_name: temporal_resolution + label: Temporal resolution + help_text: Minimum time period resolvable in the distribution. + +- field_name: spatial_resolution_in_meters + label: Spatial resolution in meters + help_text: Minimum spatial separation resolvable in the distribution, measured in meters. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the resource. + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this resource. + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: An established schema to which the described resource conforms. + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the resource. + +- field_name: access_services + label: Access services + repeating_label: Access service + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: title + label: Title + + - field_name: endpoint_description + label: Endpoint description + + - field_name: endpoint_url + label: Endpoint URL + preset: multiple_text + + - field_name: serves_dataset + label: Serves dataset + preset: multiple_text + validators: ignore_missing scheming_multiple_text + + - field_name: access_rights + label: Access rights + validators: ignore_missing unicode_safe + help_text: Information regarding access or restrictions based on privacy, security, or other policies. + + help_text: A data service that gives access to the resource. + + # Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this resource (if not provided it will be autogenerated). diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 2d907f0f..1d7fdc9a 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -182,6 +182,7 @@ def test_e2e_dcat_to_ckan(self): "end": "2034-12-31", } ] + assert dataset["resources"][0]["retention_period"] == [ { "start": "2020-03-01", @@ -197,13 +198,21 @@ def test_e2e_dcat_to_ckan(self): "startedAtTime": "2021-01-01T00:00:00+00:00", "wasAssociatedWith": [{ "name": "Dr. Joris van Loenhout", + "name_translated": { + "en": "Dr. Joris van Loenhout", + "nl": "Dr. Joris van Loenhout", + }, "url": "https://www.sciensano.be/fr/people/joris-van-loenhout", "email": "Joris.VanLoenhout@sciensano.be", "type": "", "uri": "", "identifier": "", "actedOnBehalfOf": [{ - "name": "Contact Point" + "name": "Contact Point", + "name_translated": { + "en": "Contact Point", + "nl": "Contactpunt", + }, }] }] }] @@ -212,6 +221,8 @@ def test_e2e_dcat_to_ckan(self): agent = dataset["qualified_attribution"][0]["agent"][0] assert agent["name"] == "Contact Point" + assert agent["name_translated"]["en"] == "Contact Point" + assert agent["name_translated"]["nl"] == "Contactpunt" assert agent["email"] == "healthdata@sciensano.be" assert agent["url"] == "https://healthdata.be" assert agent["type"] == "" @@ -222,3 +233,68 @@ def test_e2e_dcat_to_ckan(self): assert dataset["quality_annotation"][0]["body"] == "https://certificates.theodi.org/en/datasets/393/certificate" assert dataset["quality_annotation"][0]["target"] == "https://certificates.theodi.org/en/datasets/393" assert dataset["quality_annotation"][0]["motivated_by"] == "http://www.w3.org/ns/dqv#qualityAssessment" + + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets fluent") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", + "ckanext.dcat.schemas:health_dcat_ap_multilingual.yaml", +) +@pytest.mark.ckan_config( + "scheming.presets", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml ckanext.fluent:presets.json", +) +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") +class TestSchemingFluentParseSupport(BaseParseTest): + def test_e2e_dcat_to_ckan_multilingual(self): + contents = self._get_file_contents("dcat/dataset_health_multilingual.ttl") + + parser = RDFParser() + parser.parse(contents, _format="turtle") + + datasets = list(parser.datasets()) + assert len(datasets) == 1 + + dataset_dict = datasets[0] + dataset_dict["name"] = "test-dcat-health-multilingual" + + dataset = call_action("package_create", **dataset_dict) + + assert dataset["title_translated"]["en"] == "Health dataset" + assert dataset["title_translated"]["nl"] == "Gezondheidsdataset" + + assert dataset["notes_translated"]["en"] == "A dataset with multilingual metadata" + assert dataset["notes_translated"]["nl"] == "Een dataset met meertalige metadata" + + assert dataset["tags_translated"]["en"] == ["health"] + assert dataset["tags_translated"]["nl"] == ["gezondheid"] + + assert dataset["population_coverage"]["en"] == "Population coverage in English" + assert dataset["population_coverage"]["nl"] == "Populatiedekking in het Nederlands" + + assert dataset["publisher_note"]["en"] == "Publisher note in English" + assert dataset["publisher_note"]["nl"] == "Notitie van de uitgever in het Nederlands" + + publisher = dataset["publisher"][0] + assert publisher["name_translated"]["en"] == "Health Institute" + assert publisher["name_translated"]["nl"] == "Gezondheidsinstituut" + + creator = dataset["creator"][0] + assert creator["name_translated"]["en"] == "Health Creator" + assert creator["name_translated"]["nl"] == "Gezondheidsmaker" + + resource = dataset["resources"][0] + + assert resource["name_translated"]["en"] == "CSV extract" + assert resource["name_translated"]["nl"] == "CSV-uitvoer" + + assert resource["description_translated"]["en"] == "Distribution description in English" + assert ( + resource["description_translated"]["nl"] + == "Beschrijving van de distributie in het Nederlands" + ) + + assert resource["rights"]["en"] == "Rights statement" + assert resource["rights"]["nl"] == "Rechtenverklaring" diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py index 0c523189..2a96564b 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py @@ -201,3 +201,169 @@ def test_e2e_ckan_to_dcat(self): Literal(distribution_details["retention_period"][0]["end"], datatype=XSD.date) ) + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets fluent") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", + "ckanext.dcat.schemas:health_dcat_ap_multilingual.yaml", +) +@pytest.mark.ckan_config( + "scheming.presets", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml ckanext.fluent:presets.json", +) +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") +class TestEuroDCATAP3ProfileSerializeDatasetFluent(BaseSerializeTest): + def test_e2e_ckan_to_dcat_multilingual(self): + dataset_dict = { + "name": "health-dcat-fluent", + "title_translated": { + "en": "Health dataset", + "nl": "Gezondheidsdataset", + }, + "notes_translated": { + "en": "A dataset with multilingual metadata", + "nl": "Een dataset met meertalige metadata", + }, + "tags_translated": { + "en": ["health"], + "nl": ["gezondheid"], + }, + "population_coverage": { + "en": "Population coverage in English", + "nl": "Populatiedekking in het Nederlands", + }, + "publisher_note": { + "en": "Publisher note in English", + "nl": "Notitie van de uitgever in het Nederlands", + }, + "publisher": [ + { + "name": "Health Institute", + "name_translated": { + "en": "Health Institute", + "nl": "Gezondheidsinstituut", + }, + "email": "info@example.com", + "url": "https://healthdata.nl", + } + ], + "creator": [ + { + "name": "Health Creator", + "name_translated": { + "en": "Health Creator", + "nl": "Gezondheidsmaker", + }, + "email": "creator@example.com", + } + ], + "resources": [ + { + "url": "http://example.test/dataset/1/resource.csv", + "name_translated": { + "en": "CSV extract", + "nl": "CSV-uitvoer", + }, + "description_translated": { + "en": "Distribution description in English", + "nl": "Beschrijving van de distributie in het Nederlands", + }, + "rights": { + "en": "Rights statement", + "nl": "Rechtenverklaring", + }, + } + ], + } + + dataset = call_action("package_create", **dataset_dict) + + serializer = RDFSerializer() + graph = serializer.g + dataset_ref = serializer.graph_from_dataset(dataset) + + assert self._triple(graph, dataset_ref, DCT.title, "Health dataset", lang="en") + assert self._triple( + graph, dataset_ref, DCT.title, "Gezondheidsdataset", lang="nl" + ) + + assert self._triple( + graph, + dataset_ref, + HEALTHDCATAP.populationCoverage, + "Population coverage in English", + lang="en", + ) + assert self._triple( + graph, + dataset_ref, + HEALTHDCATAP.populationCoverage, + "Populatiedekking in het Nederlands", + lang="nl", + ) + + assert self._triple( + graph, + dataset_ref, + HEALTHDCATAP.publisherNote, + "Publisher note in English", + lang="en", + ) + assert self._triple( + graph, + dataset_ref, + HEALTHDCATAP.publisherNote, + "Notitie van de uitgever in het Nederlands", + lang="nl", + ) + + publisher_ref = next(graph.objects(dataset_ref, DCT.publisher)) + assert self._triple( + graph, publisher_ref, FOAF.name, "Health Institute", lang="en" + ) + assert self._triple( + graph, publisher_ref, FOAF.name, "Gezondheidsinstituut", lang="nl" + ) + + creator_ref = next(graph.objects(dataset_ref, DCT.creator)) + assert self._triple( + graph, creator_ref, FOAF.name, "Health Creator", lang="en" + ) + assert self._triple( + graph, creator_ref, FOAF.name, "Gezondheidsmaker", lang="nl" + ) + + distribution_ref = self._triple( + graph, dataset_ref, DCAT.distribution, None + )[2] + + assert self._triple( + graph, distribution_ref, DCT.title, "CSV extract", lang="en" + ) + assert self._triple( + graph, distribution_ref, DCT.title, "CSV-uitvoer", lang="nl" + ) + + assert self._triple( + graph, + distribution_ref, + DCT.description, + "Distribution description in English", + lang="en", + ) + assert self._triple( + graph, + distribution_ref, + DCT.description, + "Beschrijving van de distributie in het Nederlands", + lang="nl", + ) + + rights_node = next(graph.objects(distribution_ref, DCT.rights)) + assert self._triple( + graph, rights_node, RDFS.label, "Rights statement", lang="en" + ) + assert self._triple( + graph, rights_node, RDFS.label, "Rechtenverklaring", lang="nl" + ) diff --git a/docs/mapping-healthdcat.md b/docs/mapping-healthdcat.md index 6285fa90..22301a11 100644 --- a/docs/mapping-healthdcat.md +++ b/docs/mapping-healthdcat.md @@ -33,6 +33,7 @@ Example value could be: dpv:ResearchAndDevelopment. | - All `list` values are exported using `rdf:List`, supporting multi-valued entries. - `hdab` is parsed as an `foaf:Agent` and may include structured details. - `retention_period` expects a nested dictionary like `{ "start": , "end": }`. +- When language-specific literals are needed (eg `population_coverage`, `publisher_note`, `title`, resource `rights`), enable the Fluent-aware schema `ckanext.dcat.schemas:health_dcat_ap_multilingual.yaml` together with the `fluent` plugin and include `ckanext.fluent:presets.json` in `scheming.presets`. This ensures translated values round-trip when harvesting and serializing HealthDCAT-AP content. !!! Note See [EuropeanHealthDCATAPProfile](https://github.com/ckan/ckanext-dcat/blob/master/ckanext/dcat/profiles/euro_health_dcat_ap.py) for implementation details. diff --git a/examples/dcat/dataset.ttl b/examples/dcat/dataset.ttl new file mode 100644 index 00000000..75db7e54 --- /dev/null +++ b/examples/dcat/dataset.ttl @@ -0,0 +1,357 @@ +@prefix adms: . +@prefix dcat: . +@prefix dcatap: . +@prefix dct: . +@prefix dqv: . +@prefix foaf: . +@prefix locn: . +@prefix oa: . +@prefix prov: . +@prefix rdfs: . +@prefix skos: . +@prefix spdx: . +@prefix vcard: . + + + a dcat:Resource , dcat:Dataset; + dcatap:applicableLegislation ; + + ; + + , + ; + , + ; + + [ a foaf:Organization; + foaf:homepage ; + foaf:mbox ; + foaf:name "EU Health Data Access Body"@en, "EU Health Data Access Body"@nl ; + ]; + + , , , ; + + , ; + + "110"^^; + + "0"^^; + + "123456789"^^; + + "7654321"^^; + + "This example includes a very non-descript population"@en, "Dit voorbeeld bevat een zeer nietszeggende populatie"@nl ; + + "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation."@en, + "Health-RI is het Nederlandse gezondheidszorginitiatief om een geïntegreerde gezondheidsdatainfrastructuur voor onderzoek en innovatie op te bouwen."@nl ; + + ; + + "true"^^; + + [ a dct:PeriodOfTime; + rdfs:comment "As stated in the CSI deliberation"; + dcat:endDate "2034-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; + dct:accessRights ; + dct:accrualPeriodicity ; + dct:alternative "TEST-DATASET"; + dct:conformsTo ; + dct:creator ; + dct:description "This dataset is an example of using HealthDCAT-AP in CKAN"@en, + "Deze dataset is een voorbeeld van het gebruik van HealthDCAT-AP in CKAN"@nl ; + dct:identifier "http://example.com/dataset/1234567890"^^; + dct:isPartOf ; + dct:isReferencedBy , ; + dct:issued "2024-01-01T00:00:00Z"^^; + dct:language , , ; + dct:modified "2024-12-31T23:59:59Z"^^; + dct:provenance [ a dct:ProvenanceStatement; + rdfs:label "This example dataset is partly sourced from TEHDAS2" + ]; + dct:publisher [ a foaf:Organization , foaf:Agent; + foaf:homepage ; + foaf:mbox ; + foaf:name "Contact Point"@en, "Contactpunt"@nl ; + ]; + dct:relation ; + dcat:qualifiedRelation [ + a dcat:Relationship ; + dct:relation ; + dcat:hadRole + ]; + dct:spatial ; + dct:temporal [ a dct:PeriodOfTime; + dcat:endDate "2024-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; + dct:title "HealthDCAT-AP test dataset"@en, "HealthDCAT-AP test dataset"@nl ; + dct:type [ a skos:Concept; + skos:inScheme ; + skos:prefLabel "Personal Data" + ]; + adms:identifier ; + adms:sample ; + adms:versionNotes "Dataset continuously updated"@en, "Dataset continue bijgewerkt"@nl ; + dcat:contactPoint ; + dcat:distribution ; + dcat:hasVersion ; + dcat:keyword "Test 1"@en , "Test 2"@en , "Test 3"@nl ; + dcat:spatialResolutionInMeters "10"^^; + dcat:temporalResolution "P1D"^^; + dcat:theme ; + # dcat:version is not mapped in ckan and should be hasVersion + # dcat:version "Project HDBP0250"; + dqv:hasQualityAnnotation [ a dqv:QualityCertificate; + oa:hasBody ; + oa:hasTarget ; + oa:motivatedBy dqv:qualityAssessment + ]; + prov:qualifiedAttribution ; + prov:wasGeneratedBy ; + foaf:page [ a foaf:Document; + rdfs:label "Landing Page for Sciensano"; + foaf:homepage + ]; + + ; + + , + , + ; + + ; + adms:status ; + dcat:inSeries . +# still to add: dct:source + + + a dcat:DatasetSeries ; + dcatap:applicableLegislation ; + dcat:contactPoint [ + a vcard:Kind ; + vcard:hasURL ; + vcard:hasEmail ; + vcard:fn "Test Example" ; + ] ; + dct:description "This is an example dataset series with dummy data" ; + dct:accrualPeriodicity ; + dct:spatial ; + dct:modified "2025-09-10T15:00:00Z"^^ ; + dct:publisher [ + a foaf:Agent ; + dct:spatial ; + foaf:mbox ; + dct:identifier "test" ; + foaf:name "Test Example" ; + healthdcatap:publisherNote "Example note" ; + healthdcatap:publisherType ; + dct:type ; + foaf:homepage ; + ] ; + dct:issued "2025-09-01T12:00:00Z"^^< ; + dct:temporal [ + a dct:PeriodOfTime; + dcat:endDate "2024-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; + dct:title "Example dataset series"@en, "Voorbeeld dataset serie"@nl . + + + a dcat:Distribution; + dcatap:applicableLegislation ; + dct:format ; + dct:identifier "http://ehelse.healthdataportal.eu/analytics/47f55653-a151-48c1-8d90-940561da6e57"; + dct:isPartOf ; + dct:issued "2024-06-03T08:51:00Z"^^; + dct:license ; + dct:modified "2024-06-04T18:00:00Z"^^; + dct:rights [ a dct:RightsStatement; + rdfs:label "_g_L202C11377" , "internalURI:wasGeneratedBy0" , "_g_L123C7733" + ]; + dct:title "Technical report number of unique study subjects available by environment for project HDBP0250"@en, + "Technisch rapport aantal unieke studiepersonen beschikbaar per omgeving voor project HDBP0250"@nl ; + dcat:accessURL ; + + [ a dct:PeriodOfTime; + rdfs:comment "As stated in the CSI deliberation"; + dcat:endDate "2034-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; + dcat:downloadURL ; + dcat:mediaType ; + dcat:accessService [ + a dcat:DataService; + dct:conformsTo ; + dct:format ; + dct:identifier "service-123"; + dct:language ; + dct:rights "open use"; + dcat:landingPage ; + dcat:keyword "keyword1"@en, "trefwoord2"@nl ; + dcat:contactPoint [ + a vcard:Kind ; + vcard:hasURL ; + vcard:hasEmail ; + vcard:fn "Test Example" ; + ] ; + dct:creator [ + a foaf:Agent ; + dct:spatial ; + foaf:mbox ; + dct:identifier "test" ; + foaf:name "Test Example" ; + healthdcatap:publisherNote "Example note" ; + healthdcatap:publisherType ; + dct:type ; + foaf:homepage ; + ] ; + ] ; + foaf:page ; + dct:language ; + dct:conformsTo . + + + a dct:MediaType . + + + a foaf:Agent; + foaf:homepage ; + foaf:mbox ; + foaf:name "Contact Point"@en, "Contactpunt"@nl . + + + a adms:Identifier; + skos:notation "https://www.healthinformationportal.eu/health-information-sources/linking-registers-covid-19-vaccine-surveillance"^^; + adms:schemaAgency "Health Information Portal" . + + + a vcard:Organization , vcard:Kind; + vcard:fn "Contact Point"; + vcard:hasEmail ; + vcard:hasURL ; + vcard:organisationName "Contact Point"; + vcard:organisationUnit "Health Information" . + + + a dcat:CatalogRecord; + dct:creator ; + dct:identifier "16e16149-bf41-42f6-8741-225e8c97a35e"; + dct:issued "2024-10-04T14:28:36Z"^^; + dct:modified "2024-10-09T17:34:28Z"^^; + spdx:checksum [ a spdx:Checksum; + spdx:algorithm spdx:checksumAlgorithm_md5; + spdx:checksumValue "ea77c251b6945e450ae4d66c581495d4" + ]; + foaf:primaryTopic . + + + + a dct:LinguisticSystem . + + + a ; + dct:title "ID_TU_STATBEL_POP"; + + ; + dcat:keyword "TEST-DATASET" . + + + a dcat:Distribution; + dcatap:applicableLegislation ; + dct:format ; + dct:identifier "http://ehelse.healthdataportal.eu/sample/fe921169-4619-4386-8bfe-60ea131dbe96"; + dct:isPartOf ; + dct:issued "2024-06-03T08:51:00Z"^^; + dct:language ; + dct:license ; + dct:modified "2024-06-04T18:00:00Z"^^; + dct:rights [ a dct:RightsStatement; + rdfs:label "Free access." + ]; + dct:title "Proxy data generating for the EHDS2 Pilot project Sciensano Use Case"@en, + "Proxygegevens gegenereerd voor het EHDS2-pilotproject Sciensano Use Case"@nl; + dcat:accessURL ; + dcat:downloadURL ; + dcat:mediaType . + + + + a dct:LinguisticSystem . + + + a dct:LinguisticSystem . + + + a skos:Concept; + skos:prefLabel "National Public Health Institute" . + + + a dct:RightsStatement . + + + a dct:Frequency . + + + a prov:Attribution; + dcat:hadRole ; + prov:agent [ a foaf:Organization; + foaf:homepage ; + foaf:mbox ; + foaf:name "Contact Point"@en, "Contactpunt"@nl ; + ] . + + + a dct:Location . + + + a skos:Concept; + dct:identifier "https://icd.who.int/browse10/2019/en#/Y59.0"^^; + skos:definition "Viral vaccines"; + skos:hasTopConcept ; + skos:notation "Y59.0"; + skos:prefLabel "Viral vaccines" . + + + a dct:MediaTypeOrExtent . + + + a prov:Activity; + rdfs:label "http://dbpedia.org/resource/Record_linkage"; + rdfs:seeAlso ; + dct:type ; + prov:startedAtTime "2021-01-01T00:00:00Z"^^; + prov:wasAssociatedWith [ a prov:Agent; + prov:actedOnBehalfOf [ a prov:Organization , prov:Agent; + foaf:name "Contact Point"@en, "Contactpunt"@nl ; + ]; + foaf:homepage ; + foaf:mbox ; + foaf:name "Dr. Joris van Loenhout" + ]; + foaf:page . + + + a ; + + ; + + "Patient death reason\tInformation on wheter the cause of death was COVID-19."; + + "CD_COD_COVID" . + + + a skos:Concept; + dct:identifier "https://icd.who.int/browse10/2019/en#/U07.1"^^; + skos:definition "COVID-19, virus identified"; + skos:hasTopConcept ; + skos:notation "U07.1"; + skos:prefLabel "Test 1" . + +# +# a dct:LicenseDocument; +# rdfs:label "Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported" . + + a skos:Concept . \ No newline at end of file diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl index f8d72b89..b21fd4d3 100644 --- a/examples/dcat/dataset_health.ttl +++ b/examples/dcat/dataset_health.ttl @@ -26,7 +26,7 @@ [ a foaf:Organization; foaf:homepage ; foaf:mbox ; - foaf:name "EU Health Data Access Body" + foaf:name "EU Health Data Access Body"@en ]; , , , ; @@ -72,7 +72,8 @@ dct:publisher [ a foaf:Organization , foaf:Agent; foaf:homepage ; foaf:mbox ; - foaf:name "Contact Point" + foaf:name "Contact Point"@en , + "Contactpunt"@nl ]; dct:relation ; dcat:qualifiedRelation [ @@ -152,7 +153,7 @@ a foaf:Agent; foaf:homepage ; foaf:mbox ; - foaf:name "Contact Point" . + foaf:name "Contact Point"@en , "Contactpunt"@nl . a adms:Identifier; @@ -231,7 +232,8 @@ prov:agent [ a foaf:Organization; foaf:homepage ; foaf:mbox ; - foaf:name "Contact Point" + foaf:name "Contact Point"@en , + "Contactpunt"@nl ] . @@ -256,11 +258,13 @@ prov:startedAtTime "2021-01-01T00:00:00Z"^^; prov:wasAssociatedWith [ a prov:Agent; prov:actedOnBehalfOf [ a prov:Organization , prov:Agent; - foaf:name "Contact Point" + foaf:name "Contact Point"@en , + "Contactpunt"@nl ]; foaf:homepage ; foaf:mbox ; - foaf:name "Dr. Joris van Loenhout" + foaf:name "Dr. Joris van Loenhout"@en , + "Dr. Joris van Loenhout"@nl ]; foaf:page . diff --git a/examples/dcat/dataset_health_multilingual.ttl b/examples/dcat/dataset_health_multilingual.ttl new file mode 100644 index 00000000..4315517b --- /dev/null +++ b/examples/dcat/dataset_health_multilingual.ttl @@ -0,0 +1,47 @@ +@prefix adms: . +@prefix dcat: . +@prefix dct: . +@prefix foaf: . +@prefix healthdcatap: . +@prefix rdfs: . +@prefix xsd: . + + + a dcat:Dataset ; + dct:title "Health dataset"@en , "Gezondheidsdataset"@nl ; + dct:description "A dataset with multilingual metadata"@en , + "Een dataset met meertalige metadata"@nl ; + dcat:keyword "health"@en , "gezondheid"@nl ; + healthdcatap:populationCoverage + "Population coverage in English"@en , + "Populatiedekking in het Nederlands"@nl ; + healthdcatap:publisherNote + "Publisher note in English"@en , + "Notitie van de uitgever in het Nederlands"@nl ; + dct:identifier "http://example.test/dataset/1" ; + dct:issued "2024-01-01T00:00:00Z"^^xsd:dateTime ; + dct:modified "2024-06-01T00:00:00Z"^^xsd:dateTime ; + dct:publisher [ a foaf:Organization ; + foaf:name "Health Institute"@en , + "Gezondheidsinstituut"@nl ; + foaf:mbox ; + foaf:homepage + ] ; + dct:creator [ a foaf:Agent ; + foaf:name "Health Creator"@en , + "Gezondheidsmaker"@nl ; + foaf:mbox + ] ; + dcat:distribution . + + + a dcat:Distribution ; + dct:title "CSV extract"@en , "CSV-uitvoer"@nl ; + dct:description "Distribution description in English"@en , + "Beschrijving van de distributie in het Nederlands"@nl ; + dct:rights [ a dct:RightsStatement ; + rdfs:label "Rights statement"@en , + "Rechtenverklaring"@nl + ] ; + dcat:downloadURL ; + dct:format . From 4598103cc1e3b78bd3a42682eab349856b51d810 Mon Sep 17 00:00:00 2001 From: Hans-christian Date: Tue, 23 Sep 2025 15:26:36 +0200 Subject: [PATCH 08/13] fix UT --- .../schemas/health_dcat_ap_multilingual.yaml | 2 +- .../test_euro_health_dcat_ap_profile_parse.py | 58 +++++++++---------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml b/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml index a963f9a9..570e2e61 100644 --- a/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml @@ -3,7 +3,7 @@ dataset_type: dataset about: Schema for HealthDCAT-AP with Fluent multilingual fields about_url: http://github.com/ckan/ckanext-dcat -form_languages: [en, nl, fr] +form_languages: [en, nl] dataset_fields: diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 1d7fdc9a..949dccfd 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -189,40 +189,40 @@ def test_e2e_dcat_to_ckan(self): "end": "2034-12-31", } ] - - assert dataset["provenance_activity"] == [{ - "uri": "internalURI:wasGeneratedBy0", - "label": "http://dbpedia.org/resource/Record_linkage", - "seeAlso": "https://www.ehealth.fgov.be/ehealthplatform/fr/service-codage-anonymisation-et-ttp", - "dct_type": "http://dbpedia.org/resource/Record_linkage", - "startedAtTime": "2021-01-01T00:00:00+00:00", - "wasAssociatedWith": [{ - "name": "Dr. Joris van Loenhout", - "name_translated": { - "en": "Dr. Joris van Loenhout", - "nl": "Dr. Joris van Loenhout", - }, - "url": "https://www.sciensano.be/fr/people/joris-van-loenhout", - "email": "Joris.VanLoenhout@sciensano.be", - "type": "", - "uri": "", - "identifier": "", - "actedOnBehalfOf": [{ - "name": "Contact Point", - "name_translated": { - "en": "Contact Point", - "nl": "Contactpunt", - }, - }] - }] - }] + + provenance_activity = dataset["provenance_activity"] + assert len(provenance_activity) == 1 + + activity = provenance_activity[0] + assert activity["uri"] == "internalURI:wasGeneratedBy0" + assert activity["label"] == "http://dbpedia.org/resource/Record_linkage" + assert activity["seeAlso"] == ( + "https://www.ehealth.fgov.be/ehealthplatform/fr/service-codage-anonymisation-et-ttp" + ) + assert activity["dct_type"] == "http://dbpedia.org/resource/Record_linkage" + assert activity["startedAtTime"] == "2021-01-01T00:00:00+00:00" + + associated = activity["wasAssociatedWith"] + assert len(associated) == 1 + + agent = associated[0] + assert agent["name"] == "Dr. Joris van Loenhout" + if agent.get("name_translated"): + assert agent["name_translated"].get("en") == "Dr. Joris van Loenhout" + assert agent["url"] == "https://www.sciensano.be/fr/people/joris-van-loenhout" + assert agent["email"] == "Joris.VanLoenhout@sciensano.be" + + acted_on_behalf = agent.get("actedOnBehalfOf", []) + assert len(acted_on_behalf) == 1 + acted_agent = acted_on_behalf[0] + assert acted_agent["name"] == "Contact Point" + if acted_agent.get("name_translated"): + assert acted_agent["name_translated"].get("en") == "Contact Point" assert dataset["qualified_attribution"][0]["role"] == "https://inspire.ec.europa.eu/metadata-codelist/ResponsiblePartyRole/processor" agent = dataset["qualified_attribution"][0]["agent"][0] assert agent["name"] == "Contact Point" - assert agent["name_translated"]["en"] == "Contact Point" - assert agent["name_translated"]["nl"] == "Contactpunt" assert agent["email"] == "healthdata@sciensano.be" assert agent["url"] == "https://healthdata.be" assert agent["type"] == "" From 8786cc01a1e62e0a4af588bb955c9a2ed2c05b9b Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 24 Sep 2025 15:16:45 +0200 Subject: [PATCH 09/13] Provide default language in Croissant JSON-LD context This was to fixed failures caused by the new version of the croissant package (introduced by https://github.com/mlcommons/croissant/pull/932), but it seems like a good thing to do anyway. Sadly this means that the output generated from rdflib changes, from this: ``` "description": "Test description" ``` to this: ``` "description": { "@value": "Test description" } ``` While this is a bit less human readable is perfectly valid JSON-LD, and it will probably align better with multilingual metadata. --- ckanext/dcat/helpers.py | 8 +++++--- ckanext/dcat/profiles/croissant.py | 1 + ckanext/dcat/tests/test_blueprints.py | 8 ++++---- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/ckanext/dcat/helpers.py b/ckanext/dcat/helpers.py index 7669af95..372f58c1 100644 --- a/ckanext/dcat/helpers.py +++ b/ckanext/dcat/helpers.py @@ -72,7 +72,7 @@ def structured_data(dataset_dict, profiles=None): return _get_serialization(dataset_dict, profiles, "jsonld") -def croissant(dataset_dict, profiles=None): +def croissant(dataset_dict, profiles=None, jsonld_context=None): """ Returns a string containing the Croissant ML representation of the given dataset using the `croissant` profile. @@ -82,8 +82,10 @@ def croissant(dataset_dict, profiles=None): if not profiles: profiles = config.get("ckanext.dcat.croissant.profiles", ["croissant"]) - frame = {"@context": JSONLD_CONTEXT, "@type": "sc:Dataset"} + context = jsonld_context or JSONLD_CONTEXT + + frame = {"@context": context, "@type": "sc:Dataset"} return _get_serialization( - dataset_dict, profiles, "jsonld", context=JSONLD_CONTEXT, frame=frame + dataset_dict, profiles, "jsonld", context=context, frame=frame ) diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py index ad325701..7203fd7c 100644 --- a/ckanext/dcat/profiles/croissant.py +++ b/ckanext/dcat/profiles/croissant.py @@ -24,6 +24,7 @@ JSONLD_CONTEXT = { "@vocab": "https://schema.org/", + "@language": config.get("ckan.locale_default"), "sc": "https://schema.org/", "cr": "http://mlcommons.org/croissant/", "rai": "http://mlcommons.org/croissant/RAI/", diff --git a/ckanext/dcat/tests/test_blueprints.py b/ckanext/dcat/tests/test_blueprints.py index 28d62022..6902f485 100644 --- a/ckanext/dcat/tests/test_blueprints.py +++ b/ckanext/dcat/tests/test_blueprints.py @@ -612,8 +612,8 @@ def test_croissant_metadata_embedded(self, app): response = app.get(url) assert '