diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index cecb1d70..9f83c6d8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -68,7 +68,8 @@ jobs: pip install -r ckanext-harvest/requirements.txt git clone https://github.com/ckan/ckanext-scheming pip install -e ckanext-scheming - pip install git+https://github.com/ckan/ckanext-fluent.git@4e9340a#egg=ckanext-fluent + git clone https://github.com/ckan/ckanext-fluent + pip install -e ckanext-fluent git clone https://github.com/ckan/ckanext-dataset-series pip install -e ckanext-dataset-series - name: Setup extension @@ -76,4 +77,4 @@ jobs: ckan -c test.ini db init ckan -c test.ini db pending-migrations --apply - name: Run tests - run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests + run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests \ No newline at end of file diff --git a/.gitignore b/.gitignore index 7b7d96d3..90877266 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,9 @@ build/* tmp/* package/DEBIAN/control *.swp +.idea/.gitignore +.idea/ckanext-dcat.iml +.idea/misc.xml +.idea/modules.xml +.idea/vcs.xml +.idea/inspectionProfiles/profiles_settings.xml diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ffbd217..37925701 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,18 @@ # Changelog -## [Unreleased](https://github.com/ckan/ckanext-dcat/compare/v2.4.0...HEAD) +## [Unreleased](https://github.com/ckan/ckanext-dcat/compare/v2.4.1...HEAD) + +## [v2.4.1](https://github.com/ckan/ckanext-dcat/compare/v2.4.0...v2.4.1) - 2025-09-25 + +* Fix regression redirect from /dataset to /dataset_series ([#362](https://github.com/ckan/ckanext-dcat/pull/362)) +* Provide default language in Croissant JSON-LD context ([#361](https://github.com/ckan/ckanext-dcat/pull/361)) +* Added [`IDCATURIGenerator`](https://docs.ckan.org/projects/ckanext-dcat/en/latest/uri-customization/) + plugin interface to allow customization of the URIs generation ([#351](https://github.com/ckan/ckanext-dcat/pull/351)) +* Added support for new fields to DCAT classes: `dcat:Dataset` (`prov:wasGeneratedBy`, `prov:qualifiedAttribution`, + `dcat:hasVersion`), `dcat:Catalog` (`foaf:homepage`), `dcat:DataService` (`dct:conformsTo`, `dct:format`, + `dct:identifier`, `dct:language`, `dct:rights`, `dcat:landingPage`, `dcat:keyword`) ([#352](https://github.com/ckan/ckanext-dcat/pull/352)) +* Add HealthDCAT-AP mapping to CKAN field mapping table ([#347](https://github.com/ckan/ckanext-dcat/pull/347)) +* Docs: Add HealthDCAT-AP mapping to CKAN field mapping table ([#347](https://github.com/ckan/ckanext-dcat/pull/347)) ## [v2.4.0](https://github.com/ckan/ckanext-dcat/compare/v2.3.0...v2.4.0) - 2025-05-20 diff --git a/ckanext/dcat/blueprints.py b/ckanext/dcat/blueprints.py index e224fb22..62f3859f 100644 --- a/ckanext/dcat/blueprints.py +++ b/ckanext/dcat/blueprints.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from flask import Blueprint, jsonify, make_response import ckantoolkit as toolkit @@ -12,11 +11,7 @@ config = toolkit.config -dcat = Blueprint( - 'dcat', - __name__, - url_defaults={u'package_type': u'dataset'} -) +dcat = Blueprint("dcat", __name__, url_defaults={"package_type": "dataset"}) def read_catalog(_format=None, package_type=None): @@ -30,23 +25,31 @@ def read_dataset(_id, _format=None, package_type=None): if endpoints_enabled(): # requirements={'_format': 'xml|rdf|n3|ttl|jsonld'} - dcat.add_url_rule(config.get('ckanext.dcat.catalog_endpoint', - utils.DEFAULT_CATALOG_ENDPOINT).replace( - '{_format}', '<_format>'), - view_func=read_catalog) + dcat.add_url_rule( + config.get( + "ckanext.dcat.catalog_endpoint", utils.DEFAULT_CATALOG_ENDPOINT + ).replace("{_format}", "<_format>"), + view_func=read_catalog, + ) # TODO: Generalize for all dataset types - dcat.add_url_rule('/dataset_series/<_id>.<_format>', view_func=read_dataset) - dcat.add_url_rule('/dataset/<_id>.<_format>', view_func=read_dataset) + dcat.add_url_rule( + "/dataset_series/<_id>.<_format>", + view_func=read_dataset, + endpoint="read_dataset_series", + ) + dcat.add_url_rule( + "/dataset/<_id>.<_format>", view_func=read_dataset, endpoint="read_dataset" + ) if toolkit.asbool(config.get(utils.ENABLE_CONTENT_NEGOTIATION_CONFIG)): - dcat.add_url_rule('/', view_func=read_catalog) + dcat.add_url_rule("/", view_func=read_catalog) - dcat.add_url_rule('/dataset/new', view_func=CreateView.as_view(str(u'new'))) - dcat.add_url_rule('/dataset/<_id>', view_func=read_dataset) + dcat.add_url_rule("/dataset/new", view_func=CreateView.as_view(str("new"))) + dcat.add_url_rule("/dataset/<_id>", view_func=read_dataset) -dcat_json_interface = Blueprint('dcat_json_interface', __name__) +dcat_json_interface = Blueprint("dcat_json_interface", __name__) def dcat_json(): @@ -54,12 +57,12 @@ def dcat_json(): return jsonify(datasets) -dcat_json_interface.add_url_rule(config.get('ckanext.dcat.json_endpoint', - '/dcat.json'), - view_func=dcat_json) +dcat_json_interface.add_url_rule( + config.get("ckanext.dcat.json_endpoint", "/dcat.json"), view_func=dcat_json +) -croissant = Blueprint('croissant', __name__) +croissant = Blueprint("croissant", __name__) def read_dataset_croissant(_id): @@ -72,15 +75,14 @@ def read_dataset_croissant(_id): ) context = { - 'user': user_name, + "user": user_name, } - data_dict = {'id': _id} + data_dict = {"id": _id} dataset_dict = toolkit.get_action("package_show")(context, data_dict) except (toolkit.ObjectNotFound, toolkit.NotAuthorized): return toolkit.abort( - 404, - toolkit._("Dataset not found or you have no permission to view it") + 404, toolkit._("Dataset not found or you have no permission to view it") ) response = make_response(croissant_serialization(dataset_dict)) @@ -88,4 +90,7 @@ def read_dataset_croissant(_id): return response -croissant.add_url_rule('/dataset/<_id>/croissant.jsonld', view_func=read_dataset_croissant) + +croissant.add_url_rule( + "/dataset/<_id>/croissant.jsonld", view_func=read_dataset_croissant +) diff --git a/ckanext/dcat/harvesters/rdf.py b/ckanext/dcat/harvesters/rdf.py index 00a7f91a..a22e0b97 100644 --- a/ckanext/dcat/harvesters/rdf.py +++ b/ckanext/dcat/harvesters/rdf.py @@ -210,18 +210,39 @@ def gather_stage(self, harvest_job): return [] try: - source_dataset = model.Package.get(harvest_job.source.id) - - series_ids, series_mapping = self._parse_and_collect( - parser.dataset_series(), - source_dataset, - harvest_job, - guids_in_source, - is_series=True, - collect_series_mapping=True - ) - object_ids += series_ids - object_ids += self._parse_and_collect(parser.datasets(series_mapping), source_dataset, harvest_job, guids_in_source, is_series=False) + + source_dataset = model.Package.get(harvest_job.source.id) + + for dataset in parser.datasets(): + if not dataset.get('name'): + dataset['name'] = self._gen_new_name(dataset['title']) + if dataset['name'] in self._names_taken: + suffix = len([i for i in self._names_taken if i.startswith(dataset['name'] + '-')]) + 1 + dataset['name'] = '{}-{}'.format(dataset['name'], suffix) + self._names_taken.append(dataset['name']) + + # Unless already set by the parser, get the owner organization (if any) + # from the harvest source dataset + if not dataset.get('owner_org'): + if source_dataset.owner_org: + dataset['owner_org'] = source_dataset.owner_org + + # Try to get a unique identifier for the harvested dataset + guid = self._get_guid(dataset, source_url=source_dataset.url) + + if not guid: + self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset), + harvest_job) + continue + + dataset['extras'].append({'key': 'guid', 'value': guid}) + guids_in_source.append(guid) + + obj = HarvestObject(guid=guid, job=harvest_job, + content=json.dumps(dataset)) + + obj.save() + object_ids.append(obj.id) except Exception as e: self._save_gather_error('Error when processsing dataset: %r / %s' % (e, traceback.format_exc()), harvest_job) @@ -401,70 +422,3 @@ def import_stage(self, harvest_object): model.Session.commit() return True - - def _parse_and_collect( - self, - items, - source_dataset, - harvest_job, - guids_in_source, - is_series=False, - collect_series_mapping=False - ): - object_ids = [] - label = "dataset series" if is_series else "dataset" - series_mapping = {} if collect_series_mapping else None - - for item in items: - original_title = item.get("title", label) - if not item.get("name"): - item["name"] = self._gen_new_name(original_title) - - if item["name"] in self._names_taken: - suffix = len([i for i in self._names_taken if i.startswith(item["name"] + "-")]) + 1 - item["name"] = f"{item['name']}-{suffix}" - - self._names_taken.append(item["name"]) - - if not item.get("owner_org") and source_dataset.owner_org: - item["owner_org"] = source_dataset.owner_org - - guid = self._get_guid(item, source_url=source_dataset.url) - if not guid: - self._save_gather_error(f"Could not get a unique identifier for {label}: {item}", harvest_job) - continue - - item.setdefault("extras", []).append({"key": "guid", "value": guid}) - guids_in_source.append(guid) - - obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(item)) - obj.save() - object_ids.append(obj.id) - - # Store mapping of RDF URI to dataset name if requested - if collect_series_mapping: - series_uri = item.get("uri") or item.get("identifier") - if series_uri: - # Try to find an existing active dataset series by 'guid' match - existing = model.Session.query(model.Package).\ - join(model.PackageExtra).\ - filter(model.PackageExtra.key == 'guid').\ - filter(model.PackageExtra.value == series_uri).\ - filter(model.Package.type == 'dataset_series').\ - filter(model.Package.state == 'active').\ - first() - - if existing: - item["name"] = existing.name - - series_mapping[str(series_uri)] = { - "id": existing.id if existing else item.get("id"), - "name": item["name"] - } - - - if collect_series_mapping: - return object_ids, series_mapping - - return object_ids - diff --git a/ckanext/dcat/helpers.py b/ckanext/dcat/helpers.py index 7669af95..372f58c1 100644 --- a/ckanext/dcat/helpers.py +++ b/ckanext/dcat/helpers.py @@ -72,7 +72,7 @@ def structured_data(dataset_dict, profiles=None): return _get_serialization(dataset_dict, profiles, "jsonld") -def croissant(dataset_dict, profiles=None): +def croissant(dataset_dict, profiles=None, jsonld_context=None): """ Returns a string containing the Croissant ML representation of the given dataset using the `croissant` profile. @@ -82,8 +82,10 @@ def croissant(dataset_dict, profiles=None): if not profiles: profiles = config.get("ckanext.dcat.croissant.profiles", ["croissant"]) - frame = {"@context": JSONLD_CONTEXT, "@type": "sc:Dataset"} + context = jsonld_context or JSONLD_CONTEXT + + frame = {"@context": context, "@type": "sc:Dataset"} return _get_serialization( - dataset_dict, profiles, "jsonld", context=JSONLD_CONTEXT, frame=frame + dataset_dict, profiles, "jsonld", context=context, frame=frame ) diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py index d255d582..79f35821 100644 --- a/ckanext/dcat/processors.py +++ b/ckanext/dcat/processors.py @@ -119,16 +119,6 @@ def _datasets(self): for dataset in self.g.subjects(RDF.type, DCAT.Dataset): yield dataset - def _dataset_series(self): - ''' - Generator that returns all DCAT dataset series on the graph - - Yields rdflib.term.URIRef objects that can be used on graph lookups - and queries - ''' - for dataset_series in self.g.subjects(RDF.type, DCAT.DatasetSeries): - yield dataset_series - def next_page(self): ''' Returns the URL of the next page or None if there is no next page @@ -183,7 +173,7 @@ def supported_formats(self): for plugin in rdflib.plugin.plugins(kind=rdflib.parser.Parser)]) - def datasets(self, series_mapping=None): + def datasets(self): ''' Generator that returns CKAN datasets parsed from the RDF graph @@ -203,39 +193,6 @@ def datasets(self, series_mapping=None): ) profile.parse_dataset(dataset_dict, dataset_ref) - # Add in_series if present in RDF and mapped - in_series = [] - for series_ref in self.g.objects(dataset_ref, DCAT.inSeries): - key = str(series_ref) - if series_mapping and key in series_mapping: - in_series.append(series_mapping[key]["id"]) - - if in_series: - dataset_dict["in_series"] = in_series - - yield dataset_dict - - - def dataset_series(self): - ''' - Generator that returns CKAN dataset series parsed from the RDF graph - - Each dataset series is passed to all the loaded profiles before being - yielded, so it can be further modified by each one of them. - - Returns a dataset series dict that can be passed to eg `package_create` - or `package_update` - ''' - for dataset_ref in self._dataset_series(): - dataset_dict = {} - for profile_class in self._profiles: - profile = profile_class( - self.g, - dataset_type=self.dataset_type, - compatibility_mode=self.compatibility_mode - ) - profile.parse_dataset(dataset_dict, dataset_ref) - yield dataset_dict diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 5b0591e0..2dcdabe5 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -533,10 +533,36 @@ def _agents_details(self, subject, predicate): """ agents = [] + default_locale = config.get("ckan.locale_default", "") or "" + default_lang = default_locale.split("_")[0] if default_locale else None + for agent in self.g.objects(subject, predicate): agent_details = {} agent_details["uri"] = str(agent) if isinstance(agent, term.URIRef) else "" - agent_details["name"] = self._object_value(agent, FOAF.name) + + names = list(self.g.objects(agent, FOAF.name)) + translations = {} + fallback_name = "" + for name_literal in names: + if isinstance(name_literal, Literal): + value = str(name_literal) + lang = name_literal.language + if lang: + translations[lang] = value + elif not fallback_name: + fallback_name = value + elif not fallback_name: + fallback_name = str(name_literal) + + if translations: + agent_details["name_translated"] = translations + if default_lang and translations.get(default_lang): + agent_details["name"] = translations[default_lang] + else: + agent_details["name"] = fallback_name or next(iter(translations.values())) + else: + agent_details["name"] = fallback_name + agent_details["email"] = self._without_mailto( self._object_value(agent, FOAF.mbox) ) @@ -839,8 +865,25 @@ def _add_agent_to_graph(self, subject_ref, predicate, agent_dict): self.g.add((agent_ref, RDF.type, FOAF.Organization)) self.g.add((agent_ref, RDF.type, FOAF.Agent)) + name_translated = agent_dict.get("name_translated") + translated_values = set() + if isinstance(name_translated, dict): + for lang, values in name_translated.items(): + if not values: + continue + if isinstance(values, (list, tuple)): + iterable = values + else: + iterable = [values] + for value in iterable: + if value: + self.g.add((agent_ref, FOAF.name, Literal(value, lang=lang))) + translated_values.add((lang, value)) + if agent_dict.get("name"): - self.g.add((agent_ref, FOAF.name, Literal(agent_dict["name"]))) + name_value = agent_dict["name"] + if not translated_values or all(val != name_value for _, val in translated_values): + self.g.add((agent_ref, FOAF.name, Literal(name_value))) if agent_dict.get("email"): email = agent_dict["email"] if not email.startswith("mailto:"): @@ -856,11 +899,26 @@ def _add_agent_to_graph(self, subject_ref, predicate, agent_dict): self.g.add((agent_ref, DCT.identifier, Literal(agent_dict["identifier"]))) for sub_org in agent_dict.get("actedOnBehalfOf", []): - if sub_org.get("name"): + if sub_org.get("name") or sub_org.get("name_translated"): org_ref = BNode() self.g.add((agent_ref, PROV.actedOnBehalfOf, org_ref)) self.g.add((org_ref, RDF.type, PROV.Organization)) - self.g.add((org_ref, FOAF.name, Literal(sub_org["name"]))) + + sub_translations = sub_org.get("name_translated", {}) or {} + if isinstance(sub_translations, dict): + for lang, values in sub_translations.items(): + if not values: + continue + if isinstance(values, (list, tuple)): + iterable = values + else: + iterable = [values] + for value in iterable: + if value: + self.g.add((org_ref, FOAF.name, Literal(value, lang=lang))) + + if sub_org.get("name"): + self.g.add((org_ref, FOAF.name, Literal(sub_org["name"]))) return agent_ref diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py index ad325701..7203fd7c 100644 --- a/ckanext/dcat/profiles/croissant.py +++ b/ckanext/dcat/profiles/croissant.py @@ -24,6 +24,7 @@ JSONLD_CONTEXT = { "@vocab": "https://schema.org/", + "@language": config.get("ckan.locale_default"), "sc": "https://schema.org/", "cr": "http://mlcommons.org/croissant/", "rai": "http://mlcommons.org/croissant/RAI/", diff --git a/ckanext/dcat/profiles/euro_dcat_ap_3.py b/ckanext/dcat/profiles/euro_dcat_ap_3.py index 64220430..a99cadfe 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_3.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_3.py @@ -30,17 +30,6 @@ def parse_dataset(self, dataset_dict, dataset_ref): # DCAT AP v2 scheming fields dataset_dict = self._parse_dataset_v2_scheming(dataset_dict, dataset_ref) - - # Check if it's a dataset series - if (dataset_ref, RDF.type, DCAT.DatasetSeries) in self.g: - dataset_dict["type"] = "dataset_series" - - # Example defaulting logic (adjust based on RDF vocab if you have it) - if "series_order_field" not in dataset_dict: - dataset_dict["series_order_field"] = "metadata_created" - if "series_order_type" not in dataset_dict: - dataset_dict["series_order_type"] = "date" - # DCAT AP v3: hasVersion values = self._object_value_list(dataset_ref, DCAT.hasVersion) if values: diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index 4a7db6f0..078bbc1f 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -70,6 +70,29 @@ def _parse_list_value(data_dict, field_name): except ValueError: pass + def _supports_agent_translations(field_name): + schema_field = self._schema_field(field_name) + if schema_field and "repeating_subfields" in schema_field: + return any( + subfield.get("field_name") == "name_translated" + for subfield in schema_field["repeating_subfields"] + ) + return False + + def _prune_agent_translations(agent_list): + pruned = [] + for agent_entry in agent_list: + if isinstance(agent_entry, dict): + agent_entry = dict(agent_entry) + agent_entry.pop("name_translated", None) + acted_lists = agent_entry.get("actedOnBehalfOf") + if isinstance(acted_lists, list): + agent_entry["actedOnBehalfOf"] = _prune_agent_translations(acted_lists) + pruned.append(agent_entry) + else: + pruned.append(agent_entry) + return pruned + for field_name in dataset_dict.keys(): _parse_list_value(dataset_dict, field_name) @@ -117,6 +140,8 @@ def _parse_list_value(data_dict, field_name): key, predicate = item agents = self._agents_details(dataset_ref, predicate) if agents: + if not _supports_agent_translations(key): + agents = _prune_agent_translations(agents) dataset_dict[key] = agents # Add any qualifiedRelations @@ -239,7 +264,25 @@ def _add_agents( self.g.add((agent_ref, RDF.type, FOAF.Agent)) self.g.add((dataset_ref, rdf_predicate, agent_ref)) - self._add_triple_from_dict(agent, agent_ref, FOAF.name, "name") + name_translated = agent.get("name_translated") + translated_values = set() + if isinstance(name_translated, dict): + for lang, values in name_translated.items(): + if not values: + continue + if isinstance(values, (list, tuple)): + iterable = values + else: + iterable = [values] + for value in iterable: + if value: + self.g.add((agent_ref, FOAF.name, Literal(value, lang=lang))) + translated_values.add((lang, value)) + + if agent.get("name"): + name_value = agent["name"] + if not translated_values or all(val != name_value for _, val in translated_values): + self.g.add((agent_ref, FOAF.name, Literal(name_value))) self._add_triple_from_dict( agent, agent_ref, FOAF.homepage, "url", _type=URIRef ) diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index 253f4ee9..b80a5fe6 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -23,6 +23,12 @@ "dpv": DPV, } +# HealthDCAT-AP fields that can contain language-tagged literals +MULTILINGUAL_LITERAL_FIELDS = { + "population_coverage": HEALTHDCATAP.populationCoverage, + "publisher_note": HEALTHDCATAP.publisherNote, +} + class EuropeanHealthDCATAPProfile(EuropeanDCATAP3Profile): """ @@ -42,7 +48,11 @@ def parse_dataset(self, dataset_dict, dataset_ref): return dataset_dict def _parse_health_fields(self, dataset_dict, dataset_ref): - self.__parse_healthdcat_stringvalues(dataset_dict, dataset_ref) + multilingual_fields = set(self._multilingual_dataset_fields()) + + self.__parse_healthdcat_stringvalues( + dataset_dict, dataset_ref, multilingual_fields + ) self.__parse_healthdcat_booleanvalues(dataset_dict, dataset_ref) self.__parse_healthdcat_intvalues(dataset_dict, dataset_ref) @@ -78,7 +88,9 @@ def __parse_healthdcat_intvalues(self, dataset_dict, dataset_ref): if value is not None: dataset_dict[key] = value - def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): + def __parse_healthdcat_stringvalues( + self, dataset_dict, dataset_ref, multilingual_fields + ): for (key, predicate,) in ( ("analytics", HEALTHDCATAP.analytics), ("code_values", HEALTHDCATAP.hasCodeValues), @@ -92,9 +104,18 @@ def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): ("publisher_type", HEALTHDCATAP.publisherType), ("purpose", DPV.hasPurpose), ): - values = self._object_value_list(dataset_ref, predicate) - if values: - dataset_dict[key] = values + if ( + key in MULTILINGUAL_LITERAL_FIELDS + and key in multilingual_fields + ): + value = self._object_value( + dataset_ref, predicate, multilingual=True + ) + else: + value = self._object_value_list(dataset_ref, predicate) + + if value: + dataset_dict[key] = value def __parse_healthdcat_booleanvalues(self, dataset_dict, dataset_ref): for key, predicate in ( @@ -162,31 +183,52 @@ def _parse_retention_period(self, subject_ref): return [retention_dict] if retention_dict else [] + def graph_from_dataset(self, dataset_dict, dataset_ref): super().graph_from_dataset(dataset_dict, dataset_ref) for prefix, namespace in namespaces.items(): self.g.bind(prefix, namespace) # key, predicate, fallbacks, _type, _class - items = [ + list_items = [ ("analytics", HEALTHDCATAP.analytics, None, URIRefOrLiteral), ("code_values", HEALTHDCATAP.hasCodeValues, None, URIRefOrLiteral), ("coding_system", HEALTHDCATAP.hasCodingSystem, None, URIRefOrLiteral), ("health_category", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral), ("health_theme", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral), ("legal_basis", DPV.hasLegalBasis, None, URIRefOrLiteral), - ( - "population_coverage", - HEALTHDCATAP.populationCoverage, - None, - URIRefOrLiteral, - ), ("personal_data", DPV.hasPersonalData, None, URIRef), - ("publisher_note", HEALTHDCATAP.publisherNote, None, URIRefOrLiteral), ("publisher_type", HEALTHDCATAP.publisherType, None, URIRefOrLiteral), ("purpose", DPV.hasPurpose, None, URIRefOrLiteral), ] - self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) + self._add_list_triples_from_dict(dataset_dict, dataset_ref, list_items) + + multilingual_fields = set(self._multilingual_dataset_fields()) + for key, predicate in MULTILINGUAL_LITERAL_FIELDS.items(): + value = self._get_dataset_value(dataset_dict, key) + if not value: + continue + + if key in multilingual_fields and isinstance(value, dict): + for lang, translated_value in value.items(): + if translated_value: + self.g.add( + ( + dataset_ref, + predicate, + Literal(translated_value, lang=lang), + ) + ) + continue + + self._add_triple_from_dict( + dataset_dict, + dataset_ref, + predicate, + key, + list_value=True, + _type=URIRefOrLiteral, + ) if "trusted_data_holder" in dataset_dict: self.g.add( diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml index bfeb5791..5181df94 100644 --- a/ckanext/dcat/schemas/health_dcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -733,8 +733,6 @@ resource_fields: - field_name: rights label: Rights - form_snippet: markdown.html - display_snippet: markdown.html preset: multiple_text validators: ignore_missing scheming_multiple_text diff --git a/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml b/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml new file mode 100644 index 00000000..570e2e61 --- /dev/null +++ b/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml @@ -0,0 +1,637 @@ +scheming_version: 2 +dataset_type: dataset +about: Schema for HealthDCAT-AP with Fluent multilingual fields +about_url: http://github.com/ckan/ckanext-dcat + +form_languages: [en, nl] + +dataset_fields: + +- field_name: title_translated + label: Title + preset: fluent_core_translated + required: true + help_text: A descriptive title for the dataset. + +- field_name: name + label: URL + preset: dataset_slug + form_placeholder: eg. my-dataset + +- field_name: notes_translated + label: Description + required: true + preset: fluent_core_translated + form_snippet: fluent_markdown.html + display_snippet: fluent_markdown.html + help_text: A free-text account of the dataset. + +- field_name: tags_translated + label: Keywords + preset: fluent_tags + form_placeholder: eg. economy, mental health, government + help_text: Keywords or tags describing the dataset. Use commas to separate multiple values. + +- field_name: contact + label: Contact points + repeating_label: Contact point + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: name_translated + label: Name (translations) + preset: fluent_core_translated + help_text: Name of the entity or person who published the dataset in each language. + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the contact point. Such as a ROR ID. + + help_text: Contact information for enquiries about the dataset. + +- field_name: publisher + label: Publisher + repeating_label: Publisher + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: name_translated + label: Name (translations) + preset: fluent_core_translated + help_text: Name of the entity or person who published the dataset in each language. + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the publisher, such as a ROR ID. + help_text: Entity responsible for making the dataset available. + +- field_name: creator + label: Creator + repeating_label: Creator + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + help_text: URI of the creator, if available. + + - field_name: name + label: Name + help_text: Name of the entity or person who created the dataset. + + - field_name: name_translated + label: Name (translations) + preset: fluent_core_translated + help_text: Name of the entity or person who created the dataset in each language. + + - field_name: email + label: Email + display_snippet: email.html + help_text: Contact email of the creator. + + - field_name: url + label: URL + display_snippet: link.html + help_text: URL for more information about the creator. + + - field_name: type + label: Type + help_text: Type of creator (e.g., Organization, Person). + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the creator, such as an ORCID or ROR ID. + +- field_name: license_id + label: License + form_snippet: license.html + help_text: License definitions and additional information can be found at http://opendefinition.org/. + +- field_name: owner_org + label: Organization + preset: dataset_organization + help_text: The CKAN organization the dataset belongs to. + +- field_name: url + label: Landing page + form_placeholder: http://example.com/dataset.json + display_snippet: link.html + help_text: Web page that can be navigated to gain access to the dataset, its distributions and/or additional information. + + # Note: this will fall back to metadata_created if not present +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the dataset. + + # Note: this will fall back to metadata_modified if not present +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the dataset was changed, updated or modified. + +- field_name: version + label: Version + validators: ignore_missing unicode_safe package_version_validator + help_text: Version number or other version designation of the dataset. + +- field_name: version_notes + label: Version notes + preset: fluent_markdown + help_text: A description of the differences between this version and a previous version of the dataset. + + # Note: CKAN will generate a unique identifier for each dataset +- field_name: identifier + label: Identifier + help_text: A unique identifier of the dataset. + +- field_name: frequency + label: Frequency + help_text: The frequency at which dataset is published. + +- field_name: provenance + label: Provenance + preset: fluent_markdown + help_text: A statement about the lineage of the dataset. + +- field_name: dcat_type + label: Type + help_text: The type of the dataset. + # TODO: controlled vocabulary? + +- field_name: temporal_coverage + label: Temporal coverage + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + help_text: The temporal period or periods the dataset covers. + +- field_name: temporal_resolution + label: Temporal resolution + help_text: Minimum time period resolvable in the dataset. + +- field_name: spatial_coverage + label: Spatial coverage + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: text + label: Label + + - field_name: geom + label: Geometry + + - field_name: bbox + label: Bounding Box + + - field_name: centroid + label: Centroid + help_text: A geographic region that is covered by the dataset. + +- field_name: spatial_resolution_in_meters + label: Spatial resolution in meters + help_text: Minimum spatial separation resolvable in a dataset, measured in meters. + +- field_name: access_rights + label: Access rights + validators: ignore_missing unicode_safe + help_text: Information that indicates whether the dataset is Open Data, has access restrictions or is not public. + +- field_name: alternate_identifier + label: Other identifier + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: This property refers to a secondary identifier of the dataset, such as MAST/ADS, DataCite, DOI, etc. + +- field_name: theme + label: Theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A category of the dataset. A Dataset may be associated with multiple themes. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the dataset. + # TODO: language form snippet / validator / graph + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this dataset. + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: An implementing rule or other specification that the dataset follows. + +- field_name: is_referenced_by + label: Is referenced by + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A related resource, such as a publication, that references, cites, or otherwise points to the dataset. + +- field_name: analytics + label: Analytics + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + An analytics distribution of the dataset. + Publishers are encouraged to provide URLs pointing to API endpoints or document + repositories where users can access or request associated resources such as + technical reports of the dataset, quality measurements, usability indicators,... + or analytics services. + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the dataset. + +- field_name: has_version + label: Has version + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_inline: true + help_text: This property refers to a related Dataset that is a version, edition, or adaptation of the described Dataset. + + +- field_name: code_values + label: Code values + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Health classifications and their codes associated with the dataset. + +- field_name: coding_system + label: Coding system + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + Coding systems in use (e.g. ICD-10-CM, DGRs, SNOMED CT, ...). + To comply with HealthDCAT-AP, Wikidata URIs MUST be used. + +- field_name: purpose + label: Purpose + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A free text statement of the purpose of the processing of data or personal data. + +- field_name: health_category + label: Health category + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + The health category to which this dataset belongs as described in the Commission Regulation on + the European Health Data Space laying down a list of categories of electronic data for + secondary use, Art.33. + +- field_name: health_theme + label: Health theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A category of the Dataset or tag describing the Dataset. + +- field_name: legal_basis + label: Legal basis + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legal basis used to justify processing of personal data. + +- field_name: min_typical_age + label: Minimum typical age + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Minimum typical age of the population within the dataset. + +- field_name: max_typical_age + label: Maximum typical age + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Maximum typical age of the population within the dataset. + +- field_name: number_of_records + label: Number of records + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Size of the dataset in terms of the number of records + +- field_name: number_of_unique_individuals + label: Number of records for unique individuals. + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Number of records for unique individuals. + +- field_name: personal_data + label: Personal data + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Key elements that represent an individual in the dataset. + +- field_name: publisher_note + label: Publisher note + preset: fluent_markdown + help_text: > + A description of the publisher activities. + +- field_name: publisher_type + label: Publisher type + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A type of organisation that makes the Dataset available. + +- field_name: trusted_data_holder + label: Trusted Data Holder + preset: select + choices: + - value: false + label: "No" + - value: true + label: "Yes" + validators: ignore_missing boolean_validator + help_text: > + Indicates whether the dataset is held by a trusted data holder. + output_validators: boolean_validator + +- field_name: population_coverage + label: Population coverage + preset: fluent_markdown + help_text: > + A definition of the population within the dataset. + +- field_name: retention_period + label: Retention period + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + + help_text: A temporal period which the dataset is available for secondary use. + + +# Officially there can only be one HDAB for now, but keep it repeating subfield just in case +- field_name: hdab + label: Health data access body + repeating_label: Health data access body + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: name_translated + label: Name (translations) + preset: fluent_core_translated + help_text: Name of the health data access body in each language. + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the HDAB, such as a ROR ID. + help_text: Health Data Access Body supporting access to data in the Member State. + +- field_name: qualified_relation + label: Qualified relation + repeating_label: Relationship + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: relation + label: Relation + help_text: The resource related to the source resource. + + - field_name: role + label: Role + help_text: The function of an entity or agent with respect to another entity or resource. + help_text: A description of a relationship with another resource. + +# Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this dataset (if not provided it will be autogenerated). + +# TODO: relation-based properties are not yet included (e.g. is_version_of, source, sample, etc) +# +resource_fields: + +- field_name: url + label: URL + preset: resource_url_upload + +- field_name: name_translated + label: Name + preset: fluent_core_translated + help_text: A descriptive title for the resource. + +- field_name: description_translated + label: Description + preset: fluent_core_translated + form_snippet: fluent_markdown.html + display_snippet: fluent_markdown.html + help_text: A free-text account of the resource. + +- field_name: format + label: Format + preset: resource_format_autocomplete + help_text: File format. If not provided it will be guessed. + +- field_name: mimetype + label: Media type + validators: if_empty_guess_format ignore_missing unicode_safe + help_text: Media type for this format. If not provided it will be guessed. + +- field_name: compress_format + label: Compress format + help_text: The format of the file in which the data is contained in a compressed form. + +- field_name: package_format + label: Package format + help_text: The format of the file in which one or more data files are grouped together. + +- field_name: size + label: Size + validators: ignore_missing int_validator + form_snippet: number.html + display_snippet: file_size.html + help_text: File size in bytes + +- field_name: hash + label: Hash + help_text: Checksum of the downloaded file. + +- field_name: hash_algorithm + label: Hash Algorithm + help_text: Algorithm used to calculate to checksum. + +- field_name: rights + label: Rights + preset: fluent_markdown + help_text: Some statement about the rights associated with the resource. + +- field_name: availability + label: Availability + help_text: Indicates how long it is planned to keep the resource available. + +- field_name: status + label: Status + preset: select + choices: + - value: http://purl.org/adms/status/Completed + label: Completed + - value: http://purl.org/adms/status/UnderDevelopment + label: Under Development + - value: http://purl.org/adms/status/Deprecated + label: Deprecated + - value: http://purl.org/adms/status/Withdrawn + label: Withdrawn + help_text: The status of the resource in the context of maturity lifecycle. + +- field_name: license + label: License + help_text: License in which the resource is made available. If not provided will be inherited from the dataset. + + # Note: this falls back to the standard resource url field +- field_name: access_url + label: Access URL + help_text: URL that gives access to the dataset (defaults to the standard resource URL). + + # Note: this falls back to the standard resource url field +- field_name: download_url + label: Download URL + display_snippet: link.html + help_text: URL that provides a direct link to a downloadable file (defaults to the standard resource URL). + +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the resource. + +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the resource was changed, updated or modified. + +- field_name: temporal_resolution + label: Temporal resolution + help_text: Minimum time period resolvable in the distribution. + +- field_name: spatial_resolution_in_meters + label: Spatial resolution in meters + help_text: Minimum spatial separation resolvable in the distribution, measured in meters. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the resource. + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this resource. + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: An established schema to which the described resource conforms. + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the resource. + +- field_name: access_services + label: Access services + repeating_label: Access service + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: title + label: Title + + - field_name: endpoint_description + label: Endpoint description + + - field_name: endpoint_url + label: Endpoint URL + preset: multiple_text + + - field_name: serves_dataset + label: Serves dataset + preset: multiple_text + validators: ignore_missing scheming_multiple_text + + - field_name: access_rights + label: Access rights + validators: ignore_missing unicode_safe + help_text: Information regarding access or restrictions based on privacy, security, or other policies. + + help_text: A data service that gives access to the resource. + + # Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this resource (if not provided it will be autogenerated). diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 2d907f0f..949dccfd 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -182,31 +182,42 @@ def test_e2e_dcat_to_ckan(self): "end": "2034-12-31", } ] + assert dataset["resources"][0]["retention_period"] == [ { "start": "2020-03-01", "end": "2034-12-31", } ] - - assert dataset["provenance_activity"] == [{ - "uri": "internalURI:wasGeneratedBy0", - "label": "http://dbpedia.org/resource/Record_linkage", - "seeAlso": "https://www.ehealth.fgov.be/ehealthplatform/fr/service-codage-anonymisation-et-ttp", - "dct_type": "http://dbpedia.org/resource/Record_linkage", - "startedAtTime": "2021-01-01T00:00:00+00:00", - "wasAssociatedWith": [{ - "name": "Dr. Joris van Loenhout", - "url": "https://www.sciensano.be/fr/people/joris-van-loenhout", - "email": "Joris.VanLoenhout@sciensano.be", - "type": "", - "uri": "", - "identifier": "", - "actedOnBehalfOf": [{ - "name": "Contact Point" - }] - }] - }] + + provenance_activity = dataset["provenance_activity"] + assert len(provenance_activity) == 1 + + activity = provenance_activity[0] + assert activity["uri"] == "internalURI:wasGeneratedBy0" + assert activity["label"] == "http://dbpedia.org/resource/Record_linkage" + assert activity["seeAlso"] == ( + "https://www.ehealth.fgov.be/ehealthplatform/fr/service-codage-anonymisation-et-ttp" + ) + assert activity["dct_type"] == "http://dbpedia.org/resource/Record_linkage" + assert activity["startedAtTime"] == "2021-01-01T00:00:00+00:00" + + associated = activity["wasAssociatedWith"] + assert len(associated) == 1 + + agent = associated[0] + assert agent["name"] == "Dr. Joris van Loenhout" + if agent.get("name_translated"): + assert agent["name_translated"].get("en") == "Dr. Joris van Loenhout" + assert agent["url"] == "https://www.sciensano.be/fr/people/joris-van-loenhout" + assert agent["email"] == "Joris.VanLoenhout@sciensano.be" + + acted_on_behalf = agent.get("actedOnBehalfOf", []) + assert len(acted_on_behalf) == 1 + acted_agent = acted_on_behalf[0] + assert acted_agent["name"] == "Contact Point" + if acted_agent.get("name_translated"): + assert acted_agent["name_translated"].get("en") == "Contact Point" assert dataset["qualified_attribution"][0]["role"] == "https://inspire.ec.europa.eu/metadata-codelist/ResponsiblePartyRole/processor" @@ -222,3 +233,68 @@ def test_e2e_dcat_to_ckan(self): assert dataset["quality_annotation"][0]["body"] == "https://certificates.theodi.org/en/datasets/393/certificate" assert dataset["quality_annotation"][0]["target"] == "https://certificates.theodi.org/en/datasets/393" assert dataset["quality_annotation"][0]["motivated_by"] == "http://www.w3.org/ns/dqv#qualityAssessment" + + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets fluent") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", + "ckanext.dcat.schemas:health_dcat_ap_multilingual.yaml", +) +@pytest.mark.ckan_config( + "scheming.presets", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml ckanext.fluent:presets.json", +) +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") +class TestSchemingFluentParseSupport(BaseParseTest): + def test_e2e_dcat_to_ckan_multilingual(self): + contents = self._get_file_contents("dcat/dataset_health_multilingual.ttl") + + parser = RDFParser() + parser.parse(contents, _format="turtle") + + datasets = list(parser.datasets()) + assert len(datasets) == 1 + + dataset_dict = datasets[0] + dataset_dict["name"] = "test-dcat-health-multilingual" + + dataset = call_action("package_create", **dataset_dict) + + assert dataset["title_translated"]["en"] == "Health dataset" + assert dataset["title_translated"]["nl"] == "Gezondheidsdataset" + + assert dataset["notes_translated"]["en"] == "A dataset with multilingual metadata" + assert dataset["notes_translated"]["nl"] == "Een dataset met meertalige metadata" + + assert dataset["tags_translated"]["en"] == ["health"] + assert dataset["tags_translated"]["nl"] == ["gezondheid"] + + assert dataset["population_coverage"]["en"] == "Population coverage in English" + assert dataset["population_coverage"]["nl"] == "Populatiedekking in het Nederlands" + + assert dataset["publisher_note"]["en"] == "Publisher note in English" + assert dataset["publisher_note"]["nl"] == "Notitie van de uitgever in het Nederlands" + + publisher = dataset["publisher"][0] + assert publisher["name_translated"]["en"] == "Health Institute" + assert publisher["name_translated"]["nl"] == "Gezondheidsinstituut" + + creator = dataset["creator"][0] + assert creator["name_translated"]["en"] == "Health Creator" + assert creator["name_translated"]["nl"] == "Gezondheidsmaker" + + resource = dataset["resources"][0] + + assert resource["name_translated"]["en"] == "CSV extract" + assert resource["name_translated"]["nl"] == "CSV-uitvoer" + + assert resource["description_translated"]["en"] == "Distribution description in English" + assert ( + resource["description_translated"]["nl"] + == "Beschrijving van de distributie in het Nederlands" + ) + + assert resource["rights"]["en"] == "Rights statement" + assert resource["rights"]["nl"] == "Rechtenverklaring" diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py index 0c523189..2a96564b 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py @@ -201,3 +201,169 @@ def test_e2e_ckan_to_dcat(self): Literal(distribution_details["retention_period"][0]["end"], datatype=XSD.date) ) + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets fluent") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", + "ckanext.dcat.schemas:health_dcat_ap_multilingual.yaml", +) +@pytest.mark.ckan_config( + "scheming.presets", + "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml ckanext.fluent:presets.json", +) +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") +class TestEuroDCATAP3ProfileSerializeDatasetFluent(BaseSerializeTest): + def test_e2e_ckan_to_dcat_multilingual(self): + dataset_dict = { + "name": "health-dcat-fluent", + "title_translated": { + "en": "Health dataset", + "nl": "Gezondheidsdataset", + }, + "notes_translated": { + "en": "A dataset with multilingual metadata", + "nl": "Een dataset met meertalige metadata", + }, + "tags_translated": { + "en": ["health"], + "nl": ["gezondheid"], + }, + "population_coverage": { + "en": "Population coverage in English", + "nl": "Populatiedekking in het Nederlands", + }, + "publisher_note": { + "en": "Publisher note in English", + "nl": "Notitie van de uitgever in het Nederlands", + }, + "publisher": [ + { + "name": "Health Institute", + "name_translated": { + "en": "Health Institute", + "nl": "Gezondheidsinstituut", + }, + "email": "info@example.com", + "url": "https://healthdata.nl", + } + ], + "creator": [ + { + "name": "Health Creator", + "name_translated": { + "en": "Health Creator", + "nl": "Gezondheidsmaker", + }, + "email": "creator@example.com", + } + ], + "resources": [ + { + "url": "http://example.test/dataset/1/resource.csv", + "name_translated": { + "en": "CSV extract", + "nl": "CSV-uitvoer", + }, + "description_translated": { + "en": "Distribution description in English", + "nl": "Beschrijving van de distributie in het Nederlands", + }, + "rights": { + "en": "Rights statement", + "nl": "Rechtenverklaring", + }, + } + ], + } + + dataset = call_action("package_create", **dataset_dict) + + serializer = RDFSerializer() + graph = serializer.g + dataset_ref = serializer.graph_from_dataset(dataset) + + assert self._triple(graph, dataset_ref, DCT.title, "Health dataset", lang="en") + assert self._triple( + graph, dataset_ref, DCT.title, "Gezondheidsdataset", lang="nl" + ) + + assert self._triple( + graph, + dataset_ref, + HEALTHDCATAP.populationCoverage, + "Population coverage in English", + lang="en", + ) + assert self._triple( + graph, + dataset_ref, + HEALTHDCATAP.populationCoverage, + "Populatiedekking in het Nederlands", + lang="nl", + ) + + assert self._triple( + graph, + dataset_ref, + HEALTHDCATAP.publisherNote, + "Publisher note in English", + lang="en", + ) + assert self._triple( + graph, + dataset_ref, + HEALTHDCATAP.publisherNote, + "Notitie van de uitgever in het Nederlands", + lang="nl", + ) + + publisher_ref = next(graph.objects(dataset_ref, DCT.publisher)) + assert self._triple( + graph, publisher_ref, FOAF.name, "Health Institute", lang="en" + ) + assert self._triple( + graph, publisher_ref, FOAF.name, "Gezondheidsinstituut", lang="nl" + ) + + creator_ref = next(graph.objects(dataset_ref, DCT.creator)) + assert self._triple( + graph, creator_ref, FOAF.name, "Health Creator", lang="en" + ) + assert self._triple( + graph, creator_ref, FOAF.name, "Gezondheidsmaker", lang="nl" + ) + + distribution_ref = self._triple( + graph, dataset_ref, DCAT.distribution, None + )[2] + + assert self._triple( + graph, distribution_ref, DCT.title, "CSV extract", lang="en" + ) + assert self._triple( + graph, distribution_ref, DCT.title, "CSV-uitvoer", lang="nl" + ) + + assert self._triple( + graph, + distribution_ref, + DCT.description, + "Distribution description in English", + lang="en", + ) + assert self._triple( + graph, + distribution_ref, + DCT.description, + "Beschrijving van de distributie in het Nederlands", + lang="nl", + ) + + rights_node = next(graph.objects(distribution_ref, DCT.rights)) + assert self._triple( + graph, rights_node, RDFS.label, "Rights statement", lang="en" + ) + assert self._triple( + graph, rights_node, RDFS.label, "Rechtenverklaring", lang="nl" + ) diff --git a/ckanext/dcat/tests/test_blueprints.py b/ckanext/dcat/tests/test_blueprints.py index 28d62022..594b58fb 100644 --- a/ckanext/dcat/tests/test_blueprints.py +++ b/ckanext/dcat/tests/test_blueprints.py @@ -23,15 +23,22 @@ def _sort_query_params(url): parts = urlparse(url) qs = parse_qs(parts.query) ordered_qs = OrderedDict(sorted(qs.items())) - encoded_qs = urlencode(ordered_qs).replace('u%27', '%27') + encoded_qs = urlencode(ordered_qs).replace("u%27", "%27") return urlunparse( - (parts.scheme, parts.netloc, parts.path, parts.params, - encoded_qs, parts.fragment) + ( + parts.scheme, + parts.netloc, + parts.path, + parts.params, + encoded_qs, + parts.fragment, + ) ) -@pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_index') -class TestEndpoints(): + +@pytest.mark.usefixtures("with_plugins", "clean_db", "clean_index") +class TestEndpoints: def _object_value(self, graph, subject, predicate): @@ -40,22 +47,20 @@ def _object_value(self, graph, subject, predicate): def test_dataset_default(self, app): - dataset = factories.Dataset( - notes='Test dataset' - ) + dataset = factories.Dataset(notes="Test dataset") - url = url_for('dcat.read_dataset', _id=dataset['name'], _format='rdf') + url = url_for("dcat.read_dataset", _id=dataset["name"], _format="rdf") response = app.get(url) - assert response.headers['Content-Type'] == 'application/rdf+xml' + assert response.headers["Content-Type"] == "application/rdf+xml" content = response.body # Parse the contents to check it's an actual serialization p = RDFParser() - p.parse(content, _format='xml') + p.parse(content, _format="xml") dcat_datasets = [d for d in p.datasets()] @@ -63,27 +68,54 @@ def test_dataset_default(self, app): dcat_dataset = dcat_datasets[0] - assert dcat_dataset['title'] == dataset['title'] - assert dcat_dataset['notes'] == dataset['notes'] + assert dcat_dataset["title"] == dataset["title"] + assert dcat_dataset["notes"] == dataset["notes"] - def test_dataset_xml(self, app): + def test_dataset_default_no_redirects(self, app): + + dataset = factories.Dataset(notes="Test dataset") + + url = url_for("dcat.read_dataset", _id=dataset["name"], _format="rdf") + + assert url.startswith("/dataset/") + def test_dataset_default_private(self, app): + user = factories.UserWithToken() + org = factories.Organization( + users=[{"name": user["name"], "capacity": "admin"}] + ) dataset = factories.Dataset( - notes='Test dataset' + notes="Test dataset", owner_org=org["id"], private=True ) - url = url_for('dcat.read_dataset', _id=dataset['name'], _format='xml') + url = url_for("dcat.read_dataset", _id=dataset["name"], _format="rdf") + + # Unauthenticated request + response = app.get(url) + assert response.status_code == 403 + + # Authenticated request + headers = {"Authorization": user["token"]} + response = app.get(url, headers=headers) + + assert response.headers["Content-Type"] == "application/rdf+xml" + + def test_dataset_xml(self, app): + + dataset = factories.Dataset(notes="Test dataset") + + url = url_for("dcat.read_dataset", _id=dataset["name"], _format="xml") response = app.get(url) - assert response.headers['Content-Type'] == 'application/rdf+xml' + assert response.headers["Content-Type"] == "application/rdf+xml" content = response.body # Parse the contents to check it's an actual serialization p = RDFParser() - p.parse(content, _format='xml') + p.parse(content, _format="xml") dcat_datasets = [d for d in p.datasets()] @@ -91,27 +123,25 @@ def test_dataset_xml(self, app): dcat_dataset = dcat_datasets[0] - assert dcat_dataset['title'] == dataset['title'] - assert dcat_dataset['notes'] == dataset['notes'] + assert dcat_dataset["title"] == dataset["title"] + assert dcat_dataset["notes"] == dataset["notes"] def test_dataset_ttl(self, app): - dataset = factories.Dataset( - notes='Test dataset' - ) + dataset = factories.Dataset(notes="Test dataset") - url = url_for('dcat.read_dataset', _id=dataset['name'], _format='ttl') + url = url_for("dcat.read_dataset", _id=dataset["name"], _format="ttl") response = app.get(url) - assert response.headers['Content-Type'] == 'text/turtle' + assert response.headers["Content-Type"] == "text/turtle" content = response.body # Parse the contents to check it's an actual serialization p = RDFParser() - p.parse(content, _format='turtle') + p.parse(content, _format="turtle") dcat_datasets = [d for d in p.datasets()] @@ -119,27 +149,25 @@ def test_dataset_ttl(self, app): dcat_dataset = dcat_datasets[0] - assert dcat_dataset['title'] == dataset['title'] - assert dcat_dataset['notes'] == dataset['notes'] + assert dcat_dataset["title"] == dataset["title"] + assert dcat_dataset["notes"] == dataset["notes"] def test_dataset_n3(self, app): - dataset = factories.Dataset( - notes='Test dataset' - ) + dataset = factories.Dataset(notes="Test dataset") - url = url_for('dcat.read_dataset', _id=dataset['name'], _format='n3') + url = url_for("dcat.read_dataset", _id=dataset["name"], _format="n3") response = app.get(url) - assert response.headers['Content-Type'] == 'text/n3' + assert response.headers["Content-Type"] == "text/n3" content = response.body # Parse the contents to check it's an actual serialization p = RDFParser() - p.parse(content, _format='n3') + p.parse(content, _format="n3") dcat_datasets = [d for d in p.datasets()] @@ -147,27 +175,25 @@ def test_dataset_n3(self, app): dcat_dataset = dcat_datasets[0] - assert dcat_dataset['title'] == dataset['title'] - assert dcat_dataset['notes'] == dataset['notes'] + assert dcat_dataset["title"] == dataset["title"] + assert dcat_dataset["notes"] == dataset["notes"] def test_dataset_jsonld(self, app): - dataset = factories.Dataset( - notes='Test dataset' - ) + dataset = factories.Dataset(notes="Test dataset") - url = url_for('dcat.read_dataset', _id=dataset['name'], _format='jsonld') + url = url_for("dcat.read_dataset", _id=dataset["name"], _format="jsonld") response = app.get(url) - assert response.headers['Content-Type'] == 'application/ld+json' + assert response.headers["Content-Type"] == "application/ld+json" content = response.body # Parse the contents to check it's an actual serialization p = RDFParser() - p.parse(content, _format='json-ld') + p.parse(content, _format="json-ld") dcat_datasets = [d for d in p.datasets()] @@ -175,49 +201,52 @@ def test_dataset_jsonld(self, app): dcat_dataset = dcat_datasets[0] - assert dcat_dataset['title'] == dataset['title'] - assert dcat_dataset['notes'] == dataset['notes'] + assert dcat_dataset["title"] == dataset["title"] + assert dcat_dataset["notes"] == dataset["notes"] def test_dataset_profiles_jsonld(self, app): - dataset = factories.Dataset( - notes='Test dataset' - ) + dataset = factories.Dataset(notes="Test dataset") - url = url_for('dcat.read_dataset', _id=dataset['name'], _format='jsonld', profiles='schemaorg') + url = url_for( + "dcat.read_dataset", + _id=dataset["name"], + _format="jsonld", + profiles="schemaorg", + ) response = app.get(url) - assert response.headers['Content-Type'] == 'application/ld+json' + assert response.headers["Content-Type"] == "application/ld+json" content = response.body assert '"@type": "schema:Dataset"' in content - assert '"schema:description": "%s"' % dataset['notes'] in content + assert '"schema:description": "%s"' % dataset["notes"] in content def test_dataset_profiles_not_found(self, app): - dataset = factories.Dataset( - notes='Test dataset' - ) + dataset = factories.Dataset(notes="Test dataset") - url = url_for('dcat.read_dataset', _id=dataset['name'], _format='jsonld', profiles='nope') + url = url_for( + "dcat.read_dataset", _id=dataset["name"], _format="jsonld", profiles="nope" + ) response = app.get(url, status=409) - assert 'Unknown RDF profiles: nope' in response.body + assert "Unknown RDF profiles: nope" in response.body def test_dataset_not_found(self, app): import uuid - url = url_for('dcat.read_dataset', _id=str(uuid.uuid4()), _format='n3') + url = url_for("dcat.read_dataset", _id=str(uuid.uuid4()), _format="n3") app.get(url, status=404) def test_dataset_form_is_rendered(self, app): sysadmin = factories.Sysadmin() - env = {'REMOTE_USER': sysadmin['name'].encode('ascii')} - url = url_for('dataset.new') + env = {"REMOTE_USER": sysadmin["name"].encode("ascii")} + url = url_for("dataset.new") response = app.get(url, extra_environ=env) @@ -225,26 +254,23 @@ def test_dataset_form_is_rendered(self, app): assert '' in response.body -@pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_index') -class TestCroissant(): +@pytest.mark.usefixtures("with_plugins", "clean_db", "clean_index") +class TestCroissant: - @pytest.mark.ckan_config('ckan.plugins', 'dcat croissant') + @pytest.mark.ckan_config("ckan.plugins", "dcat croissant") def test_croissant_metadata_embedded(self, app): - dataset = factories.Dataset( - notes='test description' - ) + dataset = factories.Dataset(notes="test description") - url = url_for('dataset.read', id=dataset['name']) + url = url_for("dataset.read", id=dataset["name"]) response = app.get(url) assert '