diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index cecb1d70..9f83c6d8 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -68,7 +68,8 @@ jobs:
pip install -r ckanext-harvest/requirements.txt
git clone https://github.com/ckan/ckanext-scheming
pip install -e ckanext-scheming
- pip install git+https://github.com/ckan/ckanext-fluent.git@4e9340a#egg=ckanext-fluent
+ git clone https://github.com/ckan/ckanext-fluent
+ pip install -e ckanext-fluent
git clone https://github.com/ckan/ckanext-dataset-series
pip install -e ckanext-dataset-series
- name: Setup extension
@@ -76,4 +77,4 @@ jobs:
ckan -c test.ini db init
ckan -c test.ini db pending-migrations --apply
- name: Run tests
- run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests
+ run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 7b7d96d3..90877266 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,9 @@ build/*
tmp/*
package/DEBIAN/control
*.swp
+.idea/.gitignore
+.idea/ckanext-dcat.iml
+.idea/misc.xml
+.idea/modules.xml
+.idea/vcs.xml
+.idea/inspectionProfiles/profiles_settings.xml
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2ffbd217..37925701 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,18 @@
# Changelog
-## [Unreleased](https://github.com/ckan/ckanext-dcat/compare/v2.4.0...HEAD)
+## [Unreleased](https://github.com/ckan/ckanext-dcat/compare/v2.4.1...HEAD)
+
+## [v2.4.1](https://github.com/ckan/ckanext-dcat/compare/v2.4.0...v2.4.1) - 2025-09-25
+
+* Fix regression redirect from /dataset to /dataset_series ([#362](https://github.com/ckan/ckanext-dcat/pull/362))
+* Provide default language in Croissant JSON-LD context ([#361](https://github.com/ckan/ckanext-dcat/pull/361))
+* Added [`IDCATURIGenerator`](https://docs.ckan.org/projects/ckanext-dcat/en/latest/uri-customization/)
+ plugin interface to allow customization of the URIs generation ([#351](https://github.com/ckan/ckanext-dcat/pull/351))
+* Added support for new fields to DCAT classes: `dcat:Dataset` (`prov:wasGeneratedBy`, `prov:qualifiedAttribution`,
+ `dcat:hasVersion`), `dcat:Catalog` (`foaf:homepage`), `dcat:DataService` (`dct:conformsTo`, `dct:format`,
+ `dct:identifier`, `dct:language`, `dct:rights`, `dcat:landingPage`, `dcat:keyword`) ([#352](https://github.com/ckan/ckanext-dcat/pull/352))
+* Add HealthDCAT-AP mapping to CKAN field mapping table ([#347](https://github.com/ckan/ckanext-dcat/pull/347))
+* Docs: Add HealthDCAT-AP mapping to CKAN field mapping table ([#347](https://github.com/ckan/ckanext-dcat/pull/347))
## [v2.4.0](https://github.com/ckan/ckanext-dcat/compare/v2.3.0...v2.4.0) - 2025-05-20
diff --git a/ckanext/dcat/blueprints.py b/ckanext/dcat/blueprints.py
index e224fb22..62f3859f 100644
--- a/ckanext/dcat/blueprints.py
+++ b/ckanext/dcat/blueprints.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
from flask import Blueprint, jsonify, make_response
import ckantoolkit as toolkit
@@ -12,11 +11,7 @@
config = toolkit.config
-dcat = Blueprint(
- 'dcat',
- __name__,
- url_defaults={u'package_type': u'dataset'}
-)
+dcat = Blueprint("dcat", __name__, url_defaults={"package_type": "dataset"})
def read_catalog(_format=None, package_type=None):
@@ -30,23 +25,31 @@ def read_dataset(_id, _format=None, package_type=None):
if endpoints_enabled():
# requirements={'_format': 'xml|rdf|n3|ttl|jsonld'}
- dcat.add_url_rule(config.get('ckanext.dcat.catalog_endpoint',
- utils.DEFAULT_CATALOG_ENDPOINT).replace(
- '{_format}', '<_format>'),
- view_func=read_catalog)
+ dcat.add_url_rule(
+ config.get(
+ "ckanext.dcat.catalog_endpoint", utils.DEFAULT_CATALOG_ENDPOINT
+ ).replace("{_format}", "<_format>"),
+ view_func=read_catalog,
+ )
# TODO: Generalize for all dataset types
- dcat.add_url_rule('/dataset_series/<_id>.<_format>', view_func=read_dataset)
- dcat.add_url_rule('/dataset/<_id>.<_format>', view_func=read_dataset)
+ dcat.add_url_rule(
+ "/dataset_series/<_id>.<_format>",
+ view_func=read_dataset,
+ endpoint="read_dataset_series",
+ )
+ dcat.add_url_rule(
+ "/dataset/<_id>.<_format>", view_func=read_dataset, endpoint="read_dataset"
+ )
if toolkit.asbool(config.get(utils.ENABLE_CONTENT_NEGOTIATION_CONFIG)):
- dcat.add_url_rule('/', view_func=read_catalog)
+ dcat.add_url_rule("/", view_func=read_catalog)
- dcat.add_url_rule('/dataset/new', view_func=CreateView.as_view(str(u'new')))
- dcat.add_url_rule('/dataset/<_id>', view_func=read_dataset)
+ dcat.add_url_rule("/dataset/new", view_func=CreateView.as_view(str("new")))
+ dcat.add_url_rule("/dataset/<_id>", view_func=read_dataset)
-dcat_json_interface = Blueprint('dcat_json_interface', __name__)
+dcat_json_interface = Blueprint("dcat_json_interface", __name__)
def dcat_json():
@@ -54,12 +57,12 @@ def dcat_json():
return jsonify(datasets)
-dcat_json_interface.add_url_rule(config.get('ckanext.dcat.json_endpoint',
- '/dcat.json'),
- view_func=dcat_json)
+dcat_json_interface.add_url_rule(
+ config.get("ckanext.dcat.json_endpoint", "/dcat.json"), view_func=dcat_json
+)
-croissant = Blueprint('croissant', __name__)
+croissant = Blueprint("croissant", __name__)
def read_dataset_croissant(_id):
@@ -72,15 +75,14 @@ def read_dataset_croissant(_id):
)
context = {
- 'user': user_name,
+ "user": user_name,
}
- data_dict = {'id': _id}
+ data_dict = {"id": _id}
dataset_dict = toolkit.get_action("package_show")(context, data_dict)
except (toolkit.ObjectNotFound, toolkit.NotAuthorized):
return toolkit.abort(
- 404,
- toolkit._("Dataset not found or you have no permission to view it")
+ 404, toolkit._("Dataset not found or you have no permission to view it")
)
response = make_response(croissant_serialization(dataset_dict))
@@ -88,4 +90,7 @@ def read_dataset_croissant(_id):
return response
-croissant.add_url_rule('/dataset/<_id>/croissant.jsonld', view_func=read_dataset_croissant)
+
+croissant.add_url_rule(
+ "/dataset/<_id>/croissant.jsonld", view_func=read_dataset_croissant
+)
diff --git a/ckanext/dcat/harvesters/rdf.py b/ckanext/dcat/harvesters/rdf.py
index 00a7f91a..a22e0b97 100644
--- a/ckanext/dcat/harvesters/rdf.py
+++ b/ckanext/dcat/harvesters/rdf.py
@@ -210,18 +210,39 @@ def gather_stage(self, harvest_job):
return []
try:
- source_dataset = model.Package.get(harvest_job.source.id)
-
- series_ids, series_mapping = self._parse_and_collect(
- parser.dataset_series(),
- source_dataset,
- harvest_job,
- guids_in_source,
- is_series=True,
- collect_series_mapping=True
- )
- object_ids += series_ids
- object_ids += self._parse_and_collect(parser.datasets(series_mapping), source_dataset, harvest_job, guids_in_source, is_series=False)
+
+ source_dataset = model.Package.get(harvest_job.source.id)
+
+ for dataset in parser.datasets():
+ if not dataset.get('name'):
+ dataset['name'] = self._gen_new_name(dataset['title'])
+ if dataset['name'] in self._names_taken:
+ suffix = len([i for i in self._names_taken if i.startswith(dataset['name'] + '-')]) + 1
+ dataset['name'] = '{}-{}'.format(dataset['name'], suffix)
+ self._names_taken.append(dataset['name'])
+
+ # Unless already set by the parser, get the owner organization (if any)
+ # from the harvest source dataset
+ if not dataset.get('owner_org'):
+ if source_dataset.owner_org:
+ dataset['owner_org'] = source_dataset.owner_org
+
+ # Try to get a unique identifier for the harvested dataset
+ guid = self._get_guid(dataset, source_url=source_dataset.url)
+
+ if not guid:
+ self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset),
+ harvest_job)
+ continue
+
+ dataset['extras'].append({'key': 'guid', 'value': guid})
+ guids_in_source.append(guid)
+
+ obj = HarvestObject(guid=guid, job=harvest_job,
+ content=json.dumps(dataset))
+
+ obj.save()
+ object_ids.append(obj.id)
except Exception as e:
self._save_gather_error('Error when processsing dataset: %r / %s' % (e, traceback.format_exc()),
harvest_job)
@@ -401,70 +422,3 @@ def import_stage(self, harvest_object):
model.Session.commit()
return True
-
- def _parse_and_collect(
- self,
- items,
- source_dataset,
- harvest_job,
- guids_in_source,
- is_series=False,
- collect_series_mapping=False
- ):
- object_ids = []
- label = "dataset series" if is_series else "dataset"
- series_mapping = {} if collect_series_mapping else None
-
- for item in items:
- original_title = item.get("title", label)
- if not item.get("name"):
- item["name"] = self._gen_new_name(original_title)
-
- if item["name"] in self._names_taken:
- suffix = len([i for i in self._names_taken if i.startswith(item["name"] + "-")]) + 1
- item["name"] = f"{item['name']}-{suffix}"
-
- self._names_taken.append(item["name"])
-
- if not item.get("owner_org") and source_dataset.owner_org:
- item["owner_org"] = source_dataset.owner_org
-
- guid = self._get_guid(item, source_url=source_dataset.url)
- if not guid:
- self._save_gather_error(f"Could not get a unique identifier for {label}: {item}", harvest_job)
- continue
-
- item.setdefault("extras", []).append({"key": "guid", "value": guid})
- guids_in_source.append(guid)
-
- obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(item))
- obj.save()
- object_ids.append(obj.id)
-
- # Store mapping of RDF URI to dataset name if requested
- if collect_series_mapping:
- series_uri = item.get("uri") or item.get("identifier")
- if series_uri:
- # Try to find an existing active dataset series by 'guid' match
- existing = model.Session.query(model.Package).\
- join(model.PackageExtra).\
- filter(model.PackageExtra.key == 'guid').\
- filter(model.PackageExtra.value == series_uri).\
- filter(model.Package.type == 'dataset_series').\
- filter(model.Package.state == 'active').\
- first()
-
- if existing:
- item["name"] = existing.name
-
- series_mapping[str(series_uri)] = {
- "id": existing.id if existing else item.get("id"),
- "name": item["name"]
- }
-
-
- if collect_series_mapping:
- return object_ids, series_mapping
-
- return object_ids
-
diff --git a/ckanext/dcat/helpers.py b/ckanext/dcat/helpers.py
index 7669af95..372f58c1 100644
--- a/ckanext/dcat/helpers.py
+++ b/ckanext/dcat/helpers.py
@@ -72,7 +72,7 @@ def structured_data(dataset_dict, profiles=None):
return _get_serialization(dataset_dict, profiles, "jsonld")
-def croissant(dataset_dict, profiles=None):
+def croissant(dataset_dict, profiles=None, jsonld_context=None):
"""
Returns a string containing the Croissant ML representation of the given
dataset using the `croissant` profile.
@@ -82,8 +82,10 @@ def croissant(dataset_dict, profiles=None):
if not profiles:
profiles = config.get("ckanext.dcat.croissant.profiles", ["croissant"])
- frame = {"@context": JSONLD_CONTEXT, "@type": "sc:Dataset"}
+ context = jsonld_context or JSONLD_CONTEXT
+
+ frame = {"@context": context, "@type": "sc:Dataset"}
return _get_serialization(
- dataset_dict, profiles, "jsonld", context=JSONLD_CONTEXT, frame=frame
+ dataset_dict, profiles, "jsonld", context=context, frame=frame
)
diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py
index d255d582..79f35821 100644
--- a/ckanext/dcat/processors.py
+++ b/ckanext/dcat/processors.py
@@ -119,16 +119,6 @@ def _datasets(self):
for dataset in self.g.subjects(RDF.type, DCAT.Dataset):
yield dataset
- def _dataset_series(self):
- '''
- Generator that returns all DCAT dataset series on the graph
-
- Yields rdflib.term.URIRef objects that can be used on graph lookups
- and queries
- '''
- for dataset_series in self.g.subjects(RDF.type, DCAT.DatasetSeries):
- yield dataset_series
-
def next_page(self):
'''
Returns the URL of the next page or None if there is no next page
@@ -183,7 +173,7 @@ def supported_formats(self):
for plugin
in rdflib.plugin.plugins(kind=rdflib.parser.Parser)])
- def datasets(self, series_mapping=None):
+ def datasets(self):
'''
Generator that returns CKAN datasets parsed from the RDF graph
@@ -203,39 +193,6 @@ def datasets(self, series_mapping=None):
)
profile.parse_dataset(dataset_dict, dataset_ref)
- # Add in_series if present in RDF and mapped
- in_series = []
- for series_ref in self.g.objects(dataset_ref, DCAT.inSeries):
- key = str(series_ref)
- if series_mapping and key in series_mapping:
- in_series.append(series_mapping[key]["id"])
-
- if in_series:
- dataset_dict["in_series"] = in_series
-
- yield dataset_dict
-
-
- def dataset_series(self):
- '''
- Generator that returns CKAN dataset series parsed from the RDF graph
-
- Each dataset series is passed to all the loaded profiles before being
- yielded, so it can be further modified by each one of them.
-
- Returns a dataset series dict that can be passed to eg `package_create`
- or `package_update`
- '''
- for dataset_ref in self._dataset_series():
- dataset_dict = {}
- for profile_class in self._profiles:
- profile = profile_class(
- self.g,
- dataset_type=self.dataset_type,
- compatibility_mode=self.compatibility_mode
- )
- profile.parse_dataset(dataset_dict, dataset_ref)
-
yield dataset_dict
diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py
index 5b0591e0..2dcdabe5 100644
--- a/ckanext/dcat/profiles/base.py
+++ b/ckanext/dcat/profiles/base.py
@@ -533,10 +533,36 @@ def _agents_details(self, subject, predicate):
"""
agents = []
+ default_locale = config.get("ckan.locale_default", "") or ""
+ default_lang = default_locale.split("_")[0] if default_locale else None
+
for agent in self.g.objects(subject, predicate):
agent_details = {}
agent_details["uri"] = str(agent) if isinstance(agent, term.URIRef) else ""
- agent_details["name"] = self._object_value(agent, FOAF.name)
+
+ names = list(self.g.objects(agent, FOAF.name))
+ translations = {}
+ fallback_name = ""
+ for name_literal in names:
+ if isinstance(name_literal, Literal):
+ value = str(name_literal)
+ lang = name_literal.language
+ if lang:
+ translations[lang] = value
+ elif not fallback_name:
+ fallback_name = value
+ elif not fallback_name:
+ fallback_name = str(name_literal)
+
+ if translations:
+ agent_details["name_translated"] = translations
+ if default_lang and translations.get(default_lang):
+ agent_details["name"] = translations[default_lang]
+ else:
+ agent_details["name"] = fallback_name or next(iter(translations.values()))
+ else:
+ agent_details["name"] = fallback_name
+
agent_details["email"] = self._without_mailto(
self._object_value(agent, FOAF.mbox)
)
@@ -839,8 +865,25 @@ def _add_agent_to_graph(self, subject_ref, predicate, agent_dict):
self.g.add((agent_ref, RDF.type, FOAF.Organization))
self.g.add((agent_ref, RDF.type, FOAF.Agent))
+ name_translated = agent_dict.get("name_translated")
+ translated_values = set()
+ if isinstance(name_translated, dict):
+ for lang, values in name_translated.items():
+ if not values:
+ continue
+ if isinstance(values, (list, tuple)):
+ iterable = values
+ else:
+ iterable = [values]
+ for value in iterable:
+ if value:
+ self.g.add((agent_ref, FOAF.name, Literal(value, lang=lang)))
+ translated_values.add((lang, value))
+
if agent_dict.get("name"):
- self.g.add((agent_ref, FOAF.name, Literal(agent_dict["name"])))
+ name_value = agent_dict["name"]
+ if not translated_values or all(val != name_value for _, val in translated_values):
+ self.g.add((agent_ref, FOAF.name, Literal(name_value)))
if agent_dict.get("email"):
email = agent_dict["email"]
if not email.startswith("mailto:"):
@@ -856,11 +899,26 @@ def _add_agent_to_graph(self, subject_ref, predicate, agent_dict):
self.g.add((agent_ref, DCT.identifier, Literal(agent_dict["identifier"])))
for sub_org in agent_dict.get("actedOnBehalfOf", []):
- if sub_org.get("name"):
+ if sub_org.get("name") or sub_org.get("name_translated"):
org_ref = BNode()
self.g.add((agent_ref, PROV.actedOnBehalfOf, org_ref))
self.g.add((org_ref, RDF.type, PROV.Organization))
- self.g.add((org_ref, FOAF.name, Literal(sub_org["name"])))
+
+ sub_translations = sub_org.get("name_translated", {}) or {}
+ if isinstance(sub_translations, dict):
+ for lang, values in sub_translations.items():
+ if not values:
+ continue
+ if isinstance(values, (list, tuple)):
+ iterable = values
+ else:
+ iterable = [values]
+ for value in iterable:
+ if value:
+ self.g.add((org_ref, FOAF.name, Literal(value, lang=lang)))
+
+ if sub_org.get("name"):
+ self.g.add((org_ref, FOAF.name, Literal(sub_org["name"])))
return agent_ref
diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py
index ad325701..7203fd7c 100644
--- a/ckanext/dcat/profiles/croissant.py
+++ b/ckanext/dcat/profiles/croissant.py
@@ -24,6 +24,7 @@
JSONLD_CONTEXT = {
"@vocab": "https://schema.org/",
+ "@language": config.get("ckan.locale_default"),
"sc": "https://schema.org/",
"cr": "http://mlcommons.org/croissant/",
"rai": "http://mlcommons.org/croissant/RAI/",
diff --git a/ckanext/dcat/profiles/euro_dcat_ap_3.py b/ckanext/dcat/profiles/euro_dcat_ap_3.py
index 64220430..a99cadfe 100644
--- a/ckanext/dcat/profiles/euro_dcat_ap_3.py
+++ b/ckanext/dcat/profiles/euro_dcat_ap_3.py
@@ -30,17 +30,6 @@ def parse_dataset(self, dataset_dict, dataset_ref):
# DCAT AP v2 scheming fields
dataset_dict = self._parse_dataset_v2_scheming(dataset_dict, dataset_ref)
-
- # Check if it's a dataset series
- if (dataset_ref, RDF.type, DCAT.DatasetSeries) in self.g:
- dataset_dict["type"] = "dataset_series"
-
- # Example defaulting logic (adjust based on RDF vocab if you have it)
- if "series_order_field" not in dataset_dict:
- dataset_dict["series_order_field"] = "metadata_created"
- if "series_order_type" not in dataset_dict:
- dataset_dict["series_order_type"] = "date"
-
# DCAT AP v3: hasVersion
values = self._object_value_list(dataset_ref, DCAT.hasVersion)
if values:
diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py
index 4a7db6f0..078bbc1f 100644
--- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py
+++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py
@@ -70,6 +70,29 @@ def _parse_list_value(data_dict, field_name):
except ValueError:
pass
+ def _supports_agent_translations(field_name):
+ schema_field = self._schema_field(field_name)
+ if schema_field and "repeating_subfields" in schema_field:
+ return any(
+ subfield.get("field_name") == "name_translated"
+ for subfield in schema_field["repeating_subfields"]
+ )
+ return False
+
+ def _prune_agent_translations(agent_list):
+ pruned = []
+ for agent_entry in agent_list:
+ if isinstance(agent_entry, dict):
+ agent_entry = dict(agent_entry)
+ agent_entry.pop("name_translated", None)
+ acted_lists = agent_entry.get("actedOnBehalfOf")
+ if isinstance(acted_lists, list):
+ agent_entry["actedOnBehalfOf"] = _prune_agent_translations(acted_lists)
+ pruned.append(agent_entry)
+ else:
+ pruned.append(agent_entry)
+ return pruned
+
for field_name in dataset_dict.keys():
_parse_list_value(dataset_dict, field_name)
@@ -117,6 +140,8 @@ def _parse_list_value(data_dict, field_name):
key, predicate = item
agents = self._agents_details(dataset_ref, predicate)
if agents:
+ if not _supports_agent_translations(key):
+ agents = _prune_agent_translations(agents)
dataset_dict[key] = agents
# Add any qualifiedRelations
@@ -239,7 +264,25 @@ def _add_agents(
self.g.add((agent_ref, RDF.type, FOAF.Agent))
self.g.add((dataset_ref, rdf_predicate, agent_ref))
- self._add_triple_from_dict(agent, agent_ref, FOAF.name, "name")
+ name_translated = agent.get("name_translated")
+ translated_values = set()
+ if isinstance(name_translated, dict):
+ for lang, values in name_translated.items():
+ if not values:
+ continue
+ if isinstance(values, (list, tuple)):
+ iterable = values
+ else:
+ iterable = [values]
+ for value in iterable:
+ if value:
+ self.g.add((agent_ref, FOAF.name, Literal(value, lang=lang)))
+ translated_values.add((lang, value))
+
+ if agent.get("name"):
+ name_value = agent["name"]
+ if not translated_values or all(val != name_value for _, val in translated_values):
+ self.g.add((agent_ref, FOAF.name, Literal(name_value)))
self._add_triple_from_dict(
agent, agent_ref, FOAF.homepage, "url", _type=URIRef
)
diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py
index 253f4ee9..b80a5fe6 100644
--- a/ckanext/dcat/profiles/euro_health_dcat_ap.py
+++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py
@@ -23,6 +23,12 @@
"dpv": DPV,
}
+# HealthDCAT-AP fields that can contain language-tagged literals
+MULTILINGUAL_LITERAL_FIELDS = {
+ "population_coverage": HEALTHDCATAP.populationCoverage,
+ "publisher_note": HEALTHDCATAP.publisherNote,
+}
+
class EuropeanHealthDCATAPProfile(EuropeanDCATAP3Profile):
"""
@@ -42,7 +48,11 @@ def parse_dataset(self, dataset_dict, dataset_ref):
return dataset_dict
def _parse_health_fields(self, dataset_dict, dataset_ref):
- self.__parse_healthdcat_stringvalues(dataset_dict, dataset_ref)
+ multilingual_fields = set(self._multilingual_dataset_fields())
+
+ self.__parse_healthdcat_stringvalues(
+ dataset_dict, dataset_ref, multilingual_fields
+ )
self.__parse_healthdcat_booleanvalues(dataset_dict, dataset_ref)
self.__parse_healthdcat_intvalues(dataset_dict, dataset_ref)
@@ -78,7 +88,9 @@ def __parse_healthdcat_intvalues(self, dataset_dict, dataset_ref):
if value is not None:
dataset_dict[key] = value
- def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref):
+ def __parse_healthdcat_stringvalues(
+ self, dataset_dict, dataset_ref, multilingual_fields
+ ):
for (key, predicate,) in (
("analytics", HEALTHDCATAP.analytics),
("code_values", HEALTHDCATAP.hasCodeValues),
@@ -92,9 +104,18 @@ def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref):
("publisher_type", HEALTHDCATAP.publisherType),
("purpose", DPV.hasPurpose),
):
- values = self._object_value_list(dataset_ref, predicate)
- if values:
- dataset_dict[key] = values
+ if (
+ key in MULTILINGUAL_LITERAL_FIELDS
+ and key in multilingual_fields
+ ):
+ value = self._object_value(
+ dataset_ref, predicate, multilingual=True
+ )
+ else:
+ value = self._object_value_list(dataset_ref, predicate)
+
+ if value:
+ dataset_dict[key] = value
def __parse_healthdcat_booleanvalues(self, dataset_dict, dataset_ref):
for key, predicate in (
@@ -162,31 +183,52 @@ def _parse_retention_period(self, subject_ref):
return [retention_dict] if retention_dict else []
+
def graph_from_dataset(self, dataset_dict, dataset_ref):
super().graph_from_dataset(dataset_dict, dataset_ref)
for prefix, namespace in namespaces.items():
self.g.bind(prefix, namespace)
# key, predicate, fallbacks, _type, _class
- items = [
+ list_items = [
("analytics", HEALTHDCATAP.analytics, None, URIRefOrLiteral),
("code_values", HEALTHDCATAP.hasCodeValues, None, URIRefOrLiteral),
("coding_system", HEALTHDCATAP.hasCodingSystem, None, URIRefOrLiteral),
("health_category", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral),
("health_theme", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral),
("legal_basis", DPV.hasLegalBasis, None, URIRefOrLiteral),
- (
- "population_coverage",
- HEALTHDCATAP.populationCoverage,
- None,
- URIRefOrLiteral,
- ),
("personal_data", DPV.hasPersonalData, None, URIRef),
- ("publisher_note", HEALTHDCATAP.publisherNote, None, URIRefOrLiteral),
("publisher_type", HEALTHDCATAP.publisherType, None, URIRefOrLiteral),
("purpose", DPV.hasPurpose, None, URIRefOrLiteral),
]
- self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)
+ self._add_list_triples_from_dict(dataset_dict, dataset_ref, list_items)
+
+ multilingual_fields = set(self._multilingual_dataset_fields())
+ for key, predicate in MULTILINGUAL_LITERAL_FIELDS.items():
+ value = self._get_dataset_value(dataset_dict, key)
+ if not value:
+ continue
+
+ if key in multilingual_fields and isinstance(value, dict):
+ for lang, translated_value in value.items():
+ if translated_value:
+ self.g.add(
+ (
+ dataset_ref,
+ predicate,
+ Literal(translated_value, lang=lang),
+ )
+ )
+ continue
+
+ self._add_triple_from_dict(
+ dataset_dict,
+ dataset_ref,
+ predicate,
+ key,
+ list_value=True,
+ _type=URIRefOrLiteral,
+ )
if "trusted_data_holder" in dataset_dict:
self.g.add(
diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml
index bfeb5791..5181df94 100644
--- a/ckanext/dcat/schemas/health_dcat_ap.yaml
+++ b/ckanext/dcat/schemas/health_dcat_ap.yaml
@@ -733,8 +733,6 @@ resource_fields:
- field_name: rights
label: Rights
- form_snippet: markdown.html
- display_snippet: markdown.html
preset: multiple_text
validators: ignore_missing scheming_multiple_text
diff --git a/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml b/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml
new file mode 100644
index 00000000..570e2e61
--- /dev/null
+++ b/ckanext/dcat/schemas/health_dcat_ap_multilingual.yaml
@@ -0,0 +1,637 @@
+scheming_version: 2
+dataset_type: dataset
+about: Schema for HealthDCAT-AP with Fluent multilingual fields
+about_url: http://github.com/ckan/ckanext-dcat
+
+form_languages: [en, nl]
+
+dataset_fields:
+
+- field_name: title_translated
+ label: Title
+ preset: fluent_core_translated
+ required: true
+ help_text: A descriptive title for the dataset.
+
+- field_name: name
+ label: URL
+ preset: dataset_slug
+ form_placeholder: eg. my-dataset
+
+- field_name: notes_translated
+ label: Description
+ required: true
+ preset: fluent_core_translated
+ form_snippet: fluent_markdown.html
+ display_snippet: fluent_markdown.html
+ help_text: A free-text account of the dataset.
+
+- field_name: tags_translated
+ label: Keywords
+ preset: fluent_tags
+ form_placeholder: eg. economy, mental health, government
+ help_text: Keywords or tags describing the dataset. Use commas to separate multiple values.
+
+- field_name: contact
+ label: Contact points
+ repeating_label: Contact point
+ repeating_subfields:
+
+ - field_name: uri
+ label: URI
+
+ - field_name: name
+ label: Name
+
+ - field_name: name_translated
+ label: Name (translations)
+ preset: fluent_core_translated
+ help_text: Name of the entity or person who published the dataset in each language.
+
+ - field_name: email
+ label: Email
+ display_snippet: email.html
+
+ - field_name: identifier
+ label: Identifier
+ help_text: Unique identifier for the contact point. Such as a ROR ID.
+
+ help_text: Contact information for enquiries about the dataset.
+
+- field_name: publisher
+ label: Publisher
+ repeating_label: Publisher
+ repeating_once: true
+ repeating_subfields:
+
+ - field_name: uri
+ label: URI
+
+ - field_name: name
+ label: Name
+
+ - field_name: name_translated
+ label: Name (translations)
+ preset: fluent_core_translated
+ help_text: Name of the entity or person who published the dataset in each language.
+
+ - field_name: email
+ label: Email
+ display_snippet: email.html
+
+ - field_name: url
+ label: URL
+ display_snippet: link.html
+
+ - field_name: type
+ label: Type
+
+ - field_name: identifier
+ label: Identifier
+ help_text: Unique identifier for the publisher, such as a ROR ID.
+ help_text: Entity responsible for making the dataset available.
+
+- field_name: creator
+ label: Creator
+ repeating_label: Creator
+ repeating_once: true
+ repeating_subfields:
+
+ - field_name: uri
+ label: URI
+ help_text: URI of the creator, if available.
+
+ - field_name: name
+ label: Name
+ help_text: Name of the entity or person who created the dataset.
+
+ - field_name: name_translated
+ label: Name (translations)
+ preset: fluent_core_translated
+ help_text: Name of the entity or person who created the dataset in each language.
+
+ - field_name: email
+ label: Email
+ display_snippet: email.html
+ help_text: Contact email of the creator.
+
+ - field_name: url
+ label: URL
+ display_snippet: link.html
+ help_text: URL for more information about the creator.
+
+ - field_name: type
+ label: Type
+ help_text: Type of creator (e.g., Organization, Person).
+
+ - field_name: identifier
+ label: Identifier
+ help_text: Unique identifier for the creator, such as an ORCID or ROR ID.
+
+- field_name: license_id
+ label: License
+ form_snippet: license.html
+ help_text: License definitions and additional information can be found at http://opendefinition.org/.
+
+- field_name: owner_org
+ label: Organization
+ preset: dataset_organization
+ help_text: The CKAN organization the dataset belongs to.
+
+- field_name: url
+ label: Landing page
+ form_placeholder: http://example.com/dataset.json
+ display_snippet: link.html
+ help_text: Web page that can be navigated to gain access to the dataset, its distributions and/or additional information.
+
+ # Note: this will fall back to metadata_created if not present
+- field_name: issued
+ label: Release date
+ preset: dcat_date
+ help_text: Date of publication of the dataset.
+
+ # Note: this will fall back to metadata_modified if not present
+- field_name: modified
+ label: Modification date
+ preset: dcat_date
+ help_text: Most recent date on which the dataset was changed, updated or modified.
+
+- field_name: version
+ label: Version
+ validators: ignore_missing unicode_safe package_version_validator
+ help_text: Version number or other version designation of the dataset.
+
+- field_name: version_notes
+ label: Version notes
+ preset: fluent_markdown
+ help_text: A description of the differences between this version and a previous version of the dataset.
+
+ # Note: CKAN will generate a unique identifier for each dataset
+- field_name: identifier
+ label: Identifier
+ help_text: A unique identifier of the dataset.
+
+- field_name: frequency
+ label: Frequency
+ help_text: The frequency at which dataset is published.
+
+- field_name: provenance
+ label: Provenance
+ preset: fluent_markdown
+ help_text: A statement about the lineage of the dataset.
+
+- field_name: dcat_type
+ label: Type
+ help_text: The type of the dataset.
+ # TODO: controlled vocabulary?
+
+- field_name: temporal_coverage
+ label: Temporal coverage
+ repeating_subfields:
+
+ - field_name: start
+ label: Start
+ preset: dcat_date
+
+ - field_name: end
+ label: End
+ preset: dcat_date
+ help_text: The temporal period or periods the dataset covers.
+
+- field_name: temporal_resolution
+ label: Temporal resolution
+ help_text: Minimum time period resolvable in the dataset.
+
+- field_name: spatial_coverage
+ label: Spatial coverage
+ repeating_subfields:
+
+ - field_name: uri
+ label: URI
+
+ - field_name: text
+ label: Label
+
+ - field_name: geom
+ label: Geometry
+
+ - field_name: bbox
+ label: Bounding Box
+
+ - field_name: centroid
+ label: Centroid
+ help_text: A geographic region that is covered by the dataset.
+
+- field_name: spatial_resolution_in_meters
+ label: Spatial resolution in meters
+ help_text: Minimum spatial separation resolvable in a dataset, measured in meters.
+
+- field_name: access_rights
+ label: Access rights
+ validators: ignore_missing unicode_safe
+ help_text: Information that indicates whether the dataset is Open Data, has access restrictions or is not public.
+
+- field_name: alternate_identifier
+ label: Other identifier
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: This property refers to a secondary identifier of the dataset, such as MAST/ADS, DataCite, DOI, etc.
+
+- field_name: theme
+ label: Theme
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: A category of the dataset. A Dataset may be associated with multiple themes.
+
+- field_name: language
+ label: Language
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: Language or languages of the dataset.
+ # TODO: language form snippet / validator / graph
+
+- field_name: documentation
+ label: Documentation
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: A page or document about this dataset.
+
+- field_name: conforms_to
+ label: Conforms to
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: An implementing rule or other specification that the dataset follows.
+
+- field_name: is_referenced_by
+ label: Is referenced by
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: A related resource, such as a publication, that references, cites, or otherwise points to the dataset.
+
+- field_name: analytics
+ label: Analytics
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: >
+ An analytics distribution of the dataset.
+ Publishers are encouraged to provide URLs pointing to API endpoints or document
+ repositories where users can access or request associated resources such as
+ technical reports of the dataset, quality measurements, usability indicators,...
+ or analytics services.
+
+- field_name: applicable_legislation
+ label: Applicable legislation
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: The legislation that mandates the creation or management of the dataset.
+
+- field_name: has_version
+ label: Has version
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_inline: true
+ help_text: This property refers to a related Dataset that is a version, edition, or adaptation of the described Dataset.
+
+
+- field_name: code_values
+ label: Code values
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: Health classifications and their codes associated with the dataset.
+
+- field_name: coding_system
+ label: Coding system
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: >
+ Coding systems in use (e.g. ICD-10-CM, DGRs, SNOMED CT, ...).
+ To comply with HealthDCAT-AP, Wikidata URIs MUST be used.
+
+- field_name: purpose
+ label: Purpose
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: A free text statement of the purpose of the processing of data or personal data.
+
+- field_name: health_category
+ label: Health category
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: >
+ The health category to which this dataset belongs as described in the Commission Regulation on
+ the European Health Data Space laying down a list of categories of electronic data for
+ secondary use, Art.33.
+
+- field_name: health_theme
+ label: Health theme
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: >
+ A category of the Dataset or tag describing the Dataset.
+
+- field_name: legal_basis
+ label: Legal basis
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: The legal basis used to justify processing of personal data.
+
+- field_name: min_typical_age
+ label: Minimum typical age
+ validators: ignore_missing int_validator
+ form_snippet: number.html
+ help_text: Minimum typical age of the population within the dataset.
+
+- field_name: max_typical_age
+ label: Maximum typical age
+ validators: ignore_missing int_validator
+ form_snippet: number.html
+ help_text: Maximum typical age of the population within the dataset.
+
+- field_name: number_of_records
+ label: Number of records
+ validators: ignore_missing int_validator
+ form_snippet: number.html
+ help_text: Size of the dataset in terms of the number of records
+
+- field_name: number_of_unique_individuals
+ label: Number of records for unique individuals.
+ validators: ignore_missing int_validator
+ form_snippet: number.html
+ help_text: Number of records for unique individuals.
+
+- field_name: personal_data
+ label: Personal data
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: Key elements that represent an individual in the dataset.
+
+- field_name: publisher_note
+ label: Publisher note
+ preset: fluent_markdown
+ help_text: >
+ A description of the publisher activities.
+
+- field_name: publisher_type
+ label: Publisher type
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: >
+ A type of organisation that makes the Dataset available.
+
+- field_name: trusted_data_holder
+ label: Trusted Data Holder
+ preset: select
+ choices:
+ - value: false
+ label: "No"
+ - value: true
+ label: "Yes"
+ validators: ignore_missing boolean_validator
+ help_text: >
+ Indicates whether the dataset is held by a trusted data holder.
+ output_validators: boolean_validator
+
+- field_name: population_coverage
+ label: Population coverage
+ preset: fluent_markdown
+ help_text: >
+ A definition of the population within the dataset.
+
+- field_name: retention_period
+ label: Retention period
+ repeating_subfields:
+
+ - field_name: start
+ label: Start
+ preset: dcat_date
+
+ - field_name: end
+ label: End
+ preset: dcat_date
+
+ help_text: A temporal period which the dataset is available for secondary use.
+
+
+# Officially there can only be one HDAB for now, but keep it repeating subfield just in case
+- field_name: hdab
+ label: Health data access body
+ repeating_label: Health data access body
+ repeating_once: true
+ repeating_subfields:
+
+ - field_name: uri
+ label: URI
+
+ - field_name: name
+ label: Name
+
+ - field_name: name_translated
+ label: Name (translations)
+ preset: fluent_core_translated
+ help_text: Name of the health data access body in each language.
+
+ - field_name: email
+ label: Email
+ display_snippet: email.html
+
+ - field_name: url
+ label: URL
+ display_snippet: link.html
+
+ - field_name: type
+ label: Type
+
+ - field_name: identifier
+ label: Identifier
+ help_text: Unique identifier for the HDAB, such as a ROR ID.
+ help_text: Health Data Access Body supporting access to data in the Member State.
+
+- field_name: qualified_relation
+ label: Qualified relation
+ repeating_label: Relationship
+ repeating_subfields:
+
+ - field_name: uri
+ label: URI
+
+ - field_name: relation
+ label: Relation
+ help_text: The resource related to the source resource.
+
+ - field_name: role
+ label: Role
+ help_text: The function of an entity or agent with respect to another entity or resource.
+ help_text: A description of a relationship with another resource.
+
+# Note: if not provided, this will be autogenerated
+- field_name: uri
+ label: URI
+ help_text: An URI for this dataset (if not provided it will be autogenerated).
+
+# TODO: relation-based properties are not yet included (e.g. is_version_of, source, sample, etc)
+#
+resource_fields:
+
+- field_name: url
+ label: URL
+ preset: resource_url_upload
+
+- field_name: name_translated
+ label: Name
+ preset: fluent_core_translated
+ help_text: A descriptive title for the resource.
+
+- field_name: description_translated
+ label: Description
+ preset: fluent_core_translated
+ form_snippet: fluent_markdown.html
+ display_snippet: fluent_markdown.html
+ help_text: A free-text account of the resource.
+
+- field_name: format
+ label: Format
+ preset: resource_format_autocomplete
+ help_text: File format. If not provided it will be guessed.
+
+- field_name: mimetype
+ label: Media type
+ validators: if_empty_guess_format ignore_missing unicode_safe
+ help_text: Media type for this format. If not provided it will be guessed.
+
+- field_name: compress_format
+ label: Compress format
+ help_text: The format of the file in which the data is contained in a compressed form.
+
+- field_name: package_format
+ label: Package format
+ help_text: The format of the file in which one or more data files are grouped together.
+
+- field_name: size
+ label: Size
+ validators: ignore_missing int_validator
+ form_snippet: number.html
+ display_snippet: file_size.html
+ help_text: File size in bytes
+
+- field_name: hash
+ label: Hash
+ help_text: Checksum of the downloaded file.
+
+- field_name: hash_algorithm
+ label: Hash Algorithm
+ help_text: Algorithm used to calculate to checksum.
+
+- field_name: rights
+ label: Rights
+ preset: fluent_markdown
+ help_text: Some statement about the rights associated with the resource.
+
+- field_name: availability
+ label: Availability
+ help_text: Indicates how long it is planned to keep the resource available.
+
+- field_name: status
+ label: Status
+ preset: select
+ choices:
+ - value: http://purl.org/adms/status/Completed
+ label: Completed
+ - value: http://purl.org/adms/status/UnderDevelopment
+ label: Under Development
+ - value: http://purl.org/adms/status/Deprecated
+ label: Deprecated
+ - value: http://purl.org/adms/status/Withdrawn
+ label: Withdrawn
+ help_text: The status of the resource in the context of maturity lifecycle.
+
+- field_name: license
+ label: License
+ help_text: License in which the resource is made available. If not provided will be inherited from the dataset.
+
+ # Note: this falls back to the standard resource url field
+- field_name: access_url
+ label: Access URL
+ help_text: URL that gives access to the dataset (defaults to the standard resource URL).
+
+ # Note: this falls back to the standard resource url field
+- field_name: download_url
+ label: Download URL
+ display_snippet: link.html
+ help_text: URL that provides a direct link to a downloadable file (defaults to the standard resource URL).
+
+- field_name: issued
+ label: Release date
+ preset: dcat_date
+ help_text: Date of publication of the resource.
+
+- field_name: modified
+ label: Modification date
+ preset: dcat_date
+ help_text: Most recent date on which the resource was changed, updated or modified.
+
+- field_name: temporal_resolution
+ label: Temporal resolution
+ help_text: Minimum time period resolvable in the distribution.
+
+- field_name: spatial_resolution_in_meters
+ label: Spatial resolution in meters
+ help_text: Minimum spatial separation resolvable in the distribution, measured in meters.
+
+- field_name: language
+ label: Language
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: Language or languages of the resource.
+
+- field_name: documentation
+ label: Documentation
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: A page or document about this resource.
+
+- field_name: conforms_to
+ label: Conforms to
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: An established schema to which the described resource conforms.
+
+- field_name: applicable_legislation
+ label: Applicable legislation
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+ help_text: The legislation that mandates the creation or management of the resource.
+
+- field_name: access_services
+ label: Access services
+ repeating_label: Access service
+ repeating_subfields:
+
+ - field_name: uri
+ label: URI
+
+ - field_name: title
+ label: Title
+
+ - field_name: endpoint_description
+ label: Endpoint description
+
+ - field_name: endpoint_url
+ label: Endpoint URL
+ preset: multiple_text
+
+ - field_name: serves_dataset
+ label: Serves dataset
+ preset: multiple_text
+ validators: ignore_missing scheming_multiple_text
+
+ - field_name: access_rights
+ label: Access rights
+ validators: ignore_missing unicode_safe
+ help_text: Information regarding access or restrictions based on privacy, security, or other policies.
+
+ help_text: A data service that gives access to the resource.
+
+ # Note: if not provided, this will be autogenerated
+- field_name: uri
+ label: URI
+ help_text: An URI for this resource (if not provided it will be autogenerated).
diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py
index 2d907f0f..949dccfd 100644
--- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py
+++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py
@@ -182,31 +182,42 @@ def test_e2e_dcat_to_ckan(self):
"end": "2034-12-31",
}
]
+
assert dataset["resources"][0]["retention_period"] == [
{
"start": "2020-03-01",
"end": "2034-12-31",
}
]
-
- assert dataset["provenance_activity"] == [{
- "uri": "internalURI:wasGeneratedBy0",
- "label": "http://dbpedia.org/resource/Record_linkage",
- "seeAlso": "https://www.ehealth.fgov.be/ehealthplatform/fr/service-codage-anonymisation-et-ttp",
- "dct_type": "http://dbpedia.org/resource/Record_linkage",
- "startedAtTime": "2021-01-01T00:00:00+00:00",
- "wasAssociatedWith": [{
- "name": "Dr. Joris van Loenhout",
- "url": "https://www.sciensano.be/fr/people/joris-van-loenhout",
- "email": "Joris.VanLoenhout@sciensano.be",
- "type": "",
- "uri": "",
- "identifier": "",
- "actedOnBehalfOf": [{
- "name": "Contact Point"
- }]
- }]
- }]
+
+ provenance_activity = dataset["provenance_activity"]
+ assert len(provenance_activity) == 1
+
+ activity = provenance_activity[0]
+ assert activity["uri"] == "internalURI:wasGeneratedBy0"
+ assert activity["label"] == "http://dbpedia.org/resource/Record_linkage"
+ assert activity["seeAlso"] == (
+ "https://www.ehealth.fgov.be/ehealthplatform/fr/service-codage-anonymisation-et-ttp"
+ )
+ assert activity["dct_type"] == "http://dbpedia.org/resource/Record_linkage"
+ assert activity["startedAtTime"] == "2021-01-01T00:00:00+00:00"
+
+ associated = activity["wasAssociatedWith"]
+ assert len(associated) == 1
+
+ agent = associated[0]
+ assert agent["name"] == "Dr. Joris van Loenhout"
+ if agent.get("name_translated"):
+ assert agent["name_translated"].get("en") == "Dr. Joris van Loenhout"
+ assert agent["url"] == "https://www.sciensano.be/fr/people/joris-van-loenhout"
+ assert agent["email"] == "Joris.VanLoenhout@sciensano.be"
+
+ acted_on_behalf = agent.get("actedOnBehalfOf", [])
+ assert len(acted_on_behalf) == 1
+ acted_agent = acted_on_behalf[0]
+ assert acted_agent["name"] == "Contact Point"
+ if acted_agent.get("name_translated"):
+ assert acted_agent["name_translated"].get("en") == "Contact Point"
assert dataset["qualified_attribution"][0]["role"] == "https://inspire.ec.europa.eu/metadata-codelist/ResponsiblePartyRole/processor"
@@ -222,3 +233,68 @@ def test_e2e_dcat_to_ckan(self):
assert dataset["quality_annotation"][0]["body"] == "https://certificates.theodi.org/en/datasets/393/certificate"
assert dataset["quality_annotation"][0]["target"] == "https://certificates.theodi.org/en/datasets/393"
assert dataset["quality_annotation"][0]["motivated_by"] == "http://www.w3.org/ns/dqv#qualityAssessment"
+
+
+
+@pytest.mark.usefixtures("with_plugins", "clean_db")
+@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets fluent")
+@pytest.mark.ckan_config(
+ "scheming.dataset_schemas",
+ "ckanext.dcat.schemas:health_dcat_ap_multilingual.yaml",
+)
+@pytest.mark.ckan_config(
+ "scheming.presets",
+ "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml ckanext.fluent:presets.json",
+)
+@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap")
+class TestSchemingFluentParseSupport(BaseParseTest):
+ def test_e2e_dcat_to_ckan_multilingual(self):
+ contents = self._get_file_contents("dcat/dataset_health_multilingual.ttl")
+
+ parser = RDFParser()
+ parser.parse(contents, _format="turtle")
+
+ datasets = list(parser.datasets())
+ assert len(datasets) == 1
+
+ dataset_dict = datasets[0]
+ dataset_dict["name"] = "test-dcat-health-multilingual"
+
+ dataset = call_action("package_create", **dataset_dict)
+
+ assert dataset["title_translated"]["en"] == "Health dataset"
+ assert dataset["title_translated"]["nl"] == "Gezondheidsdataset"
+
+ assert dataset["notes_translated"]["en"] == "A dataset with multilingual metadata"
+ assert dataset["notes_translated"]["nl"] == "Een dataset met meertalige metadata"
+
+ assert dataset["tags_translated"]["en"] == ["health"]
+ assert dataset["tags_translated"]["nl"] == ["gezondheid"]
+
+ assert dataset["population_coverage"]["en"] == "Population coverage in English"
+ assert dataset["population_coverage"]["nl"] == "Populatiedekking in het Nederlands"
+
+ assert dataset["publisher_note"]["en"] == "Publisher note in English"
+ assert dataset["publisher_note"]["nl"] == "Notitie van de uitgever in het Nederlands"
+
+ publisher = dataset["publisher"][0]
+ assert publisher["name_translated"]["en"] == "Health Institute"
+ assert publisher["name_translated"]["nl"] == "Gezondheidsinstituut"
+
+ creator = dataset["creator"][0]
+ assert creator["name_translated"]["en"] == "Health Creator"
+ assert creator["name_translated"]["nl"] == "Gezondheidsmaker"
+
+ resource = dataset["resources"][0]
+
+ assert resource["name_translated"]["en"] == "CSV extract"
+ assert resource["name_translated"]["nl"] == "CSV-uitvoer"
+
+ assert resource["description_translated"]["en"] == "Distribution description in English"
+ assert (
+ resource["description_translated"]["nl"]
+ == "Beschrijving van de distributie in het Nederlands"
+ )
+
+ assert resource["rights"]["en"] == "Rights statement"
+ assert resource["rights"]["nl"] == "Rechtenverklaring"
diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py
index 0c523189..2a96564b 100644
--- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py
+++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py
@@ -201,3 +201,169 @@ def test_e2e_ckan_to_dcat(self):
Literal(distribution_details["retention_period"][0]["end"], datatype=XSD.date)
)
+
+@pytest.mark.usefixtures("with_plugins", "clean_db")
+@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets fluent")
+@pytest.mark.ckan_config(
+ "scheming.dataset_schemas",
+ "ckanext.dcat.schemas:health_dcat_ap_multilingual.yaml",
+)
+@pytest.mark.ckan_config(
+ "scheming.presets",
+ "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml ckanext.fluent:presets.json",
+)
+@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap")
+class TestEuroDCATAP3ProfileSerializeDatasetFluent(BaseSerializeTest):
+ def test_e2e_ckan_to_dcat_multilingual(self):
+ dataset_dict = {
+ "name": "health-dcat-fluent",
+ "title_translated": {
+ "en": "Health dataset",
+ "nl": "Gezondheidsdataset",
+ },
+ "notes_translated": {
+ "en": "A dataset with multilingual metadata",
+ "nl": "Een dataset met meertalige metadata",
+ },
+ "tags_translated": {
+ "en": ["health"],
+ "nl": ["gezondheid"],
+ },
+ "population_coverage": {
+ "en": "Population coverage in English",
+ "nl": "Populatiedekking in het Nederlands",
+ },
+ "publisher_note": {
+ "en": "Publisher note in English",
+ "nl": "Notitie van de uitgever in het Nederlands",
+ },
+ "publisher": [
+ {
+ "name": "Health Institute",
+ "name_translated": {
+ "en": "Health Institute",
+ "nl": "Gezondheidsinstituut",
+ },
+ "email": "info@example.com",
+ "url": "https://healthdata.nl",
+ }
+ ],
+ "creator": [
+ {
+ "name": "Health Creator",
+ "name_translated": {
+ "en": "Health Creator",
+ "nl": "Gezondheidsmaker",
+ },
+ "email": "creator@example.com",
+ }
+ ],
+ "resources": [
+ {
+ "url": "http://example.test/dataset/1/resource.csv",
+ "name_translated": {
+ "en": "CSV extract",
+ "nl": "CSV-uitvoer",
+ },
+ "description_translated": {
+ "en": "Distribution description in English",
+ "nl": "Beschrijving van de distributie in het Nederlands",
+ },
+ "rights": {
+ "en": "Rights statement",
+ "nl": "Rechtenverklaring",
+ },
+ }
+ ],
+ }
+
+ dataset = call_action("package_create", **dataset_dict)
+
+ serializer = RDFSerializer()
+ graph = serializer.g
+ dataset_ref = serializer.graph_from_dataset(dataset)
+
+ assert self._triple(graph, dataset_ref, DCT.title, "Health dataset", lang="en")
+ assert self._triple(
+ graph, dataset_ref, DCT.title, "Gezondheidsdataset", lang="nl"
+ )
+
+ assert self._triple(
+ graph,
+ dataset_ref,
+ HEALTHDCATAP.populationCoverage,
+ "Population coverage in English",
+ lang="en",
+ )
+ assert self._triple(
+ graph,
+ dataset_ref,
+ HEALTHDCATAP.populationCoverage,
+ "Populatiedekking in het Nederlands",
+ lang="nl",
+ )
+
+ assert self._triple(
+ graph,
+ dataset_ref,
+ HEALTHDCATAP.publisherNote,
+ "Publisher note in English",
+ lang="en",
+ )
+ assert self._triple(
+ graph,
+ dataset_ref,
+ HEALTHDCATAP.publisherNote,
+ "Notitie van de uitgever in het Nederlands",
+ lang="nl",
+ )
+
+ publisher_ref = next(graph.objects(dataset_ref, DCT.publisher))
+ assert self._triple(
+ graph, publisher_ref, FOAF.name, "Health Institute", lang="en"
+ )
+ assert self._triple(
+ graph, publisher_ref, FOAF.name, "Gezondheidsinstituut", lang="nl"
+ )
+
+ creator_ref = next(graph.objects(dataset_ref, DCT.creator))
+ assert self._triple(
+ graph, creator_ref, FOAF.name, "Health Creator", lang="en"
+ )
+ assert self._triple(
+ graph, creator_ref, FOAF.name, "Gezondheidsmaker", lang="nl"
+ )
+
+ distribution_ref = self._triple(
+ graph, dataset_ref, DCAT.distribution, None
+ )[2]
+
+ assert self._triple(
+ graph, distribution_ref, DCT.title, "CSV extract", lang="en"
+ )
+ assert self._triple(
+ graph, distribution_ref, DCT.title, "CSV-uitvoer", lang="nl"
+ )
+
+ assert self._triple(
+ graph,
+ distribution_ref,
+ DCT.description,
+ "Distribution description in English",
+ lang="en",
+ )
+ assert self._triple(
+ graph,
+ distribution_ref,
+ DCT.description,
+ "Beschrijving van de distributie in het Nederlands",
+ lang="nl",
+ )
+
+ rights_node = next(graph.objects(distribution_ref, DCT.rights))
+ assert self._triple(
+ graph, rights_node, RDFS.label, "Rights statement", lang="en"
+ )
+ assert self._triple(
+ graph, rights_node, RDFS.label, "Rechtenverklaring", lang="nl"
+ )
diff --git a/ckanext/dcat/tests/test_blueprints.py b/ckanext/dcat/tests/test_blueprints.py
index 28d62022..594b58fb 100644
--- a/ckanext/dcat/tests/test_blueprints.py
+++ b/ckanext/dcat/tests/test_blueprints.py
@@ -23,15 +23,22 @@ def _sort_query_params(url):
parts = urlparse(url)
qs = parse_qs(parts.query)
ordered_qs = OrderedDict(sorted(qs.items()))
- encoded_qs = urlencode(ordered_qs).replace('u%27', '%27')
+ encoded_qs = urlencode(ordered_qs).replace("u%27", "%27")
return urlunparse(
- (parts.scheme, parts.netloc, parts.path, parts.params,
- encoded_qs, parts.fragment)
+ (
+ parts.scheme,
+ parts.netloc,
+ parts.path,
+ parts.params,
+ encoded_qs,
+ parts.fragment,
+ )
)
-@pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_index')
-class TestEndpoints():
+
+@pytest.mark.usefixtures("with_plugins", "clean_db", "clean_index")
+class TestEndpoints:
def _object_value(self, graph, subject, predicate):
@@ -40,22 +47,20 @@ def _object_value(self, graph, subject, predicate):
def test_dataset_default(self, app):
- dataset = factories.Dataset(
- notes='Test dataset'
- )
+ dataset = factories.Dataset(notes="Test dataset")
- url = url_for('dcat.read_dataset', _id=dataset['name'], _format='rdf')
+ url = url_for("dcat.read_dataset", _id=dataset["name"], _format="rdf")
response = app.get(url)
- assert response.headers['Content-Type'] == 'application/rdf+xml'
+ assert response.headers["Content-Type"] == "application/rdf+xml"
content = response.body
# Parse the contents to check it's an actual serialization
p = RDFParser()
- p.parse(content, _format='xml')
+ p.parse(content, _format="xml")
dcat_datasets = [d for d in p.datasets()]
@@ -63,27 +68,54 @@ def test_dataset_default(self, app):
dcat_dataset = dcat_datasets[0]
- assert dcat_dataset['title'] == dataset['title']
- assert dcat_dataset['notes'] == dataset['notes']
+ assert dcat_dataset["title"] == dataset["title"]
+ assert dcat_dataset["notes"] == dataset["notes"]
- def test_dataset_xml(self, app):
+ def test_dataset_default_no_redirects(self, app):
+
+ dataset = factories.Dataset(notes="Test dataset")
+
+ url = url_for("dcat.read_dataset", _id=dataset["name"], _format="rdf")
+
+ assert url.startswith("/dataset/")
+ def test_dataset_default_private(self, app):
+ user = factories.UserWithToken()
+ org = factories.Organization(
+ users=[{"name": user["name"], "capacity": "admin"}]
+ )
dataset = factories.Dataset(
- notes='Test dataset'
+ notes="Test dataset", owner_org=org["id"], private=True
)
- url = url_for('dcat.read_dataset', _id=dataset['name'], _format='xml')
+ url = url_for("dcat.read_dataset", _id=dataset["name"], _format="rdf")
+
+ # Unauthenticated request
+ response = app.get(url)
+ assert response.status_code == 403
+
+ # Authenticated request
+ headers = {"Authorization": user["token"]}
+ response = app.get(url, headers=headers)
+
+ assert response.headers["Content-Type"] == "application/rdf+xml"
+
+ def test_dataset_xml(self, app):
+
+ dataset = factories.Dataset(notes="Test dataset")
+
+ url = url_for("dcat.read_dataset", _id=dataset["name"], _format="xml")
response = app.get(url)
- assert response.headers['Content-Type'] == 'application/rdf+xml'
+ assert response.headers["Content-Type"] == "application/rdf+xml"
content = response.body
# Parse the contents to check it's an actual serialization
p = RDFParser()
- p.parse(content, _format='xml')
+ p.parse(content, _format="xml")
dcat_datasets = [d for d in p.datasets()]
@@ -91,27 +123,25 @@ def test_dataset_xml(self, app):
dcat_dataset = dcat_datasets[0]
- assert dcat_dataset['title'] == dataset['title']
- assert dcat_dataset['notes'] == dataset['notes']
+ assert dcat_dataset["title"] == dataset["title"]
+ assert dcat_dataset["notes"] == dataset["notes"]
def test_dataset_ttl(self, app):
- dataset = factories.Dataset(
- notes='Test dataset'
- )
+ dataset = factories.Dataset(notes="Test dataset")
- url = url_for('dcat.read_dataset', _id=dataset['name'], _format='ttl')
+ url = url_for("dcat.read_dataset", _id=dataset["name"], _format="ttl")
response = app.get(url)
- assert response.headers['Content-Type'] == 'text/turtle'
+ assert response.headers["Content-Type"] == "text/turtle"
content = response.body
# Parse the contents to check it's an actual serialization
p = RDFParser()
- p.parse(content, _format='turtle')
+ p.parse(content, _format="turtle")
dcat_datasets = [d for d in p.datasets()]
@@ -119,27 +149,25 @@ def test_dataset_ttl(self, app):
dcat_dataset = dcat_datasets[0]
- assert dcat_dataset['title'] == dataset['title']
- assert dcat_dataset['notes'] == dataset['notes']
+ assert dcat_dataset["title"] == dataset["title"]
+ assert dcat_dataset["notes"] == dataset["notes"]
def test_dataset_n3(self, app):
- dataset = factories.Dataset(
- notes='Test dataset'
- )
+ dataset = factories.Dataset(notes="Test dataset")
- url = url_for('dcat.read_dataset', _id=dataset['name'], _format='n3')
+ url = url_for("dcat.read_dataset", _id=dataset["name"], _format="n3")
response = app.get(url)
- assert response.headers['Content-Type'] == 'text/n3'
+ assert response.headers["Content-Type"] == "text/n3"
content = response.body
# Parse the contents to check it's an actual serialization
p = RDFParser()
- p.parse(content, _format='n3')
+ p.parse(content, _format="n3")
dcat_datasets = [d for d in p.datasets()]
@@ -147,27 +175,25 @@ def test_dataset_n3(self, app):
dcat_dataset = dcat_datasets[0]
- assert dcat_dataset['title'] == dataset['title']
- assert dcat_dataset['notes'] == dataset['notes']
+ assert dcat_dataset["title"] == dataset["title"]
+ assert dcat_dataset["notes"] == dataset["notes"]
def test_dataset_jsonld(self, app):
- dataset = factories.Dataset(
- notes='Test dataset'
- )
+ dataset = factories.Dataset(notes="Test dataset")
- url = url_for('dcat.read_dataset', _id=dataset['name'], _format='jsonld')
+ url = url_for("dcat.read_dataset", _id=dataset["name"], _format="jsonld")
response = app.get(url)
- assert response.headers['Content-Type'] == 'application/ld+json'
+ assert response.headers["Content-Type"] == "application/ld+json"
content = response.body
# Parse the contents to check it's an actual serialization
p = RDFParser()
- p.parse(content, _format='json-ld')
+ p.parse(content, _format="json-ld")
dcat_datasets = [d for d in p.datasets()]
@@ -175,49 +201,52 @@ def test_dataset_jsonld(self, app):
dcat_dataset = dcat_datasets[0]
- assert dcat_dataset['title'] == dataset['title']
- assert dcat_dataset['notes'] == dataset['notes']
+ assert dcat_dataset["title"] == dataset["title"]
+ assert dcat_dataset["notes"] == dataset["notes"]
def test_dataset_profiles_jsonld(self, app):
- dataset = factories.Dataset(
- notes='Test dataset'
- )
+ dataset = factories.Dataset(notes="Test dataset")
- url = url_for('dcat.read_dataset', _id=dataset['name'], _format='jsonld', profiles='schemaorg')
+ url = url_for(
+ "dcat.read_dataset",
+ _id=dataset["name"],
+ _format="jsonld",
+ profiles="schemaorg",
+ )
response = app.get(url)
- assert response.headers['Content-Type'] == 'application/ld+json'
+ assert response.headers["Content-Type"] == "application/ld+json"
content = response.body
assert '"@type": "schema:Dataset"' in content
- assert '"schema:description": "%s"' % dataset['notes'] in content
+ assert '"schema:description": "%s"' % dataset["notes"] in content
def test_dataset_profiles_not_found(self, app):
- dataset = factories.Dataset(
- notes='Test dataset'
- )
+ dataset = factories.Dataset(notes="Test dataset")
- url = url_for('dcat.read_dataset', _id=dataset['name'], _format='jsonld', profiles='nope')
+ url = url_for(
+ "dcat.read_dataset", _id=dataset["name"], _format="jsonld", profiles="nope"
+ )
response = app.get(url, status=409)
- assert 'Unknown RDF profiles: nope' in response.body
+ assert "Unknown RDF profiles: nope" in response.body
def test_dataset_not_found(self, app):
import uuid
- url = url_for('dcat.read_dataset', _id=str(uuid.uuid4()), _format='n3')
+ url = url_for("dcat.read_dataset", _id=str(uuid.uuid4()), _format="n3")
app.get(url, status=404)
def test_dataset_form_is_rendered(self, app):
sysadmin = factories.Sysadmin()
- env = {'REMOTE_USER': sysadmin['name'].encode('ascii')}
- url = url_for('dataset.new')
+ env = {"REMOTE_USER": sysadmin["name"].encode("ascii")}
+ url = url_for("dataset.new")
response = app.get(url, extra_environ=env)
@@ -225,26 +254,23 @@ def test_dataset_form_is_rendered(self, app):
assert '' in response.body
-@pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_index')
-class TestCroissant():
+@pytest.mark.usefixtures("with_plugins", "clean_db", "clean_index")
+class TestCroissant:
- @pytest.mark.ckan_config('ckan.plugins', 'dcat croissant')
+ @pytest.mark.ckan_config("ckan.plugins", "dcat croissant")
def test_croissant_metadata_embedded(self, app):
- dataset = factories.Dataset(
- notes='test description'
- )
+ dataset = factories.Dataset(notes="test description")
- url = url_for('dataset.read', id=dataset['name'])
+ url = url_for("dataset.read", id=dataset["name"])
response = app.get(url)
assert '