Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,13 @@ jobs:
pip install -r ckanext-harvest/requirements.txt
git clone https://github.com/ckan/ckanext-scheming
pip install -e ckanext-scheming
pip install git+https://github.com/ckan/ckanext-fluent.git@4e9340a#egg=ckanext-fluent
git clone https://github.com/ckan/ckanext-fluent
pip install -e ckanext-fluent
git clone https://github.com/ckan/ckanext-dataset-series
pip install -e ckanext-dataset-series
- name: Setup extension
run: |
ckan -c test.ini db init
ckan -c test.ini db pending-migrations --apply
- name: Run tests
run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests
run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,9 @@ build/*
tmp/*
package/DEBIAN/control
*.swp
.idea/.gitignore
.idea/ckanext-dcat.iml
.idea/misc.xml
.idea/modules.xml
.idea/vcs.xml
.idea/inspectionProfiles/profiles_settings.xml
14 changes: 13 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
# Changelog

## [Unreleased](https://github.com/ckan/ckanext-dcat/compare/v2.4.0...HEAD)
## [Unreleased](https://github.com/ckan/ckanext-dcat/compare/v2.4.1...HEAD)

## [v2.4.1](https://github.com/ckan/ckanext-dcat/compare/v2.4.0...v2.4.1) - 2025-09-25

* Fix regression redirect from /dataset to /dataset_series ([#362](https://github.com/ckan/ckanext-dcat/pull/362))
* Provide default language in Croissant JSON-LD context ([#361](https://github.com/ckan/ckanext-dcat/pull/361))
* Added [`IDCATURIGenerator`](https://docs.ckan.org/projects/ckanext-dcat/en/latest/uri-customization/)
plugin interface to allow customization of the URIs generation ([#351](https://github.com/ckan/ckanext-dcat/pull/351))
* Added support for new fields to DCAT classes: `dcat:Dataset` (`prov:wasGeneratedBy`, `prov:qualifiedAttribution`,
`dcat:hasVersion`), `dcat:Catalog` (`foaf:homepage`), `dcat:DataService` (`dct:conformsTo`, `dct:format`,
`dct:identifier`, `dct:language`, `dct:rights`, `dcat:landingPage`, `dcat:keyword`) ([#352](https://github.com/ckan/ckanext-dcat/pull/352))
* Add HealthDCAT-AP mapping to CKAN field mapping table ([#347](https://github.com/ckan/ckanext-dcat/pull/347))
* Docs: Add HealthDCAT-AP mapping to CKAN field mapping table ([#347](https://github.com/ckan/ckanext-dcat/pull/347))


## [v2.4.0](https://github.com/ckan/ckanext-dcat/compare/v2.3.0...v2.4.0) - 2025-05-20
Expand Down
55 changes: 30 additions & 25 deletions ckanext/dcat/blueprints.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
from flask import Blueprint, jsonify, make_response

import ckantoolkit as toolkit
Expand All @@ -12,11 +11,7 @@
config = toolkit.config


dcat = Blueprint(
'dcat',
__name__,
url_defaults={u'package_type': u'dataset'}
)
dcat = Blueprint("dcat", __name__, url_defaults={"package_type": "dataset"})


def read_catalog(_format=None, package_type=None):
Expand All @@ -30,36 +25,44 @@ def read_dataset(_id, _format=None, package_type=None):
if endpoints_enabled():

# requirements={'_format': 'xml|rdf|n3|ttl|jsonld'}
dcat.add_url_rule(config.get('ckanext.dcat.catalog_endpoint',
utils.DEFAULT_CATALOG_ENDPOINT).replace(
'{_format}', '<_format>'),
view_func=read_catalog)
dcat.add_url_rule(
config.get(
"ckanext.dcat.catalog_endpoint", utils.DEFAULT_CATALOG_ENDPOINT
).replace("{_format}", "<_format>"),
view_func=read_catalog,
)

# TODO: Generalize for all dataset types
dcat.add_url_rule('/dataset_series/<_id>.<_format>', view_func=read_dataset)
dcat.add_url_rule('/dataset/<_id>.<_format>', view_func=read_dataset)
dcat.add_url_rule(
"/dataset_series/<_id>.<_format>",
view_func=read_dataset,
endpoint="read_dataset_series",
)
dcat.add_url_rule(
"/dataset/<_id>.<_format>", view_func=read_dataset, endpoint="read_dataset"
)


if toolkit.asbool(config.get(utils.ENABLE_CONTENT_NEGOTIATION_CONFIG)):
dcat.add_url_rule('/', view_func=read_catalog)
dcat.add_url_rule("/", view_func=read_catalog)

dcat.add_url_rule('/dataset/new', view_func=CreateView.as_view(str(u'new')))
dcat.add_url_rule('/dataset/<_id>', view_func=read_dataset)
dcat.add_url_rule("/dataset/new", view_func=CreateView.as_view(str("new")))
dcat.add_url_rule("/dataset/<_id>", view_func=read_dataset)

dcat_json_interface = Blueprint('dcat_json_interface', __name__)
dcat_json_interface = Blueprint("dcat_json_interface", __name__)


def dcat_json():
datasets = utils.dcat_json_page()
return jsonify(datasets)


dcat_json_interface.add_url_rule(config.get('ckanext.dcat.json_endpoint',
'/dcat.json'),
view_func=dcat_json)
dcat_json_interface.add_url_rule(
config.get("ckanext.dcat.json_endpoint", "/dcat.json"), view_func=dcat_json
)


croissant = Blueprint('croissant', __name__)
croissant = Blueprint("croissant", __name__)


def read_dataset_croissant(_id):
Expand All @@ -72,20 +75,22 @@ def read_dataset_croissant(_id):
)

context = {
'user': user_name,
"user": user_name,
}
data_dict = {'id': _id}
data_dict = {"id": _id}

dataset_dict = toolkit.get_action("package_show")(context, data_dict)
except (toolkit.ObjectNotFound, toolkit.NotAuthorized):
return toolkit.abort(
404,
toolkit._("Dataset not found or you have no permission to view it")
404, toolkit._("Dataset not found or you have no permission to view it")
)

response = make_response(croissant_serialization(dataset_dict))
response.headers["Content-type"] = "application/ld+json"

return response

croissant.add_url_rule('/dataset/<_id>/croissant.jsonld', view_func=read_dataset_croissant)

croissant.add_url_rule(
"/dataset/<_id>/croissant.jsonld", view_func=read_dataset_croissant
)
112 changes: 33 additions & 79 deletions ckanext/dcat/harvesters/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,18 +210,39 @@ def gather_stage(self, harvest_job):
return []

try:
source_dataset = model.Package.get(harvest_job.source.id)

series_ids, series_mapping = self._parse_and_collect(
parser.dataset_series(),
source_dataset,
harvest_job,
guids_in_source,
is_series=True,
collect_series_mapping=True
)
object_ids += series_ids
object_ids += self._parse_and_collect(parser.datasets(series_mapping), source_dataset, harvest_job, guids_in_source, is_series=False)

source_dataset = model.Package.get(harvest_job.source.id)

for dataset in parser.datasets():
if not dataset.get('name'):
dataset['name'] = self._gen_new_name(dataset['title'])
if dataset['name'] in self._names_taken:
suffix = len([i for i in self._names_taken if i.startswith(dataset['name'] + '-')]) + 1
dataset['name'] = '{}-{}'.format(dataset['name'], suffix)
self._names_taken.append(dataset['name'])

# Unless already set by the parser, get the owner organization (if any)
# from the harvest source dataset
if not dataset.get('owner_org'):
if source_dataset.owner_org:
dataset['owner_org'] = source_dataset.owner_org

# Try to get a unique identifier for the harvested dataset
guid = self._get_guid(dataset, source_url=source_dataset.url)

if not guid:
self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset),
harvest_job)
continue

dataset['extras'].append({'key': 'guid', 'value': guid})
guids_in_source.append(guid)

obj = HarvestObject(guid=guid, job=harvest_job,
content=json.dumps(dataset))

obj.save()
object_ids.append(obj.id)
except Exception as e:
self._save_gather_error('Error when processsing dataset: %r / %s' % (e, traceback.format_exc()),
harvest_job)
Expand Down Expand Up @@ -401,70 +422,3 @@ def import_stage(self, harvest_object):
model.Session.commit()

return True

def _parse_and_collect(
self,
items,
source_dataset,
harvest_job,
guids_in_source,
is_series=False,
collect_series_mapping=False
):
object_ids = []
label = "dataset series" if is_series else "dataset"
series_mapping = {} if collect_series_mapping else None

for item in items:
original_title = item.get("title", label)
if not item.get("name"):
item["name"] = self._gen_new_name(original_title)

if item["name"] in self._names_taken:
suffix = len([i for i in self._names_taken if i.startswith(item["name"] + "-")]) + 1
item["name"] = f"{item['name']}-{suffix}"

self._names_taken.append(item["name"])

if not item.get("owner_org") and source_dataset.owner_org:
item["owner_org"] = source_dataset.owner_org

guid = self._get_guid(item, source_url=source_dataset.url)
if not guid:
self._save_gather_error(f"Could not get a unique identifier for {label}: {item}", harvest_job)
continue

item.setdefault("extras", []).append({"key": "guid", "value": guid})
guids_in_source.append(guid)

obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(item))
obj.save()
object_ids.append(obj.id)

# Store mapping of RDF URI to dataset name if requested
if collect_series_mapping:
series_uri = item.get("uri") or item.get("identifier")
if series_uri:
# Try to find an existing active dataset series by 'guid' match
existing = model.Session.query(model.Package).\
join(model.PackageExtra).\
filter(model.PackageExtra.key == 'guid').\
filter(model.PackageExtra.value == series_uri).\
filter(model.Package.type == 'dataset_series').\
filter(model.Package.state == 'active').\
first()

if existing:
item["name"] = existing.name

series_mapping[str(series_uri)] = {
"id": existing.id if existing else item.get("id"),
"name": item["name"]
}


if collect_series_mapping:
return object_ids, series_mapping

return object_ids

8 changes: 5 additions & 3 deletions ckanext/dcat/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def structured_data(dataset_dict, profiles=None):
return _get_serialization(dataset_dict, profiles, "jsonld")


def croissant(dataset_dict, profiles=None):
def croissant(dataset_dict, profiles=None, jsonld_context=None):
"""
Returns a string containing the Croissant ML representation of the given
dataset using the `croissant` profile.
Expand All @@ -82,8 +82,10 @@ def croissant(dataset_dict, profiles=None):
if not profiles:
profiles = config.get("ckanext.dcat.croissant.profiles", ["croissant"])

frame = {"@context": JSONLD_CONTEXT, "@type": "sc:Dataset"}
context = jsonld_context or JSONLD_CONTEXT

frame = {"@context": context, "@type": "sc:Dataset"}

return _get_serialization(
dataset_dict, profiles, "jsonld", context=JSONLD_CONTEXT, frame=frame
dataset_dict, profiles, "jsonld", context=context, frame=frame
)
45 changes: 1 addition & 44 deletions ckanext/dcat/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,6 @@ def _datasets(self):
for dataset in self.g.subjects(RDF.type, DCAT.Dataset):
yield dataset

def _dataset_series(self):
'''
Generator that returns all DCAT dataset series on the graph

Yields rdflib.term.URIRef objects that can be used on graph lookups
and queries
'''
for dataset_series in self.g.subjects(RDF.type, DCAT.DatasetSeries):
yield dataset_series

def next_page(self):
'''
Returns the URL of the next page or None if there is no next page
Expand Down Expand Up @@ -183,7 +173,7 @@ def supported_formats(self):
for plugin
in rdflib.plugin.plugins(kind=rdflib.parser.Parser)])

def datasets(self, series_mapping=None):
def datasets(self):
'''
Generator that returns CKAN datasets parsed from the RDF graph

Expand All @@ -203,39 +193,6 @@ def datasets(self, series_mapping=None):
)
profile.parse_dataset(dataset_dict, dataset_ref)

# Add in_series if present in RDF and mapped
in_series = []
for series_ref in self.g.objects(dataset_ref, DCAT.inSeries):
key = str(series_ref)
if series_mapping and key in series_mapping:
in_series.append(series_mapping[key]["id"])

if in_series:
dataset_dict["in_series"] = in_series

yield dataset_dict


def dataset_series(self):
'''
Generator that returns CKAN dataset series parsed from the RDF graph

Each dataset series is passed to all the loaded profiles before being
yielded, so it can be further modified by each one of them.

Returns a dataset series dict that can be passed to eg `package_create`
or `package_update`
'''
for dataset_ref in self._dataset_series():
dataset_dict = {}
for profile_class in self._profiles:
profile = profile_class(
self.g,
dataset_type=self.dataset_type,
compatibility_mode=self.compatibility_mode
)
profile.parse_dataset(dataset_dict, dataset_ref)

yield dataset_dict


Expand Down
Loading