Skip to content

Commit 00e7a43

Browse files
authored
Merge pull request #21 from GenomicDataInfrastructure/improve-harvester-and-fix-issues
Improve harvester and fix issues; NB! Changes to CKAN and Solr scheme are required!
2 parents 887430c + 004038d commit 00e7a43

File tree

8 files changed

+408
-298
lines changed

8 files changed

+408
-298
lines changed

ckanext/fairdatapoint/profiles.py

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# check for multiple-text fields in the schema
44
# All changes are © Stichting Health-RI and are licensed under the AGPLv3 license
55

6-
from datetime import datetime
6+
from datetime import datetime, timezone
77
import re
88
import json
99
import logging
@@ -15,7 +15,7 @@
1515
from dateutil.parser import ParserError
1616
from json import JSONDecodeError
1717
from typing import Dict, List
18-
from rdflib import URIRef, Namespace
18+
from rdflib import URIRef, Namespace, DCAT
1919

2020
log = logging.getLogger(__name__)
2121

@@ -85,9 +85,11 @@ def convert_datetime_string(date_value: str) -> datetime:
8585
Converts datestrings (e.g. '2023-10-06T10:12:55.614000+00:00') to datetime class instance
8686
"""
8787
try:
88-
date_value = dateparser.parse(date_value)
88+
date_value = dateparser.parse(date_value, yearfirst=True)
89+
if date_value.tzinfo is not None:
90+
date_value = date_value.astimezone(timezone.utc)
8991
except ParserError:
90-
log.error('A date field string value can not be parsed to a date')
92+
log.error(f'A date field string value {date_value} can not be parsed to a date')
9193
return date_value
9294

9395

@@ -98,43 +100,42 @@ class FAIRDataPointDCATAPProfile(EuropeanDCATAP2Profile):
98100

99101
def parse_dataset(self, dataset_dict: Dict, dataset_ref: URIRef) -> Dict:
100102
super(FAIRDataPointDCATAPProfile, self).parse_dataset(dataset_dict, dataset_ref)
103+
dataset_dict = self._parse_contact_point(dataset_dict, dataset_ref)
101104

102105
dataset_dict = _convert_extras_to_declared_schema_fields(dataset_dict)
103106

104107
dataset_dict['tags'] = validate_tags(dataset_dict['tags'])
105108

106109
return dataset_dict
107110

108-
def _contact_details(self, subject, predicate):
111+
def _contact_point_details(self, subject, predicate) -> List:
109112
"""
110113
Overrides RDFProfile._contact_details so uri is taken from hasUID for VCard
111114
"""
112-
contact = {}
113-
# todo fix for multiple
115+
contact_list = []
114116

115117
for agent in self.g.objects(subject, predicate):
118+
contact = {
119+
'contact_uri': (str(agent) if isinstance(agent, URIRef)
120+
else self._get_vcard_property_value(agent, VCARD.hasUID)),
121+
'contact_name': self._get_vcard_property_value(agent, VCARD.hasFN, VCARD.fn),
122+
'contact_email': self._without_mailto(self._get_vcard_property_value(agent, VCARD.hasEmail))}
116123

117-
contact['uri'] = (str(agent) if isinstance(agent, URIRef)
118-
else self._get_vcard_property_value(agent, VCARD.hasUID))
119-
120-
contact['name'] = self._get_vcard_property_value(agent, VCARD.hasFN, VCARD.fn)
121-
122-
contact['email'] = self._without_mailto(self._get_vcard_property_value(agent, VCARD.hasEmail))
123-
124-
return contact
125-
126-
# def graph_from_dataset(self, dataset_dict, dataset_ref):
127-
#
128-
# g = self.g
129-
#
130-
# spatial_text = self._get_dataset_value(dataset_dict, 'hello')
131-
#
132-
# if spatial_uri:
133-
# spatial_ref = URIRef(spatial_uri)
134-
# else:
135-
# spatial_ref = BNode()
136-
#
137-
# if spatial_text:
138-
# g.add((dataset_ref, DCT.spatial, spatial_ref))
139-
# g.add((spatial_ref, RDF.type, DCT.Location))
140-
# g.add((spatial_ref, RDFS.label, Literal(spatial_text)))
124+
contact_list.append(contact)
125+
126+
return contact_list
127+
128+
def _parse_contact_point(self, dataset_dict: Dict, dataset_ref: URIRef) -> Dict:
129+
"""
130+
ckan-dcat extension implies there can be just one contact point and in case a list is provided by source only
131+
last value is taken. Besides it never solves uri from a VCard object. This function parses DCAT.contactPoint
132+
information to a list of `pontact_point` dictionaries and replaces ckan-dcat values
133+
"""
134+
contact_point = self._contact_point_details(subject=dataset_ref, predicate=DCAT.contactPoint)
135+
dcat_profile_contact_fields = ['contact_name', 'contact_email', 'contact_uri']
136+
if contact_point:
137+
dataset_dict['extras'].append({'key': 'contact_point', 'value': contact_point})
138+
# Remove the extras contact_ fields if they were parsed by dcat extension
139+
dataset_dict['extras'] = \
140+
[item for item in dataset_dict['extras'] if item.get('key') not in dcat_profile_contact_fields]
141+
return dataset_dict
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
@prefix dcterms: <http://purl.org/dc/terms/> .
2+
@prefix dcat: <http://www.w3.org/ns/dcat#> .
3+
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
4+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
5+
@prefix ldp: <http://www.w3.org/ns/ldp#> .
6+
@prefix v: <http://www.w3.org/2006/vcard/ns#> .
7+
8+
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25>
9+
a dcat:Resource, dcat:Dataset;
10+
<http://www.w3.org/2000/01/rdf-schema#label> "Example";
11+
dcterms:title "Example";
12+
<https://w3id.org/fdp/fdp-o#metadataIssued> "2023-09-05T12:00:36.276171042Z"^^xsd:dateTime;
13+
<https://w3id.org/fdp/fdp-o#metadataModified> "2024-05-02T13:01:35.716385359Z"^^xsd:dateTime;
14+
dcterms:license <http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0>;
15+
dcterms:description "This is an example description.";
16+
dcat:contactPoint [ a v:VCard ;
17+
v:fn "Marc Bonten" ;
18+
v:hasUID <https://orcid.org/0000-0002-9095-9201> ;
19+
v:hasEmail <mailto:[email protected]> ] ,
20+
[ a v:VCard ;
21+
v:fn "Frits Rosendaal" ;
22+
v:hasUID <https://orcid.org/0000-0003-2558-7496> ;
23+
v:hasEmail <mailto:[email protected]> ] .
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
@prefix dcterms: <http://purl.org/dc/terms/> .
2+
@prefix dcat: <http://www.w3.org/ns/dcat#> .
3+
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
4+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
5+
@prefix ldp: <http://www.w3.org/ns/ldp#> .
6+
7+
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25>
8+
a dcat:Resource, dcat:Dataset;
9+
<http://www.w3.org/2000/01/rdf-schema#label> "Example";
10+
dcterms:title "Example";
11+
<https://w3id.org/fdp/fdp-o#metadataIssued> "2023-09-05T12:00:36.276171042Z"^^xsd:dateTime;
12+
<https://w3id.org/fdp/fdp-o#metadataModified> "2024-05-02T13:01:35.716385359Z"^^xsd:dateTime;
13+
dcterms:license <http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0>;
14+
dcterms:description "This is an example description.";
15+
dcat:contactPoint <https://orcid.org/0000-0002-9095-9201>, <https://orcid.org/0000-0003-2558-7496> .
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
@prefix dcterms: <http://purl.org/dc/terms/> .
2+
@prefix dcat: <http://www.w3.org/ns/dcat#> .
3+
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
4+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
5+
@prefix ldp: <http://www.w3.org/ns/ldp#> .
6+
7+
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25>
8+
a dcat:Resource, dcat:Dataset;
9+
<http://www.w3.org/2000/01/rdf-schema#label> "Example";
10+
dcterms:title "Example";
11+
<https://w3id.org/fdp/fdp-o#metadataIssued> "2023-09-05T12:00:36.276171042Z"^^xsd:dateTime;
12+
<https://w3id.org/fdp/fdp-o#metadataModified> "2024-05-02T13:01:35.716385359Z"^^xsd:dateTime;
13+
dcterms:license <http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0>;
14+
dcterms:description "This is an example description.";
15+
dcat:contactPoint <https://orcid.org/0000-0002-9095-9201> .
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
@prefix dcterms: <http://purl.org/dc/terms/> .
2+
@prefix dcat: <http://www.w3.org/ns/dcat#> .
3+
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
4+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
5+
@prefix ldp: <http://www.w3.org/ns/ldp#> .
6+
@prefix v: <http://www.w3.org/2006/vcard/ns#> .
7+
8+
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25>
9+
a dcat:Resource, dcat:Dataset;
10+
<http://www.w3.org/2000/01/rdf-schema#label> "Example";
11+
dcterms:title "Example";
12+
<https://w3id.org/fdp/fdp-o#metadataIssued> "2023-09-05T12:00:36.276171042Z"^^xsd:dateTime;
13+
<https://w3id.org/fdp/fdp-o#metadataModified> "2024-05-02T13:01:35.716385359Z"^^xsd:dateTime;
14+
dcterms:license <http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0>;
15+
dcterms:description "This is an example description.";
16+
dcat:contactPoint [ a v:Kind ;
17+
v:fn "Marc Bonten" ;
18+
v:hasUID <https://orcid.org/0000-0002-9095-9201> ;
19+
v:hasEmail <mailto:[email protected]> ] .

0 commit comments

Comments
 (0)