Skip to content

Commit 49907b6

Browse files
Merge pull request #14 from GenomicDataInfrastructure/harvester]-parse-iso8601-compliant-timestamps
[Harvester] parse iso8601 compliant timestamps
2 parents 4a59131 + c84032b commit 49907b6

File tree

5 files changed

+241
-35
lines changed

5 files changed

+241
-35
lines changed

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ jobs:
1010
image: openknowledge/ckan-dev:2.10
1111
services:
1212
solr:
13-
image: ckan/ckan-solr-dev:2.10
13+
image: ckan/ckan-solr:2.10-solr9
1414
postgres:
1515
image: ckan/ckan-postgres-dev:2.10
1616
env:

ckanext/fairdatapoint/profiles.py

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,21 @@
33
# check for multiple-text fields in the schema
44
# All changes are © Stichting Health-RI and are licensed under the AGPLv3 license
55

6+
from datetime import datetime
7+
import re
8+
import json
9+
import logging
10+
611
from ckanext.dcat.profiles import EuropeanDCATAP2Profile
712
from ckan.plugins import toolkit
813
from ckan import model
9-
import json
10-
from typing import Dict
14+
import dateutil.parser as dateparser
15+
from dateutil.parser import ParserError
16+
from typing import Dict, List
1117
from rdflib import URIRef
1218

19+
log = logging.getLogger(__name__)
20+
1321

1422
def _convert_extras_to_declared_schema_fields(dataset_dict: Dict) -> Dict:
1523
"""
@@ -31,19 +39,52 @@ def _convert_extras_to_declared_schema_fields(dataset_dict: Dict) -> Dict:
3139
# Populate the declared schema fields, if they are present in the extras
3240
for extra_dict in dataset_dict.get('extras', []):
3341
field_key = extra_dict.get('key')
42+
field_value = extra_dict.get('value')
3443
if field_key in dataset_fields:
3544
preset = dataset_fields[field_key]
36-
if preset == "multiple_text" and extra_dict.get('value'):
37-
dataset_dict[field_key] = json.loads(extra_dict.get('value'))
45+
if preset == 'multiple_text' and field_value:
46+
dataset_dict[field_key] = json.loads(field_value)
47+
elif preset == 'date' and field_value:
48+
dataset_dict[field_key] = convert_datetime_string(field_value)
3849
else:
39-
dataset_dict[field_key] = extra_dict.get('value')
50+
dataset_dict[field_key] = field_value
4051

4152
# Remove the extras that have been populated into the declared schema fields
4253
dataset_dict['extras'] = [d for d in dataset_dict['extras'] if d.get('key') not in dataset_fields]
4354

4455
return dataset_dict
4556

4657

58+
def validate_tags(values_list: List[Dict]) -> List:
59+
"""
60+
Validates tags strings to contain allowed characters, replaces others with spaces
61+
"""
62+
illegal_pattern = re.compile('[^A-Za-z0-9\- _\.]')
63+
tags = []
64+
for item in values_list:
65+
tag_value = item['name']
66+
find_illegal = re.search(illegal_pattern, tag_value)
67+
if find_illegal:
68+
log.warning(f'Tag {tag_value} contains values other than alphanumeric characters, spaces, hyphens, '
69+
f'underscores or dots, they will be replaces with spaces')
70+
tag = {'name': re.sub(illegal_pattern, ' ', tag_value)}
71+
tags.append(tag)
72+
else:
73+
tags.append(item)
74+
return tags
75+
76+
77+
def convert_datetime_string(date_value: str) -> datetime:
78+
"""
79+
Converts datestrings (e.g. '2023-10-06T10:12:55.614000+00:00') to datetime class instance
80+
"""
81+
try:
82+
date_value = dateparser.parse(date_value)
83+
except ParserError:
84+
log.error('A date field string value can not be parsed to a date')
85+
return date_value
86+
87+
4788
class FAIRDataPointDCATAPProfile(EuropeanDCATAP2Profile):
4889
"""
4990
An RDF profile for FAIR data points
@@ -54,10 +95,12 @@ def parse_dataset(self, dataset_dict: Dict, dataset_ref: URIRef) -> Dict:
5495

5596
dataset_dict = _convert_extras_to_declared_schema_fields(dataset_dict)
5697

98+
dataset_dict['tags'] = validate_tags(dataset_dict['tags'])
99+
57100
# Example of adding a field
58-
dataset_dict['extras'].append({'key': 'hello',
59-
'value': "Hello from the FAIR data point profile. Use this function to do "
60-
"FAIR data point specific stuff during the import stage"})
101+
# dataset_dict['extras'].append({'key': 'hello',
102+
# 'value': 'Hello from the FAIR data point profile. Use this function to do '
103+
# 'FAIR data point specific stuff during the import stage'})
61104

62105
return dataset_dict
63106

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
@prefix dcat: <http://www.w3.org/ns/dcat#> .
2+
@prefix dcterms: <http://purl.org/dc/terms/> .
3+
@prefix ldp: <http://www.w3.org/ns/ldp#> .
4+
@prefix ns1: <https://w3id.org/fdp/fdp-o#> .
5+
@prefix ns2: <http://semanticscience.org/resource/> .
6+
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
7+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
8+
9+
<https://health-ri.sandbox.semlab-leiden.nl/distribution/> a ldp:DirectContainer ;
10+
dcterms:title "Distributions" ;
11+
ldp:contains <https://health-ri.sandbox.semlab-leiden.nl/distribution/931ed9c4-ad23-47ff-b121-2eb428e57423>,
12+
<https://health-ri.sandbox.semlab-leiden.nl/distribution/ad00299f-6efb-42aa-823d-5ff2337f38f7> ;
13+
ldp:hasMemberRelation dcat:distribution ;
14+
ldp:membershipResource <https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5> .
15+
16+
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5> a dcat:Dataset,
17+
dcat:Resource ;
18+
rdfs:label "[PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)" ;
19+
dcterms:accessRights <https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5#accessRights> ;
20+
dcterms:conformsTo <https://health-ri.sandbox.semlab-leiden.nl/profile/2f08228e-1789-40f8-84cd-28e3288c3604> ;
21+
dcterms:description "Whole exome sequencing of 23 grade II glioma tumor/normal pairs." ;
22+
dcterms:identifier "lgg_ucsf_2014"^^xsd:token ;
23+
dcterms:isPartOf <https://health-ri.sandbox.semlab-leiden.nl/catalog/5c85cb9f-be4a-406c-ab0a-287fa787caa0> ;
24+
dcterms:isReferencedBy <https://pubmed.ncbi.nlm.nih.gov/24336570> ;
25+
dcterms:issued "2019-10-30 23:00:00" ;
26+
dcterms:language <http://id.loc.gov/vocabulary/iso639-1/en> ;
27+
dcterms:license <http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0> ;
28+
dcterms:modified "2019-10-30 23:00:00" ;
29+
dcterms:publisher <https://www.health-ri.nl> ;
30+
dcterms:temporal [ a dcterms:PeriodOfTime ] ;
31+
dcterms:title "[PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)" ;
32+
ns2:SIO_000628 <https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5/metrics/445c0a70d1e214e545b261559e2842f4>,
33+
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5/metrics/5d27e854a9e78eb3f663331cd47cdc13> ;
34+
dcat:distribution <https://health-ri.sandbox.semlab-leiden.nl/distribution/931ed9c4-ad23-47ff-b121-2eb428e57423>,
35+
<https://health-ri.sandbox.semlab-leiden.nl/distribution/ad00299f-6efb-42aa-823d-5ff2337f38f7> ;
36+
dcat:keyword "CNS/Brain",
37+
"Diffuse Glioma",
38+
"Glioma" ;
39+
dcat:landingPage <https://cbioportal.health-ri.nl/study/summary?id=lgg_ucsf_2014> ;
40+
ns1:metadataIdentifier <https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5#identifier> ;
41+
ns1:metadataIssued "2024-01-22T12:58:04.249592+00:00"^^xsd:dateTime ;
42+
ns1:metadataModified "2024-01-22T12:58:05.109355+00:00"^^xsd:dateTime .
43+
44+
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5#accessRights> a dcterms:RightsStatement ;
45+
dcterms:description "This resource has no access restriction" .
46+
47+
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5#identifier> a <http://purl.org/spar/datacite/Identifier> ;
48+
dcterms:identifier "https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5" .
49+
50+
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5/metrics/445c0a70d1e214e545b261559e2842f4> ns2:SIO_000332 <https://www.ietf.org/rfc/rfc3986.txt> ;
51+
ns2:SIO_000628 <https://www.ietf.org/rfc/rfc3986.txt> .
52+
53+
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5/metrics/5d27e854a9e78eb3f663331cd47cdc13> ns2:SIO_000332 <https://www.wikidata.org/wiki/Q8777> ;
54+
ns2:SIO_000628 <https://www.wikidata.org/wiki/Q8777> .
55+
56+
<https://health-ri.sandbox.semlab-leiden.nl/profile/2f08228e-1789-40f8-84cd-28e3288c3604> rdfs:label "Dataset Profile" .
57+
58+
<https://health-ri.sandbox.semlab-leiden.nl/distribution/931ed9c4-ad23-47ff-b121-2eb428e57423> dcterms:description "Clinical data for [PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)" ;
59+
dcterms:license <http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0> ;
60+
dcterms:title "Clinical data for [PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)" ;
61+
dcat:accessURL <https://cbioportal.health-ri.nl/study/clinicalData?id=lgg_ucsf_2014> .
62+
63+
<https://health-ri.sandbox.semlab-leiden.nl/distribution/ad00299f-6efb-42aa-823d-5ff2337f38f7> dcterms:description "Mutation data from whole exome sequencing of 23 grade II glioma tumor/normal pairs. (MAF)" ;
64+
dcterms:license <http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0> ;
65+
dcterms:title "Mutations" ;
66+
dcat:accessURL <https://cbioportal.health-ri.nl/study/summary?id=lgg_ucsf_2014> .

ckanext/fairdatapoint/tests/test_processors.py

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
"""
1818

1919
import pytest
20+
from datetime import datetime
21+
from dateutil.tz import tzutc
2022
from pathlib import Path
2123
from unittest.mock import patch
2224
from rdflib import Graph
@@ -59,10 +61,7 @@ def test_fdp_record_converter_dataset_dict(self):
5961
record=data)
6062
expected_dataset = {"extras":
6163
[
62-
{"key": "uri", "value": "https://covid19initiatives.health-ri.nl/p/Project/27866022694497978"},
63-
{"key": "hello",
64-
"value": "Hello from the FAIR data point profile. Use this function to do FAIR data point "
65-
"specific stuff during the import stage"}
64+
{"key": "uri", "value": "https://covid19initiatives.health-ri.nl/p/Project/27866022694497978"}
6665
],
6766
"resources": [],
6867
"title": "COVID-NL cohort MUMC+",
@@ -73,8 +72,8 @@ def test_fdp_record_converter_dataset_dict(self):
7372
"has_version": ["https://repo.metadatacenter.org/template-instances/2836bf1c-76e9-44e7-a65e-80e9ca63025a"],
7473
"contact_uri": "https://orcid.org/0000-0002-4348-707X",
7574
"publisher_uri": "https://opal.health-ri.nl/pub/",
76-
"temporal_start": "2020-01-01",
77-
"temporal_end": "2025-12-31"}
75+
"temporal_start": datetime(2020, 1, 1, 0, 0),
76+
"temporal_end": datetime(2025, 12, 31, 0, 0)}
7877
assert actual_dataset == expected_dataset
7978

8079
def test_fdp_record_converter_catalog_dict(self):
@@ -84,25 +83,22 @@ def test_fdp_record_converter_catalog_dict(self):
8483
guid="catalog=https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d",
8584
record=data)
8685
expected = {
87-
"access_rights": "https://fair.healthinformationportal.eu/catalog/"
88-
"1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#accessRights",
89-
"conforms_to": ["https://fair.healthinformationportal.eu/profile/"
90-
"a0949e72-4466-4d53-8900-9436d1049a4b"],
91-
"extras": [{"key": "uri",
92-
"value": "https://fair.healthinformationportal.eu/catalog/"
93-
"1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d"},
94-
{"key": "hello",
95-
"value": "Hello from the FAIR data point profile. Use this "
96-
"function to do FAIR data point specific stuff during "
97-
"the import stage"}],
98-
"has_version": ["1.0"],
99-
"issued": "2023-10-06T10:12:55.614000+00:00",
100-
"language": ["http://id.loc.gov/vocabulary/iso639-1/en"],
101-
"license_id": "",
102-
"modified": "2023-10-06T10:12:55.614000+00:00",
103-
"publisher_name": "Automatic",
104-
"resources": [],
105-
"tags": [],
106-
"title": "Slovenia National Node"
86+
"access_rights": "https://fair.healthinformationportal.eu/catalog/"
87+
"1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#accessRights",
88+
"conforms_to": ["https://fair.healthinformationportal.eu/profile/"
89+
"a0949e72-4466-4d53-8900-9436d1049a4b"],
90+
"extras": [{"key": "uri",
91+
"value": "https://fair.healthinformationportal.eu/catalog/"
92+
"1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d"},
93+
],
94+
"has_version": ["1.0"],
95+
"issued": datetime(2023, 10, 6, 10, 12, 55, 614000, tzinfo=tzutc()),
96+
"language": ["http://id.loc.gov/vocabulary/iso639-1/en"],
97+
"license_id": "",
98+
"modified": datetime(2023, 10, 6, 10, 12, 55, 614000, tzinfo=tzutc()),
99+
"publisher_name": "Automatic",
100+
"resources": [],
101+
"tags": [],
102+
"title": "Slovenia National Node"
107103
}
108104
assert actual == expected
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
"""
2+
CKAN Fair Data Point extension Test Suite
3+
Copyright (C) 2024, Stichting Health-RI
4+
5+
This program is free software: you can redistribute it and/or modify
6+
it under the terms of the GNU Affero General Public License as
7+
published by the Free Software Foundation, either version 3 of the
8+
License, or (at your option) any later version.
9+
10+
This program is distributed in the hope that it will be useful,
11+
but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
GNU Affero General Public License for more details.
14+
15+
You should have received a copy of the GNU Affero General Public License
16+
along with this program. If not, see <https://www.gnu.org/licenses/>.
17+
"""
18+
19+
import pytest
20+
from datetime import datetime
21+
from dateutil.tz import tzutc, tzoffset
22+
from pathlib import Path
23+
from rdflib import Graph, URIRef
24+
from ckanext.fairdatapoint.profiles import validate_tags, convert_datetime_string
25+
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import (
26+
FairDataPointRecordToPackageConverter)
27+
28+
TEST_DATA_DIRECTORY = Path(Path(__file__).parent.resolve(), "test_data")
29+
30+
31+
@pytest.mark.parametrize("input_tags,expected_tags", [
32+
([{"name": "CNS/Brain"}], [{"name": "CNS Brain"}]),
33+
([{"name": "COVID-19"}, {"name": "3`-DNA"}], [{"name": "COVID-19"}, {"name": "3 -DNA"}]),
34+
([{"name": "something-1.1"}, {"name": "breast cancer"}], [{"name": "something-1.1"}, {"name": "breast cancer"}]),
35+
([], [])
36+
])
37+
def test_validate_tags(input_tags, expected_tags):
38+
actual_tags = validate_tags(input_tags)
39+
assert actual_tags == expected_tags
40+
41+
42+
@pytest.mark.ckan_config("ckan.plugins", "scheming_datasets")
43+
@pytest.mark.usefixtures("with_plugins")
44+
def test_parse_dataset():
45+
"""Dataset with keywords which should be modified"""
46+
fdp_record_to_package = FairDataPointRecordToPackageConverter(profile="fairdatapoint_dcat_ap")
47+
data = Graph().parse(Path(TEST_DATA_DIRECTORY, "dataset_cbioportal.ttl")).serialize()
48+
actual = fdp_record_to_package.record_to_package(
49+
guid="catalog=https://health-ri.sandbox.semlab-leiden.nl/catalog/5c85cb9f-be4a-406c-ab0a-287fa787caa0;"
50+
"dataset=https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5",
51+
record=data)
52+
expected = {
53+
'extras': [
54+
{'key': 'uri',
55+
'value': 'https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5'
56+
}
57+
],
58+
'resources': [{'name': 'Clinical data for [PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)',
59+
'description': 'Clinical data for [PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)',
60+
'access_url': 'https://cbioportal.health-ri.nl/study/clinicalData?id=lgg_ucsf_2014',
61+
'license': 'http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0',
62+
'url': 'https://cbioportal.health-ri.nl/study/clinicalData?id=lgg_ucsf_2014',
63+
'uri': 'https://health-ri.sandbox.semlab-leiden.nl/distribution/'
64+
'931ed9c4-ad23-47ff-b121-2eb428e57423',
65+
'distribution_ref': 'https://health-ri.sandbox.semlab-leiden.nl/distribution/'
66+
'931ed9c4-ad23-47ff-b121-2eb428e57423'},
67+
{'name': 'Mutations',
68+
'description': 'Mutation data from whole exome sequencing of 23 grade II glioma tumor/normal '
69+
'pairs. (MAF)',
70+
'access_url': 'https://cbioportal.health-ri.nl/study/summary?id=lgg_ucsf_2014',
71+
'license': 'http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0',
72+
'url': 'https://cbioportal.health-ri.nl/study/summary?id=lgg_ucsf_2014',
73+
'uri': 'https://health-ri.sandbox.semlab-leiden.nl/distribution/'
74+
'ad00299f-6efb-42aa-823d-5ff2337f38f7',
75+
'distribution_ref': 'https://health-ri.sandbox.semlab-leiden.nl/distribution/'
76+
'ad00299f-6efb-42aa-823d-5ff2337f38f7'}],
77+
'title': '[PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)',
78+
'notes': 'Whole exome sequencing of 23 grade II glioma tumor/normal pairs.',
79+
'url': 'https://cbioportal.health-ri.nl/study/summary?id=lgg_ucsf_2014',
80+
'tags': [{'name': 'CNS Brain'}, {'name': 'Diffuse Glioma'}, {'name': 'Glioma'}], 'license_id': '',
81+
'issued': datetime(2019, 10, 30, 23, 0),
82+
'modified': datetime(2019, 10, 30, 23, 0),
83+
'identifier': 'lgg_ucsf_2014', 'language': ['http://id.loc.gov/vocabulary/iso639-1/en'],
84+
'conforms_to': ['https://health-ri.sandbox.semlab-leiden.nl/profile/2f08228e-1789-40f8-84cd-28e3288c3604'],
85+
'publisher_uri': 'https://www.health-ri.nl',
86+
'access_rights': 'https://health-ri.sandbox.semlab-leiden.nl/dataset/'
87+
'd9956191-1aff-4181-ac8b-16b829135ed5#accessRights',
88+
'is_referenced_by': '["https://pubmed.ncbi.nlm.nih.gov/24336570"]'}
89+
assert actual == expected
90+
91+
92+
@pytest.mark.parametrize("input_timestring,expected_output", [
93+
("2023-10-06T10:12:55.614000+00:00",
94+
datetime(2023, 10, 6, 10, 12, 55, 614000, tzinfo=tzutc())),
95+
("2024-02-15 11:16:37+03:00",
96+
datetime(2024, 2, 15, 11, 16, 37, tzinfo=tzoffset(None, 10800))),
97+
("November 9, 1999", datetime(1999, 11, 9, 0, 0, 0)),
98+
("2006-09", datetime(2006, 9, 18))])
99+
def test_convert_datetime_string(input_timestring, expected_output):
100+
actual = convert_datetime_string(input_timestring)
101+
assert actual == expected_output

0 commit comments

Comments
 (0)