Skip to content

Commit 887430c

Browse files
authored
Merge pull request #20 from GenomicDataInfrastructure/improve-harvester-and-fix-issues
fix: Improve harvester and fix issues
2 parents 45e056a + 6cf5e10 commit 887430c

14 files changed

+66
-48
lines changed

ckanext/fairdatapoint/harvesters/civity_harvester.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,7 @@ def import_stage(self, harvest_object):
359359
# Get the last harvested object (if any)
360360
previous_object = model.Session.query(HarvestObject) \
361361
.filter(HarvestObject.guid == harvest_object.guid) \
362-
.filter(HarvestObject.current is True) \
362+
.filter(HarvestObject.current == True) \
363363
.first()
364364

365365
# Flag previous object as not current anymore
@@ -427,7 +427,7 @@ def _get_guids_to_package_ids_from_database(harvest_job):
427427
:return:
428428
"""
429429
query = model.Session.query(HarvestObject.guid, HarvestObject.package_id). \
430-
filter(HarvestObject.current is True). \
430+
filter(HarvestObject.current == True). \
431431
filter(HarvestObject.harvest_source_id == harvest_job.source.id)
432432

433433
guid_to_package_id = {}

ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
from ckanext.fairdatapoint.harvesters.domain.identifier import Identifier
1212
from ckanext.fairdatapoint.harvesters.domain.fair_data_point import FairDataPoint
1313

14-
from rdflib import Namespace, URIRef, Literal, DCAT, DCTERMS, Graph, RDF
14+
from requests import JSONDecodeError, HTTPError
15+
16+
from rdflib import Namespace, URIRef, Literal, DCAT, DCTERMS, Graph, RDF, BNode
1517
from rdflib.term import Node
1618
from typing import Dict, Iterable, Union
1719

@@ -84,10 +86,14 @@ def get_record_by_id(self, guid: str) -> str:
8486

8587
subject_uri = URIRef(subject_url)
8688

89+
self._remove_fdp_defaults(g, subject_uri)
90+
8791
# Add information from distribution to graph
8892
for distribution_uri in g.objects(subject=subject_uri, predicate=DCAT.distribution):
8993
distribution_g = self.fair_data_point.get_graph(distribution_uri)
9094

95+
self._remove_fdp_defaults(g, distribution_uri)
96+
9197
for predicate in [
9298
DCTERMS.description,
9399
DCTERMS.format,
@@ -100,18 +106,32 @@ def get_record_by_id(self, guid: str) -> str:
100106

101107
# Look-up contact information
102108
for contact_point_uri in self.get_values(g, subject_uri, DCAT.contactPoint):
103-
if 'orcid' in contact_point_uri:
104-
orcid_response = requests.get(str(contact_point_uri) + '/public-record.json')
105-
json_orcid_response = orcid_response.json()
106-
name = json_orcid_response['displayName']
107-
name_literal = Literal(name)
108-
g.add((subject_uri, VCARD.fn, name_literal))
109-
# TODO add original Orcid URL in a field
109+
if isinstance(contact_point_uri, URIRef):
110+
self._parse_contact_point(g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri)
110111

111112
result = g.serialize(format='ttl')
112113

113114
return result
114115

116+
@staticmethod
117+
def _parse_contact_point(g: Graph, subject_uri: URIRef, contact_point_uri: URIRef):
118+
"""
119+
Replaces contact point URI with a VCard
120+
"""
121+
g.remove((subject_uri, DCAT.contactPoint, contact_point_uri))
122+
vcard_node = BNode()
123+
g.add((subject_uri, DCAT.contactPoint, vcard_node))
124+
g.add((vcard_node, RDF.type, VCARD.Kind))
125+
g.add((vcard_node, VCARD.hasUID, contact_point_uri))
126+
if 'orcid' in str(contact_point_uri):
127+
try:
128+
orcid_response = requests.get(str(contact_point_uri).rstrip('/') + '/public-record.json')
129+
json_orcid_response = orcid_response.json()
130+
name = json_orcid_response['displayName']
131+
g.add((vcard_node, VCARD.fn, Literal(name)))
132+
except (JSONDecodeError, HTTPError) as e:
133+
log.error(f'Failed to get data from ORCID for {contact_point_uri}: {e}')
134+
115135
@staticmethod
116136
def get_values(graph: Graph,
117137
subject: Union[str, URIRef, Node],
@@ -121,3 +141,11 @@ def get_values(graph: Graph,
121141

122142
for value in graph.objects(subject=subject_uri, predicate=predicate_uri):
123143
yield value
144+
145+
@staticmethod
146+
def _remove_fdp_defaults(g, subject_uri):
147+
for (s, p, o) in g.triples((subject_uri, DCTERMS.accessRights, None)):
148+
access_rights_default = URIRef(f'{subject_uri}#accessRights')
149+
if o == access_rights_default:
150+
g.remove((subject_uri, DCTERMS.accessRights, o))
151+
g.remove((access_rights_default, None, None))

ckanext/fairdatapoint/profiles.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,12 @@
1515
from dateutil.parser import ParserError
1616
from json import JSONDecodeError
1717
from typing import Dict, List
18-
from rdflib import URIRef
18+
from rdflib import URIRef, Namespace
1919

2020
log = logging.getLogger(__name__)
2121

22+
VCARD = Namespace("http://www.w3.org/2006/vcard/ns#")
23+
2224

2325
def _convert_extras_to_declared_schema_fields(dataset_dict: Dict) -> Dict:
2426
"""
@@ -101,13 +103,26 @@ def parse_dataset(self, dataset_dict: Dict, dataset_ref: URIRef) -> Dict:
101103

102104
dataset_dict['tags'] = validate_tags(dataset_dict['tags'])
103105

104-
# Example of adding a field
105-
# dataset_dict['extras'].append({'key': 'hello',
106-
# 'value': 'Hello from the FAIR data point profile. Use this function to do '
107-
# 'FAIR data point specific stuff during the import stage'})
108-
109106
return dataset_dict
110107

108+
def _contact_details(self, subject, predicate):
109+
"""
110+
Overrides RDFProfile._contact_details so uri is taken from hasUID for VCard
111+
"""
112+
contact = {}
113+
# todo fix for multiple
114+
115+
for agent in self.g.objects(subject, predicate):
116+
117+
contact['uri'] = (str(agent) if isinstance(agent, URIRef)
118+
else self._get_vcard_property_value(agent, VCARD.hasUID))
119+
120+
contact['name'] = self._get_vcard_property_value(agent, VCARD.hasFN, VCARD.fn)
121+
122+
contact['email'] = self._without_mailto(self._get_vcard_property_value(agent, VCARD.hasEmail))
123+
124+
return contact
125+
111126
# def graph_from_dataset(self, dataset_dict, dataset_ref):
112127
#
113128
# g = self.g

ckanext/fairdatapoint/tests/test_data/Project_27866022694497978_out.ttl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,6 @@
1616
dcat:endDate "2025-12-31"^^xsd:date ;
1717
dcat:startDate "2020-01-01"^^xsd:date ] ;
1818
dcterms:title "COVID-NL cohort MUMC+"@en ;
19-
v:fn "N.K. De Vries" ;
20-
dcat:contactPoint <https://orcid.org/0000-0002-4348-707X> .
19+
dcat:contactPoint [ a v:Kind ;
20+
v:fn "N.K. De Vries" ;
21+
v:hasUID <https://orcid.org/0000-0002-4348-707X> ] .

ckanext/fairdatapoint/tests/test_data/dataset_898ca4b8-197b-4d40-bc81-d9cd88197670.ttl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
@prefix prov: <http://www.w3.org/ns/prov#> .
88
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
99
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
10+
@prefix ns3: <http://www.w3.org/2006/vcard/ns#> .
1011

1112
<https://fair.healthinformationportal.eu/distribution/> a ldp:DirectContainer ;
1213
dcterms:title "Distributions" ;
@@ -16,7 +17,6 @@
1617
<https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670> a dcat:Dataset,
1718
dcat:Resource ;
1819
rdfs:label "Slovenian income, poverty and social exclusion indicators" ;
19-
dcterms:accessRights <https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670#accessRights> ;
2020
dcterms:conformsTo <https://fair.healthinformationportal.eu/profile/2f08228e-1789-40f8-84cd-28e3288c3604> ;
2121
dcterms:creator "Statisti&#269;ni urad Republike Slovenije"@en ;
2222
dcterms:description "The purpose of data collection is to show the quality of life in Slovenia in view of allocation of disposable income among households, relative poverty and social exclusion for different socio-economic groups of persons and households and to highlight which groups of population are relatively worse off than the rest of the population and are thus more vulnerable to poverty, material deprivation and unemployment. One of the main purposes is also collecting data on health (disability, unmet needs, etc). {\"Topics\": \"Self-perceived health/morbidity, Disability, Wellbeing\", \"Data collection period\": \"2005-01-01 2021-12-31\", \"Funding\": \"State Budget\", \"Geo coverage\": \"Nuts 3\", \"Target population\": \"General population\", \"Age range from\": \"16\", \"Age range to\": \"100\", \"Updating periodicity\": \"Annually\", \"Sample size\": \"\", \"Personal identifier\": \"National identifier\", \"Level of aggregation\": \"Individual\", \"Linkage possible\": \"Only to some\", \"Permanent identifier of the data source\": \"\", \"Regulations for data sharing\": \"\"}"@en ;
@@ -36,7 +36,8 @@
3636
dcterms:title "Slovenian income, poverty and social exclusion indicators"@en ;
3737
ns2:SIO_000628 <https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670/metrics/445c0a70d1e214e545b261559e2842f4>,
3838
<https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670/metrics/5d27e854a9e78eb3f663331cd47cdc13> ;
39-
dcat:contactPoint <https://healthinformationportal.eu> ;
39+
dcat:contactPoint [ a ns3:Kind ;
40+
ns3:hasUID <https://healthinformationportal.eu> ] ;
4041
dcat:keyword "Self-perceived health, poverty"@en ;
4142
dcat:landingPage <https://www.healthinformationportal.eu/health-information-sources/slovenian-income-poverty-and-social-exclusion-indicators> ;
4243
dcat:theme <http://publications.europa.eu/resource/authority/data-theme/HEAL>,
@@ -52,9 +53,6 @@
5253
ns1:metadataIssued "2023-10-06T10:13:09.627000+00:00"^^xsd:dateTime ;
5354
ns1:metadataModified "2023-10-25T14:01:34.351000+00:00"^^xsd:dateTime .
5455

55-
<https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670#accessRights> a dcterms:RightsStatement ;
56-
dcterms:description "This resource has no access restriction" .
57-
5856
<https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670#identifier> a <http://purl.org/spar/datacite/Identifier> ;
5957
dcterms:identifier "https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670" .
6058

ckanext/fairdatapoint/tests/test_data/dataset_cbioportal.ttl

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5> a dcat:Dataset,
1717
dcat:Resource ;
1818
rdfs:label "[PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)" ;
19-
dcterms:accessRights <https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5#accessRights> ;
2019
dcterms:conformsTo <https://health-ri.sandbox.semlab-leiden.nl/profile/2f08228e-1789-40f8-84cd-28e3288c3604> ;
2120
dcterms:description "Whole exome sequencing of 23 grade II glioma tumor/normal pairs." ;
2221
dcterms:identifier "lgg_ucsf_2014"^^xsd:token ;
@@ -41,9 +40,6 @@
4140
ns1:metadataIssued "2024-01-22T12:58:04.249592+00:00"^^xsd:dateTime ;
4241
ns1:metadataModified "2024-01-22T12:58:05.109355+00:00"^^xsd:dateTime .
4342

44-
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5#accessRights> a dcterms:RightsStatement ;
45-
dcterms:description "This resource has no access restriction" .
46-
4743
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5#identifier> a <http://purl.org/spar/datacite/Identifier> ;
4844
dcterms:identifier "https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5" .
4945

ckanext/fairdatapoint/tests/test_data/dataset_d7129d28-b72a-437f-8db0-4f0258dd3c25.ttl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,6 @@
3131
ns1:metadataIssued "2023-09-05T12:00:36.276171+00:00"^^xsd:dateTime ;
3232
ns1:metadataModified "2023-09-05T12:03:28.843400+00:00"^^xsd:dateTime .
3333

34-
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25#accessRights> a dcterms:RightsStatement ;
35-
dcterms:description "This resource has no access restriction" .
36-
3734
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25#identifier> a <http://purl.org/spar/datacite/Identifier> ;
3835
dcterms:identifier "https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25" .
3936

ckanext/fairdatapoint/tests/test_data/dataset_d7129d28-b72a-437f-8db0-4f0258dd3c25_out.ttl

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25> a dcat:Dataset,
1818
dcat:Resource ;
1919
rdfs:label "Example" ;
20-
dcterms:accessRights <https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25#accessRights> ;
2120
dcterms:conformsTo <https://health-ri.sandbox.semlab-leiden.nl/profile/2f08228e-1789-40f8-84cd-28e3288c3604> ;
2221
dcterms:description "This is an example description." ;
2322
dcterms:isPartOf <https://health-ri.sandbox.semlab-leiden.nl/catalog/e3faf7ad-050c-475f-8ce4-da7e2faa5cd0> ;
@@ -31,9 +30,6 @@
3130
ns1:metadataIssued "2023-09-05T12:00:36.276171+00:00"^^xsd:dateTime ;
3231
ns1:metadataModified "2023-09-05T12:03:28.843400+00:00"^^xsd:dateTime .
3332

34-
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25#accessRights> a dcterms:RightsStatement ;
35-
dcterms:description "This resource has no access restriction" .
36-
3733
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25#identifier> a <http://purl.org/spar/datacite/Identifier> ;
3834
dcterms:identifier "https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25" .
3935

ckanext/fairdatapoint/tests/test_data/distribution_f9b9dff8-a039-4ca2-be9b-da72a61e3bac.ttl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,6 @@
2323
ns2:metadataIssued "2023-09-05T12:03:28.782932+00:00"^^xsd:dateTime ;
2424
ns2:metadataModified "2023-09-05T12:03:28.782932+00:00"^^xsd:dateTime .
2525

26-
<https://health-ri.sandbox.semlab-leiden.nl/distribution/f9b9dff8-a039-4ca2-be9b-da72a61e3bac#accessRights> a dcterms:RightsStatement ;
27-
dcterms:description "This resource has no access restriction" .
28-
2926
<https://health-ri.sandbox.semlab-leiden.nl/distribution/f9b9dff8-a039-4ca2-be9b-da72a61e3bac#identifier> a <http://purl.org/spar/datacite/Identifier> ;
3027
dcterms:identifier "https://health-ri.sandbox.semlab-leiden.nl/distribution/f9b9dff8-a039-4ca2-be9b-da72a61e3bac" .
3128

ckanext/fairdatapoint/tests/test_data/fdp_catalog.ttl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,6 @@
3636
ns2:metadataIssued "2023-10-06T10:12:55.614000+00:00"^^xsd:dateTime ;
3737
ns2:metadataModified "2023-10-25T14:02:23.680000+00:00"^^xsd:dateTime .
3838

39-
<https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#accessRights> a dcterms:RightsStatement ;
40-
dcterms:description "This resource has no access restriction" .
41-
4239
<https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#identifier> a <http://purl.org/spar/datacite/Identifier> ;
4340
dcterms:identifier "https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d" .
4441

0 commit comments

Comments
 (0)