From 56bfc4701edafe21c6a23df63930d3dadd0d2795 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Gr=C3=B6nberg?= Date: Thu, 25 Sep 2025 11:16:25 +0200 Subject: [PATCH 01/10] feat: merge quickstart.md and README.md to one document --- README.md | 309 ++++++++++++++++++++++++++++++++++++++++------ quickstart-sda.md | 284 ------------------------------------------ 2 files changed, 273 insertions(+), 320 deletions(-) delete mode 100644 quickstart-sda.md diff --git a/README.md b/README.md index da06e0c..b05732f 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,37 @@ -# starter-kit-storage-and-interfaces +# Quickstart guide to the SDA -There exist two compose files at the root of the repo. Details on how to use them are provided below. +This document contains information on how to work with the Sensitive Data Archive (SDA) from the GDI user perspective. -**Note:** Before deploying the stack, please make sure that all configuration files are in place. The following files need to be created from their respective examples: +## Brief description of the storage and interfaces stack + +The storage and interfaces software stack for the GDI-starter-kit consists of the following services: + +| Component | Description | +|--------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| broker | RabbitMQ based message broker, [SDA-MQ](https://github.com/neicnordic/sensitive-data-archive/tree/main/rabbitmq). | +| database | PostgreSQL database, [SDA-DB](https://github.com/neicnordic/sensitive-data-archive/tree/main/postgresql). | +| storage | S3 object store, demo uses Minio S3. | +| auth | OpenID Connect relaying party and authentication service, [SDA-auth](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda-auth). | +| s3inbox | Proxy inbox to the S3 backend store, [SDA-S3Inbox](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda). | +| download | Data out solution for downloading files from the SDA, [SDA-download](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda-download). | +| SDA-pipeline | The ingestion pipeline of the SDA, [SDA-pipeline](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda). This comprises of the following core components: `ingest`, `verify`, `finalize` and `mapper`. | + +Detailed documentation on the `sda-pipeline` can be found at: [https://neic-sda.readthedocs.io/en/latest/services/pipeline)](https://neic-sda.readthedocs.io/en/latest/services/pipeline). + +NeIC Sensitive Data Archive documentation can be found at: [https://neic-sda.readthedocs.io/en/latest/](https://neic-sda.readthedocs.io/en/latest/) . + +## Deployment + +There exist two compose files at the root of the repo. +- [docker-compose.yml](docker-compose.yml) + - To bootstrap the *full stack* of `storage-and-interfaces` services + - Requires a running [`LS-AAI-mock`](https://github.com/GenomicDataInfrastructure/starter-kit-lsaai-mock) service. To configure the LS-AAI-mock service follow the instructions in that repo. +- [docker-compose-demo.yml](docker-compose-demo.yml) + - Used to start the `storage-and-interfaces` services in *demo* mode with an example dataset preloaded and ingested to the sensitive data archive when the deployment is done. + - This comes with its own python implementation of a mock-oidc in place of LS-AAI and can be run as standalone for demonstration purposes. + + +Before deploying the stack, please make sure that all configuration files are in place. The following files need to be created from their respective examples: ```shell cp ./config/config.yaml.example ./config/config.yaml @@ -12,58 +41,242 @@ cp ./.env.example ./.env no further editing to the above files is required for running the stack locally. -## Starting the full stack with LS-AAI-mock +The storage and interfaces stack can be deployed with the use of the provided `docker-compose.yml` file by running + +```shell +docker compose -f docker-compose.yml up -d +``` + +from the root of this repo. Please note that in the current form of the compose file, services are configured to work out-of-the-box with the [LS-AAI-mock](https://github.com/GenomicDataInfrastructure/starter-kit-lsaai-mock) service and some configuration of the latter is needed beforehand, see [here](./README.md##Starting-the-full-stack-with-LS-AAI-mock) for step-by-step instructions. The rationale behind this setup is to allow for a seamless transition to an environment with a *live* LS-AAI service as discussed briefly below. + +Configuration can be further customized by changing the files listed at the top of the `./.env` file along with the `.env` file itself. Please bear in mind that environment variables take precedence over the `config.yaml` file. + +Lastly, this repo includes a `docker-compose-demo.yml` file which deploys a standalone stack along with a demo of the sda services' functionality with test data. Details can be found [here](./README.md##Starting-the-stack-in-standalone-demo-mode). + +### Adding TLS to internet facing services + +Internet facing services such as `s3inbox`, `download` and `auth`, need to be secured via TLS certification. This can be most conveniently achieved by using [Let's Encrypt](https://letsencrypt.org/getting-started/) as Certificate Authority. Assuming shell access to your web host, a convenient way to set this up is through installing Certbot (or any other ACME client supported by Let's Encrypt). Detailed instructions on setting up Certbot for different systems can be found [here](https://certbot.eff.org/). + +## Authentication for users with LS-AAI (mock or alive) + +To interact with SDA services, users need to provide [JSON Web Token](https://jwt.io/) (JWT) authorization. Ultimately, tokens can be fetched by [LS-AAI](https://lifescience-ri.eu/ls-login/) upon user login to an OpenID Connect (OIDC) relaying party (RP) service that is [registered with LS-AAI](https://spreg-legacy.aai.elixir-czech.org/). An example of such an RP service is the [sda-auth](https://github.com/neicnordic/sda-auth), which is included in the present stack. + +### sda-auth + +Assuming users with a valid LS-AAI ID, they can obtain a JWT by logging in to the `sda-auth` service. This can be done by navigating to the `sda-auth` service URL (e.g. `https://localhost:8085` for a local deployment or `https://login.gdi.nbis.se` for a live one) and clicking on the `Login` button. This will redirect the user to the LS-AAI login page where they can enter their credentials. Once authenticated, the user will be redirected back to the `sda-auth` service and a JWT will be issued. This is an access token which can be copied from the `sda-auth`'s page and used to interact with the SDA services like e.g. for authorizing calls to `sda-download`'s API as described in the [Downloading data](#downloading-data) section below. + +From `sda-auth`'s page users can also download a configuration file for accessing the `s3inbox` service. This `s3cmd.conf` file containes the aforementioned access token along with other necessary information and it is described in detail in the [Uploading data](#uploading-data) section below. + +## How to perform common user tasks + +### Data encryption + +The `sda-pipeline` only ingests files encrypted with the archive's `c4gh` public key. For instance, using the Go implementation of the [`crypt4gh` utility](https://github.com/neicnordic/crypt4gh) a file can be encrypted simply by running: + +```shell +crypt4gh encrypt -f -p +``` + +where `` is the archive's public key. Note that `docker-compose.yml` stores the archive's c4gh public key in a volume named `shared`, see [below](#how-to-perform-common-admin-tasks) for how to extract it. + +### Uploading data + +Users can upload data to the SDA by transferring them directly to the archive's `s3inbox` with an S3 client tool such as [`s3cmd`](https://s3tools.org/s3cmd): + +```shell +s3cmd -c s3cmd.conf put s3:/// +``` -To bootstrap the *full stack* of `storage-and-interfaces` services use the file `docker-compose.yml`. Note that this requires a running [`LS-AAI-mock`](https://github.com/GenomicDataInfrastructure/starter-kit-lsaai-mock) service. To configure the LS-AAI-mock service follow the instructions below. +where `USER_LS-AAI_ID` is the user's LS-AAI ID with the `@` replaced by a `_` and `s3cmd.conf` is a configuration file with the following content: + +```ini +[default] +access_key = +secret_key = +access_token= +check_ssl_certificate = False +check_ssl_hostname = False +encoding = UTF-8 +encrypt = False +guess_mime_type = True +host_base = +host_bucket = +human_readable_sizes = true +multipart_chunk_size_mb = 50 +use_https = True +socket_timeout = 30 +``` -First clone the [startet-kit-lsaai-mock](https://github.com/GenomicDataInfrastructure/starter-kit-lsaai-mock) repo. +It is possible to download the `s3cmd.conf` file from the `sda-auth` service as described in the [Authentication for users with LS-AAI (mock or alive)](#authentication-for-users-with-ls-aai-mock-or-alive) section above. However, do note that `s3cmd.conf` downloaded from this service lacks the section header `[default]` which needs to be added manually if one wishes to use the file directly with `s3cmd`. -Add the `sda-auth` client by creating a file `configuration/aai-mock/clients/client1.yaml` with the following contents: +For example, a `s3cmd.conf` file downloaded from `auth` after deploying the stack locally (with LS-AAI-mock as OIDC) would look like this: ```ini -client-name: "auth" -client-id: "XC56EL11xx" -client-secret: "wHPVQaYXmdDHg" -redirect-uris: ["http://localhost:8085/oidc/login"] -token-endpoint-auth-method: "client_secret_basic" -scope: ["openid", "profile", "email", "ga4gh_passport_v1", "eduperson_entitlement"] -grant-types: ["authorization_code"] -post-logout-redirect-uris: ["http://localhost:8085/oidc/login"] +access_key = jd123_lifescience-ri.eu +secret_key = jd123_lifescience-ri.eu +access_token=eyJraWQiOiJyc2ExIiwidH... +check_ssl_certificate = False +check_ssl_hostname = False +encoding = UTF-8 +encrypt = False +guess_mime_type = True +host_base = localhost:8000 +host_bucket = localhost:8000 +human_readable_sizes = true +multipart_chunk_size_mb = 50 +use_https = True +socket_timeout = 30 +``` + +where the access token has been truncated for brevity. Please note that the option `use_https = True` is missing from the above file (therefore set implicitly to `False`) since the local deployment of the stack does not use TLS. + +### The sda-cli tool + +Instead of the tools above, users are **encouraged** to use [`sda-cli`](https://github.com/NBISweden/sda-cli), which is a tool specifically developed to perform all common SDA user-related tasks in a convenient and unified manner. It is recommended to use precompiled executables for `sda-cli` which can be found at [https://github.com/NBISweden/sda-cli/releases](https://github.com/NBISweden/sda-cli/releases) + +To start using the tool run: + +```shell +./sda-cli help +``` + +#### Examples of common usage + +- Encrypt and upload a file to the SDA in one go: + +```shell +./sda-cli upload -config s3cmd.conf --encrypt-with-key +``` + +- Encrypt and upload a whole folder recursively to a specified path, which can be different from the source, in one go: + +```shell +./sda-cli upload -config s3cmd.conf --encrypt-with-key -r -targetDir +``` + +- List all uploaded files in the user's bucket recursively: + +```shell +./sda-cli list -config s3cmd.conf +``` + +For detailed documentation on the tool's capabilities and usage please refer [here](https://github.com/NBISweden/sda-cli#usage). + +### Downloading data + +Users can directly download data from the SDA via `sda-download`, for more details see the service's [api reference](https://github.com/neicnordic/sda-download/blob/main/docs/API.md). In short, given a [valid JW token](#sda-auth), `$token`, a user can download the file with file ID, `$fileID` by issuing the following command: + +```shell +curl --cacert -H "Authorization: Bearer $token" https:///files/$fileID -o ``` -## Starting storage-and-interfaces with LS-AAI-mock +where for example `sda-download_DOMAIN_NAME` can be `login.gdi.nbis.se` or `localhost:8443` depending on the deployment. In the case of a local deployment, the certificate file can be obtained by running: -From the root of the `starter-kit-storage-and-interfaces` folder and run: +```shell +docker cp download:/shared/cert/ca.crt . +``` + +The `fileID` is a unique file identifier that can be obtained by calls to `sda-download`'s `/datasets` endpoint. For details and a concrete example on how to use `sda-download` with demo data please see [here](./README.md#get-token-or-downloading-data). + +### Data access permissions + +In order for a user to access a file, permission to access the dataset that the file belongs to is needed. This is granted through [REMS](https://github.com/CSCfi/rems) in the form of `GA4GH` visas. For details see [starter-kit documentation on REMS](https://github.com/GenomicDataInfrastructure/starter-kit-rems) and the links therein. + +## How to perform common admin tasks + +### The sda-admin tool + +Within the scope of the starter-kit, it is up to the system administrator to curate incoming uploads to the Sensitive Data Archive. To ease this task, we have created the `sda-admin` tool which is a shell script that can perform all the necessary steps in order for an unencrypted file to end up properly ingested and archived by the SDA stack. The script can be found under `scripts/` and can be used to upload and ingest files as well as assigning accession ID to archived files and linking them to a dataset. + +In the background it utilizes the `sda-cli` for encrypting and uploading files and automates generating and sending broker messages between the SDA services. Detailed documentation on its usage along with examples can be retrieved upon running the command: + +```shell +./sda-admin help +``` + +Below we provide a step-by-step example of `sda-admin` usage. + +Create a test file: + +```shell +dd if=/dev/random of=test_file count=1 bs=$(( 1024 * 1024 * 1 )) iflag=fullblock +``` + +Fetch the archive's `c4gh` public key (assuming shell access to the host machine): + +```shell +docker cp ingest:/shared/c4gh.pub.pem . +``` + +#### Encrypt and upload + +To encrypt and upload `test_file` to the s3inbox, first get a token and prepare a `s3cmd` configuration file as described in the section [Uploading data](#uploading-data) above. Then run the following: + +```shell +./sda-admin --sda-config s3cmd.conf --sda-key c4gh.pub.pem upload test_file +``` + +One can verify that the encrypted file is uploaded in the archive's inbox by the following command: + +```shell +sda-cli list --config s3cmd.conf +``` + +#### Ingesting + +To list the filenames currently in the "inbox" queue waiting to be ingested run: + +```shell +./sda-admin ingest +``` + +If `test_file.c4gh` is in the returned list, run: + +```shell +./sda-admin --user ingest test_file +``` + +to trigger ingestion of the file. + +#### Adding accession IDs + +In brief, accesion IDs are unique identifiers that are assigned to files in order to be able to reference them in the future. Check that the file has been ingested by listing the filenames currently in the "verified" queue waiting to have accession IDs assigned to them: ```shell -docker compose up -d +./sda-admin accession ``` -and then from the root folder of the `starter-kit-lsaai-mock` run: +If `test_file.c4gh` is in the returned list, we can proceed with accession: ```shell -docker compose up -d +./sda-admin accession MYID001 test_file ``` -## Starting the stack in standalone demo mode +where `MYID001` is the `accession ID` we wish to assign to the file. + +#### Mapping to datasets -The file `docker-compose-demo.yml` is used to start the `storage-and-interfaces` services in *demo* mode with an example dataset preloaded and ingested to the sensitive data archive when the deployment is done. This comes with its own python implementation of a mock-oidc in place of LS-AAI and can be run as standalone for demonstration purposes. +Check that the file got an accession ID by listing the filenames currently in the "completed" queue waiting to be associated with a dataset ID: -The files imported by the data loading script come from [here:](https://github.com/ga4gh/htsget-refserver/tree/main/data/gcp/gatk-test-data/wgs_bam) +```shell +./sda-admin dataset +``` -To deploy use the following command: +Lastly, associate the file with a dataset ID: ```shell -docker compose -f docker-compose-demo.yml up -d +./sda-admin dataset MYSET001 test_file ``` -After deployment is done, follow the instructions below to test that the demo worked as expected. +Note that all the above steps can be done for multiple files at a time except from assigning accession IDs which needs to be done for one file at a time. -### **Download unencrypted files directly** +### Actions by calling the APIs directly with curl -### Get token for downloading data +In place of the [`sda-cli`](https://github.com/NBISweden/sda-cli) tool mentioned above certain actions are available by calling the APIs directly with curl. -For the purpose of the demo stack, tokens can be issued by the included `oidc` service and be used to authorize calls to the `download` service's API. The `oidc` is a simple Python implementation that mimics the basic OIDC functionality of LS-AAI. It does not require user authentication and serves a valid token through its `/token` endpoint: +#### Get token for downloading data + +For the purpose of the demo stack([docker-compose-demo.yml](docker-compose-demo.yml)), tokens can be issued by the included `oidc` service and be used to authorize calls to the `download` service's API. The `oidc` is a simple Python implementation that mimics the basic OIDC functionality of LS-AAI. It does not require user authentication and serves a valid token through its `/token` endpoint: ```shell token=$(curl -s -k https://localhost:8080/tokens | jq -r '.[0]') @@ -71,20 +284,20 @@ token=$(curl -s -k https://localhost:8080/tokens | jq -r '.[0]') This token is created upon deployment. See `scripts/make_credentials.sh` for more details. Note that the API returns a list of tokens where the first element is the token of interest, and the rest are tokens for [testing `sda-download`](https://github.com/neicnordic/sda-download/blob/main/dev_utils/README.md#get-a-token). -### List datasets +#### List datasets ```shell curl -s -H "Authorization: Bearer $token" http://localhost:8443/metadata/datasets | jq . ``` -### List files in a dataset +#### List files in a dataset ```shell datasetID=$(curl -s -H "Authorization: Bearer $token" http://localhost:8443/metadata/datasets | jq -r .'[0]') curl -s -H "Authorization: Bearer $token" "http://localhost:8443/metadata/datasets/$datasetID/files" | jq . ``` -### Download a specific file +#### Download a specific file The `sda-download` service offers multiple methods for downloading files through the API, with options for both encrypted and unencrypted results. Below, you will find an example illustrating each of these methods. @@ -95,18 +308,18 @@ filename="htsnexus_test_NA12878.bam" fileID=$(curl -s -H "Authorization: Bearer $token" "http://localhost:8443/metadata/datasets/$datasetID/files" | jq -r --arg filename "$filename".c4gh '.[] | select(.displayFileName==$filename) | .fileId') ``` -#### 1. Download unencrypted file from the `/files` endpoint +##### 1. Download unencrypted file from the `/files` endpoint ```bash curl -s -H "Authorization: Bearer $token" http://localhost:8443/files/$fileID -o "$filename" ``` After successful execution, the BAM file `htsnexus_test_NA12878.bam` will be downloaded to your current folder. -#### 2. Download unencrypted file from the `/s3` endpoint +##### 2. Download unencrypted file from the `/s3` endpoint ```bash curl -s -H "Authorization: Bearer $token" http://localhost:8443/s3/$datasetID/$filename -o "$filename" ``` -#### 3. Download encrypted file from the `/s3-encrypted` endpoint +##### 3. Download encrypted file from the `/s3-encrypted` endpoint To download an encrypted file that is re-encrypted with a custom Crypt4GH public key, you need to first create a key pair by the [`sda-cli`](https://github.com/NBISweden/sda-cli) tool, instructions can be found [here](https://github.com/NBISweden/sda-cli?tab=readme-ov-file#create-keys). ```bash @@ -122,4 +335,28 @@ After successful execution, the Crypt4GH encrypted BAM file `htsnexus_test_NA128 ```bash sda-cli decrypt -key c4gh.sec.pem htsnexus_test_NA12878.bam.c4gh -``` \ No newline at end of file +``` + +### Monitoring the status of services + +Assuming access to a terminal session in the host machine of the deployed docker compose stack, the status of all running containers can be checked as per usual with the command: `docker ps` whereas all logs from the deployed services can be monitored in real time as per usual by the command: + +```shell +docker compose -f docker-compose.yml logs -f +``` + +or per service as: + +```shell +docker compose -f docker-compose.yml logs -f +``` + +Note that when applicable periodic `healthchecks` are in place to ensure that services are running normally. All containers are configured to always restart upon failure. + +### Working with RabbitMQ + +As stated, we use [RabbitMQ](https://www.rabbitmq.com/) as our message broker between different services in this stack. Monitoring the status of the broker service can most conveniently be done via the web interface, which is accessible at `http://localhost:15672/` (use `https` if TLS is enabled). By default, `user:password` credentials with values `test:test` are created upon deployment and can be changed by editing the `docker-compose.yml` file. There are two ways to create a password hash for RabbitMQ as described [here](https://www.rabbitmq.com/passwords.html#computing-password-hash) + +Broker messages are most conveniently generated by `scripts/sda-admin` as described above. If for some reason one wants to send MQ messages manually instead, there exist step-by-step examples [here](https://github.com/neicnordic/sda-pipeline/tree/master/dev_utils#json-formatted-messages). + + diff --git a/quickstart-sda.md b/quickstart-sda.md deleted file mode 100644 index 223533e..0000000 --- a/quickstart-sda.md +++ /dev/null @@ -1,284 +0,0 @@ -# Quickstart guide to the SDA - -This document contains information on how to work with the Sensitive Data Archive (SDA) from the GDI user perspective. - -## Brief description of the storage and interfaces stack - -The storage and interfaces software stack for the GDI-starter-kit consists of the following services: - -| Component | Description | -|---------------|------| -| broker | RabbitMQ based message broker, [SDA-MQ](https://github.com/neicnordic/sensitive-data-archive/tree/main/rabbitmq). | -| database | PostgreSQL database, [SDA-DB](https://github.com/neicnordic/sensitive-data-archive/tree/main/postgresql). | -| storage | S3 object store, demo uses Minio S3. | -| auth | OpenID Connect relaying party and authentication service, [SDA-auth](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda-auth). | -| s3inbox | Proxy inbox to the S3 backend store, [SDA-S3Inbox](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda). | -| download | Data out solution for downloading files from the SDA, [SDA-download](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda-download). | -| SDA-pipeline | The ingestion pipeline of the SDA, [SDA-pipeline](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda). This comprises of the following core components: `ingest`, `verify`, `finalize` and `mapper`.| - -Detailed documentation on the `sda-pipeline` can be found at: [https://neic-sda.readthedocs.io/en/latest/services/pipeline)](https://neic-sda.readthedocs.io/en/latest/services/pipeline). - -NeIC Sensitive Data Archive documentation can be found at: [https://neic-sda.readthedocs.io/en/latest/](https://neic-sda.readthedocs.io/en/latest/) . - -## Deployment - -Before deploying the stack, please make sure that all configuration files are in place. The following files need to be created from their respective examples: - -```shell -cp ./config/config.yaml.example ./config/config.yaml -cp ./config/iss.json.example ./config/iss.json -cp ./.env.example ./.env -``` - -no further editing to the above files is required for running the stack locally. - -The storage and interfaces stack can be deployed with the use of the provided `docker-compose.yml` file by running - -```shell -docker compose -f docker-compose.yml up -d -``` - -from the root of this repo. Please note that in the current form of the compose file, services are configured to work out-of-the-box with the [LS-AAI-mock](https://github.com/GenomicDataInfrastructure/starter-kit-lsaai-mock) service and some configuration of the latter is needed beforehand, see [here](./README.md##Starting-the-full-stack-with-LS-AAI-mock) for step-by-step instructions. The rationale behind this setup is to allow for a seamless transition to an environment with a *live* LS-AAI service as discussed briefly below. - -Configuration can be further customized by changing the files listed at the top of the `./.env` file along with the `.env` file itself. Please bear in mind that environment variables take precedence over the `config.yaml` file. - -Lastly, this repo includes a `docker-compose-demo.yml` file which deploys a standalone stack along with a demo of the sda services' functionality with test data. Details can be found [here](./README.md##Starting-the-stack-in-standalone-demo-mode). - -### Adding TLS to internet facing services - -Internet facing services such as `s3inbox`, `download` and `auth`, need to be secured via TLS certification. This can be most conveniently achieved by using [Let's Encrypt](https://letsencrypt.org/getting-started/) as Certificate Authority. Assuming shell access to your web host, a convenient way to set this up is through installing Certbot (or any other ACME client supported by Let's Encrypt). Detailed instructions on setting up Certbot for different systems can be found [here](https://certbot.eff.org/). - -## Authentication for users with LS-AAI (mock or alive) - -To interact with SDA services, users need to provide [JSON Web Token](https://jwt.io/) (JWT) authorization. Ultimately, tokens can be fetched by [LS-AAI](https://lifescience-ri.eu/ls-login/) upon user login to an OpenID Connect (OIDC) relaying party (RP) service that is [registered with LS-AAI](https://spreg-legacy.aai.elixir-czech.org/). An example of such an RP service is the [sda-auth](https://github.com/neicnordic/sda-auth), which is included in the present stack. - -### sda-auth - -Assuming users with a valid LS-AAI ID, they can obtain a JWT by logging in to the `sda-auth` service. This can be done by navigating to the `sda-auth` service URL (e.g. `https://localhost:8085` for a local deployment or `https://login.gdi.nbis.se` for a live one) and clicking on the `Login` button. This will redirect the user to the LS-AAI login page where they can enter their credentials. Once authenticated, the user will be redirected back to the `sda-auth` service and a JWT will be issued. This is an access token which can be copied from the `sda-auth`'s page and used to interact with the SDA services like e.g. for authorizing calls to `sda-download`'s API as described in the [Downloading data](#downloading-data) section below. - -From `sda-auth`'s page users can also download a configuration file for accessing the `s3inbox` service. This `s3cmd.conf` file containes the aforementioned access token along with other necessary information and it is described in detail in the [Uploading data](#uploading-data) section below. - -## How to perform common user tasks - -### Data encryption - -The `sda-pipeline` only ingests files encrypted with the archive's `c4gh` public key. For instance, using the Go implementation of the [`crypt4gh` utility](https://github.com/neicnordic/crypt4gh) a file can be encrypted simply by running: - -```shell -crypt4gh encrypt -f -p -``` - -where `` is the archive's public key. Note that `docker-compose.yml` stores the archive's c4gh public key in a volume named `shared`, see [below](#how-to-perform-common-admin-tasks) for how to extract it. - -### Uploading data - -Users can upload data to the SDA by transferring them directly to the archive's `s3inbox` with an S3 client tool such as [`s3cmd`](https://s3tools.org/s3cmd): - -```shell -s3cmd -c s3cmd.conf put s3:/// -``` - -where `USER_LS-AAI_ID` is the user's LS-AAI ID with the `@` replaced by a `_` and `s3cmd.conf` is a configuration file with the following content: - -```ini -[default] -access_key = -secret_key = -access_token= -check_ssl_certificate = False -check_ssl_hostname = False -encoding = UTF-8 -encrypt = False -guess_mime_type = True -host_base = -host_bucket = -human_readable_sizes = true -multipart_chunk_size_mb = 50 -use_https = True -socket_timeout = 30 -``` - -It is possible to download the `s3cmd.conf` file from the `sda-auth` service as described in the [Authentication for users with LS-AAI (mock or alive)](#authentication-for-users-with-ls-aai-mock-or-alive) section above. However, do note that `s3cmd.conf` downloaded from this service lacks the section header `[default]` which needs to be added manually if one wishes to use the file directly with `s3cmd`. - -For example, a `s3cmd.conf` file downloaded from `auth` after deploying the stack locally (with LS-AAI-mock as OIDC) would look like this: - -```ini -access_key = jd123_lifescience-ri.eu -secret_key = jd123_lifescience-ri.eu -access_token=eyJraWQiOiJyc2ExIiwidH... -check_ssl_certificate = False -check_ssl_hostname = False -encoding = UTF-8 -encrypt = False -guess_mime_type = True -host_base = localhost:8000 -host_bucket = localhost:8000 -human_readable_sizes = true -multipart_chunk_size_mb = 50 -use_https = True -socket_timeout = 30 -``` - -where the acces token has been truncated for brevity. Please note that the option `use_https = True` is missing from the above file (therefore set implicitly to `False`) since the local deployment of the stack does not use TLS. - -### The sda-cli tool - -Instead of the tools above, users are **encouraged** to use [`sda-cli`](https://github.com/NBISweden/sda-cli), which is a tool specifically developed to perform all common SDA user-related tasks in a convenient and unified manner. It is recommended to use precompiled executables for `sda-cli` which can be found at [https://github.com/NBISweden/sda-cli/releases](https://github.com/NBISweden/sda-cli/releases) - -To start using the tool run: - -```shell -./sda-cli help -``` - -#### Examples of common usage - -- Encrypt and upload a file to the SDA in one go: - -```shell -./sda-cli upload -config s3cmd.conf --encrypt-with-key -``` - -- Encrypt and upload a whole folder recursively to a specified path, which can be different from the source, in one go: - -```shell -./sda-cli upload -config s3cmd.conf --encrypt-with-key -r -targetDir -``` - -- List all uploaded files in the user's bucket recursively: - -```shell -./sda-cli list -config s3cmd.conf -``` - -For detailed documentation on the tool's capabilities and usage please refer [here](https://github.com/NBISweden/sda-cli#usage). - -### Downloading data - -Users can directly download data from the SDA via `sda-download`, for more details see the service's [api reference](https://github.com/neicnordic/sda-download/blob/main/docs/API.md). In short, given a [valid JW token](#sda-auth), `$token`, a user can download the file with file ID, `$fileID` by issuing the following command: - -```shell -curl --cacert -H "Authorization: Bearer $token" https:///files/$fileID -o -``` - -where for example `sda-download_DOMAIN_NAME` can be `login.gdi.nbis.se` or `localhost:8443` depending on the deployment. In the case of a local deployment, the certificate file can be obtained by running: - -```shell -docker cp download:/shared/cert/ca.crt . -``` - -The `fileID` is a unique file identifier that can be obtained by calls to `sda-download`'s `/datasets` endpoint. For details and a concrete example on how to use `sda-download` with demo data please see [here](./README.md#get-token-or-downloading-data). - -### Data access permissions - -In order for a user to access a file, permission to access the dataset that the file belongs to is needed. This is granted through [REMS](https://github.com/CSCfi/rems) in the form of `GA4GH` visas. For details see [starter-kit documentation on REMS](https://github.com/GenomicDataInfrastructure/starter-kit-rems) and the links therein. - -## How to perform common admin tasks - -### The sda-admin tool - -Within the scope of the starter-kit, it is up to the system administrator to curate incoming uploads to the Sensitive Data Archive. To ease this task, we have created the `sda-admin` tool which is a shell script that can perform all the necessary steps in order for an unencrypted file to end up properly ingested and archived by the SDA stack. The script can be found under `scripts/` and can be used to upload and ingest files as well as assigning accession ID to archived files and linking them to a dataset. - -In the background it utilizes the `sda-cli` for encrypting and uploading files and automates generating and sending broker messages between the SDA services. Detailed documentation on its usage along with examples can be retrieved upon running the command: - -```shell -./sda-admin help -``` - -Below we provide a step-by-step example of `sda-admin` usage. - -Create a test file: - -```shell -dd if=/dev/random of=test_file count=1 bs=$(( 1024 * 1024 * 1 )) iflag=fullblock -``` - -Fetch the archive's `c4gh` public key (assuming shell access to the host machine): - -```shell -docker cp ingest:/shared/c4gh.pub.pem . -``` - -#### Encrypt and upload - -To encrypt and upload `test_file` to the s3inbox, first get a token and prepare a `s3cmd` configuration file as described in the section [Uploading data](#uploading-data) above. Then run the following: - -```shell -./sda-admin --sda-config s3cmd.conf --sda-key c4gh.pub.pem upload test_file -``` - -One can verify that the encrypted file is uploaded in the archive's inbox by the following command: - -```shell -sda-cli list --config s3cmd.conf -``` - -#### Ingesting - -To list the filenames currently in the "inbox" queue waiting to be ingested run: - -```shell -./sda-admin ingest -``` - -If `test_file.c4gh` is in the returned list, run: - -```shell -./sda-admin --user ingest test_file -``` - -to trigger ingestion of the file. - -#### Adding accession IDs - -In brief, accesion IDs are unique identifiers that are assigned to files in order to be able to reference them in the future. Check that the file has been ingested by listing the filenames currently in the "verified" queue waiting to have accession IDs assigned to them: - -```shell -./sda-admin accession -``` - -If `test_file.c4gh` is in the returned list, we can proceed with accession: - -```shell -./sda-admin accession MYID001 test_file -``` - -where `MYID001` is the `accession ID` we wish to assign to the file. - -#### Mapping to datasets - -Check that the file got an accession ID by listing the filenames currently in the "completed" queue waiting to be associated with a dataset ID: - -```shell -./sda-admin dataset -``` - -Lastly, associate the file with a dataset ID: - -```shell -./sda-admin dataset MYSET001 test_file -``` - -Note that all the above steps can be done for multiple files at a time except from assigning accession IDs which needs to be done for one file at a time. - -### Monitoring the status of services - -Assuming access to a terminal session in the host machine of the deployed docker compose stack, the status of all running containers can be checked as per usual with the command: `docker ps` whereas all logs from the deployed services can be monitored in real time as per usual by the command: - -```shell -docker compose -f docker-compose.yml logs -f -``` - -or per service as: - -```shell -docker compose -f docker-compose.yml logs -f -``` - -Note that when applicable periodic `healthchecks` are in place to ensure that services are running normally. All containers are configured to always restart upon failure. - -### Working with RabbitMQ - -As stated, we use [RabbitMQ](https://www.rabbitmq.com/) as our message broker between different services in this stack. Monitoring the status of the broker service can most conveniently be done via the web interface, which is accessible at `http://localhost:15672/` (use `https` if TLS is enabled). By default, `user:password` credentials with values `test:test` are created upon deployment and can be changed by editing the `docker-compose.yml` file. There are two ways to create a password hash for RabbitMQ as described [here](https://www.rabbitmq.com/passwords.html#computing-password-hash) - -Broker messages are most conveniently generated by `scripts/sda-admin` as described above. If for some reason one wants to send MQ messages manually instead, there exist step-by-step examples [here](https://github.com/neicnordic/sda-pipeline/tree/master/dev_utils#json-formatted-messages). From 5e35b143c612ae39c74f0cea5fcf0f3dc09ff4fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Gr=C3=B6nberg?= Date: Thu, 25 Sep 2025 11:23:11 +0200 Subject: [PATCH 02/10] feat: add info noting that current version of sda-pipeline does not support starter-kit-htsget --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b05732f..6f8f6b5 100644 --- a/README.md +++ b/README.md @@ -20,9 +20,14 @@ Detailed documentation on the `sda-pipeline` can be found at: [https://neic-sda. NeIC Sensitive Data Archive documentation can be found at: [https://neic-sda.readthedocs.io/en/latest/](https://neic-sda.readthedocs.io/en/latest/) . +> [!IMPORTANT] +> The current version of the `sda-pipeline` does not support the [starter-kit-htsget](https://github.com/GenomicDataInfrastructure/starter-kit-htsget) for the moment. +> +> For the [starter-kit-htsget](https://github.com/GenomicDataInfrastructure/starter-kit-htsget) to function an earlier version of the `download` service needs to be deployed until further notice. + ## Deployment -There exist two compose files at the root of the repo. +There exist two docker compose files at the root of the repo. - [docker-compose.yml](docker-compose.yml) - To bootstrap the *full stack* of `storage-and-interfaces` services - Requires a running [`LS-AAI-mock`](https://github.com/GenomicDataInfrastructure/starter-kit-lsaai-mock) service. To configure the LS-AAI-mock service follow the instructions in that repo. From d4d5a2ec57eabdca1709e39fd109565172b917d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Gr=C3=B6nberg?= Date: Thu, 25 Sep 2025 13:46:37 +0200 Subject: [PATCH 03/10] feat: update images, python:3.10-slim -> python:3.10-slim-bookworm, update to latest sda stack, use ghcr.io repo for sda download image, update config example to the now expected c4gh formating --- README.md | 27 +++++++-------------------- config/config.yaml.example | 5 +++-- docker-compose-demo.yml | 2 +- docker-compose.yml | 23 +++++++++++------------ 4 files changed, 22 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 6f8f6b5..3b8be63 100644 --- a/README.md +++ b/README.md @@ -302,7 +302,7 @@ datasetID=$(curl -s -H "Authorization: Bearer $token" http://localhost:8443/meta curl -s -H "Authorization: Bearer $token" "http://localhost:8443/metadata/datasets/$datasetID/files" | jq . ``` -#### Download a specific file +#### Download a specific encrypted file The `sda-download` service offers multiple methods for downloading files through the API, with options for both encrypted and unencrypted results. Below, you will find an example illustrating each of these methods. @@ -313,33 +313,20 @@ filename="htsnexus_test_NA12878.bam" fileID=$(curl -s -H "Authorization: Bearer $token" "http://localhost:8443/metadata/datasets/$datasetID/files" | jq -r --arg filename "$filename".c4gh '.[] | select(.displayFileName==$filename) | .fileId') ``` -##### 1. Download unencrypted file from the `/files` endpoint -```bash -curl -s -H "Authorization: Bearer $token" http://localhost:8443/files/$fileID -o "$filename" -``` -After successful execution, the BAM file `htsnexus_test_NA12878.bam` will be downloaded to your current folder. +Create a crypt4gh key pair -##### 2. Download unencrypted file from the `/s3` endpoint ```bash -curl -s -H "Authorization: Bearer $token" http://localhost:8443/s3/$datasetID/$filename -o "$filename" +crypt4gh generate -n demokey +pubkey=$(base64 -w0 demokey.pub.pem) ``` - -##### 3. Download encrypted file from the `/s3-encrypted` endpoint -To download an encrypted file that is re-encrypted with a custom Crypt4GH public key, you need to first create a key pair by the [`sda-cli`](https://github.com/NBISweden/sda-cli) tool, instructions can be found [here](https://github.com/NBISweden/sda-cli?tab=readme-ov-file#create-keys). - -```bash -# create a crypt4gh key pair -sda-cli createKey c4gh -``` -```bash -pubkey=$(base64 -w0 c4gh.pub.pem) -curl -s -H "Authorization: Bearer $token" -H "Client-Public-Key: $pubkey" http://localhost:8443/s3-encrypted/$datasetID/$filename -o "$filename.c4gh" +```bash +curl -s -H "Authorization: Bearer $token" -H "Client-Public-Key: $pubkey" http://localhost:8443/s3/$datasetID/$filename -o "$filename.c4gh" ``` After successful execution, the Crypt4GH encrypted BAM file `htsnexus_test_NA12878.bam.c4gh` will be downloaded to your current folder. This file can be decrypted using the private key of the key pair you have created by ```bash -sda-cli decrypt -key c4gh.sec.pem htsnexus_test_NA12878.bam.c4gh +sda-cli decrypt -key demokey.sec.pem htsnexus_test_NA12878.bam.c4gh ``` ### Monitoring the status of services diff --git a/config/config.yaml.example b/config/config.yaml.example index a60c699..1145c81 100644 --- a/config/config.yaml.example +++ b/config/config.yaml.example @@ -28,8 +28,9 @@ broker: ssl: "false" c4gh: - passphrase: "c4ghpass" - filepath: "/shared/c4gh.sec.pem" + privateKeys: + - filepath: "/shared/c4gh.sec.pem" + passphrase: "c4ghpass" db: host: "postgres" diff --git a/docker-compose-demo.yml b/docker-compose-demo.yml index 0f08651..51d1194 100644 --- a/docker-compose-demo.yml +++ b/docker-compose-demo.yml @@ -38,7 +38,7 @@ services: interval: 5s timeout: 10s retries: 20 - image: python:3.10-slim + image: python:3.10-slim-bookworm networks: - public ports: diff --git a/docker-compose.yml b/docker-compose.yml index f89519f..1ea38fc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,7 +16,7 @@ services: - PGPASSWORD=${credentials_PGPASSWORD} env_file: - .env - image: python:3.10-slim + image: python:3.10-slim-bookworm networks: - secure volumes: @@ -27,7 +27,7 @@ services: auth: container_name: auth command: sda-auth - image: ghcr.io/neicnordic/sensitive-data-archive:v0.3.25 + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14 depends_on: credentials: condition: service_completed_successfully @@ -65,7 +65,7 @@ services: interval: 5s timeout: 20s retries: 20 - image: ghcr.io/neicnordic/sensitive-data-archive:v0.3.25-rabbitmq + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14-rabbitmq networks: - secure ports: @@ -89,7 +89,7 @@ services: interval: 5s timeout: 20s retries: 20 - image: ghcr.io/neicnordic/sensitive-data-archive:v0.3.25-postgres + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14-postgres networks: - secure restart: always @@ -139,10 +139,9 @@ services: - DB_PASSWORD=${download_DB_PASSWORD} - DB_USER=${download_DB_USER} - OIDC_CONFIGURATION_URL=http://${DOCKERHOST:-dockerhost}:8080/oidc/.well-known/openid-configuration - - ARCHIVE_TYPE=s3seekable extra_hosts: - ${DOCKERHOST:-dockerhost}:host-gateway - image: harbor.nbis.se/gdi/sda-download:20240415 + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14-download networks: - public - secure @@ -173,7 +172,7 @@ services: - BROKER_USER=${finalize_BROKER_USER} - DB_PASSWORD=${finalize_DB_PASSWORD} - DB_USER=${finalize_DB_USER} - image: ghcr.io/neicnordic/sensitive-data-archive:v0.3.25 + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14 networks: - secure restart: always @@ -199,7 +198,7 @@ services: - BROKER_USER=${ingest_BROKER_USER} - DB_PASSWORD=${ingest_DB_PASSWORD} - DB_USER=${ingest_DB_USER} - image: ghcr.io/neicnordic/sensitive-data-archive:v0.3.25 + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14 networks: - secure restart: always @@ -225,7 +224,7 @@ services: - BROKER_USER=${mapper_BROKER_USER} - DB_PASSWORD=${mapper_DB_PASSWORD} - DB_USER=${mapper_DB_USER} - image: ghcr.io/neicnordic/sensitive-data-archive:v0.3.25 + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14 networks: - secure restart: always @@ -251,7 +250,7 @@ services: - BROKER_USER=${verify_BROKER_USER} - DB_PASSWORD=${verify_DB_PASSWORD} - DB_USER=${verify_DB_USER} - image: ghcr.io/neicnordic/sensitive-data-archive:v0.3.25 + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14 networks: - secure restart: always @@ -280,7 +279,7 @@ services: - SERVER_JWTPUBKEYURL=http://${DOCKERHOST:-dockerhost}:8080/oidc/jwk extra_hosts: - ${DOCKERHOST:-dockerhost}:host-gateway - image: ghcr.io/neicnordic/sensitive-data-archive:v0.3.25 + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14 networks: - public - secure @@ -297,7 +296,7 @@ services: depends_on: credentials: condition: service_completed_successfully - image: ghcr.io/neicnordic/sensitive-data-archive:v0.3.25 + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14 networks: - secure restart: always From 2628b1e82a0cf7d68fb1a4fde6a99d1eecaf53f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Gr=C3=B6nberg?= Date: Thu, 25 Sep 2025 14:46:14 +0200 Subject: [PATCH 04/10] feat: update images and config for the TLS example --- config/TLS-example/README.md | 4 +++- config/TLS-example/config.yaml | 5 +++-- config/TLS-example/docker-compose.yml | 18 +++++++++--------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/config/TLS-example/README.md b/config/TLS-example/README.md index 141c9f5..bb87dc4 100644 --- a/config/TLS-example/README.md +++ b/config/TLS-example/README.md @@ -22,7 +22,9 @@ inbox: secretkey: c4gh: - passphrase: + privateKeys: + - filepath: + passphrase: elixir: id: diff --git a/config/TLS-example/config.yaml b/config/TLS-example/config.yaml index 84a9a5a..8de06fe 100644 --- a/config/TLS-example/config.yaml +++ b/config/TLS-example/config.yaml @@ -31,8 +31,9 @@ broker: # clientKey: /certificates/tls.key c4gh: - passphrase: "" - filepath: "/c4gh/gdi.sec.pem" + privateKeys: + - filepath: "/c4gh/gdi.sec.pem" + passphrase: "" db: host: "postgres" diff --git a/config/TLS-example/docker-compose.yml b/config/TLS-example/docker-compose.yml index 4d13fda..a2243c4 100644 --- a/config/TLS-example/docker-compose.yml +++ b/config/TLS-example/docker-compose.yml @@ -1,7 +1,7 @@ services: auth: container_name: auth - image: ghcr.io/neicnordic/sensitive-data-archive:v0.2.23-auth + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14-auth networks: - public ports: @@ -31,7 +31,7 @@ services: interval: 5s timeout: 20s retries: 20 - image: ghcr.io/neicnordic/sensitive-data-archive:v0.2.23-rabbitmq + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14-rabbitmq networks: - secure ports: @@ -63,7 +63,7 @@ services: interval: 5s timeout: 20s retries: 20 - image: ghcr.io/neicnordic/sensitive-data-archive:v0.2.23-postgres + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14-postgres networks: - secure restart: always @@ -80,7 +80,7 @@ services: environment: - DB_PASSWORD=${download_DB_PASSWORD} - DB_USER=download - image: ghcr.io/neicnordic/sensitive-data-archive:v0.2.23-download + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14-download networks: - public - secure @@ -109,7 +109,7 @@ services: - BROKER_USER=finalize - DB_PASSWORD=${finalize_DB_PASSWORD} - DB_USER=finalize - image: ghcr.io/neicnordic/sensitive-data-archive:v0.2.23 + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14 networks: - secure restart: always @@ -132,7 +132,7 @@ services: - BROKER_USER=ingest - DB_PASSWORD=${ingest_DB_PASSWORD} - DB_USER=ingest - image: ghcr.io/neicnordic/sensitive-data-archive:v0.2.23 + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14 networks: - secure restart: always @@ -155,7 +155,7 @@ services: - BROKER_USER=mapper - DB_PASSWORD=${mapper_DB_PASSWORD} - DB_USER=mapper - image: ghcr.io/neicnordic/sensitive-data-archive:v0.2.23 + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14 networks: - secure restart: always @@ -178,7 +178,7 @@ services: - BROKER_USER=verify - DB_PASSWORD=${verify_DB_PASSWORD} - DB_USER=verify - image: ghcr.io/neicnordic/sensitive-data-archive:v0.2.23 + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14 networks: - secure restart: always @@ -201,7 +201,7 @@ services: - BROKER_USER=inbox - DB_PASSWORD=${s3inbox_DB_PASSWORD} - DB_USER=inbox - image: ghcr.io/neicnordic/sensitive-data-archive:v0.2.23 + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14 networks: - public - secure From 0c6d1054ddc1b702d74fe8b6c00a718fe6985bbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Gr=C3=B6nberg?= Date: Thu, 25 Sep 2025 14:49:01 +0200 Subject: [PATCH 05/10] feat: add refernce to TLS-example/README.md from README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 3b8be63..d6ab4af 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,8 @@ Lastly, this repo includes a `docker-compose-demo.yml` file which deploys a stan Internet facing services such as `s3inbox`, `download` and `auth`, need to be secured via TLS certification. This can be most conveniently achieved by using [Let's Encrypt](https://letsencrypt.org/getting-started/) as Certificate Authority. Assuming shell access to your web host, a convenient way to set this up is through installing Certbot (or any other ACME client supported by Let's Encrypt). Detailed instructions on setting up Certbot for different systems can be found [here](https://certbot.eff.org/). +For an example of how to set up the `sda-pipeline` with TLS, see [TLS-example/README.md](config/TLS-example/README.md) + ## Authentication for users with LS-AAI (mock or alive) To interact with SDA services, users need to provide [JSON Web Token](https://jwt.io/) (JWT) authorization. Ultimately, tokens can be fetched by [LS-AAI](https://lifescience-ri.eu/ls-login/) upon user login to an OpenID Connect (OIDC) relaying party (RP) service that is [registered with LS-AAI](https://spreg-legacy.aai.elixir-czech.org/). An example of such an RP service is the [sda-auth](https://github.com/neicnordic/sda-auth), which is included in the present stack. From a0b3e3e3137ff6d5dae65db2101679d4231bb26e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Gr=C3=B6nberg?= Date: Fri, 26 Sep 2025 08:56:44 +0200 Subject: [PATCH 06/10] feat: fix auth service image in TlS-example as it no longer in its own image --- config/TLS-example/docker-compose.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/config/TLS-example/docker-compose.yml b/config/TLS-example/docker-compose.yml index a2243c4..52ae972 100644 --- a/config/TLS-example/docker-compose.yml +++ b/config/TLS-example/docker-compose.yml @@ -1,7 +1,8 @@ services: auth: container_name: auth - image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14-auth + command: sda-auth + image: ghcr.io/neicnordic/sensitive-data-archive:v3.0.14 networks: - public ports: From 4e411246b7af1abbeaec3f6c77608a0cdd000b60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Gr=C3=B6nberg?= Date: Fri, 26 Sep 2025 09:02:01 +0200 Subject: [PATCH 07/10] feat: update README.md, fix links to outdated docs, fix sda-cli commands examples, remove unecessary header --- README.md | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index d6ab4af..eb3ec1e 100644 --- a/README.md +++ b/README.md @@ -6,17 +6,17 @@ This document contains information on how to work with the Sensitive Data Archiv The storage and interfaces software stack for the GDI-starter-kit consists of the following services: -| Component | Description | -|--------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| broker | RabbitMQ based message broker, [SDA-MQ](https://github.com/neicnordic/sensitive-data-archive/tree/main/rabbitmq). | -| database | PostgreSQL database, [SDA-DB](https://github.com/neicnordic/sensitive-data-archive/tree/main/postgresql). | -| storage | S3 object store, demo uses Minio S3. | -| auth | OpenID Connect relaying party and authentication service, [SDA-auth](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda-auth). | -| s3inbox | Proxy inbox to the S3 backend store, [SDA-S3Inbox](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda). | -| download | Data out solution for downloading files from the SDA, [SDA-download](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda-download). | -| SDA-pipeline | The ingestion pipeline of the SDA, [SDA-pipeline](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda). This comprises of the following core components: `ingest`, `verify`, `finalize` and `mapper`. | - -Detailed documentation on the `sda-pipeline` can be found at: [https://neic-sda.readthedocs.io/en/latest/services/pipeline)](https://neic-sda.readthedocs.io/en/latest/services/pipeline). +| Component | Description | +|--------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| broker | RabbitMQ based message broker, [SDA-MQ](https://github.com/neicnordic/sensitive-data-archive/tree/main/rabbitmq). | +| database | PostgreSQL database, [SDA-DB](https://github.com/neicnordic/sensitive-data-archive/tree/main/postgresql). | +| storage | S3 object store, demo uses Minio S3. | +| auth | OpenID Connect relaying party and authentication service, [SDA-auth](https://github.com/neicnordic/sensitive-data-archive/tree/main/cmd/auth/auth.md). | +| s3inbox | Proxy inbox to the S3 backend store, [SDA-S3Inbox](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda/cmd/s3inbox/s3inbox.md). | +| download | Data out solution for downloading files from the SDA, [SDA-download](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda-download). | +| SDA-pipeline | The ingestion pipeline of the SDA, [SDA-pipeline](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda/sda.md). This comprises of the following core components: `auth`, `ingest`, `verify`, `finalize`, `s3inbox`, and `mapper`. | + +Detailed documentation on the `sda-pipeline` can be found at: [https://neic-sda.readthedocs.io/en/latest/services/sda)](https://neic-sda.readthedocs.io/en/latest/services/sda). NeIC Sensitive Data Archive documentation can be found at: [https://neic-sda.readthedocs.io/en/latest/](https://neic-sda.readthedocs.io/en/latest/) . @@ -66,7 +66,7 @@ For an example of how to set up the `sda-pipeline` with TLS, see [TLS-example/RE ## Authentication for users with LS-AAI (mock or alive) -To interact with SDA services, users need to provide [JSON Web Token](https://jwt.io/) (JWT) authorization. Ultimately, tokens can be fetched by [LS-AAI](https://lifescience-ri.eu/ls-login/) upon user login to an OpenID Connect (OIDC) relaying party (RP) service that is [registered with LS-AAI](https://spreg-legacy.aai.elixir-czech.org/). An example of such an RP service is the [sda-auth](https://github.com/neicnordic/sda-auth), which is included in the present stack. +To interact with SDA services, users need to provide [JSON Web Token](https://jwt.io/) (JWT) authorization. Ultimately, tokens can be fetched by [LS-AAI](https://lifescience-ri.eu/ls-login/) upon user login to an OpenID Connect (OIDC) relaying party (RP) service that is [registered with LS-AAI](https://spreg-legacy.aai.elixir-czech.org/). An example of such an RP service is the [sda-auth](https://github.com/neicnordic/sensitive-data-archive/tree/main/sda/cmd/auth), which is included in the present stack. ### sda-auth @@ -152,26 +152,26 @@ To start using the tool run: - Encrypt and upload a file to the SDA in one go: ```shell -./sda-cli upload -config s3cmd.conf --encrypt-with-key +./sda-cli -config s3cmd.conf upload --encrypt-with-key ``` - Encrypt and upload a whole folder recursively to a specified path, which can be different from the source, in one go: ```shell -./sda-cli upload -config s3cmd.conf --encrypt-with-key -r -targetDir +./sda-cli -config s3cmd.conf upload --encrypt-with-key -r -targetDir ``` - List all uploaded files in the user's bucket recursively: ```shell -./sda-cli list -config s3cmd.conf +./sda-cli -config s3cmd.conf list ``` For detailed documentation on the tool's capabilities and usage please refer [here](https://github.com/NBISweden/sda-cli#usage). ### Downloading data -Users can directly download data from the SDA via `sda-download`, for more details see the service's [api reference](https://github.com/neicnordic/sda-download/blob/main/docs/API.md). In short, given a [valid JW token](#sda-auth), `$token`, a user can download the file with file ID, `$fileID` by issuing the following command: +Users can directly download data from the SDA via `sda-download`, for more details see the service's [api reference](https://github.com/neicnordic/sensitive-data-archive/blob/main/sda-download/api/api.md). In short, given a [valid JW token](#sda-auth), `$token`, a user can download the file with file ID, `$fileID` by issuing the following command: ```shell curl --cacert -H "Authorization: Bearer $token" https:///files/$fileID -o @@ -189,8 +189,6 @@ The `fileID` is a unique file identifier that can be obtained by calls to `sda-d In order for a user to access a file, permission to access the dataset that the file belongs to is needed. This is granted through [REMS](https://github.com/CSCfi/rems) in the form of `GA4GH` visas. For details see [starter-kit documentation on REMS](https://github.com/GenomicDataInfrastructure/starter-kit-rems) and the links therein. -## How to perform common admin tasks - ### The sda-admin tool Within the scope of the starter-kit, it is up to the system administrator to curate incoming uploads to the Sensitive Data Archive. To ease this task, we have created the `sda-admin` tool which is a shell script that can perform all the necessary steps in order for an unencrypted file to end up properly ingested and archived by the SDA stack. The script can be found under `scripts/` and can be used to upload and ingest files as well as assigning accession ID to archived files and linking them to a dataset. @@ -306,7 +304,7 @@ curl -s -H "Authorization: Bearer $token" "http://localhost:8443/metadata/datase #### Download a specific encrypted file -The `sda-download` service offers multiple methods for downloading files through the API, with options for both encrypted and unencrypted results. Below, you will find an example illustrating each of these methods. +The `sda-download` service offers a method for downloading files through the API, which will be encrypted with the public key provided in the request. Below, you will find an example illustrating each of this methods. To download the file `htsnexus_test_NA12878.bam`, first obtain the respective `fileID` using the following command. The `datasetID`, which is `DATASET0001`, can be obtained by following the instructions at [List datasets](#list-datasets) From af473e642077eac0eed2fa906814afe9e70846ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Gr=C3=B6nberg?= Date: Fri, 26 Sep 2025 13:56:47 +0200 Subject: [PATCH 08/10] feat: update version of helm charts and config for the terraform example --- terraform/main.tf | 8 ++++ terraform/postgresql.tf | 2 + terraform/rabbitmq.tf | 2 + terraform/sda.tf | 40 ++++++++++++++++-- terraform/terraform.tfvars | 83 ++++++++++++++++++++++++++++++++++++-- terraform/variables.tf | 3 ++ 6 files changed, 131 insertions(+), 7 deletions(-) diff --git a/terraform/main.tf b/terraform/main.tf index 7785968..f4c94a8 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -23,6 +23,14 @@ resource "random_password" "verify" { length = 16 special = false } +resource "random_password" "auth" { + length = 16 + special = false +} +resource "random_password" "api" { + length = 16 + special = false +} ### namespace ### resource "kubernetes_namespace_v1" "namespace" { diff --git a/terraform/postgresql.tf b/terraform/postgresql.tf index 5527e2c..58fdfb2 100644 --- a/terraform/postgresql.tf +++ b/terraform/postgresql.tf @@ -45,6 +45,8 @@ resource "kubernetes_secret_v1" "db_post_start_script" { psql -h postgres-sda-db -U postgres -d sda -c "ALTER ROLE ingest LOGIN PASSWORD '${random_password.ingest.result}';" psql -h postgres-sda-db -U postgres -d sda -c "ALTER ROLE mapper LOGIN PASSWORD '${random_password.mapper.result}';" psql -h postgres-sda-db -U postgres -d sda -c "ALTER ROLE verify LOGIN PASSWORD '${random_password.verify.result}';" + psql -h postgres-sda-db -U postgres -d sda -c "ALTER ROLE api LOGIN PASSWORD '${random_password.api.result}';" + psql -h postgres-sda-db -U postgres -d sda -c "ALTER ROLE auth LOGIN PASSWORD '${random_password.auth.result}';" EOF } } diff --git a/terraform/rabbitmq.tf b/terraform/rabbitmq.tf index efe0edb..ee39d9e 100644 --- a/terraform/rabbitmq.tf +++ b/terraform/rabbitmq.tf @@ -63,6 +63,8 @@ curl --cacert /certs/ca.crt -s -u "$MQ_USER:$MQ_PASS" -X PUT https://${helm_rele -H "content-type:application/json" -d '{"password":"'"${random_password.mapper.result}"'", "tags":"none"}' curl --cacert /certs/ca.crt -s -u "$MQ_USER:$MQ_PASS" -X PUT https://${helm_release.sda_mq.name}-sda-mq:15671/api/users/verify \ -H "content-type:application/json" -d '{"password":"'"${random_password.verify.result}"'", "tags":"none"}' +curl --cacert /certs/ca.crt -s -u "$MQ_USER:$MQ_PASS" -X PUT https://${helm_release.sda_mq.name}-sda-mq:15671/api/users/api \ + -H "content-type:application/json" -d '{"password":"'"${random_password.api.result}"'", "tags":"none"}' curl --cacert /certs/ca.crt -s -u "$MQ_USER:$MQ_PASS" -X PUT https://${helm_release.sda_mq.name}-sda-mq:15671/api/permissions/sda/finalize \ -H "content-type:application/json" -d '{"configure":"","write":"sda","read":"accession"}' curl --cacert /certs/ca.crt -s -u "$MQ_USER:$MQ_PASS" -X PUT https://${helm_release.sda_mq.name}-sda-mq:15671/api/permissions/sda/inbox \ diff --git a/terraform/sda.tf b/terraform/sda.tf index 07aaa82..3fe2bb5 100644 --- a/terraform/sda.tf +++ b/terraform/sda.tf @@ -28,6 +28,12 @@ resource "helm_release" "sda_pipeline" { yamlencode( { "credentials" : { + api : { + dbUser : "api" + dbPassword : random_password.api.result + mqUser : "api" + mqPassword : random_password.api.result + }, "download" : { "dbUser" : "download", "dbPassword" : random_password.download.result, @@ -64,8 +70,15 @@ resource "helm_release" "sda_pipeline" { "mqUser" : "verify", "mqPassword" : random_password.verify.result, }, + "auth" : { + "dbUser" : "auth", + "dbPassword" : random_password.auth.result, + }, }, "global" : { + "api" : { + "rbacFileSecret" : kubernetes_secret_v1.api_admins.metadata[0].name + } "archive" : { "storageType" : "s3", "s3Url" : var.s3URL, @@ -90,10 +103,14 @@ resource "helm_release" "sda_pipeline" { "prefetchCount" : "0", }, "c4gh" : { + "privateKeys": [ + { + "keyName" : "c4gh.sec.pem" + "passphrase" : var.repository-c4gh-passphrase, + }, + ] + "publicKey" : "c4gh.pub.pem", "secretName" : kubernetes_secret_v1.c4gh.metadata[0].name, - "keyFile" : "c4gh.sec.pem", - "passphrase" : var.repository-c4gh-passphrase, - "publicFile" : "c4gh.pub.pem", }, "db" : { "host" : "${helm_release.sda_db.name}-sda-db", @@ -119,13 +136,18 @@ resource "helm_release" "sda_pipeline" { "ingress" : { "deploy" : var.ingress-deploy, "hostName" : { + "api" : "api.${var.ingress-base}" "auth" : "login.${var.ingress-base}", "download" : "download.${var.ingress-base}", - "inbox" : "inbox.${var.ingress-base}", + "s3Inbox" : "inbox.${var.ingress-base}", }, "ingressClassName" : var.ingress-class, "issuer" : var.letsencrypt-issuer, }, + "reencrypt" : { + "host": "pipeline-sda-svc-reencrypt" + "port": 50443 + }, "oidc" : { "provider" : var.oidc-provider, "id" : var.oidc-client-id, @@ -226,3 +248,13 @@ resource "kubernetes_network_policy_v1" "pipeline" { policy_types = ["Egress", "Ingress"] } } +resource "kubernetes_secret_v1" "api_admins" { + depends_on = [kubernetes_namespace_v1.namespace] + metadata { + name = "api-admins" + namespace = var.namespace + } + data = { + "rbac.json" = "${jsonencode(var.api-admins)}" + } +} diff --git a/terraform/terraform.tfvars b/terraform/terraform.tfvars index 9163a3e..486dc7a 100644 --- a/terraform/terraform.tfvars +++ b/terraform/terraform.tfvars @@ -6,9 +6,9 @@ kubeconfig-path = "" sda-services-backup-version = "v0.1.34" ### helm chart versions -sda-db-version = "0.8.8" -sda-mq-version = "0.7.9" -sda-svc-version = "0.23.4" +sda-db-version = "2.0.20" +sda-mq-version = "2.0.20" +sda-svc-version = "3.0.13" ### ingress ### ingress-base is the root domain, all exposed services will be reachable as sub domains. @@ -40,3 +40,80 @@ s3SecretKey = "" s3BackupURL = "" s3BackupAccessKey = "" s3BackupSecretKey = "" + +api-admins = { + "policy" : [ + { + "role" : "admin", + "path" : "/c4gh-keys/*", + "action" : "(GET)|(POST)|(PUT)" + }, + { + "role" : "admin", + "path" : "/file/verify/:accession", + "action" : "PUT" + }, + { + "role" : "admin", + "path" : "/dataset/verify/:dataset", + "action" : "PUT" + }, + { + "role" : "submission", + "path" : "/datasets/*", + "action" : "GET" + }, + { + "role" : "submission", + "path" : "/dataset/*", + "action" : "(POST)|(PUT)" + }, + { + "role" : "submission", + "path" : "/file/ingest", + "action" : "POST" + }, + { + "role" : "submission", + "path" : "/file/accession", + "action" : "POST" + }, + { + "role" : "submission", + "path" : "/file/*", + "action" : "DELETE" + }, + { + "role" : "submission", + "path" : "/users", + "action" : "GET" + }, + { + "role" : "submission", + "path" : "/users/:username/files", + "action" : "GET" + }, + { + "role" : "*", + "path" : "/datasets", + "action" : "GET" + }, + { + "role" : "*", + "path" : "/files", + "action" : "GET" + } + ], + "roles" : [ + { + "role" : "admin", + "rolebinding" : "submission" + }, + # Here add the users which should be admin or submission roles + # Example + # { + # "role" : "${USER_ID]", + # "rolebinding" : "admin" + # }, + ] +} \ No newline at end of file diff --git a/terraform/variables.tf b/terraform/variables.tf index 682933b..20102b6 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -77,3 +77,6 @@ variable "s3BackupSecretKey" { variable "s3BackupURL" { type = string } +variable "api-admins" { + type = map(list(map(string))) +} \ No newline at end of file From 1856c27f198dd1899193426af107bde162af2b1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Gr=C3=B6nberg?= Date: Fri, 26 Sep 2025 14:03:24 +0200 Subject: [PATCH 09/10] feat: update README.md, add specifc version of sda-cli that is **encouraged** to be used --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index eb3ec1e..c2aea91 100644 --- a/README.md +++ b/README.md @@ -139,7 +139,7 @@ where the access token has been truncated for brevity. Please note that the opti ### The sda-cli tool -Instead of the tools above, users are **encouraged** to use [`sda-cli`](https://github.com/NBISweden/sda-cli), which is a tool specifically developed to perform all common SDA user-related tasks in a convenient and unified manner. It is recommended to use precompiled executables for `sda-cli` which can be found at [https://github.com/NBISweden/sda-cli/releases](https://github.com/NBISweden/sda-cli/releases) +Instead of the tools above, users are **encouraged** to use [`sda-cli@v0.2.1`](https://github.com/NBISweden/sda-cli/releases/tag/v0.2.1), which is a tool specifically developed to perform all common SDA user-related tasks in a convenient and unified manner. To start using the tool run: From 9584d8e4c14e4951a8e14b66f7f112611154fdff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Gr=C3=B6nberg?= Date: Fri, 26 Sep 2025 14:42:06 +0200 Subject: [PATCH 10/10] feat: replace elixir-czech oidc url with new oidc url --- config/TLS-example/config.yaml | 6 +++--- config/TLS-example/iss.json | 4 ++-- terraform/terraform.tfvars | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/config/TLS-example/config.yaml b/config/TLS-example/config.yaml index 8de06fe..5b84e2d 100644 --- a/config/TLS-example/config.yaml +++ b/config/TLS-example/config.yaml @@ -50,7 +50,7 @@ db: elixir: # used by the auth service id: "" # LS AAI client ID secret: "" # LS AAI client secret - provider: "https://login.elixir-czech.org/oidc/" + provider: "https://login.aai.lifescience-ri.eu/oidc/" jwkpath: "jwk" redirectUrl: "" # public url to the auth endpoint @@ -69,7 +69,7 @@ log: oidc: configuration: - url: "https://login.elixir-czech.org/oidc/.well-known/openid-configuration" + url: "https://login.aai.lifescience-ri.eu/oidc/.well-known/openid-configuration" trusted: iss: "/iss.json" @@ -77,7 +77,7 @@ schema: type: isolated server: - jwtpubkeyurl: "https://login.elixir-czech.org/oidc/jwk" + jwtpubkeyurl: "https://login.aai.lifescience-ri.eu/oidc/jwk" cert: "/certificates/fullchain.pem" key: "/certificates/privkey.pem" diff --git a/config/TLS-example/iss.json b/config/TLS-example/iss.json index 3a0d2c9..a578bd4 100644 --- a/config/TLS-example/iss.json +++ b/config/TLS-example/iss.json @@ -1,6 +1,6 @@ [ { - "iss": "https://login.elixir-czech.org/oidc/", - "jku": "https://login.elixir-czech.org/oidc/jwk" + "iss": "https://login.aai.lifescience-ri.eu/oidc/", + "jku": "https://login.aai.lifescience-ri.eu/oidc/jwk" } ] \ No newline at end of file diff --git a/terraform/terraform.tfvars b/terraform/terraform.tfvars index 486dc7a..f8fab43 100644 --- a/terraform/terraform.tfvars +++ b/terraform/terraform.tfvars @@ -25,7 +25,7 @@ namespace = "" rabbitmq-admin-user = "admin" ### Storage backend configuration -oidc-provider = "https://login.elixir-czech.org/oidc" +oidc-provider = "https://login.aai.lifescience-ri.eu/oidc" oidc-client-id = "" oidc-client-secret = ""