diff --git a/.nf-core.yml b/.nf-core.yml index d8d5aea..ab8281b 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -34,8 +34,8 @@ repository_type: pipeline template: author: "Famke Bäuerle, Dorothy Ellis" - description: Nextflow pipeline to convert (g)vcfs to matrices suitable for statistical - analysis + description: Nextflow pipeline to convert (g)vcfs to matrices suitable for + statistical analysis force: false is_nfcore: false name: vcftocounts @@ -46,4 +46,4 @@ template: - codespaces - fastqc - adaptivecard - version: 2.0.2dev + version: 2.1.0dev diff --git a/CHANGELOG.md b/CHANGELOG.md index ba7d527..0219ea4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v2.0.2dev +## v2.1.0dev ### `Added` - [#34](https://github.com/qbic-pipelines/vcftocounts/pull/34) - Swap CI tests to nf-test and fix small channel issue +- [#39](https://github.com/qbic-pipelines/vcftocounts/pull/39) - Add random subsampling as alternative to filtering ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index a662707..2feaab4 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,6 +10,18 @@ ## Pipeline tools +- [BCFTools](https://pubmed.ncbi.nlm.nih.gov/21903627/) + + > Li H: A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics. 2011 Nov 1;27(21):2987-93. doi: 10.1093/bioinformatics/btr509. PubMed PMID: 21903627; PubMed Central PMCID: PMC3198575. + +- [GATK](https://pubmed.ncbi.nlm.nih.gov/20644199/) + + > McKenna A, Hanna M, Banks E, et al.: The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 2010 Sep;20(9):1297-303. doi: 10.1101/gr.107524.110. Epub 2010 Jul 19. PubMed PMID: 20644199; PubMed Central PMCID: PMC2928508. + +- [Tabix](https://academic.oup.com/bioinformatics/article/27/5/718/262743) + + > Li H, Tabix: fast retrieval of sequence features from generic TAB-delimited files, Bioinformatics, Volume 27, Issue 5, 1 March 2011, Pages 718–719, doi: 10.1093/bioinformatics/btq671. PubMed PMID: 21208982. PubMed Central PMCID: PMC3042176. + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. diff --git a/README.md b/README.md index b21b117..5d58d3b 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,9 @@ 1. Indexes (g.)vcf files ([`tabix`](http://www.htslib.org/doc/tabix.html)) 2. Converts g.vcf files to vcf with `genotypegvcf` ([`GATK`](https://gatk.broadinstitute.org/hc/en-us)) -3. Filters the VCF based on a string given to the `filter` param with `bcftools/view` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html)) - Turned off by default. +3. Optional filtering of VCF files + 3.1 Filtering based on a string given to the `filter` param with `bcftools/view` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html)) - Turned off by default. + 3.2 Keeping only a fraction of random variants based on the `subset` param with a custom bash script using `bcftools/stats`, `view` and `sort` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html)) - Turned off by default, should be used as alternative to filtering. 4. Concatenates all vcfs that have the same id and the same label with `bcftools/concat` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html)) 5. Changes the sample name in the vcf file to the filename with `bcftools/reheader` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html)) - This can be turned off by adding `--rename false` to the `nextflow run` command. 6. Merges all vcfs from the same sample with `bcftools/merge` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html)) @@ -73,8 +75,6 @@ If you would like to contribute to this pipeline, please see the [contributing g If you use qbic-pipelines/vcftocounts for your analysis, please cite it using the following doi: [10.5281/zenodo.14616650](https://doi.org/10.5281/zenodo.14616650) - - An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE). diff --git a/bin/randomsubset.sh b/bin/randomsubset.sh new file mode 100755 index 0000000..28e1191 --- /dev/null +++ b/bin/randomsubset.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Usage: randomsubset.sh +if [[ $# -ne 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +input_vcf="$1" +output_vcf="$2" +fraction="$3" + +# Create temp files and directories +tmpdir=$(mktemp -d) +tmp_vcf="$tmpdir/tmp.vcf" +tmp_sorted_vcf="$tmpdir/tmp.sorted.vcf" + +# Calculate number of records to sample +subset_count=$(bcftools stats "$input_vcf" | awk -v frac="$fraction" -F'\t' '$3=="number of records:" {print int($4*frac)}') + +echo "Sampling $subset_count records from $input_vcf" + +# Write header +bcftools view --header-only "$input_vcf" > "$tmp_vcf" + +# Randomly sample records +bcftools view --no-header "$input_vcf" | \ + awk '{printf("%f\t%s\n",rand(),$0);}' | \ + sort -t $'\t' -T "$tmpdir" -k1,1g | \ + head -n "$subset_count" | \ + cut -f 2- >> "$tmp_vcf" || true + +# Sort and write to output +bcftools sort -T "$tmpdir" -o "$output_vcf" "$tmp_vcf" + +# Clean up +rm -rf "$tmpdir" diff --git a/conf/modules.config b/conf/modules.config index 1b991d4..7676628 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -67,6 +67,14 @@ process { ] } + withName: 'RANDOMSUBSET' { + ext.prefix = { "${meta.id}.subset" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/bcftools/subset/${meta.label}/" }, + ] + } + withName: 'BCFTOOLS_MERGE' { ext.args = { "--force-samples --output-type z --write-index=tbi" } ext.prefix = { "${meta.id}.merge" } diff --git a/docs/images/vcftocounts-subway.excalidraw.png b/docs/images/vcftocounts-subway.excalidraw.png index e7d3d1b..deddecd 100644 Binary files a/docs/images/vcftocounts-subway.excalidraw.png and b/docs/images/vcftocounts-subway.excalidraw.png differ diff --git a/docs/output.md b/docs/output.md index 37071dd..d898864 100644 --- a/docs/output.md +++ b/docs/output.md @@ -13,6 +13,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Tabix](#tabix) - Indexes (g.)vcf files - [GenotypeGVCFs](#genotypegvcfs) - Converts g.vcf files to vcf with GATK - [Filter VCFs](#filter-vcfs) - Filters the VCF based on a string given to the `filter` param with bcftools/view +- [Subset VCFs](#subsetvcfs) - Keeps only a fraction of random variants based on the `subset` param - [Concatenate VCFs](#concatenate-vcfs) - Concatenates all vcfs that have the same id and the same label with bcftools/concat - [Rename Samples](#rename-samples) - Changes the sample name in the vcf file to the label with bcftools/reheader - [Merge VCFs](#merge-vcfs) - Merges all vcfs from the same sample with bcftools/merge @@ -59,6 +60,19 @@ The GATK GenotypeGVCFs module translates genotype (g) vcf files into classic vcf VEP annotated VCF files can be filtered for certain flags present after VEP annotation. Notably, this enables filtering for variants with certain impact levels or consequences. Filtering will produces VCF files holding just the variants matching the specific patterns. +### Subset VCFs + +
+Output files + +- `bcftools/subset/{meta.label}/` + - `{filename}.subset.vcf.gz`: vcf file with fraction of random variants. + - `{filename}.seubset.vcf.gz.tbi`: tabix index of the vcf file. + +
+ +VCF files can be randomly subsampled to keep only a specific fraction of variants. This enables comparison to the filtered variants. + ### Concatenate VCFs
diff --git a/docs/usage.md b/docs/usage.md index 9bcd1ba..0061480 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -96,6 +96,12 @@ Notably, this enables filtering for variants with certain impact levels or conse > [!NOTE] > The filtering step only works with conda for nextflow versions above 24.10.2 (use docker or singularity if you want to use an older nextflow version) +### Subset VCFs + +VCF files can be randomly subsetted to keep only a specific fraction of variants. This enables comparison to the filtered variants. + +You can determine appropriate fractions by comparing the number of filtered variants with the total number of variants. This can be done with a script that collects the number of variants by using `bcftools stats` from both files and dividing them. The more VCF files you use for comparison, the more robust the fraction becomes. (We compared around 90 files and obtained an average fraction of 0.00175 when using `--filter 'INFO/CSQ ~ "HIGH"'`). + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: diff --git a/modules/local/randomsubset/environment.yml b/modules/local/randomsubset/environment.yml new file mode 100644 index 0000000..a394b02 --- /dev/null +++ b/modules/local/randomsubset/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::bcftools=1.21" diff --git a/modules/local/randomsubset/main.nf b/modules/local/randomsubset/main.nf new file mode 100644 index 0000000..62eba56 --- /dev/null +++ b/modules/local/randomsubset/main.nf @@ -0,0 +1,49 @@ +process RANDOMSUBSET { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/5a/5acacb55c52bec97c61fd34ffa8721fce82ce823005793592e2a80bf71632cd0/data': + 'community.wave.seqera.io/library/bcftools:1.21--4335bec1d7b44d11' }" + + input: + tuple val(meta), path(vcf), path(index) + val(fraction) + + output: + tuple val(meta), path("*.vcf.gz"), emit: vcf + tuple val(meta), path("*.tbi") , emit: tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + randomsubset.sh ${vcf} ${prefix}.vcf ${fraction} + + bgzip ${prefix}.vcf + tabix -p vcf ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo | gzip > ${prefix}.vcf.gz + touch ${prefix}.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/randomsubset/tests/main.nf.test b/modules/local/randomsubset/tests/main.nf.test new file mode 100644 index 0000000..f98a25b --- /dev/null +++ b/modules/local/randomsubset/tests/main.nf.test @@ -0,0 +1,65 @@ +nextflow_process { + + name "Test Process RANDOMSUBSET" + script "../main.nf" + process "RANDOMSUBSET" + + tag "modules" + tag "modules_" + tag "randomsubset" + + test("sarscov2 - [vcf, tbi]") { + + when { + process { + """ + // The input VCF has 9 records so we expect 4 records in the output VCF + input[0] = [ + [ id:'out', single_end:false ], // meta map + file('https://github.com/nf-core/test-datasets/raw/refs/heads/modules/data/genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file('https://github.com/nf-core/test-datasets/raw/refs/heads/modules/data/genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = 0.5 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + path(process.out.vcf.get(0).get(1)).vcf.summary, + file(process.out.tbi.get(0).get(1)).name, + process.out.versions + ).match() }, + ) + } + + } + + test("sarscov2 - [vcf, tbi] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'out', single_end:false ], // meta map + file('https://github.com/nf-core/test-datasets/raw/refs/heads/modules/data/genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file('https://github.com/nf-core/test-datasets/raw/refs/heads/modules/data/genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = 0.00175 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } +} diff --git a/modules/local/randomsubset/tests/main.nf.test.snap b/modules/local/randomsubset/tests/main.nf.test.snap new file mode 100644 index 0000000..43ef2b2 --- /dev/null +++ b/modules/local/randomsubset/tests/main.nf.test.snap @@ -0,0 +1,69 @@ +{ + "sarscov2 - [vcf, tbi] - stub": { + "content": [ + { + "0": [ + [ + { + "id": "out", + "single_end": false + }, + "out.subset.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "out", + "single_end": false + }, + "out.subset.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,ee7626565a01c36b7fb7a05f41e0653e" + ], + "tbi": [ + [ + { + "id": "out", + "single_end": false + }, + "out.subset.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "vcf": [ + [ + { + "id": "out", + "single_end": false + }, + "out.subset.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,ee7626565a01c36b7fb7a05f41e0653e" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.3" + }, + "timestamp": "2025-06-18T10:53:32.966380045" + }, + "sarscov2 - [vcf, tbi]": { + "content": [ + "VcfFile [chromosomes=[MT192765.1], sampleCount=1, variantCount=4, phased=false, phasedAutodetect=false]", + "out.subset.vcf.gz.tbi", + [ + "versions.yml:md5,ee7626565a01c36b7fb7a05f41e0653e" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.3" + }, + "timestamp": "2025-06-18T10:53:26.441286474" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 8a8d4b6..2d0d8a7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,7 @@ params { input = null rename = true filter = null + subset = null removeIDs = true // References @@ -249,7 +250,7 @@ manifest { mainScript = 'main.nf' defaultBranch = 'master' nextflowVersion = '!>=24.04.2' - version = '2.0.2dev' + version = '2.1.0dev' doi = '' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 8460300..51c4c48 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -32,6 +32,10 @@ "type": "string", "description": "Add a filtering criterium suitable for bcftools/view. For example 'INFO/CSQ ~ \"HIGH\"'." }, + "subset": { + "type": "number", + "description": "Get a random subset of variants. Set this variable to the fraction you want to keep (f.ex. 0.5 if you want to keep half of the variants)." + }, "removeIDs": { "type": "boolean", "default": true, diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 1c122e1..7914acb 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -22,8 +22,8 @@ "@id": "./", "@type": "Dataset", "creativeWorkStatus": "InProgress", - "datePublished": "2025-04-15T13:25:04+00:00", - "description": "# qbic-pipelines/vcftocounts\n\n[![GitHub Actions CI Status](https://github.com/qbic-pipelines/vcftocounts/actions/workflows/ci.yml/badge.svg)](https://github.com/qbic-pipelines/vcftocounts/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/qbic-pipelines/vcftocounts/actions/workflows/linting.yml/badge.svg)](https://github.com/qbic-pipelines/vcftocounts/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.14616650-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.14616650)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.04.2-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/qbic-pipelines/vcftocounts)\n\n## Introduction\n\n**qbic-pipelines/vcftocounts** is a bioinformatics pipeline that processes g.vcf files to a matrix suitable for downstream analysis. The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:\n\n1. Indexes (g.)vcf files ([`tabix`](http://www.htslib.org/doc/tabix.html))\n2. Converts g.vcf files to vcf with `genotypegvcf` ([`GATK`](https://gatk.broadinstitute.org/hc/en-us))\n3. Filters the VCF based on a string given to the `filter` param with `bcftools/view` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html)) - Turned off by default.\n4. Concatenates all vcfs that have the same id and the same label with `bcftools/concat` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html))\n5. Changes the sample name in the vcf file to the filename with `bcftools/reheader` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html)) - This can be turned off by adding `--rename false` to the `nextflow run` command.\n6. Merges all vcfs from the same sample with `bcftools/merge` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html))\n7. Removes entries in the ID column with `bcftools/annotate` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html)) - his can be turned off by adding `--removeIDs false` to the `nextflow run` command.\n8. Converts the (merged) vcfs to a matrix using a custom R script written by @ellisdoro ([`R`](https://www.r-project.org/))\n9. Collects all reports into a MultiQC report ([`MultiQC`](http://multiqc.info/))\n\n![](./docs/images/vcftocounts-subway.excalidraw.png)\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,label,gvcf,vcf_path,vcf_index_path\nSAMPLE-1,pipelineA-callerA,false,path/to/vcf.gz,path/to/.vcf.gz.tbi\nSAMPLE-1,pipelineB-callerA,false,path/to/vcf.gz,path/to/.vcf.gz.tbi\nSAMPLE-2,pipelineB-callerB,true,path/to/g.vcf.gz,path/to/g.vcf.gz.tbi\nSAMPLE-2,pipelineB-callerB,true,path/to/g.vcf.gz,path/to/g.vcf.gz.tbi\n```\n\nEach row represents a VCF file coming from a sample. The `label` column enables concatenation of vcfs (for example when the pipeline produces different vcfs for chrM and chrY). The `gvcf` column indicates whether the file is a g.vcf file or not. The `vcf_path` and `vcf_index_path` columns contain the path to the VCF file and its index, respectively.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run qbic-pipelines/vcftocounts \\\n -profile \\\n --input samplesheet.csv \\\n --genome GATK.GRCh38 \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\n## Credits\n\nqbic-pipelines/vcftocounts was originally written by Famke B\u00e4uerle, Dorothy Ellis.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use qbic-pipelines/vcftocounts for your analysis, please cite it using the following doi: [10.5281/zenodo.14616650](https://doi.org/10.5281/zenodo.14616650)\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "datePublished": "2025-06-18T07:57:14+00:00", + "description": "# qbic-pipelines/vcftocounts\n\n[![GitHub Actions CI Status](https://github.com/qbic-pipelines/vcftocounts/actions/workflows/ci.yml/badge.svg)](https://github.com/qbic-pipelines/vcftocounts/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/qbic-pipelines/vcftocounts/actions/workflows/linting.yml/badge.svg)](https://github.com/qbic-pipelines/vcftocounts/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.14616650-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.14616650)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.04.2-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/qbic-pipelines/vcftocounts)\n\n## Introduction\n\n**qbic-pipelines/vcftocounts** is a bioinformatics pipeline that processes g.vcf files to a matrix suitable for downstream analysis. The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:\n\n1. Indexes (g.)vcf files ([`tabix`](http://www.htslib.org/doc/tabix.html))\n2. Converts g.vcf files to vcf with `genotypegvcf` ([`GATK`](https://gatk.broadinstitute.org/hc/en-us))\n3. Optional filtering of VCF files\n 3.1 Filtering based on a string given to the `filter` param with `bcftools/view` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html)) - Turned off by default.\n 3.2 Keeping only a fraction of random variants based on the `subset` param with a custom bash script using `bcftools/stats`, `view` and `sort` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html)) - Turned off by default, should be used as alternative to filtering.\n4. Concatenates all vcfs that have the same id and the same label with `bcftools/concat` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html))\n5. Changes the sample name in the vcf file to the filename with `bcftools/reheader` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html)) - This can be turned off by adding `--rename false` to the `nextflow run` command.\n6. Merges all vcfs from the same sample with `bcftools/merge` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html))\n7. Removes entries in the ID column with `bcftools/annotate` ([`bcftools`](https://samtools.github.io/bcftools/bcftools.html)) - his can be turned off by adding `--removeIDs false` to the `nextflow run` command.\n8. Converts the (merged) vcfs to a matrix using a custom R script written by @ellisdoro ([`R`](https://www.r-project.org/))\n9. Collects all reports into a MultiQC report ([`MultiQC`](http://multiqc.info/))\n\n![](./docs/images/vcftocounts-subway.excalidraw.png)\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,label,gvcf,vcf_path,vcf_index_path\nSAMPLE-1,pipelineA-callerA,false,path/to/vcf.gz,path/to/.vcf.gz.tbi\nSAMPLE-1,pipelineB-callerA,false,path/to/vcf.gz,path/to/.vcf.gz.tbi\nSAMPLE-2,pipelineB-callerB,true,path/to/g.vcf.gz,path/to/g.vcf.gz.tbi\nSAMPLE-2,pipelineB-callerB,true,path/to/g.vcf.gz,path/to/g.vcf.gz.tbi\n```\n\nEach row represents a VCF file coming from a sample. The `label` column enables concatenation of vcfs (for example when the pipeline produces different vcfs for chrM and chrY). The `gvcf` column indicates whether the file is a g.vcf file or not. The `vcf_path` and `vcf_index_path` columns contain the path to the VCF file and its index, respectively.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run qbic-pipelines/vcftocounts \\\n -profile \\\n --input samplesheet.csv \\\n --genome GATK.GRCh38 \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\n## Credits\n\nqbic-pipelines/vcftocounts was originally written by Famke B\u00e4uerle, Dorothy Ellis.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use qbic-pipelines/vcftocounts for your analysis, please cite it using the following doi: [10.5281/zenodo.14616650](https://doi.org/10.5281/zenodo.14616650)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" @@ -102,7 +102,7 @@ }, "mentions": [ { - "@id": "#0a69c269-aa71-493a-8675-0f0bf0bbac87" + "@id": "#cd73230b-0ef5-4c7c-9749-31f88b8fd3de" } ], "name": "qbic-pipelines/vcftocounts" @@ -132,14 +132,14 @@ "creator": [ "qbic", { - "@id": "#45968370+famosab@users.noreply.github.com" + "@id": "#famke.baeuerle@gmail.com" }, { "@id": "https://orcid.org/0000-0003-1387-0251" } ], "dateCreated": "", - "dateModified": "2025-04-15T15:25:04Z", + "dateModified": "2025-06-18T09:57:15Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", "keywords": [ "nf-core", @@ -167,7 +167,7 @@ "https://nf-co.re/qbic-pipelines/vcftocounts/dev/" ], "version": [ - "2.0.2dev" + "2.1.0dev" ] }, { @@ -183,11 +183,11 @@ "version": "!>=24.04.2" }, { - "@id": "#0a69c269-aa71-493a-8675-0f0bf0bbac87", + "@id": "#cd73230b-0ef5-4c7c-9749-31f88b8fd3de", "@type": "TestSuite", "instance": [ { - "@id": "#09be11e9-c228-48c5-b7a1-cefba1528138" + "@id": "#6680d03f-8568-4178-adcd-07f6308c9c90" } ], "mainEntity": { @@ -196,10 +196,10 @@ "name": "Test suite for qbic-pipelines/vcftocounts" }, { - "@id": "#09be11e9-c228-48c5-b7a1-cefba1528138", + "@id": "#6680d03f-8568-4178-adcd-07f6308c9c90", "@type": "TestInstance", "name": "GitHub Actions workflow for testing qbic-pipelines/vcftocounts", - "resource": "repos/qbic-pipelines/vcftocounts/actions/workflows/ci.yml", + "resource": "repos/qbic-pipelines/vcftocounts/actions/workflows/nf-test.yml", "runsOn": { "@id": "https://w3id.org/ro/terms/test#GithubService" }, @@ -330,9 +330,9 @@ "url": "https://nf-co.re/" }, { - "@id": "#45968370+famosab@users.noreply.github.com", + "@id": "#famke.baeuerle@gmail.com", "@type": "Person", - "email": "45968370+famosab@users.noreply.github.com", + "email": "famke.baeuerle@gmail.com", "name": "Famke Ba\u0308uerle" }, { diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap index d830b8c..d4b0c3f 100644 --- a/tests/default.nf.test.snap +++ b/tests/default.nf.test.snap @@ -19,7 +19,7 @@ "vcf2counts.R": "1.0.0" }, "Workflow": { - "qbic-pipelines/vcftocounts": "v2.0.2dev" + "qbic-pipelines/vcftocounts": "v2.1.0dev" } }, [ @@ -64,7 +64,7 @@ "nf-test": "0.9.2", "nextflow": "25.04.3" }, - "timestamp": "2025-06-13T14:58:31.890894061" + "timestamp": "2025-06-18T09:58:04.986919663" }, "-profile test": { "content": [ @@ -86,7 +86,7 @@ "vcf2counts.R": "1.0.0" }, "Workflow": { - "qbic-pipelines/vcftocounts": "v2.0.2dev" + "qbic-pipelines/vcftocounts": "v2.1.0dev" } }, [ @@ -151,6 +151,6 @@ "nf-test": "0.9.2", "nextflow": "25.04.3" }, - "timestamp": "2025-06-16T10:38:37.201720882" + "timestamp": "2025-06-18T09:57:52.144590003" } } \ No newline at end of file diff --git a/tests/filter.nf.test b/tests/filter.nf.test index 12b55d8..6c1674b 100644 --- a/tests/filter.nf.test +++ b/tests/filter.nf.test @@ -51,6 +51,51 @@ nextflow_pipeline { } } + test("-profile test --subset 0.005") { + + when { + params { + outdir = "$outputDir" + // Input data for subset test + input = "${projectDir}/tests/input-filter.csv" + subset = 0.005 + removeIDs = true + rename = true + + // Genome references + fasta = "https://github.com/nf-core/test-datasets/raw/refs/heads/modules/data/genomics/homo_sapiens/genome/genome.fasta" + fai = "https://github.com/nf-core/test-datasets/raw/refs/heads/modules/data/genomics/homo_sapiens/genome/genome.fasta.fai" + dict = "https://github.com/nf-core/test-datasets/raw/refs/heads/modules/data/genomics/homo_sapiens/genome/genome.dict" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore', ignore: ['vcf2counts/chr21_ann_sample.csv']) + // vcf_files: All files in ${params.outdir} + def vcf_files = getAllFilesFromDir(params.outdir, include: ['**/*.vcf{,.gz}']) + def csv_files = getAllFilesFromDir(params.outdir, include: ['**/*.csv']) + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we tests pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/vcftocounts_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path, + // All vcf files + vcf_files.collect{ file -> file.name + ":numVariants=" + path(file.path).vcf.variants.size() }, + csv_files.collect{ file -> file.name + ":numVariants=" + (path(file.path).readLines().size() - 1) } + ).match() } + ) + } + } + test("-profile test --filter 'INFO/CSQ ~ 'HIGH'' - stub") { options "-stub" diff --git a/tests/filter.nf.test.snap b/tests/filter.nf.test.snap index 28d738c..cc4dcda 100644 --- a/tests/filter.nf.test.snap +++ b/tests/filter.nf.test.snap @@ -25,7 +25,7 @@ "vcf2counts.R": "1.0.0" }, "Workflow": { - "qbic-pipelines/vcftocounts": "v2.0.2dev" + "qbic-pipelines/vcftocounts": "v2.1.0dev" } }, [ @@ -111,7 +111,7 @@ "nf-test": "0.9.2", "nextflow": "25.04.3" }, - "timestamp": "2025-06-13T15:00:31.95746673" + "timestamp": "2025-06-18T09:58:55.060177619" }, "-profile test --filter 'INFO/CSQ ~ 'HIGH'' - stub": { "content": [ @@ -139,7 +139,7 @@ "vcf2counts.R": "1.0.0" }, "Workflow": { - "qbic-pipelines/vcftocounts": "v2.0.2dev" + "qbic-pipelines/vcftocounts": "v2.1.0dev" } }, [ @@ -208,6 +208,110 @@ "nf-test": "0.9.2", "nextflow": "25.04.3" }, - "timestamp": "2025-06-13T15:00:45.84005744" + "timestamp": "2025-06-18T09:59:36.172027044" + }, + "-profile test --subset 0.005": { + "content": [ + 12, + { + "BCFTOOLS_ANNOTATE": { + "bcftools": 1.21 + }, + "BCFTOOLS_MERGE": { + "bcftools": 1.21 + }, + "BCFTOOLS_REHEADER": { + "bcftools": 1.21 + }, + "CREATE_SAMPLE_FILE": { + "create_sample_file": "1.0.0" + }, + "RANDOMSUBSET": { + "bcftools": 1.21 + }, + "TABIX_TABIX": { + "tabix": 1.21 + }, + "VCF2COUNTS": { + "vcf2counts.R": "1.0.0" + }, + "Workflow": { + "qbic-pipelines/vcftocounts": "v2.1.0dev" + } + }, + [ + "bcftools", + "bcftools/annotate", + "bcftools/annotate/chr21_ann_sample.IDremoved.vcf.gz", + "bcftools/annotate/chr21_ann_sample.IDremoved.vcf.gz.tbi", + "bcftools/annotate/versions.yml", + "bcftools/merge", + "bcftools/merge/chr21_ann_sample.merge.vcf.gz", + "bcftools/merge/chr21_ann_sample.merge.vcf.gz.tbi", + "bcftools/merge/versions.yml", + "bcftools/reheader", + "bcftools/reheader/chr21_ann_sample", + "bcftools/reheader/chr21_ann_sample/chr21_ann_sample.chr21_ann_callerA.reheader.vcf.gz", + "bcftools/reheader/chr21_ann_sample/chr21_ann_sample.chr21_ann_callerA.reheader.vcf.gz.tbi", + "bcftools/reheader/chr21_ann_sample/chr21_ann_sample.chr21_ann_callerB.reheader.vcf.gz", + "bcftools/reheader/chr21_ann_sample/chr21_ann_sample.chr21_ann_callerB.reheader.vcf.gz.tbi", + "bcftools/reheader/chr21_ann_sample/versions.yml", + "bcftools/reheader/samplefiles", + "bcftools/reheader/samplefiles/chr21_ann_callerA.txt", + "bcftools/reheader/samplefiles/chr21_ann_callerB.txt", + "bcftools/reheader/samplefiles/versions.yml", + "bcftools/subset", + "bcftools/subset/chr21_ann_callerA", + "bcftools/subset/chr21_ann_callerA/chr21_ann_sample.subset.vcf.gz", + "bcftools/subset/chr21_ann_callerA/chr21_ann_sample.subset.vcf.gz.tbi", + "bcftools/subset/chr21_ann_callerA/versions.yml", + "bcftools/subset/chr21_ann_callerB", + "bcftools/subset/chr21_ann_callerB/chr21_ann_sample.subset.vcf.gz", + "bcftools/subset/chr21_ann_callerB/chr21_ann_sample.subset.vcf.gz.tbi", + "bcftools/subset/chr21_ann_callerB/versions.yml", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/BETA-multiqc.parquet", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/vcftocounts_software_mqc_versions.yml", + "tabix", + "tabix/annotated.vcf.gz.tbi", + "vcf2counts", + "vcf2counts/chr21_ann_sample.csv" + ], + [ + "versions.yml:md5,c805d5803ca9daca8965aa71800cb41e", + "versions.yml:md5,429703c7a1b835b7e3cc81f91a87a167", + "versions.yml:md5,033f0e38ea7e11362d7c210db7aa3f4c", + "chr21_ann_callerA.txt:md5,53d75ae10df3e0af2f8e2172826ed179", + "chr21_ann_callerB.txt:md5,c72e9e92762ccaf4b20f7e92126e7aef", + "versions.yml:md5,4afe7b519f82d5d8c539950934477c1a", + "versions.yml:md5,e0f218ef306127de5c57d21eb0b4d227", + "versions.yml:md5,e0f218ef306127de5c57d21eb0b4d227", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f" + ], + [ + "chr21_ann_sample.IDremoved.vcf.gz:numVariants=58", + "chr21_ann_sample.merge.vcf.gz:numVariants=58", + "chr21_ann_sample.chr21_ann_callerA.reheader.vcf.gz:numVariants=29", + "chr21_ann_sample.chr21_ann_callerB.reheader.vcf.gz:numVariants=29", + "chr21_ann_sample.subset.vcf.gz:numVariants=29", + "chr21_ann_sample.subset.vcf.gz:numVariants=29" + ], + [ + "chr21_ann_sample.csv:numVariants=58" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.3" + }, + "timestamp": "2025-06-18T10:44:41.225101511" } } \ No newline at end of file diff --git a/tests/full.nf.test.snap b/tests/full.nf.test.snap index 7f38e07..4ea31e0 100644 --- a/tests/full.nf.test.snap +++ b/tests/full.nf.test.snap @@ -28,7 +28,7 @@ "vcf2counts.R": "1.0.0" }, "Workflow": { - "qbic-pipelines/vcftocounts": "v2.0.2dev" + "qbic-pipelines/vcftocounts": "v2.1.0dev" } }, [ @@ -215,7 +215,7 @@ "nf-test": "0.9.2", "nextflow": "25.04.3" }, - "timestamp": "2025-06-13T15:01:55.67824494" + "timestamp": "2025-06-18T09:58:18.779339304" }, "-profile test_full - stub": { "content": [ @@ -246,7 +246,7 @@ "vcf2counts.R": "1.0.0" }, "Workflow": { - "qbic-pipelines/vcftocounts": "v2.0.2dev" + "qbic-pipelines/vcftocounts": "v2.1.0dev" } }, [ @@ -394,6 +394,6 @@ "nf-test": "0.9.2", "nextflow": "25.04.3" }, - "timestamp": "2025-06-13T15:02:16.0250005" + "timestamp": "2025-06-18T09:58:38.896324197" } } \ No newline at end of file diff --git a/workflows/vcftocounts.nf b/workflows/vcftocounts.nf index 2eaec76..63b3999 100644 --- a/workflows/vcftocounts.nf +++ b/workflows/vcftocounts.nf @@ -7,6 +7,7 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main' include { TABIX_TABIX } from '../modules/nf-core/tabix/tabix/main' include { GATK4_GENOTYPEGVCFS } from '../modules/nf-core/gatk4/genotypegvcfs/main' include { BCFTOOLS_CONCAT } from '../modules/nf-core/bcftools/concat/main' +include { RANDOMSUBSET } from '../modules/local/randomsubset/main' include { CREATE_SAMPLE_FILE } from '../modules/local/createsamplefile/main' include { BCFTOOLS_REHEADER } from '../modules/nf-core/bcftools/reheader/main' include { BCFTOOLS_VIEW } from '../modules/nf-core/bcftools/view/main' @@ -102,7 +103,21 @@ workflow VCFTOCOUNTS { ch_filtered_vcf = BCFTOOLS_VIEW.out.vcf .join(BCFTOOLS_VIEW.out.tbi) - } else { + } else if (params.subset != null) { + // + // Get a random subset of variants with given fraction of variants + // + RANDOMSUBSET( + ch_vcf.map{ it -> [it[0], it[1], it[2]] }, + params.subset + ) + + ch_versions = ch_versions.mix(RANDOMSUBSET.out.versions) + + ch_filtered_vcf = RANDOMSUBSET.out.vcf + .join(RANDOMSUBSET.out.tbi) + } + else { ch_filtered_vcf = ch_vcf }