Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ You can use the `include_expression` `Filter="PASS"` to achieve this.
- `reference_fasta`: [Homo_sapiens_assembly38.fasta](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0?pli=1) - need a valid google account, this is a link to the resource bundle from Broad GATK
- `reference_dict`: [Homo_sapiens_assembly38.dict](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0?pli=1) - need a valid google account, this is a link to the resource bundle from Broad GATK
- `calling_regions`: [wgs_calling_regions.hg38.interval_list](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0?pli=1) - need a valid google account, this is a link to the resource bundle from Broad GATK. **To create our canonical calling intervals, edit this file by leaving only entries related to chr1-22,X,Y,M. M may need to be added.**
- `cnv_blacklist_regions`: `somatic-hg38_CNV_and_centromere_blacklist.hg38liftover.list` Blacklist regions that include centromeres to exclude from CNV calling
- `cnv_blacklist_regions`: `somatic-hg38_CNV_and_centromere_blacklist.hg38liftover.bed` Blacklist regions that include centromeres to exclude from CNV calling
- `coding_sequence_regions`: `GRCh38.gencode.v31.CDS.merged.bed` For Lancet WGS, it's highly recommended to use CDS bed as the starting point and supplement with the regions of calls from Strelka2 & Mutect2. Our CDS regions were obtained from GENCODE, [release 31](https://www.gencodegenes.org/human/release_31.html) using this GTF file [gencode.v31.primary_assembly.annotation.gtf.gz](ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_31/gencode.v31.primary_assembly.annotation.gtf.gz) and parsing features for `UTR`, `start codon`, `stop codon`, and `exon`, then using bedtools sort and merge after converting coordinates into bed format.
- `cnvkit_annotation_file`: [refFlat_HG38.txt](http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/refFlat.txt.gz) gunzip this file from UCSC
- `af_only_gnomad_vcf`: [af-only-gnomad.hg38.vcf.gz](https://console.cloud.google.com/storage/browser/gatk-best-practices/somatic-hg38) - need a valid google account, this is a link to the best practices google bucket from Broad GATK.
Expand Down
118 changes: 118 additions & 0 deletions ica/ica_json_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#/usr/bin/env python3

import json
from typing import Any
import yaml # type: ignore
import sys

def interpret_string(in_type: str, default: str | None = None) -> dict[str, str]:
"""
Interpret a string type and return a dictionary with the appropriate ICA attributes.
Arguments:
in_type: The type of the input.
default: The default value for the input.
Returns:
A dictionary with the ICA attributes for the input type.
"""
out: dict[str, Any] = {}
if default:
out["value"] = default
if in_type.endswith("?"):
in_type = in_type.replace("?", "")
else:
out["minValues"] = 1
if "[]" in in_type:
out["maxValues"] = 1000
in_type = in_type.replace("[]", "")
converter: dict[str, str] = {
"string": "textbox",
"float": "number",
"int": "integer",
"File": "data",
"Directory": "data"
}
if in_type == "null":
out["minValues"] = 0
elif in_type == "boolean":
if "value" in out: del out["value"]
out.update(interpret_bool(default))
elif in_type in converter:
out["type"] = converter[in_type]
else:
raise ValueError(f"Unaccounted type: {in_type}")
if in_type == "File":
out["dataFilter"] = {"dataType": "file"}
elif in_type == "Directory":
out["dataFilter"] = {"dataType": "directory"}
return out


def interpret_enum(enum_type: dict[str, str], default: str | None = None) -> dict[str, str]:
"""
Interpret an enum type and return a dictionary with the appropriate ICA attributes.
Arguments:
enum_type: A dictionary with the enum type.
default: The default value for the enum type.
Returns:
A dictionary with the ICA attributes for the enum type.
"""
out: dict[str, Any] = {"type": "select",
"choices": [{"value": i, "text": i, "selected": i == default} for i in enum_type["symbols"]]}
return out

def interpret_bool(default: str | None = None) -> dict[str, str]:
"""
Interpret a boolean type and return a dictionary with the appropriate ICA attributes.
Arguments:
default: The default value for the boolean type.
Returns:
A dictionary with the ICA attributes for the boolean type.
"""
out = {"type": "select",
"choices": [{"value": i, "text": str(i), "selected": i == default} for i in [True, False]]}
return out

def interpret_list(list_type: list[str], default: str | None = None) -> dict[str, str]:
"""
Interpret a list type and return a dictionary with the appropriate ICA attributes.
Arguments:
list_type: A list with the types of the input.
default: The default value for the list type.
Returns:
A dictionary with the ICA attributes for the list type.
"""
out = {"minValues": 1}
for t in list_type:
if type(t) == str:
out.update(interpret_string(t))
elif type(t) == dict:
out.update(interpret_enum(t, default))
else:
raise Exception(f"I don't know what to do with {t}")
return out


def main():
with open(sys.argv[-1], "r") as file:
wf = yaml.safe_load(file)
inputs = [dict(v, **{"id": k}) for k, v in wf["inputs"].items()]
payload = {"fields":[]}
for input in inputs:
ica_attr = {"id": input["id"]}
doc = None if "doc" not in input else input["doc"]
if doc: ica_attr["helpText"] = doc
default = None if "default" not in input else input["default"]
if type(input["type"]) == dict:
ica_attr.update(interpret_enum(input["type"], default))
elif type(input["type"]) == str:
ica_attr.update(interpret_string(input["type"], default))
elif type(input["type"]) == list:
ica_attr.update(interpret_list(input["type"], default))
else:
raise Exception(f"Cannot process input {input["type"]}")
payload["fields"].append(ica_attr)
print(json.dumps(payload, sort_keys=True, indent=2))


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions ica/in_clt_flatten_file_list.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"input_list": [["one","two"], ["three"], "four"]
}
14 changes: 14 additions & 0 deletions ica/in_prepare_regions.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"calling_regions": {
"class": "File",
"path": "/home/ubuntu/volume/Ref/wgs_calling_regions.hg38.interval_list"
},
"blacklist_regions": {
"class": "File",
"path": "/home/ubuntu/volume/Ref/hg38-blacklist.v2.bed.gz"
},
"reference_dict": {
"class": "File",
"path": "/home/ubuntu/volume/Ref/Homo_sapiens_assembly38.dict"
}
}
22 changes: 22 additions & 0 deletions ica/in_runtime_validator.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"is_wgs": true,
"vardict": true,
"mutect2": true,
"strelka2": true,
"preexisting_vcf": true,
"lancet": true,
"controlfreec": true,
"cnvkit": true,
"amplicon_architect": true,
"theta2": true,
"manta": true,
"gatk_cnv": true,
"mosek_present": true,
"pon_present": false,
"exome_flag": null,
"cnvkit_wgs_mode": null,
"i_flag": null,
"lancet_padding": null,
"lancet_window": null,
"vardict_padding": null
}
Loading