kids-first · dmiller15 · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025 · Apr 23, 2025
diff --git a/README.md b/README.md
@@ -192,7 +192,7 @@ You can use the `include_expression` `Filter="PASS"` to achieve this.
    - `reference_fasta`: [Homo_sapiens_assembly38.fasta](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0?pli=1) - need a valid google account, this is a link to the resource bundle from Broad GATK
    - `reference_dict`: [Homo_sapiens_assembly38.dict](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0?pli=1) - need a valid google account, this is a link to the resource bundle from Broad GATK
    - `calling_regions`: [wgs_calling_regions.hg38.interval_list](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0?pli=1) - need a valid google account, this is a link to the resource bundle from Broad GATK. **To create our canonical calling intervals, edit this file by leaving only entries related to chr1-22,X,Y,M. M may need to be added.**
-   - `cnv_blacklist_regions`: `somatic-hg38_CNV_and_centromere_blacklist.hg38liftover.list` Blacklist regions that include centromeres to exclude from CNV calling
+   - `cnv_blacklist_regions`: `somatic-hg38_CNV_and_centromere_blacklist.hg38liftover.bed` Blacklist regions that include centromeres to exclude from CNV calling
    - `coding_sequence_regions`: `GRCh38.gencode.v31.CDS.merged.bed` For Lancet WGS, it's highly recommended to use CDS bed as the starting point and supplement with the regions of calls from Strelka2 & Mutect2. Our CDS regions were obtained from GENCODE, [release 31](https://www.gencodegenes.org/human/release_31.html) using this GTF file [gencode.v31.primary_assembly.annotation.gtf.gz](ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_31/gencode.v31.primary_assembly.annotation.gtf.gz) and parsing features for `UTR`, `start codon`, `stop codon`, and `exon`, then using bedtools sort and merge after converting coordinates into bed format.
    - `cnvkit_annotation_file`: [refFlat_HG38.txt](http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/refFlat.txt.gz) gunzip this file from UCSC
    - `af_only_gnomad_vcf`: [af-only-gnomad.hg38.vcf.gz](https://console.cloud.google.com/storage/browser/gatk-best-practices/somatic-hg38) - need a valid google account, this is a link to the best practices google bucket from Broad GATK.

diff --git a/ica/ica_json_builder.py b/ica/ica_json_builder.py
@@ -0,0 +1,118 @@
+#/usr/bin/env python3
+
+import json
+from typing import Any
+import yaml # type: ignore
+import sys
+
+def interpret_string(in_type: str, default: str | None = None) -> dict[str, str]:
+    """
+    Interpret a string type and return a dictionary with the appropriate ICA attributes.
+    Arguments:
+        in_type: The type of the input.
+        default: The default value for the input.
+    Returns:
+        A dictionary with the ICA attributes for the input type.
+    """
+    out: dict[str, Any] = {}
+    if default:
+        out["value"] = default
+    if in_type.endswith("?"):
+        in_type = in_type.replace("?", "")
+    else:
+        out["minValues"] = 1
+    if "[]" in in_type:
+        out["maxValues"] = 1000
+        in_type = in_type.replace("[]", "")
+    converter: dict[str, str] = {
+        "string": "textbox",
+        "float": "number",
+        "int": "integer",
+        "File": "data",
+        "Directory": "data"
+    }
+    if in_type == "null":
+        out["minValues"] = 0
+    elif in_type == "boolean":
+        if "value" in out: del out["value"]
+        out.update(interpret_bool(default))
+    elif in_type in converter:
+        out["type"] = converter[in_type]
+    else:
+        raise ValueError(f"Unaccounted type: {in_type}")
+    if in_type == "File":
+        out["dataFilter"] = {"dataType": "file"}
+    elif in_type == "Directory":
+        out["dataFilter"] = {"dataType": "directory"}
+    return out
+
+
+def interpret_enum(enum_type: dict[str, str], default: str | None = None) -> dict[str, str]:
+    """
+    Interpret an enum type and return a dictionary with the appropriate ICA attributes.
+    Arguments:
+        enum_type: A dictionary with the enum type.
+        default: The default value for the enum type.
+    Returns:
+        A dictionary with the ICA attributes for the enum type.
+    """
+    out: dict[str, Any] = {"type": "select",
+           "choices": [{"value": i, "text": i, "selected": i == default} for i in enum_type["symbols"]]}
+    return out
+
+def interpret_bool(default: str | None = None) -> dict[str, str]:
+    """
+    Interpret a boolean type and return a dictionary with the appropriate ICA attributes.
+    Arguments:
+        default: The default value for the boolean type.
+    Returns:
+        A dictionary with the ICA attributes for the boolean type.
+    """
+    out = {"type": "select",
+           "choices": [{"value": i, "text": str(i), "selected": i == default} for i in [True, False]]}
+    return out
+
+def interpret_list(list_type: list[str], default: str | None = None) -> dict[str, str]:
+    """
+    Interpret a list type and return a dictionary with the appropriate ICA attributes.
+    Arguments:
+        list_type: A list with the types of the input.
+        default: The default value for the list type.
+    Returns:
+        A dictionary with the ICA attributes for the list type.
+    """
+    out = {"minValues": 1}
+    for t in list_type:
+        if type(t) == str:
+            out.update(interpret_string(t))
+        elif type(t) == dict:
+            out.update(interpret_enum(t, default))
+        else:
+            raise Exception(f"I don't know what to do with {t}")
+    return out
+
+
+def main():
+    with open(sys.argv[-1], "r") as file:
+        wf = yaml.safe_load(file)
+    inputs = [dict(v, **{"id": k}) for k, v in wf["inputs"].items()]
+    payload = {"fields":[]}
+    for input in inputs:
+        ica_attr = {"id": input["id"]}
+        doc = None if "doc" not in input else input["doc"]
+        if doc: ica_attr["helpText"] = doc
+        default = None if "default" not in input else input["default"]
+        if type(input["type"]) == dict:
+            ica_attr.update(interpret_enum(input["type"], default))
+        elif type(input["type"]) == str:
+            ica_attr.update(interpret_string(input["type"], default))
+        elif type(input["type"]) == list:
+            ica_attr.update(interpret_list(input["type"], default))
+        else:
+            raise Exception(f"Cannot process input {input["type"]}")
+        payload["fields"].append(ica_attr)
+    print(json.dumps(payload, sort_keys=True, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ica/in_clt_flatten_file_list.json b/ica/in_clt_flatten_file_list.json
@@ -0,0 +1,3 @@
+{
+  "input_list": [["one","two"], ["three"], "four"]
+}
diff --git a/ica/in_prepare_regions.json b/ica/in_prepare_regions.json
@@ -0,0 +1,14 @@
+{
+  "calling_regions": {
+    "class": "File",
+    "path": "/home/ubuntu/volume/Ref/wgs_calling_regions.hg38.interval_list"
+  },
+  "blacklist_regions": {
+    "class": "File",
+    "path": "/home/ubuntu/volume/Ref/hg38-blacklist.v2.bed.gz"
+  },
+  "reference_dict": {
+    "class": "File",
+    "path": "/home/ubuntu/volume/Ref/Homo_sapiens_assembly38.dict"
+  }
+}
diff --git a/ica/in_runtime_validator.json b/ica/in_runtime_validator.json
@@ -0,0 +1,22 @@
+{
+  "is_wgs": true,
+  "vardict": true,
+  "mutect2": true,
+  "strelka2": true,
+  "preexisting_vcf": true,
+  "lancet": true,
+  "controlfreec": true,
+  "cnvkit": true,
+  "amplicon_architect": true,
+  "theta2": true,
+  "manta": true,
+  "gatk_cnv": true,
+  "mosek_present": true,
+  "pon_present": false,
+  "exome_flag": null,
+  "cnvkit_wgs_mode": null,
+  "i_flag": null,
+  "lancet_padding": null,
+  "lancet_window": null,
+  "vardict_padding": null
+}