From 49c839cafb4d0a613a8aa6d881ae77c9331211f2 Mon Sep 17 00:00:00 2001 From: RichardBruskiewich Date: Wed, 24 Sep 2025 15:37:58 -0700 Subject: [PATCH 1/2] Added 'new-metadata' and 'validate-metadata' make/justfile targets (mainly coded inside the 'include' Makefile and justfile). Cleaned up other code along the way. The root README.md could be updated to be more descriptive about scripts, etc. (see the resource-ingest-guide-schema" repo for an example of this) but that is perhaps a task for another day. --- Makefile | 6 +- README.md | 18 +++ justfile | 3 +- project.Makefile | 21 +++- project.justfile | 24 +++- src/{ => docs}/doc-templates/class.md.jinja2 | 0 .../doc-templates/class_diagram.md.jinja2 | 0 .../doc-templates/common_metadata.md.jinja2 | 0 src/{ => docs}/doc-templates/enum.md.jinja2 | 0 src/{ => docs}/doc-templates/index.md.jinja2 | 0 src/{ => docs}/doc-templates/schema.md.jinja2 | 0 src/{ => docs}/doc-templates/slot.md.jinja2 | 0 src/{ => docs}/doc-templates/subset.md.jinja2 | 0 src/{ => docs}/doc-templates/type.md.jinja2 | 0 src/docs/files/example-metadata.md | 7 ++ src/docs/files/ingest_metadata_template.yaml | 37 ++++++ src/scripts/create_metadata.py | 109 ++++++++++++++++++ 17 files changed, 219 insertions(+), 6 deletions(-) rename src/{ => docs}/doc-templates/class.md.jinja2 (100%) rename src/{ => docs}/doc-templates/class_diagram.md.jinja2 (100%) rename src/{ => docs}/doc-templates/common_metadata.md.jinja2 (100%) rename src/{ => docs}/doc-templates/enum.md.jinja2 (100%) rename src/{ => docs}/doc-templates/index.md.jinja2 (100%) rename src/{ => docs}/doc-templates/schema.md.jinja2 (100%) rename src/{ => docs}/doc-templates/slot.md.jinja2 (100%) rename src/{ => docs}/doc-templates/subset.md.jinja2 (100%) rename src/{ => docs}/doc-templates/type.md.jinja2 (100%) create mode 100644 src/docs/files/example-metadata.md create mode 100644 src/docs/files/ingest_metadata_template.yaml create mode 100644 src/scripts/create_metadata.py diff --git a/Makefile b/Makefile index df5871c..02a33d7 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ SRC = src DEST = project PYMODEL = $(SRC)/$(SCHEMA_NAME)/datamodel DOCDIR = docs -DOCTEMPLATES = $(SRC)/docs/templates +DOCTEMPLATES = $(SRC)/docs/doc-templates EXAMPLEDIR = examples # Use += to append variables from the variables file @@ -178,7 +178,7 @@ $(DOCDIR): gendoc: $(DOCDIR) cp -rf $(SRC)/docs/files/* $(DOCDIR) ; \ - $(RUN) gen-doc ${GEN_DOC_ARGS} -d $(DOCDIR) $(SOURCE_SCHEMA_PATH) + $(RUN) gen-doc ${GEN_DOC_ARGS} -d $(DOCDIR) --template-directory $(DOCTEMPLATES) $(SOURCE_SCHEMA_PATH) testdoc: gendoc serve @@ -194,7 +194,7 @@ git-add: .cruft.json git-commit: git commit -m 'chore: make setup was run' -a git-status: - git status + git statusfrom os import makedirs, path, sep # only necessary if setting up via cookiecutter .cruft.json: diff --git a/README.md b/README.md index 785dda2..05d2aed 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,8 @@ pip install uv ### Getting Started +Note: either 'make' or 'just' can be used to run the commands below. + 1. Clone the repository: ```bash git clone https://github.com/biolink/ingest-metadata.git @@ -53,6 +55,22 @@ uv sync make test # or use the `just test` command ``` +4. Creating a New Ingest Metadata file + +```bash +# Create a new Ingest Metadata file from the template +make new-metadata INFORES=infores:example + +# This creates src/docs/metadata/mydatasource_metadata.yaml +# Edit the file to fill in your specific information +``` + +or using the equivalent **`just`** command (note the reversed order of arguments): + +```bash +just INFORES=infores:example new-metadata +``` + ### Development Commands To run the commands, you may use good old make or the command runner [just](https://github.com/casey/just/) which is a better choice on Windows. diff --git a/justfile b/justfile index 8d95492..c8249f3 100644 --- a/justfile +++ b/justfile @@ -44,6 +44,7 @@ src := "src" dest := "project" pymodel := src / schema_name / "datamodel" docdir := "docs" +doctemplates := "{{src}}/docs/doc-templates" exampledir := "examples" # Main project manager is 'uv' @@ -150,7 +151,7 @@ lint: # Generate documentation _gendoc: _ensure_docdir cp -r {{src}}/docs/files/* {{docdir}} - {{run}} gen-doc {{gen_doc_args}} -d {{docdir}} {{source_schema_path}} + {{run}} gen-doc {{gen_doc_args}} -d {{docdir}} --template-directory {{doctemplates}} {{source_schema_path}} # Build docs and run test server testdoc: _gendoc _serve diff --git a/project.Makefile b/project.Makefile index 0dd0fa0..a43e5a7 100644 --- a/project.Makefile +++ b/project.Makefile @@ -1 +1,20 @@ -## Add your own custom Makefile targets here +## This is included by the main Makefile. + +# Create a new Ingest Metadata Specification YAML from template +# Usage: make new-metadata INFORES=infores:ctd +new-metadata: +ifndef INFORES + $(error INFORES is required. Usage: make new-metadata INFORES=infores:example") +endif + $(RUN) python $(SRC)/scripts/create_metadata.py --infores "$(INFORES)" + +# Validate all RIG files against the schema +validate-metadata: + @echo "Validating Ingest Metadata Specification files against schema..." + @for ingest_spec in $(SRC)/docs/metadata/*.yaml; do \ + if [ -f "$$ingest_spec" ]; then \ + echo "Validating $$ingest_spec"; \ + $(RUN) linkml-validate --schema $(SOURCE_SCHEMA_PATH) "$$ingest_spec"; \ + fi; \ + done + @echo "✓ All Ingest Metadata Specification files validated successfully" diff --git a/project.justfile b/project.justfile index bf84fbc..783aefa 100644 --- a/project.justfile +++ b/project.justfile @@ -1 +1,23 @@ -## Add your own just recipes here. This is imported by the main justfile. +## This is imported by the main justfile. + +INFORES:= "" + +# Create a new Ingest Metadata Specification YAML from template +# Usage: just INFORES=infores:ctd new-metadata +new-metadata: + @if [[ -z "{{INFORES}}" ]]; then \ + echo "INFORES is required. Usage: just INFORES=infores:example new-metadata"; \ + else \ + {{run}} python {{src}}/scripts/create_metadata.py --infores "{{INFORES}}"; \ + fi + +# Validate all Ingest Metadata Specification files against the schema +validate-metadata: + @echo "Validating Ingest Metadata Specification files against schema..." + @for ingest_spec in {{src}}/docs/metadata/*.yaml; do \ + if [ -f "$ingest_spec" ]; then \ + echo "Validating $ingest_spec"; \ + {{run}} linkml-validate --schema {{source_schema_path}} "$ingest_spec"; \ + fi; \ + done + @echo "✓ All Ingest Metadata Specification files validated (with any errors as indicated)" diff --git a/src/doc-templates/class.md.jinja2 b/src/docs/doc-templates/class.md.jinja2 similarity index 100% rename from src/doc-templates/class.md.jinja2 rename to src/docs/doc-templates/class.md.jinja2 diff --git a/src/doc-templates/class_diagram.md.jinja2 b/src/docs/doc-templates/class_diagram.md.jinja2 similarity index 100% rename from src/doc-templates/class_diagram.md.jinja2 rename to src/docs/doc-templates/class_diagram.md.jinja2 diff --git a/src/doc-templates/common_metadata.md.jinja2 b/src/docs/doc-templates/common_metadata.md.jinja2 similarity index 100% rename from src/doc-templates/common_metadata.md.jinja2 rename to src/docs/doc-templates/common_metadata.md.jinja2 diff --git a/src/doc-templates/enum.md.jinja2 b/src/docs/doc-templates/enum.md.jinja2 similarity index 100% rename from src/doc-templates/enum.md.jinja2 rename to src/docs/doc-templates/enum.md.jinja2 diff --git a/src/doc-templates/index.md.jinja2 b/src/docs/doc-templates/index.md.jinja2 similarity index 100% rename from src/doc-templates/index.md.jinja2 rename to src/docs/doc-templates/index.md.jinja2 diff --git a/src/doc-templates/schema.md.jinja2 b/src/docs/doc-templates/schema.md.jinja2 similarity index 100% rename from src/doc-templates/schema.md.jinja2 rename to src/docs/doc-templates/schema.md.jinja2 diff --git a/src/doc-templates/slot.md.jinja2 b/src/docs/doc-templates/slot.md.jinja2 similarity index 100% rename from src/doc-templates/slot.md.jinja2 rename to src/docs/doc-templates/slot.md.jinja2 diff --git a/src/doc-templates/subset.md.jinja2 b/src/docs/doc-templates/subset.md.jinja2 similarity index 100% rename from src/doc-templates/subset.md.jinja2 rename to src/docs/doc-templates/subset.md.jinja2 diff --git a/src/doc-templates/type.md.jinja2 b/src/docs/doc-templates/type.md.jinja2 similarity index 100% rename from src/doc-templates/type.md.jinja2 rename to src/docs/doc-templates/type.md.jinja2 diff --git a/src/docs/files/example-metadata.md b/src/docs/files/example-metadata.md new file mode 100644 index 0000000..9924c12 --- /dev/null +++ b/src/docs/files/example-metadata.md @@ -0,0 +1,7 @@ +# Composing a Ingest Metadata File + +An Ingest Metadata File is a YAML file which specifies the metadata of a knowledge source ingest into a knowledge graph representation within the Translator Ingests repository. + +## Overview + +See [Ingest Metadata File template](ingest_metadata_template.yaml) for details. diff --git a/src/docs/files/ingest_metadata_template.yaml b/src/docs/files/ingest_metadata_template.yaml new file mode 100644 index 0000000..38e24a6 --- /dev/null +++ b/src/docs/files/ingest_metadata_template.yaml @@ -0,0 +1,37 @@ +# IngestMetadataFile schema based content. +# See schema at src/ingest_metadata/schema/ingest_metadata.yaml +file_name: # (required, range = string) +file_created_by: # (optional, multivalued, range = string) +file_creation_date: # (required, range = date) + +# Information about the code used to execute the ingest task +# (e.g., a specific version/branch of code used) +ingest_code_url: # (required, range = URIorCURIE) +ingest_code_version: # (required, range = string) + +# Information about when/how the source was accessed in performing the ingest +source_infores_id: # (required, range = URIorCURIE) +source_data_version: # (required, range = string) +source_access_date: # (required, range = date) +source_access_urls: # (optional, multivalued, range = URIorCURIE) +source_file_names: # (optional, multivalued, range = string) + +# Information about the target KGX graph produced by this execution of the source ingest task +target_name: # (required, range = string) +target_creation_date: # (required, range = date) +target_data_url: # (required, range = URIorCURIE) +target_data_version: # (required, range = string) +target_format: # (required, range = string) +target_model: # (required, range = string) # This will be Biolink Model for all Translator graphs +target_model_url: # (optional, range = string) +target_data_model_version: # (required, range = string) # e.g. "4.2.6-rc5" +node_normalizer: # (optional, range = string) +node_normalizer_version: # (optional, range = string) +node_normalizer_url: # (optional, range = string) # e.g. "https://github.com/TranslatorSRI/NodeNormalization" + +# Metrics about the content of the specific KGX graph produced by the ingest task +total_edge_count: # (optional, range = integer) +total_node_count: # (optional, range = integer) +orphan_node_count: # (optional, range = integer) +node_categories: # (optional, multivalued, range = string) +edge_predicates: # (optional, multivalued, range = string) diff --git a/src/scripts/create_metadata.py b/src/scripts/create_metadata.py new file mode 100644 index 0000000..9b48279 --- /dev/null +++ b/src/scripts/create_metadata.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +""" +Script to create a new Ingest Metadata Specification (metadata) from the template. +""" + +from os import makedirs, path, sep +import sys +from pathlib import Path +import yaml +from datetime import datetime +import click + +METADATA_FILE_DIRECTORY = Path(__file__).parent.parent / "docs" / "metadata" + +def load_template(template_path): + """Load the metadata template from the YAML file.""" + with open(template_path, 'r') as f: + return yaml.safe_load(f) + + +def create_metadata(infores_id, output_file, output_path, template_path): + """ + Create a new Ingest Metadata Specification from the template, with user-specified values. + + :param infores_id: Associated with the primary knowledge source + :param output_file: file name of the metadata file + :param output_path: full path to the metadata file + :param template_path: full path to the template file + :return: None + """ + # Load template + template = load_template(template_path) + + # Update template with user values + template['file_name'] = output_file + template['source_infores_id'] = infores_id + template['file_creation_date'] = datetime.now().strftime('%Y-%m-%d') + + # Write the new metadata file + with open(output_path, 'w') as f: + yaml.dump(template, f, default_flow_style=False, sort_keys=False, indent=2) + + click.echo(f"Created new Ingest Metadata file: {output_file}") + click.echo(f" InfoRes ID: {infores_id}") + click.echo(f"\nNext steps:") + click.echo(f"1. Edit {output_path} to fill in the template sections") + click.echo(f"2. See src{sep}docs{sep}files{sep}example-metadata.md for detailed guidance") + + +@click.command() +@click.option( + '--infores', + required=True, + help='InfoRes identifier for the data source (e.g., infores:ctd)' +) +@click.option( + '--output', + help='Output filename for the new metadata (default: based on infores ID)' +) +@click.option( + '--template', + default=f"src{sep}docs{sep}files{sep}ingest_metadata_template.yaml", + help=f"Path to the metadata template file (default: src{sep}docs{sep}files{sep}metadata_template.yaml)" +) +def main(infores, output, template): + """Create a new Ingest Metadata file from the template. + + Examples: + + \b + create_metadata.py --infores "infores:ctd" + create_metadata.py --infores "infores:pharmgkb" --output "my_own_pharmgkb_metadata.yaml" + """ + + # Validate infores format + if not infores.startswith('infores:'): + click.echo("Error: InfoRes ID must start with 'infores:'", err=True) + sys.exit(1) + + # Generate output filename if not provided + if not output: + # Extract a source file name from infores ID and create the output filename + source_name = infores.replace('infores:', '').replace(':', '_') + output = f"{source_name}_metadata.yaml" + + # Sanity check: ensure the Ingest Metadata file directory exists + makedirs(path.dirname(METADATA_FILE_DIRECTORY), exist_ok=True) + output_path = f"{METADATA_FILE_DIRECTORY}{sep}{output}" + + # Check if template exists + if not path.exists(template): + click.echo(f"Error: Template file not found: {template}", err=True) + sys.exit(1) + + # Check if an output file already exists + if path.exists(output_path): + if not click.confirm(f"File {output_path} already exists. Overwrite?"): + click.echo("Aborted.") + sys.exit(0) + + try: + create_metadata(infores, output, output_path, template) + except Exception as e: + click.echo(f"Error creating Ingest Metadata Specification: {e}", err=True) + sys.exit(1) + + +if __name__ == '__main__': + main() \ No newline at end of file From a55dfbd97c20aa03a9ef6975dc349b3148879782 Mon Sep 17 00:00:00 2001 From: RichardBruskiewich Date: Wed, 24 Sep 2025 16:03:04 -0700 Subject: [PATCH 2/2] bug fix: use abspath not dirname for making the METADATA_FILE_DIRECTORY --- src/scripts/create_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/create_metadata.py b/src/scripts/create_metadata.py index 9b48279..db2f5e7 100644 --- a/src/scripts/create_metadata.py +++ b/src/scripts/create_metadata.py @@ -84,7 +84,7 @@ def main(infores, output, template): output = f"{source_name}_metadata.yaml" # Sanity check: ensure the Ingest Metadata file directory exists - makedirs(path.dirname(METADATA_FILE_DIRECTORY), exist_ok=True) + makedirs(path.abspath(METADATA_FILE_DIRECTORY), exist_ok=True) output_path = f"{METADATA_FILE_DIRECTORY}{sep}{output}" # Check if template exists