diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml index 9a4daa3..c03c844 100644 --- a/.github/workflows/install.yml +++ b/.github/workflows/install.yml @@ -14,7 +14,7 @@ jobs: with: channel-priority: strict activate-environment: snakemake - environment-file: .test/environment_v7.yaml + environment-file: .test/environment_v9.yaml - name: Create environments shell: bash -el {0} run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml --conda-create-envs-only --use-conda -c1 --conda-frontend conda @@ -29,7 +29,7 @@ jobs: with: channel-priority: strict activate-environment: snakemake - environment-file: .test/environment_v7.yaml + environment-file: .test/environment_v9.yaml - name: Dry run shell: bash -el {0} run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml --dry-run diff --git a/.github/workflows/test_apptainer.yml b/.github/workflows/test_apptainer.yml index 452af0d..8ab2fdd 100644 --- a/.github/workflows/test_apptainer.yml +++ b/.github/workflows/test_apptainer.yml @@ -30,8 +30,8 @@ jobs: - name: Pack logs if: success() || failure() shell: bash -el {0} - run: tar czf logs.tar.gz .test/output .snakemake/log - - name: Upload output file + run: tar czf logs.tar.gz logs .snakemake/log + - name: Upload logs file if: success() || failure() uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/test_v7.yml b/.github/workflows/test_v7.yml deleted file mode 100644 index 3416131..0000000 --- a/.github/workflows/test_v7.yml +++ /dev/null @@ -1,39 +0,0 @@ -name: Test Sm v7 - -on: - workflow_dispatch: - push: - branches: - - main - - dev - pull_request: - branches: - - "**" - -jobs: - run_test_pipeline: - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - name: Setup environment - uses: conda-incubator/setup-miniconda@v3 - with: - channel-priority: strict - activate-environment: snakemake - auto-activate-base: false - miniforge-version: latest - environment-file: .test/environment_v7.yaml - - name: Run test pipeline - shell: bash -el {0} - run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml --use-conda -c1 --conda-frontend conda --keep-going --notemp - - name: Pack logs - if: success() || failure() - shell: bash -el {0} - run: tar czf logs.tar.gz .test/output .snakemake/log - - name: Upload output file - if: success() || failure() - uses: actions/upload-artifact@v4 - with: - name: output-logs - path: logs.tar.gz diff --git a/.github/workflows/test_v9.yml b/.github/workflows/test_v9.yml index a3e211c..de6e379 100644 --- a/.github/workflows/test_v9.yml +++ b/.github/workflows/test_v9.yml @@ -30,8 +30,8 @@ jobs: - name: Pack logs if: success() || failure() shell: bash -el {0} - run: tar czf logs.tar.gz .test/output .snakemake/log - - name: Upload output file + run: tar czf logs.tar.gz logs .snakemake/log + - name: Upload logs file if: success() || failure() uses: actions/upload-artifact@v4 with: diff --git a/.gitignore b/.gitignore index 950a45d..cedc41e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,8 @@ # Analysis files /data/ -/output/ +/results/ +/logs/ # MAC .DS_Store diff --git a/.test/Snakefile b/.test/Snakefile index 7c3d4f8..cdb9d24 100644 --- a/.test/Snakefile +++ b/.test/Snakefile @@ -5,11 +5,14 @@ from snakemake.utils import min_version import subprocess -min_version("7.19") +min_version("9.12") # Workflow version __version__ = "test" +pathvars: + dataset = config["OUTPUT_NAME"] + # Rules include: "../workflow/core.smk" include: "../workflow/rules/demix.smk" @@ -19,4 +22,4 @@ include: "../workflow/rules/report.smk" rule all: input: - OUTDIR/f"{OUTPUT_NAME}.report.html" + "//report.html" diff --git a/.test/environment_v7.yaml b/.test/environment_v7.yaml deleted file mode 100644 index db4f147..0000000 --- a/.test/environment_v7.yaml +++ /dev/null @@ -1,10 +0,0 @@ -name: snakemake -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - python==3.11.6 - - snakemake==7.32.4 - - pulp<2.8 - - setuptools<81 diff --git a/.test/targets.yaml b/.test/targets.yaml index bac704f..1f6960e 100644 --- a/.test/targets.yaml +++ b/.test/targets.yaml @@ -10,8 +10,6 @@ SAMPLES: fasta: ".test/data/fasta/sample3.fasta" METADATA: ".test/data/metadata.csv" -OUTPUT_DIRECTORY: - ".test/output" CONTEXT_FASTA: ".test/data/context.fasta" OUTPUT_NAME: diff --git a/README.md b/README.md index cf44879..29d0168 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,6 @@ [![Zenodo DOI](https://zenodo.org/badge/627797162.svg)](https://doi.org/10.5281/zenodo.15771867) ![Install workflow](https://github.com/PathoGenOmics-Lab/VIPERA/actions/workflows/install.yml/badge.svg) -![Test workflow with Snakemake v7](https://github.com/PathoGenOmics-Lab/VIPERA/actions/workflows/test_v7.yml/badge.svg) ![Test workflow with Snakemake v9](https://github.com/PathoGenOmics-Lab/VIPERA/actions/workflows/test_v9.yml/badge.svg) ![Test workflow with Snakemake v9 and Apptainer](https://github.com/PathoGenOmics-Lab/VIPERA/actions/workflows/test_apptainer.yml/badge.svg) @@ -24,7 +23,7 @@ configuring [the inputs and outputs](config/README.md#inputs-and-outputs) and [the context dataset](config/README.md#automated-construction-of-a-context-dataset): ```shell -snakemake --use-conda --cores 4 # command for Snakemake v7 +snakemake --sdm conda --cores 4 ``` We provide a simple script that downloads the [data](https://doi.org/10.20350/digitalCSIC/15648) from [our study](https://doi.org/10.1093/ve/veae018) @@ -40,8 +39,9 @@ It supports dependency management through either conda or Apptainer/Singularity, [run modes documentation](config/README.md#run-modes). We use continuous integration (CI) to automatically verify that all dependencies install correctly -with Snakemake v7.32.4 (see GitHub Action `Install`), and to test that VIPERA runs -successfully with Snakemake v7.32.4 and v9.15.0 using conda (Actions `Test Sm v(7|9)`). +(see GitHub Action `Install`), and to test that VIPERA runs +successfully using conda (Actions `Test Sm v(7|9)`). +We run Snakemake v9.15.0 for these. We also test a containerized workflow with Snakemake v9.15.0 and Apptainer using a [remote image](https://hub.docker.com/r/ahmig/vipera) (Action `Test Sm v9 Apptainer`). This image is automatically updated in every version (Action `Deploy`). diff --git a/build_targets.py b/build_targets.py index 0c5867c..be71810 100644 --- a/build_targets.py +++ b/build_targets.py @@ -91,7 +91,6 @@ def find_file_with_extension(directory: Path, prefix: str, extensions: List[str] else: sys.exit(f"ERROR: metadata file '{args.metadata_csv}' does not exist") targets["OUTPUT_NAME"] = args.output_name - targets["OUTPUT_DIRECTORY"] = "output" targets["CONTEXT_FASTA"] = None targets["MAPPING_REFERENCES_FASTA"] = None diff --git a/config/README.md b/config/README.md index dc0914a..8376097 100644 --- a/config/README.md +++ b/config/README.md @@ -37,8 +37,7 @@ The path to these input files is set in two configuration files in YAML format: [config.yaml](/config/config.yaml) (for general workflow settings) and [targets.yaml](/config/targets.yaml) (for specific dataset-related settings). The latter must be modified by the user to point the `SAMPLES` and `METADATA` -parameters to your data. The `OUTPUT_DIRECTORY` parameter should point to your -desired results directory. +parameters to your data. The script [`build_targets.py`](/build_targets.py) simplifies the process of creating the targets configuration file. To run this script, you need to have PyYAML installed. It @@ -62,8 +61,6 @@ SAMPLES: ... METADATA: "path/to/metadata.csv" -OUTPUT_DIRECTORY: - "output" CONTEXT_FASTA: null MAPPING_REFERENCES_FASTA: @@ -258,11 +255,10 @@ should be modified to fit your needs. Read more about Snakemake profiles [here](https://snakemake.readthedocs.io/en/stable/executing/cli.html#executing-profiles). To use the profile, install the [Snakemake executor plugin for SLURM](https://snakemake.github.io/snakemake-plugin-catalog/plugins/executor/slurm.html) -and run one of the following commands: +and run the following command: ```shell -snakemake --slurm --profile profile/slurm # Snakemake v7 -snakemake --profile profile/slurm # Snakemake v8+ +snakemake --profile profile/slurm ``` Additionally, we offer the option of running the workflow within a containerized diff --git a/config/targets.yaml b/config/targets.yaml index ce273e7..1d1197d 100644 --- a/config/targets.yaml +++ b/config/targets.yaml @@ -39,8 +39,6 @@ METADATA: "data/metadata.csv" OUTPUT_NAME: "case_study" -OUTPUT_DIRECTORY: - "output" CONTEXT_FASTA: null MAPPING_REFERENCES_FASTA: diff --git a/workflow/Snakefile b/workflow/Snakefile index ea984fd..c7b6213 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -14,6 +14,9 @@ containerized: "docker://ahmig/vipera:v" + __version__ configfile: "config/config.yaml" configfile: "config/targets.yaml" +pathvars: + dataset = config["OUTPUT_NAME"] + include: "core.smk" include: "rules/context.smk" include: "rules/demix.smk" @@ -22,4 +25,4 @@ include: "rules/report.smk" rule all: input: - OUTDIR/f"{OUTPUT_NAME}.report.html" + "//report.html" diff --git a/workflow/core.smk b/workflow/core.smk index 38ff64e..78e1183 100644 --- a/workflow/core.smk +++ b/workflow/core.smk @@ -1,18 +1,5 @@ -BASE_PATH = Path(workflow.basedir).parent.resolve() - include: "rules/common.smk" -# Outputs -OUTPUT_NAME = config["OUTPUT_NAME"] -OUTDIR = Path(config["OUTPUT_DIRECTORY"]) - -# Logging -LOGDIR = OUTDIR / "logs" - -# Report -REPORT_DIR_PLOTS = Path(OUTDIR/"report/plots") -REPORT_DIR_TABLES = Path(OUTDIR/"report/tables") - include: "rules/fetch.smk" include: "rules/fasta.smk" include: "rules/asr.smk" diff --git a/workflow/rules/asr.smk b/workflow/rules/asr.smk index d20c6d4..6e3e744 100644 --- a/workflow/rules/asr.smk +++ b/workflow/rules/asr.smk @@ -4,22 +4,21 @@ rule reconstruct_ancestral_sequence: params: seed = 7291, seqtype = "DNA", - name = OUTPUT_NAME, outgroup = config["ALIGNMENT_REFERENCE"], model = config["TREE_MODEL"] input: - fasta = OUTDIR/"nextalign"/f"{OUTPUT_NAME}.aligned.masked.fasta" + fasta = "//aligned.masked.fasta" output: - folder = directory(OUTDIR/"tree"), - state_file = OUTDIR/"tree"/f"{OUTPUT_NAME}.state" + folder = directory("//tree"), + state_file = "//tree/asr.state" log: - LOGDIR / "reconstruct_ancestral_sequence" / "log.txt" + "//reconstruct_ancestral_sequence/log.txt" shell: "mkdir -p {output.folder} && " "iqtree2 -seed {params.seed} " - "-asr " - "-o {params.outgroup} -T AUTO --threads-max {threads} -s {input.fasta} " - "--seqtype {params.seqtype} -m {params.model} --prefix {output.folder}/{params.name} >{log} 2>&1" + "-asr " + "-o {params.outgroup} -T AUTO --threads-max {threads} -s {input.fasta} " + "--seqtype {params.seqtype} -m {params.model} --prefix {output.folder}/asr >{log} 2>&1" rule ancestor_fasta: @@ -30,10 +29,10 @@ rule ancestor_fasta: indeterminate_char = "N", name = "case_ancestor", input: - state_file = OUTDIR/"tree"/f"{OUTPUT_NAME}.state" + state_file = "//tree/asr.state" output: - fasta = report(OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta") + fasta = report("//ancestor.fasta") log: - LOGDIR / "ancestor_fasta" / "log.txt" + "//ancestor_fasta/log.txt" script: "../scripts/ancestor_fasta.py" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 26cadc2..c39d87f 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -49,7 +49,7 @@ def get_repo_version(base_dir: str, default: str, warn=False) -> str: def select_context_fasta(): """Set context to be fetched automatically if CONTEXT_FASTA=null""" if "CONTEXT_FASTA" not in config.keys() or config["CONTEXT_FASTA"] is None: - return OUTDIR/"context"/"sequences.fasta" + return "//context/sequences.fasta" if not Path(config["GISAID"]["CREDENTIALS"]).is_file(): raise FileNotFoundError(f"Tried to download a context dataset, but no GISAID credentials were found at '{config['GISAID']['CREDENTIALS']}' (see README.md).") elif Path(config["CONTEXT_FASTA"]).is_file(): @@ -61,7 +61,7 @@ def select_context_fasta(): def select_mapping_references_fasta(): """Set mapping references to be fetched automatically if MAPPING_REFERENCES_FASTA=null""" if "MAPPING_REFERENCES_FASTA" not in config.keys() or config["MAPPING_REFERENCES_FASTA"] is None: - return OUTDIR/"mapping_references.fasta" + return "//mapping_references.fasta" elif Path(config["MAPPING_REFERENCES_FASTA"]).is_file(): return config["MAPPING_REFERENCES_FASTA"] else: @@ -74,7 +74,7 @@ def is_url(string: str) -> bool: def select_problematic_vcf(): if is_url(config["PROBLEMATIC_VCF"]): - return OUTDIR/"problematic_sites.vcf" + return "//problematic_sites.vcf" elif Path(config["PROBLEMATIC_VCF"]).is_file(): return config["PROBLEMATIC_VCF"] else: diff --git a/workflow/rules/context.smk b/workflow/rules/context.smk index a6c5c9f..4290d0a 100644 --- a/workflow/rules/context.smk +++ b/workflow/rules/context.smk @@ -4,7 +4,7 @@ rule download_context: conda: "../envs/gisaidr.yaml" input: metadata = config["METADATA"], - pango_report = OUTDIR/f"{OUTPUT_NAME}.lineage_report.csv" + pango_report = "//lineage_report.csv" params: gisaid_creds = config["GISAID"]["CREDENTIALS"], date_window_span = 0.95, @@ -23,11 +23,11 @@ rule download_context: chunk_length = 3000, min_theoretical_combinations = config["DIVERSITY_REPS"] output: - fasta = temp(OUTDIR/"context"/"sequences.fasta"), - metadata = temp(OUTDIR/"context"/"metadata.csv"), - duplicate_accids = OUTDIR/"context"/"duplicate_accession_ids.txt" + fasta = temp("//context/sequences.fasta"), + metadata = temp("//context/metadata.csv"), + duplicate_accids = "//context/duplicate_accession_ids.txt", log: - LOGDIR / "download_context" / "log.txt" + "//download_context/log.txt" retries: 2 script: "../scripts/download_context.R" @@ -40,13 +40,13 @@ rule align_context: params: name = "context_sequences" input: - ref_fasta = OUTDIR/"reference.fasta", + ref_fasta = "//reference.fasta", fasta = lambda wildcards: select_context_fasta() output: - folder = directory(OUTDIR/"context"/"nextalign"), - fasta = OUTDIR/"context"/"nextalign"/"context_sequences.aligned.fasta" + folder = directory("//context/nextalign"), + fasta = "//context/nextalign/context_sequences.aligned.fasta" log: - LOGDIR / "align_context" / "log.txt" + "//align_context/log.txt" shell: "nextalign run -j {threads} -O {output.folder} -o {output.fasta} -n {params.name} --include-reference -r {input.ref_fasta} {input.fasta} >{log} 2>&1" @@ -59,13 +59,13 @@ rule mask_context: mask_character = "N", mask_class = ["mask"] input: - fasta = OUTDIR/"context"/"nextalign"/"context_sequences.aligned.fasta", - ref_fasta = OUTDIR/"reference.fasta", + fasta = "//context/nextalign/context_sequences.aligned.fasta", + ref_fasta = "//reference.fasta", vcf = lambda wildcards: select_problematic_vcf() output: - fasta = OUTDIR/"context"/"nextalign"/"context_sequences.aligned.masked.fasta" + fasta = "//context/nextalign/context_sequences.aligned.masked.fasta" log: - LOGDIR / "mask_context" / "log.txt" + "//mask_context/log.txt" script: "../scripts/mask_aln.py" @@ -77,20 +77,19 @@ rule ml_context_tree: params: seed = 7291, seqtype = "DNA", - name = OUTPUT_NAME, ufboot = config["UFBOOT"]["REPS"], alrt = config["SHALRT"]["REPS"], outgroup = config["ALIGNMENT_REFERENCE"], model = config["TREE_MODEL"], max_iterations_convergence = 1000, input: - fasta = OUTDIR/"nextalign"/f"{OUTPUT_NAME}.aligned.masked.fasta", - outgroup_aln = OUTDIR/"context"/"nextalign"/"context_sequences.aligned.masked.fasta" + fasta = "//aligned.masked.fasta", + outgroup_aln = "//context/nextalign/context_sequences.aligned.masked.fasta" output: - folder = directory(OUTDIR/"tree_context"), - ml = OUTDIR/f"tree_context/{OUTPUT_NAME}.treefile" + folder = directory("//tree_context"), + ml = "//tree_context/context.treefile" log: - LOGDIR / "ml_context_tree" / "log.txt" + "//ml_context_tree/log.txt" shell: "exec >{log} && exec 2>&1; " "awk '/^>/{{p=seen[$0]++}}!p' {input.fasta} {input.outgroup_aln} >aln.fasta && " @@ -99,4 +98,4 @@ rule ml_context_tree: "-bb {params.ufboot} -alrt {params.alrt} " "-o {params.outgroup} -T AUTO --threads-max {threads} -s aln.fasta " "-nm {params.max_iterations_convergence} " - "--seqtype {params.seqtype} -m {params.model} --prefix {output.folder}/{params.name}" + "--seqtype {params.seqtype} -m {params.model} --prefix {output.folder}/context" diff --git a/workflow/rules/demix.smk b/workflow/rules/demix.smk index 4973b3e..ac7de05 100644 --- a/workflow/rules/demix.smk +++ b/workflow/rules/demix.smk @@ -6,15 +6,15 @@ rule demix_barcode_update: params: pathogen = config["DEMIX"]["PATHOGEN"] output: - folder = directory(OUTDIR/"demixing"/"freyja_data"), - curated_lineages = OUTDIR/"demixing"/"freyja_data"/"curated_lineages.json", - last_barcode_update = OUTDIR/"demixing"/"freyja_data"/"last_barcode_update.txt", - lineage_mutations = OUTDIR/"demixing"/"freyja_data"/"lineage_mutations.json", - lineage_yml = OUTDIR/"demixing"/"freyja_data"/"lineages.yml", - pathogens = OUTDIR/"demixing"/"freyja_data"/"pathogen_config.yml", - usher_barcodes = OUTDIR/"demixing"/"freyja_data"/"usher_barcodes.feather" + folder = directory("//demixing/freyja_data"), + curated_lineages = "//demixing/freyja_data/curated_lineages.json", + last_barcode_update = "//demixing/freyja_data/last_barcode_update.txt", + lineage_mutations = "//demixing/freyja_data/lineage_mutations.json", + lineage_yml = "//demixing/freyja_data/lineages.yml", + pathogens = "//demixing/freyja_data/pathogen_config.yml", + usher_barcodes = "//demixing/freyja_data/usher_barcodes.feather" log: - LOGDIR / "demix_barcode_update" / "log.txt" + "//demix_barcode_update/log.txt" shell: "mkdir -p {output.folder:q} && " "freyja update --outdir {output.folder:q} --pathogen {params.pathogen:q} >{log} 2>&1" @@ -31,11 +31,11 @@ rule demix_preprocessing: max_depth = config["DEMIX"]["MAX_DEPTH"], minq = config["DEMIX"]["MIN_QUALITY"], output: - depth_file = OUTDIR/"demixing"/"{sample}/{sample}_depth.txt", - variants_file = OUTDIR/"demixing"/"{sample}/{sample}_variants.tsv", + depth_file = "//demixing/{sample}/{sample}_depth.txt", + variants_file = "//demixing/{sample}/{sample}_variants.tsv", log: - pileup = LOGDIR / "demix_preprocessing" / "{sample}_pileup.log.txt", - ivar = LOGDIR / "demix_preprocessing" / "{sample}_ivar.log.txt", + pileup = "//demix_preprocessing/{sample}_pileup.log.txt", + ivar = "//demix_preprocessing/{sample}_ivar.log.txt", shell: "set -euo pipefail && " "samtools mpileup -aa -A -d {params.max_depth} -Q {params.minq} -q 0 -B -f {input.ref_fasta:q} {input.bam:q} >sample.pileup 2>{log.pileup:q} && " @@ -49,11 +49,11 @@ rule demix: conda: "../envs/freyja.yaml" shadow: "minimal" input: - depth_file = OUTDIR/"demixing"/"{sample}/{sample}_depth.txt", - variants_file = OUTDIR/"demixing"/"{sample}/{sample}_variants.tsv", - barcodes = OUTDIR/"demixing"/"freyja_data"/"usher_barcodes.feather", - curated_lineages = OUTDIR/"demixing"/"freyja_data"/"curated_lineages.json", - lineage_yml = OUTDIR/"demixing"/"freyja_data"/"lineages.yml", + depth_file = "//demixing/{sample}/{sample}_depth.txt", + variants_file = "//demixing/{sample}/{sample}_variants.tsv", + barcodes = "//demixing/freyja_data/usher_barcodes.feather", + curated_lineages = "//demixing/freyja_data/curated_lineages.json", + lineage_yml = "//demixing/freyja_data/lineages.yml", params: coverage_cutoff = config["DEMIX"]["COV_CUTOFF"], minimum_abundance = config["DEMIX"]["MIN_ABUNDANCE"], @@ -64,9 +64,9 @@ rule demix: relaxed_mrca_thresh = config["DEMIX"]["RELAXED_MRCA_THRESH"], solver = config["DEMIX"]["SOLVER"], output: - demix_file = OUTDIR/"demixing"/"samples"/"{sample}/{sample}_demixed.tsv" + demix_file = "//demixing/samples/{sample}/{sample}_demixed.tsv" log: - LOGDIR / "demix" / "{sample}.log.txt" + "//demix/{sample}.log.txt" shell: "freyja demix " "{input.variants_file:q} " @@ -92,10 +92,10 @@ rule summarise_demix: conda: "../envs/renv.yaml" shadow: "shallow" input: - tables = expand(OUTDIR/"demixing"/"samples"/"{sample}/{sample}_demixed.tsv", sample=iter_samples()) + tables = expand("//demixing/samples/{sample}/{sample}_demixed.tsv", sample=iter_samples()) output: - summary_df = report(OUTDIR/"demixing"/"summary.csv") + summary_df = report("//demixing/summary.csv") log: - LOGDIR / "summarise_demix" / "log.txt" + "//summarise_demix/log.txt" script: "../scripts/summarise_demix.R" diff --git a/workflow/rules/distances.smk b/workflow/rules/distances.smk index ca5f69d..62611ca 100644 --- a/workflow/rules/distances.smk +++ b/workflow/rules/distances.smk @@ -7,14 +7,14 @@ rule extract_afwdist_variants: frequency_col = "ALT_FREQ", mask_class = ["mask"], input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", - mask_vcf = OUTDIR / "all_mask_sites.vcf", - ancestor = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta", - reference = OUTDIR/"reference.fasta", + variants = "//variants.tsv", + mask_vcf = "//all_mask_sites.vcf", + ancestor = "//ancestor.fasta", + reference = "//reference.fasta", output: - variants = temp(OUTDIR/f"{OUTPUT_NAME}.variants.afwdist.csv"), + variants = temp("//variants.afwdist.csv"), log: - LOGDIR/"extract_afwdist_variants"/"log.txt" + "//extract_afwdist_variants/log.txt" script: "../scripts/extract_afwdist_variants.py" @@ -24,12 +24,12 @@ rule afwdist_weighted_distances: params: extra_args = "", input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.afwdist.csv", - reference = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta", + variants = "//variants.afwdist.csv", + reference = "//ancestor.fasta", output: - distances = temp(OUTDIR/f"{OUTPUT_NAME}.distances.raw.csv"), + distances = temp("//distances.raw.csv"), log: - LOGDIR/"afwdist_weighted_distances"/"log.txt" + "//afwdist_weighted_distances/log.txt" shell: "afwdist " "-i {input.variants:q} " @@ -43,11 +43,11 @@ rule format_afwdist_results: params: samples = sorted(iter_samples()) + [config["ALIGNMENT_REFERENCE"]], input: - distances = OUTDIR/f"{OUTPUT_NAME}.distances.raw.csv", + distances = "//distances.raw.csv", output: - distances = OUTDIR/f"{OUTPUT_NAME}.distances.csv", + distances = "//distances.csv", log: - LOGDIR/"format_afwdist_results"/"log.txt" + "//format_afwdist_results/log.txt" script: "../scripts/format_afwdist_results.py" @@ -58,11 +58,11 @@ rule allele_freq_tree_data: use_bionj = config["USE_BIONJ"], outgroup_id = config["ALIGNMENT_REFERENCE"], input: - dist = OUTDIR/f"{OUTPUT_NAME}.distances.csv", + dist = "//distances.csv", output: - tree = REPORT_DIR_TABLES/"allele_freq_tree.nwk", + tree = "//report/tables/allele_freq_tree.nwk", log: - LOGDIR / "allele_freq_tree_data" / "log.txt" + "//allele_freq_tree_data/log.txt" script: "../scripts/report/allele_freq_tree_data.R" @@ -73,12 +73,12 @@ rule time_signal_data: outgroup_id = config["ALIGNMENT_REFERENCE"], confidence_interval = 0.95, input: - tree = report(REPORT_DIR_TABLES/"allele_freq_tree.nwk"), + tree = report("//report/tables/allele_freq_tree.nwk"), metadata = config["METADATA"], output: - table = report(REPORT_DIR_TABLES/"time_signal.csv"), - json = REPORT_DIR_TABLES/"time_signal.json", + table = report("//report/tables/time_signal.csv"), + json = "//report/tables/time_signal.json", log: - LOGDIR / "time_signal_data" / "log.txt" + "//time_signal_data/log.txt" script: "../scripts/report/time_signal_data.R" diff --git a/workflow/rules/evolution.smk b/workflow/rules/evolution.smk index ef1895c..0de3a19 100644 --- a/workflow/rules/evolution.smk +++ b/workflow/rules/evolution.smk @@ -5,11 +5,11 @@ rule filter_genbank_features: included = config.get("GB_FEATURES", {}).get("INCLUDE", {}), excluded = config.get("GB_FEATURES", {}).get("EXCLUDE", {}), input: - gb = OUTDIR/"reference.gb", + gb = "//reference.gb", output: - gb = OUTDIR/"reference.cds.gb", + gb = "//reference.cds.gb", log: - LOGDIR / "filter_genbank_features" / "log.txt" + "//filter_genbank_features/log.txt" script: "../scripts/filter_genbank_features.py" @@ -20,14 +20,14 @@ rule n_s_sites: params: gb_qualifier_display = "gene", input: - fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta", - masked = OUTDIR / "sites_masked.bed", - gb = OUTDIR/"reference.cds.gb", + fasta = "//ancestor.fasta", + masked = "//sites_masked.bed", + gb = "//reference.cds.gb", genetic_code = Path(config["GENETIC_CODE_JSON"]).resolve(), output: - csv = temp(OUTDIR/f"{OUTPUT_NAME}.ancestor.n_s.sites.csv"), + csv = temp("//ancestor.n_s.sites.csv"), log: - LOGDIR / "n_s_sites" / "log.txt" + "//n_s_sites/log.txt" script: "../scripts/n_s_sites_from_fasta.py" @@ -35,13 +35,13 @@ rule n_s_sites: rule calculate_dnds: conda: "../envs/renv.yaml" input: - n_s_sites = OUTDIR/f"{OUTPUT_NAME}.ancestor.n_s.sites.csv", - masked = OUTDIR / "sites_masked.bed", - variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", + n_s_sites = "//ancestor.n_s.sites.csv", + masked = "//sites_masked.bed", + variants = "//variants.tsv", metadata = config["METADATA"] output: - table = OUTDIR/f"{OUTPUT_NAME}.dnds.csv", + table = "//dnds.csv", log: - LOGDIR / "calculate_dnds" / "log.txt" + "//calculate_dnds/log.txt" script: "../scripts/calculate_dnds.R" diff --git a/workflow/rules/fasta.smk b/workflow/rules/fasta.smk index da24498..8eaf83c 100644 --- a/workflow/rules/fasta.smk +++ b/workflow/rules/fasta.smk @@ -5,9 +5,9 @@ rule read_bam_refs: input: iter_files("bam") output: - temp(OUTDIR / "bam_ids.txt") + temp("//bam_ids.txt") log: - LOGDIR / "read_bam_refs" / "log.txt" + "//read_bam_refs/log.txt" shell: """ for bam_file in {input:q}; do @@ -21,9 +21,9 @@ rule rename_fastas: input: fasta = get_input_fasta output: - renamed = temp(OUTDIR/"renamed.{sample}.fasta") + renamed = temp("//renamed/{sample}.fasta") log: - LOGDIR / "rename_fastas" / "{sample}.log.txt" + "//rename_fastas/{sample}.log.txt" shell: "sed 's/>.*/>'{wildcards.sample}'/g' {input.fasta} > {output.renamed} 2> {log}" @@ -32,11 +32,11 @@ rule concat_fasta: threads: 1 shadow: "shallow" input: - expand(OUTDIR/"renamed.{sample}.fasta", sample = iter_samples()) + expand("//renamed/{sample}.fasta", sample = iter_samples()) output: - fasta = OUTDIR/f"{OUTPUT_NAME}.fasta" + fasta = "//sequences.fasta" log: - LOGDIR / "concat_fasta" / "log.txt" + "//concat_fasta/log.txt" shell: "cat {input} > {output.fasta} 2> {log}" @@ -45,18 +45,16 @@ rule align_fasta: threads: 32 shadow: "shallow" conda: "../envs/nextalign.yaml" - params: - name = OUTPUT_NAME input: - ref_fasta = OUTDIR/"reference.fasta", - fasta = OUTDIR/f"{OUTPUT_NAME}.fasta" + ref_fasta = "//reference.fasta", + fasta = "//sequences.fasta" output: - folder = directory(OUTDIR/"nextalign"), - fasta = OUTDIR/"nextalign"/f"{OUTPUT_NAME}.aligned.fasta" + folder = directory("//nextalign"), + fasta = "//nextalign/sequences.aligned.fasta" log: - LOGDIR / "align_fasta" / "log.txt" + "//align_fasta/log.txt" shell: - "nextalign run -j {threads} -O {output.folder} -o {output.fasta} -n {params.name} --include-reference -r {input.ref_fasta} {input.fasta} >{log} 2>&1" + "nextalign run -j {threads} -O {output.folder} -o {output.fasta} -n sequences --include-reference -r {input.ref_fasta} {input.fasta} >{log} 2>&1" rule mask_alignment: @@ -67,12 +65,12 @@ rule mask_alignment: mask_character = "N", mask_class = ["mask"] input: - fasta = OUTDIR/"nextalign"/f"{OUTPUT_NAME}.aligned.fasta", - ref_fasta = OUTDIR/"reference.fasta", + fasta = "//nextalign/sequences.aligned.fasta", + ref_fasta = "//reference.fasta", vcf = lambda wildcards: select_problematic_vcf() output: - fasta = OUTDIR/"nextalign"/f"{OUTPUT_NAME}.aligned.masked.fasta" + fasta = "//aligned.masked.fasta" log: - LOGDIR / "mask_alignment" / "log.txt" + "//mask_alignment/log.txt" script: "../scripts/mask_aln.py" diff --git a/workflow/rules/fetch.smk b/workflow/rules/fetch.smk index 3e4860c..7283d76 100644 --- a/workflow/rules/fetch.smk +++ b/workflow/rules/fetch.smk @@ -4,9 +4,9 @@ rule fetch_alignment_reference: params: ref = config["ALIGNMENT_REFERENCE"] output: - fasta = OUTDIR/"reference.fasta" + fasta = "//reference.fasta" log: - LOGDIR / "fetch_alignment_reference" / "log.txt" + "//fetch_alignment_reference/log.txt" shell: "esearch -db nucleotide -query {params.ref} | efetch -format fasta > {output.fasta} 2> {log}" @@ -19,9 +19,9 @@ rule fetch_reference_gb: database = "nucleotide", format = "gb" output: - fasta = OUTDIR/"reference.gb" + fasta = "//reference.gb" log: - LOGDIR / "fetch_reference_gb" / "log.txt" + "//fetch_reference_gb/log.txt" shell: "esearch -db {params.database} -query {params.ref} | efetch -format {params.format} > {output.fasta} 2> {log}" @@ -31,11 +31,11 @@ rule fetch_mapping_references: threads: 1 conda: "../envs/fetch.yaml" input: - OUTDIR / "bam_ids.txt" + "//bam_ids.txt" output: fasta = select_mapping_references_fasta() log: - LOGDIR / "fetch_mapping_references" / "log.txt" + "//fetch_mapping_references/log.txt" shell: """ cat {input} | while read ref_id || [[ -n $ref_id ]]; do @@ -49,9 +49,9 @@ rule fetch_alignment_annotation: params: ref = config["ALIGNMENT_REFERENCE"] output: - temp(OUTDIR/"reference.gff3") + temp("//reference.gff3") log: - LOGDIR / "fetch_alignment_annotation" / "log.txt" + "//fetch_alignment_annotation/log.txt" shell: "curl 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id={params.ref}' -o {output} -s 2>{log}" @@ -61,7 +61,7 @@ rule fetch_problematic_vcf: params: url = config["PROBLEMATIC_VCF"] log: - LOGDIR / "fetch_problematic_vcf" / "log.txt" + "//fetch_problematic_vcf/log.txt" output: select_problematic_vcf() shell: @@ -70,4 +70,4 @@ rule fetch_problematic_vcf: rule create_empty_file: output: - temp(touch(OUTDIR/"empty.txt")) + temp(touch("//empty.txt")) diff --git a/workflow/rules/pangolin.smk b/workflow/rules/pangolin.smk index 248c296..3642d7d 100644 --- a/workflow/rules/pangolin.smk +++ b/workflow/rules/pangolin.smk @@ -2,11 +2,11 @@ rule pangolin_report: threads: 8 conda: "../envs/pangolin.yaml" input: - fastas = OUTDIR/f"{OUTPUT_NAME}.fasta" + fastas = "//sequences.fasta" output: - report = report(OUTDIR/f"{OUTPUT_NAME}.lineage_report.csv") + report = report("//lineage_report.csv") log: - LOGDIR / "pangolin_report" / "log.txt" + "//pangolin_report/log.txt" shell: """ pangolin {input.fastas} --outfile {output.report} -t {threads} >{log} 2>&1 diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 50a0574..61f37b2 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -1,12 +1,12 @@ rule demix_plot_data: conda: "../envs/renv.yaml" input: - summary_demixing = OUTDIR/"demixing"/"summary.csv", + summary_demixing = "//demixing/summary.csv", metadata = config["METADATA"] output: - data = REPORT_DIR_TABLES/"demix.csv" + data = "//report/tables/demix.csv" log: - LOGDIR / "demix_plot_data" / "log.txt" + "//demix_plot_data/log.txt" script: "../scripts/report/demix_plot_data.R" @@ -18,11 +18,11 @@ rule demix_plot: plot_width_mm = 159.2, plot_height_mm = 119.4, input: - data = REPORT_DIR_TABLES/"demix.csv" + data = "//report/tables/demix.csv" output: - plot = report(REPORT_DIR_PLOTS/"demix.png") + plot = report("//report/plots/demix.png") log: - LOGDIR / "demix_plot" / "log.txt" + "//demix_plot/log.txt" script: "../scripts/report/demix_plot.R" @@ -35,13 +35,13 @@ rule diversity_data: aln_reference = config["ALIGNMENT_REFERENCE"], seed = 7291, input: - study_fasta = OUTDIR/"nextalign"/f"{OUTPUT_NAME}.aligned.masked.fasta", - context_fasta = OUTDIR/"context"/"nextalign"/"context_sequences.aligned.masked.fasta", + study_fasta = "//aligned.masked.fasta", + context_fasta = "//context/nextalign/context_sequences.aligned.masked.fasta", output: - divs = REPORT_DIR_TABLES/"diversity.txt", - json = REPORT_DIR_TABLES/"diversity.json", + divs = "//report/tables/diversity.txt", + json = "//report/tables/diversity.json", log: - LOGDIR / "diversity_data" / "log.txt" + "//diversity_data/log.txt" script: "../scripts/report/diversity_data.R" @@ -54,12 +54,12 @@ rule diversity_plot: plot_width_mm = 159.2, plot_height_mm = 119.4, input: - divs = REPORT_DIR_TABLES/"diversity.txt", - json = REPORT_DIR_TABLES/"diversity.json", + divs = "//report/tables/diversity.txt", + json = "//report/tables/diversity.json", output: - plot = report(REPORT_DIR_PLOTS/"diversity.png"), + plot = report("//report/plots/diversity.png"), log: - LOGDIR / "diversity_plot" / "log.txt" + "//diversity_plot/log.txt" script: "../scripts/report/diversity_plot.R" @@ -69,11 +69,11 @@ rule extract_genbank_regions: params: gb_qualifier = "gene", input: - gb = OUTDIR/"reference.cds.gb", + gb = "//reference.cds.gb", output: - regions = temp(REPORT_DIR_TABLES/"genbank_regions.json"), + regions = temp("//report/tables/genbank_regions.json"), log: - LOGDIR / "extract_genbank_regions" / "log.txt" + "//extract_genbank_regions/log.txt" script: "../scripts/report/extract_genbank_regions.py" @@ -85,11 +85,11 @@ rule polymorphic_sites_over_time_plot: plot_width_mm = 159.2, plot_height_mm = 119.4, input: - table = REPORT_DIR_PLOTS/"polymorphic_sites_over_time.csv", + table = "//report/tables/polymorphic_sites_over_time.csv", output: - plot = report(REPORT_DIR_PLOTS/"polymorphic_sites_over_time.png"), + plot = report("//report/plots/polymorphic_sites_over_time.png"), log: - LOGDIR / "polymorphic_sites_over_time_plot" / "log.txt" + "//polymorphic_sites_over_time_plot/log.txt" script: "../scripts/report/polymorphic_sites_over_time_plot.R" @@ -102,37 +102,37 @@ rule nv_panel_plot: plot_height_mm = 250.0, plot_width_mm = 240.0, input: - panel = REPORT_DIR_TABLES/"nv_panel.csv", - window = REPORT_DIR_TABLES/"window.csv", - regions = REPORT_DIR_TABLES/"genbank_regions.json", + panel = "//report/tables/nv_panel.csv", + window = "//report/tables/window.csv", + regions = "//report/tables/genbank_regions.json", highlight_window_regions = config["PLOT_GENOME_REGIONS"], output: - plot = report(REPORT_DIR_PLOTS/"nv_panel.png"), + plot = report("//report/plots/nv_panel.png"), log: - LOGDIR / "nv_panel_plot" / "log.txt" + "//nv_panel_plot/log.txt" script: "../scripts/report/nv_panel_plot.R" use rule nv_panel_plot as nv_panel_plot_S with: input: - panel = REPORT_DIR_TABLES/"nv_panel.S.csv", - window = REPORT_DIR_TABLES/"window.S.csv", - regions = REPORT_DIR_TABLES/"genbank_regions.json", - highlight_window_regions = OUTDIR/"empty.txt", + panel = "//report/tables/nv_panel.S.csv", + window = "//report/tables/window.S.csv", + regions = "//report/tables/genbank_regions.json", + highlight_window_regions = "//empty.txt", output: - plot = report(REPORT_DIR_PLOTS/"nv_panel.S.png"), + plot = report("//report/plots/nv_panel.S.png"), log: - LOGDIR / "nv_panel_plot_S" / "log.txt" + "//nv_panel_plot_S/log.txt" rule merge_json_files: input: - REPORT_DIR_TABLES/"nv_panel.json", - REPORT_DIR_TABLES/"polymorphic_sites_over_time.json", - REPORT_DIR_TABLES/"window.json", + "//report/tables/nv_panel.json", + "//report/tables/polymorphic_sites_over_time.json", + "//report/tables/window.json", output: - json = REPORT_DIR_TABLES/"nv_panel_summary.json", + json = "//report/tables/nv_panel_summary.json", run: import json result = {} @@ -152,13 +152,13 @@ rule context_phylogeny_data: boot_th = config["UFBOOT"]["THRESHOLD"], alrt_th = config["SHALRT"]["THRESHOLD"], input: - target_fasta = OUTDIR/f"{OUTPUT_NAME}.fasta", - tree = OUTDIR/f"tree_context/{OUTPUT_NAME}.treefile", + target_fasta = "//sequences.fasta", + tree = "//tree_context/context.treefile", output: - json = REPORT_DIR_TABLES/"context_phylogeny.json", - annotation = REPORT_DIR_TABLES/"context_phylogeny.csv", + json = "//report/tables/context_phylogeny.json", + annotation = "//report/tables/context_phylogeny.csv", log: - LOGDIR / "context_phylogeny_data" / "log.txt" + "//context_phylogeny_data/log.txt" script: "../scripts/report/context_phylogeny_data.R" @@ -172,13 +172,13 @@ rule context_phylogeny_plot: boot_th = config["UFBOOT"]["THRESHOLD"], alrt_th = config["SHALRT"]["THRESHOLD"], input: - tree = OUTDIR/f"tree_context/{OUTPUT_NAME}.treefile", - json = REPORT_DIR_TABLES/"context_phylogeny.json", - annotation = REPORT_DIR_TABLES/"context_phylogeny.csv" + tree = "//tree_context/context.treefile", + json = "//report/tables/context_phylogeny.json", + annotation = "//report/tables/context_phylogeny.csv" output: - plot = report(REPORT_DIR_PLOTS/"context_phylogeny.png"), + plot = report("//report/plots/context_phylogeny.png"), log: - LOGDIR / "context_phylogeny_plot" / "log.txt" + "//context_phylogeny_plot/log.txt" script: "../scripts/report/context_phylogeny_plot.R" @@ -191,13 +191,13 @@ rule allele_freq_tree_plot: plot_height_mm = 119.4, plot_width_mm = 159.2, input: - tree = report(REPORT_DIR_TABLES/"allele_freq_tree.nwk"), - study_fasta = OUTDIR/f"{OUTPUT_NAME}.fasta", + tree = report("//report/tables/allele_freq_tree.nwk"), + study_fasta = "//sequences.fasta", metadata = config["METADATA"], output: - plot = report(REPORT_DIR_PLOTS/"allele_freq_tree.png"), + plot = report("//report/plots/allele_freq_tree.png"), log: - LOGDIR / "allele_freq_tree_plot" / "log.txt" + "//allele_freq_tree_plot/log.txt" script: "../scripts/report/allele_freq_tree_plot.R" @@ -209,11 +209,11 @@ rule time_signal_plot: plot_height_mm = 119.4, plot_width_mm = 159.2, input: - table = report(REPORT_DIR_TABLES/"time_signal.csv"), + table = report("//report/tables/time_signal.csv"), output: - plot = report(REPORT_DIR_PLOTS/"time_signal.png"), + plot = report("//report/plots/time_signal.png"), log: - LOGDIR / "time_signal_plot" / "log.txt" + "//time_signal_plot/log.txt" script: "../scripts/report/time_signal_plot.R" @@ -225,12 +225,12 @@ rule dnds_plots: plot_height_mm = 119.4, plot_width_mm = 159.2, input: - table = OUTDIR/f"{OUTPUT_NAME}.dnds.csv", + table = "//dnds.csv", output: - plot_dn_ds = report(REPORT_DIR_PLOTS/"dn_and_ds.png"), - plot_omega = report(REPORT_DIR_PLOTS/"dnds.png"), + plot_dn_ds = report("//report/plots/dn_and_ds.png"), + plot_omega = report("//report/plots/dnds.png"), log: - LOGDIR / "evo_plots" / "log.txt" + "//evo_plots/log.txt" script: "../scripts/report/dnds_plots.R" @@ -242,11 +242,11 @@ rule af_time_correlation_plot: plot_height_mm = 119.4, plot_width_mm = 159.2, input: - correlations = REPORT_DIR_TABLES/"af_time_correlation.csv", + correlations = "//report/tables/af_time_correlation.csv", output: - plot = report(REPORT_DIR_PLOTS/"af_time_correlation.png"), + plot = report("//report/plots/af_time_correlation.png"), log: - LOGDIR / "af_time_correlation_plot" / "log.txt" + "//af_time_correlation_plot/log.txt" script: "../scripts/report/af_time_correlation_plot.R" @@ -260,12 +260,12 @@ rule af_trajectory_panel_plot: plot_width_mm = 159.2, random_color_seed = 7291, input: - fmt_variants = REPORT_DIR_TABLES/"variants.filled.dated.tsv", - subset = REPORT_DIR_TABLES/"af_time_correlation.subset.txt" + fmt_variants = "//report/tables/variants.filled.dated.tsv", + subset = "//report/tables/af_time_correlation.subset.txt" output: - plot = report(REPORT_DIR_PLOTS/"af_trajectory_panel.png"), + plot = report("//report/plots/af_trajectory_panel.png"), log: - LOGDIR / "af_trajectory_panel_plot" / "log.txt" + "//af_trajectory_panel_plot/log.txt" script: "../scripts/report/af_trajectory_panel_plot.R" @@ -273,12 +273,12 @@ rule af_trajectory_panel_plot: rule summary_table: conda: "../envs/renv.yaml" input: - report = report(OUTDIR/f"{OUTPUT_NAME}.lineage_report.csv"), + report = report("//lineage_report.csv"), metadata = config["METADATA"] output: - table = REPORT_DIR_TABLES/"summary_table.csv" + table = "//report/tables/summary_table.csv" log: - LOGDIR / "summary_table" / "log.txt" + "//summary_table/log.txt" script: "../scripts/report/summary_table.R" @@ -289,27 +289,27 @@ rule report: input: qmd = Path(config["REPORT_QMD"]).resolve(), css = Path(config["REPORT_CSS"]).resolve(), - demix = report(REPORT_DIR_PLOTS/"demix.png"), - tree_ml = report(REPORT_DIR_PLOTS/"context_phylogeny.png"), - diversity = report(REPORT_DIR_PLOTS/"diversity.png"), - fig_cor = report(REPORT_DIR_PLOTS/"polymorphic_sites_over_time.png"), - SNV = report(REPORT_DIR_PLOTS/"nv_panel.png"), - SNV_spike = report(REPORT_DIR_PLOTS/"nv_panel.S.png"), - volcano = report(REPORT_DIR_PLOTS/"af_time_correlation.png"), - panel = report(REPORT_DIR_PLOTS/"af_trajectory_panel.png"), - tree = report(REPORT_DIR_PLOTS/"allele_freq_tree.png"), - temest = report(REPORT_DIR_PLOTS/"time_signal.png"), - evo = report(REPORT_DIR_PLOTS/"dn_and_ds.png"), - omega_plot = report(REPORT_DIR_PLOTS/"dnds.png"), - heat_table = report(REPORT_DIR_TABLES/"pairwise_trajectory_correlation_matrix.csv"), - freyja_ts = OUTDIR/"demixing"/"freyja_data"/"last_barcode_update.txt", - value = REPORT_DIR_TABLES/"diversity.json", - stats_lm = REPORT_DIR_TABLES/"time_signal.json", - stats_ml = REPORT_DIR_TABLES/"context_phylogeny.json", - table = REPORT_DIR_TABLES/"summary_table.csv", - sum_nv = REPORT_DIR_TABLES/"nv_panel_summary.json", + demix = report("//report/plots/demix.png"), + tree_ml = report("//report/plots/context_phylogeny.png"), + diversity = report("//report/plots/diversity.png"), + fig_cor = report("//report/plots/polymorphic_sites_over_time.png"), + SNV = report("//report/plots/nv_panel.png"), + SNV_spike = report("//report/plots/nv_panel.S.png"), + volcano = report("//report/plots/af_time_correlation.png"), + panel = report("//report/plots/af_trajectory_panel.png"), + tree = report("//report/plots/allele_freq_tree.png"), + temest = report("//report/plots/time_signal.png"), + evo = report("//report/plots/dn_and_ds.png"), + omega_plot = report("//report/plots/dnds.png"), + heat_table = report("//report/tables/pairwise_trajectory_correlation_matrix.csv"), + freyja_ts = "//demixing/freyja_data/last_barcode_update.txt", + value = "//report/tables/diversity.json", + stats_lm = "//report/tables/time_signal.json", + stats_ml = "//report/tables/context_phylogeny.json", + table = "//report/tables/summary_table.csv", + sum_nv = "//report/tables/nv_panel_summary.json", params: - workflow_version = get_repo_version(BASE_PATH.as_posix(), __version__), + workflow_version = get_repo_version(Path(workflow.basedir).parent, __version__), min_ivar_freq = config["VC"]["MIN_FREQ"], ufboot_reps = config["UFBOOT"]["REPS"], shalrt_reps = config["SHALRT"]["REPS"], @@ -317,9 +317,9 @@ rule report: use_bionj = config["USE_BIONJ"], cor_method = config["COR"]["METHOD"], output: - html = report(OUTDIR/f"{OUTPUT_NAME}.report.html") + html = report("//report.html") log: - LOGDIR / "report" / "log.txt" + "//report/log.txt" shell: """ set +o pipefail diff --git a/workflow/rules/sites.smk b/workflow/rules/sites.smk index 9bbbd41..0dd368e 100644 --- a/workflow/rules/sites.smk +++ b/workflow/rules/sites.smk @@ -6,10 +6,10 @@ rule problematic_vcf_to_bed: input: vcf = lambda wildcards: select_problematic_vcf(), output: - vcf = temp(OUTDIR / "sites_masked.vcf"), - bed = temp(OUTDIR / "sites_masked.bed"), + vcf = temp("//sites_masked.vcf"), + bed = temp("//sites_masked.bed"), log: - LOGDIR / "problematic_vcf_to_bed" / "log.txt", + "//problematic_vcf_to_bed/log.txt", shell: "FILTER_STR=$(echo \"{params.filters}\" | tr ' ' ',') && " "bcftools view -f \"$FILTER_STR\" {input.vcf} >{output.vcf} 2>{log} && " @@ -25,13 +25,13 @@ rule bcftools_mpileup_all_sites: mpileup_extra = "--no-BAQ" input: bam = get_input_bam, - reference = OUTDIR/"vaf"/"{sample}.reference.fasta", + reference = "//vaf/{sample}.reference.fasta", output: - mpileup = temp(OUTDIR / "all_sites" / "{sample}.mpileup.vcf"), - query = temp(OUTDIR / "all_sites" / "{sample}.query.tsv"), + mpileup = temp("//all_sites/{sample}.mpileup.vcf"), + query = temp("//all_sites/{sample}.query.tsv"), log: - mpileup = LOGDIR / "bcftools_mpileup_all_sites" / "{sample}.mpileup.txt", - query = LOGDIR / "bcftools_mpileup_all_sites" / "{sample}.query.txt", + mpileup = "//bcftools_mpileup_all_sites/{sample}.mpileup.txt", + query = "//bcftools_mpileup_all_sites/{sample}.query.txt", shell: "bcftools mpileup {params.mpileup_extra} -a AD,ADF,ADR --fasta-ref {input.reference:q} --threads {threads} -Q {params.min_bq} -q {params.min_mq} -Ov -o {output.mpileup:q} {input.bam:q} >{log.mpileup:q} 2>&1 && " "echo 'CHROM\tPOS\tREF\tALT\tDP\tAD\tADF\tADR' >{output.query:q} && " @@ -45,10 +45,10 @@ rule filter_mpileup_all_sites: min_total_ADF = 0, min_total_ADR = 0, input: - OUTDIR / "all_sites" / "{sample}.query.tsv", + "//all_sites/{sample}.query.tsv", output: - sites_pass = temp(OUTDIR / "all_sites" / "{sample}.filtered_sites.tsv"), - sites_fail = temp(OUTDIR / "all_sites" / "{sample}.fail_sites.tsv"), + sites_pass = temp("//all_sites/{sample}.filtered_sites.tsv"), + sites_fail = temp("//all_sites/{sample}.fail_sites.tsv"), run: import pandas as pd df = pd.read_csv(input[0], sep="\t") @@ -68,27 +68,27 @@ rule filter_mpileup_all_sites: use rule concat_vcf_fields as merge_filtered_mpileup_all_sites with: input: - expand(OUTDIR / "all_sites" / "{sample}.filtered_sites.tsv", sample=iter_samples()), + expand("//all_sites/{sample}.filtered_sites.tsv", sample=iter_samples()), output: - OUTDIR / f"{OUTPUT_NAME}.filtered_sites.tsv", + "//filtered_sites.tsv", use rule concat_vcf_fields as merge_fail_mpileup_all_sites with: input: - expand(OUTDIR / "all_sites" / "{sample}.fail_sites.tsv", sample=iter_samples()), + expand("//all_sites/{sample}.fail_sites.tsv", sample=iter_samples()), output: - OUTDIR / f"{OUTPUT_NAME}.fail_sites.tsv", + "//fail_sites.tsv", rule fill_all_sites: conda: "../envs/renv.yaml" input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", - sites = OUTDIR / f"{OUTPUT_NAME}.filtered_sites.tsv", + variants = "//variants.tsv", + sites = "//filtered_sites.tsv", output: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.all_sites.tsv", + variants = "//variants.all_sites.tsv", log: - LOGDIR / "fill_all_sites" / "log.txt" + "//fill_all_sites/log.txt" script: "../scripts/fill_all_sites.R" @@ -99,9 +99,9 @@ rule compile_fail_sites_vcf: sub_text = "NA", exc_text = "site_qual", input: - sites = OUTDIR / f"{OUTPUT_NAME}.fail_sites.tsv", + sites = "//fail_sites.tsv", output: - sites = temp(OUTDIR / f"{OUTPUT_NAME}.fail_sites.vcf"), + sites = temp("//fail_sites.vcf"), run: import pandas as pd HEADER = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] @@ -121,9 +121,9 @@ rule compile_fail_sites_vcf: rule merge_mask_sites_vcf: input: lambda wildcards: select_problematic_vcf(), - OUTDIR / f"{OUTPUT_NAME}.fail_sites.vcf", + "//fail_sites.vcf", output: - sites = temp(OUTDIR / "all_mask_sites.vcf"), + sites = temp("//all_mask_sites.vcf"), run: import pandas as pd HEADER = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk index 5a8ed5b..f7b8aa0 100644 --- a/workflow/rules/vaf.smk +++ b/workflow/rules/vaf.smk @@ -12,14 +12,14 @@ rule snps_to_ancestor: ivar_freq=config["VC"]["MIN_FREQ"], ivar_depth=config["VC"]["MIN_DEPTH"], input: - reference_fasta=OUTDIR / f"{OUTPUT_NAME}.ancestor.fasta", + reference_fasta="//ancestor.fasta", bam=get_input_bam, - gff=OUTDIR / "reference.gff3", + gff="//reference.gff3", output: - tsv=temp(OUTDIR / "vaf" / "vc" / "{sample}.tsv"), - reference_fasta_renamed=temp(OUTDIR / "vaf" / "{sample}.reference.fasta"), + tsv=temp("//vaf/vc/{sample}.tsv"), + reference_fasta_renamed=temp("//vaf/{sample}.reference.fasta"), log: - LOGDIR / "snps_to_ancestor" / "{sample}.log.txt", + "//snps_to_ancestor/{sample}.log.txt", shell: """ set -e @@ -62,12 +62,12 @@ rule mask_tsv: params: mask_class=["mask"], input: - tsv=OUTDIR / "vaf" / "vc" / "{sample}.tsv", + tsv="//vaf/vc/{sample}.tsv", vcf=lambda wildcards: select_problematic_vcf(), output: - masked_tsv=temp(OUTDIR / "vaf" / "masked" / "{sample}.tsv"), + masked_tsv=temp("//vaf/masked/{sample}.tsv"), log: - LOGDIR / "mask_tsv" / "{sample}.log.txt", + "//mask_tsv/{sample}.log.txt", script: "../scripts/mask_tsv.py" @@ -81,11 +81,11 @@ rule filter_tsv: min_alt_rv=2, min_alt_dp=2, input: - tsv=OUTDIR / "vaf" / "masked" / "{sample}.tsv", + tsv="//vaf/masked/{sample}.tsv", output: - filtered_tsv=temp(OUTDIR / "vaf" / "filtered" / "{sample}.tsv"), + filtered_tsv=temp("//vaf/filtered/{sample}.tsv"), log: - LOGDIR / "filter_tsv" / "{sample}.log.txt", + "//filter_tsv/{sample}.log.txt", script: "../scripts/filter_tsv.R" @@ -97,11 +97,11 @@ rule tsv_to_vcf: params: ref_name=config["ALIGNMENT_REFERENCE"], input: - tsv=OUTDIR / "vaf" / "filtered" / "{sample}.tsv", + tsv="//vaf/filtered/{sample}.tsv", output: - vcf=temp(OUTDIR / "vaf" / "vcf" / "{sample}.vcf"), + vcf=temp("//vaf/vcf/{sample}.vcf"), log: - LOGDIR / "tsv_to_vcf" / "{sample}.log.txt", + "//tsv_to_vcf/{sample}.log.txt", script: "../scripts/tsv_to_vcf.py" @@ -114,13 +114,13 @@ rule variants_effect: "../envs/snpeff.yaml" params: ref_name=config["ALIGNMENT_REFERENCE"], - snpeff_data_dir=(BASE_PATH / "config" / "snpeff").resolve(), + snpeff_data_dir=(Path(workflow.basedir).parent / "config/snpeff").resolve(), input: - vcf=OUTDIR / "vaf" / "vcf" / "{sample}.vcf", + vcf="//vaf/vcf/{sample}.vcf", output: - ann_vcf=OUTDIR / "vaf" / "annotated" / "{sample}.vcf", + ann_vcf="//vaf/annotated/{sample}.vcf", log: - LOGDIR / "variants_effect" / "{sample}.log.txt", + "//variants_effect/{sample}.log.txt", retries: 2 shell: """ @@ -148,11 +148,11 @@ rule extract_vcf_fields: ], sep=",", input: - vcf=OUTDIR / "vaf" / "annotated" / "{sample}.vcf", + vcf="//vaf/annotated/{sample}.vcf", output: - tsv=OUTDIR / "vaf" / "fields" / "{sample}.tsv", + tsv="//vaf/fields/{sample}.tsv", log: - LOGDIR / "tsv_to_vcf" / "{sample}.log.txt", + "//tsv_to_vcf/{sample}.log.txt", shell: "SnpSift extractFields -e 'NA' -s {params.sep:q} {input.vcf:q} {params.extract_columns} >{output.tsv:q} 2>{log:q}" @@ -171,11 +171,11 @@ rule format_vcf_fields_longer: # lambda to deactivate automatic wildcard expansion in pattern sep=",", input: - tsv=OUTDIR / "vaf" / "fields" / "{sample}.tsv", + tsv="//vaf/fields/{sample}.tsv", output: - tsv=OUTDIR / "vaf" / "fields_longer" / "{sample}.tsv", + tsv="//vaf/fields_longer/{sample}.tsv", log: - LOGDIR / "format_vcf_fields_longer" / "{sample}.log.txt", + "//format_vcf_fields_longer/{sample}.log.txt", script: "../scripts/format_vcf_fields_longer.R" @@ -184,9 +184,9 @@ rule concat_vcf_fields: params: sep="\t", input: - expand(OUTDIR / "vaf" / "fields_longer" / "{sample}.tsv", sample=iter_samples()), + expand("//vaf/fields_longer/{sample}.tsv", sample=iter_samples()), output: - OUTDIR / f"{OUTPUT_NAME}.vcf_fields.longer.tsv", + "//vcf_fields.longer.tsv", run: import pandas as pd from functools import reduce @@ -204,21 +204,21 @@ rule merge_annotation: sample="{sample}", ref_name=config["ALIGNMENT_REFERENCE"], input: - tsv=OUTDIR / "vaf" / "filtered" / "{sample}.tsv", - annot=OUTDIR / "vaf" / "fields_longer" / "{sample}.tsv", + tsv="//vaf/filtered/{sample}.tsv", + annot="//vaf/fields_longer/{sample}.tsv", output: - tsv=OUTDIR / "vaf" / "variants" / "{sample}.tsv", + tsv="//vaf/variants/{sample}.tsv", log: - LOGDIR / "merge_annotation" / "{sample}.log.txt", + "//merge_annotation/{sample}.log.txt", script: "../scripts/merge_annotation.R" use rule concat_vcf_fields as concat_variants with: input: - expand(OUTDIR / "vaf" / "variants" / "{sample}.tsv", sample=iter_samples()), + expand("//vaf/variants/{sample}.tsv", sample=iter_samples()), output: - OUTDIR / f"{OUTPUT_NAME}.variants.tsv", + "//variants.tsv", rule window_data: @@ -230,13 +230,13 @@ rule window_data: features=config.get("GB_FEATURES", {}), gb_qualifier_display="gene", input: - variants=OUTDIR / f"{OUTPUT_NAME}.variants.tsv", - gb=OUTDIR / "reference.gb", + variants="//variants.tsv", + gb="//reference.gb", output: - window_df=REPORT_DIR_TABLES / "window.csv", - json=temp(REPORT_DIR_TABLES / "window.json"), + window_df="//report/tables/window.csv", + json=temp("//report/tables/window.json"), log: - LOGDIR / "window_data" / "log.txt", + "//window_data/log.txt", script: "../scripts/report/window_data.py" @@ -245,37 +245,37 @@ rule nv_panel_data: conda: "../envs/renv.yaml" input: - variants=OUTDIR / f"{OUTPUT_NAME}.variants.tsv", + variants="//variants.tsv", metadata=config["METADATA"], output: - table=REPORT_DIR_TABLES / "nv_panel.csv", - json=temp(REPORT_DIR_TABLES / "nv_panel.json"), + table="//report/tables/nv_panel.csv", + json=temp("//report/tables/nv_panel.json"), log: - LOGDIR / "nv_panel_data" / "log.txt", + "//nv_panel_data/log.txt", script: "../scripts/report/nv_panel_data.R" rule nv_panel_zoom_on_feature_data: input: - table=REPORT_DIR_TABLES / "nv_panel.csv", - regions=REPORT_DIR_TABLES / "genbank_regions.json", + table="//report/tables/nv_panel.csv", + regions="//report/tables/genbank_regions.json", output: - table=temp(REPORT_DIR_TABLES / "nv_panel.{region_name}.csv"), + table=temp("//report/tables/nv_panel.{region_name}.csv"), log: - LOGDIR / "nv_panel_zoom_on_feature_data" / "{region_name}.log.txt", + "//nv_panel_zoom_on_feature_data/{region_name}.log.txt", script: "../scripts/report/nv_panel_zoom_on_feature_data.py" rule window_zoom_on_feature_data: input: - table=REPORT_DIR_TABLES / "window.csv", - regions=REPORT_DIR_TABLES / "genbank_regions.json", + table="//report/tables/window.csv", + regions="//report/tables/genbank_regions.json", output: - table=temp(REPORT_DIR_TABLES / "window.{region_name}.csv"), + table=temp("//report/tables/window.{region_name}.csv"), log: - LOGDIR / "window_zoom_on_feature_data" / "{region_name}.log.txt", + "//window_zoom_on_feature_data/{region_name}.log.txt", script: "../scripts/report/window_zoom_on_feature_data.py" @@ -290,14 +290,14 @@ rule af_time_correlation_data: min_abs_cor_threshold = 0.0, min_diff_af_threshold = 0.0, input: - variants=OUTDIR / f"{OUTPUT_NAME}.variants.all_sites.tsv", + variants="//variants.all_sites.tsv", metadata=config["METADATA"], output: - fmt_variants=temp(REPORT_DIR_TABLES / "variants.filled.dated.tsv"), - correlations=report(REPORT_DIR_TABLES / "af_time_correlation.csv"), - subset=REPORT_DIR_TABLES / "af_time_correlation.subset.txt", + fmt_variants=temp("//report/tables/variants.filled.dated.tsv"), + correlations=report("//report/tables/af_time_correlation.csv"), + subset="//report/tables/af_time_correlation.subset.txt", log: - LOGDIR / "af_time_correlation_data" / "log.txt", + "//af_time_correlation_data/log.txt", script: "../scripts/report/af_time_correlation_data.R" @@ -309,13 +309,13 @@ rule pairwise_trajectory_correlation_data: cor_method=config["COR"]["METHOD"], cor_use="pairwise.complete.obs", input: - variants=OUTDIR / f"{OUTPUT_NAME}.variants.all_sites.tsv", + variants="//variants.all_sites.tsv", metadata=config["METADATA"], output: - table=REPORT_DIR_TABLES / "pairwise_trajectory_frequency_data.csv", - matrix=report(REPORT_DIR_TABLES / "pairwise_trajectory_correlation_matrix.csv"), + table="//report/tables/pairwise_trajectory_frequency_data.csv", + matrix=report("//report/tables/pairwise_trajectory_correlation_matrix.csv"), log: - LOGDIR / "pairwise_trajectory_correlation_data" / "log.txt", + "//pairwise_trajectory_correlation_data/log.txt", script: "../scripts/report/pairwise_trajectory_correlation_data.R" @@ -326,12 +326,12 @@ rule polymorphic_sites_over_time_data: params: max_alt_freq=1.0 - config["VC"]["MIN_FREQ"], input: - variants=OUTDIR / f"{OUTPUT_NAME}.variants.tsv", + variants="//variants.tsv", metadata=config["METADATA"], output: - table=REPORT_DIR_PLOTS / "polymorphic_sites_over_time.csv", - json=temp(REPORT_DIR_TABLES / "polymorphic_sites_over_time.json"), + table="//report/tables/polymorphic_sites_over_time.csv", + json=temp("//report/tables/polymorphic_sites_over_time.json"), log: - LOGDIR / "polymorphic_sites_over_time_data" / "log.txt", + "//polymorphic_sites_over_time_data/log.txt", script: "../scripts/report/polymorphic_sites_over_time_data.R"