A snakemake for GTDB-tk

public 1yr ago 0 bookmarks

View Workflow

It uses GTDB-tk to taxonomically annotate your genomes and to build a tree.

Run as follows:

dbdir="databases"
genome_dir="genomes"
snakemake --use-conda --conda-prefix "$dbdir/conda_envs" \ --config database_dir="$dbdir" genome_dir="$genome_dir"

where dbdir is the path to a (shared) directory to place the GTDB database and conda envs. genome_dir should be the folder containing all genome fastas.

This code was developped as part of metagenome-atlas . Don't forget to cite the GTDB-tk

Code Snippets

shell:
    " wget {GTDB_DATA_URL} -O {output} &> {log} "

SnakeMake From line 28 of rules/gtdbtk.smk

shell:
    'tar -xzvf {input} -C "{GTDBTK_DATA_PATH}" --strip 1 2> {log}; '
    'echo "Set the GTDBTK_DATA_PATH environment variable to {GTDBTK_DATA_PATH} " >> {log}; '
    "conda env config vars set GTDBTK_DATA_PATH={GTDBTK_DATA_PATH} "

SnakeMake From line 44 of rules/gtdbtk.smk

shell:
    "gtdbtk identify "
    "--genome_dir {input.dir} "
    " --out_dir {params.outdir} "
    "--extension {params.extension} "
    "--cpus {threads} &> {log[0]}"

SnakeMake gtdbtk From line 68 of rules/gtdbtk.smk

shell:
    "gtdbtk align --identify_dir {params.outdir} --out_dir {params.outdir} "
    "--cpus {threads} &> {log[0]}"

SnakeMake gtdbtk From line 91 of rules/gtdbtk.smk

shell:
    "gtdbtk classify --genome_dir {input.genome_dir} --align_dir {params.outdir} "
    "--out_dir {params.outdir} "
    " --tmpdir {resources.tmpdir} "
    "--extension {params.extension} "
    "--cpus {threads} &> {log[0]}"

SnakeMake gtdbtk From line 114 of rules/gtdbtk.smk

script:
    "../scripts/combine_taxonomy.py"

SnakeMake From line 130 of rules/gtdbtk.smk

shell:
    "gtdbtk infer --msa_file {input} "
    " --out_dir {params.outdir} "
    " --prefix {wildcards.msa} "
    " --cpus {threads} "
    "--tmpdir {resources.tmpdir} "

SnakeMake gtdbtk From line 147 of rules/gtdbtk.smk

script:
    "../scripts/root_tree.py"

SnakeMake From line 174 of rules/gtdbtk.smk

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

#### Begining of scripts

import pandas as pd
import numpy as np
from utils.taxonomy import tax2table

from glob import glob

gtdb_classify_folder = snakemake.input.folder

taxonomy_files = glob(f"{gtdb_classify_folder}/gtdbtk.*.summary.tsv")

N_taxonomy_files = len(taxonomy_files)
logging.info(f"Found {N_taxonomy_files} gtdb taxonomy files.")

if (0 == N_taxonomy_files) or (N_taxonomy_files > 2):

    raise Exception(
        f"Found {N_taxonomy_files} number of taxonomy files 'gtdbtk.*.summary.tsv' in {gtdb_classify_folder} expect 1 or 2."
    )


DT = pd.concat([pd.read_table(file, index_col=0) for file in taxonomy_files], axis=0)

DT.to_csv(snakemake.output.combined)

Tax = tax2table(DT.classification, remove_prefix=True)
Tax.to_csv(snakemake.output.taxonomy, sep="\t")

Python Pandas numpy utils From line 2 of scripts/combine_taxonomy.py

import sys, os
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

# start
import ete3

T = ete3.Tree(str(snakemake.input.tree), quoted_node_names=False, format=1)

try:

    T.unroot(mode="keep")
    if len(T) > 2:
        T.set_outgroup(T.get_midpoint_outgroup())

except Exception as e:
    logging.error("Failed to root tree, keep unrooted. Reason was:\n\n" + str(e))


T.write(outfile=snakemake.output.tree)