A Snakemake workflow for ecDNA detection in Nanopore or Illumina sequencing reads derived from DNA samples enriched for circular DNA.

public 1yr ago Version: v2.0.0 0 bookmarks

View Workflow

A Snakemake workflow for ecDNA detection in Nanopore or Illumina sequencing reads derived from DNA samples enriched for circular DNA.

Usage

The usage of this workflow is described in the Snakemake Workflow Catalog .

If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository.

Code Snippets

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "koester@jimmy.harvard.edu"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.bcftools import get_bcftools_opts


bcftools_opts = get_bcftools_opts(snakemake, parse_ref=False, parse_memory=False)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


shell("bcftools concat {bcftools_opts} {extra} {snakemake.input.calls} {log}")

Python Snakemake BCFtools snakemake-wrapper-utils From line 1 of concat/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "koester@jimmy.harvard.edu"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.bcftools import get_bcftools_opts


bcftools_opts = get_bcftools_opts(
    snakemake, parse_ref=False, parse_output_format=False, parse_memory=False
)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


if "--tbi" in extra or "--csi" in extra:
    raise ValueError(
        "You have specified index format (`--tbi/--csi`) in `params.extra`; this is automatically infered from the first output file."
    )

if snakemake.output[0].endswith(".tbi"):
    extra += " --tbi"
elif snakemake.output[0].endswith(".csi"):
    extra += " --csi"
else:
    raise ValueError("invalid index file format ('.tbi', '.csi').")


shell("bcftools index {bcftools_opts} {extra} {snakemake.input[0]} {log}")

Python Snakemake BCFtools snakemake-wrapper-utils From line 1 of index/wrapper.py

__author__ = "Filipe G. Vieira"
__copyright__ = "Copyright 2020, Filipe G. Vieira"
__license__ = "MIT"


import tempfile
from snakemake.shell import shell
from snakemake_wrapper_utils.bcftools import get_bcftools_opts


bcftools_opts = get_bcftools_opts(snakemake, parse_ref=False)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "bcftools sort"
        " {bcftools_opts}"
        " {extra}"
        " --temp-dir {tmpdir}"
        " {snakemake.input[0]}"
        " {log}"
    )

Python Snakemake BCFtools snakemake-wrapper-utils From line 1 of sort/wrapper.py

__author__ = "Tom Poorten"
__copyright__ = "Copyright 2017, Tom Poorten"
__email__ = "tom.poorten@gmail.com"
__license__ = "MIT"


from os import path
from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import infer_out_format
from snakemake_wrapper_utils.samtools import get_samtools_opts


samtools_opts = get_samtools_opts(snakemake, parse_output=False)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
sort = snakemake.params.get("sorting", "none")
sort_extra = snakemake.params.get("sort_extra", "")

out_ext = infer_out_format(snakemake.output[0])

pipe_cmd = ""
if out_ext != "PAF":
    # Add option for SAM output
    extra += " -a"

    # Determine which pipe command to use for converting to bam or sorting.
    if sort == "none":
        if out_ext != "SAM":
            # Simply convert to output format using samtools view.
            pipe_cmd = f"| samtools view -h {samtools_opts}"

    elif sort in ["coordinate", "queryname"]:
        # Add name flag if needed.
        if sort == "queryname":
            sort_extra += " -n"

        # Sort alignments.
        pipe_cmd = f"| samtools sort {sort_extra} {samtools_opts}"

    else:
        raise ValueError(f"Unexpected value for params.sort: {sort}")


shell(
    "(minimap2"
    " -t {snakemake.threads}"
    " {extra} "
    " {snakemake.input.target}"
    " {snakemake.input.query}"
    " {pipe_cmd}"
    " > {snakemake.output[0]}"
    ") {log}"
)

Python Snakemake SAMtools snakemake-wrapper-utils From line 1 of aligner/wrapper.py

__author__ = "Tom Poorten"
__copyright__ = "Copyright 2017, Tom Poorten"
__email__ = "tom.poorten@gmail.com"
__license__ = "MIT"

from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "(minimap2 -t {snakemake.threads} {extra} "
    "-d {snakemake.output[0]} {snakemake.input.target}) {log}"
)

Python Snakemake From line 1 of index/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "johannes.koester@uni-due.de"
__license__ = "MIT"

import subprocess
import sys
from pathlib import Path
from snakemake.shell import shell


log = snakemake.log_fmt_shell(stdout=False, stderr=True)


species = snakemake.params.species.lower()
build = snakemake.params.build
release = int(snakemake.params.release)
out_fmt = Path(snakemake.output[0]).suffixes
out_gz = (out_fmt.pop() and True) if out_fmt[-1] == ".gz" else False
out_fmt = out_fmt.pop().lstrip(".")


branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"
elif snakemake.params.get("branch"):
    branch = snakemake.params.branch + "/"


flavor = snakemake.params.get("flavor", "")
if flavor:
    flavor += "."


suffix = ""
if out_fmt == "gtf":
    suffix = "gtf.gz"
elif out_fmt == "gff3":
    suffix = "gff3.gz"
else:
    raise ValueError(
        "invalid format specified. Only 'gtf[.gz]' and 'gff3[.gz]' are currently supported."
    )


url = "ftp://ftp.ensembl.org/pub/{branch}release-{release}/{out_fmt}/{species}/{species_cap}.{build}.{release}.{flavor}{suffix}".format(
    release=release,
    build=build,
    species=species,
    out_fmt=out_fmt,
    species_cap=species.capitalize(),
    suffix=suffix,
    flavor=flavor,
    branch=branch,
)


try:
    if out_gz:
        shell("curl -L {url} > {snakemake.output[0]} {log}")
    else:
        shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
except subprocess.CalledProcessError as e:
    if snakemake.log:
        sys.stderr = open(snakemake.log[0], "a")
    print(
        "Unable to download annotation data from Ensembl. "
        "Did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)

Python Snakemake From line 1 of ensembl-annotation/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "johannes.koester@uni-due.de"
__license__ = "MIT"

import subprocess as sp
import sys
from itertools import product
from snakemake.shell import shell

species = snakemake.params.species.lower()
release = int(snakemake.params.release)
build = snakemake.params.build

branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"
elif snakemake.params.get("branch"):
    branch = snakemake.params.branch + "/"

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

spec = ("{build}" if int(release) > 75 else "{build}.{release}").format(
    build=build, release=release
)

suffixes = ""
datatype = snakemake.params.get("datatype", "")
chromosome = snakemake.params.get("chromosome", "")
if datatype == "dna":
    if chromosome:
        suffixes = ["dna.chromosome.{}.fa.gz".format(chromosome)]
    else:
        suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"]
elif datatype == "cdna":
    suffixes = ["cdna.all.fa.gz"]
elif datatype == "cds":
    suffixes = ["cds.all.fa.gz"]
elif datatype == "ncrna":
    suffixes = ["ncrna.fa.gz"]
elif datatype == "pep":
    suffixes = ["pep.all.fa.gz"]
else:
    raise ValueError("invalid datatype, must be one of dna, cdna, cds, ncrna, pep")

if chromosome:
    if not datatype == "dna":
        raise ValueError(
            "invalid datatype, to select a single chromosome the datatype must be dna"
        )

spec = spec.format(build=build, release=release)
url_prefix = f"ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}"

success = False
for suffix in suffixes:
    url = f"{url_prefix}.{suffix}"

    try:
        shell("curl -sSf {url} > /dev/null 2> /dev/null")
    except sp.CalledProcessError:
        continue

    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
    success = True
    break

if not success:
    if len(suffixes) > 1:
        url = f"{url_prefix}.[{'|'.join(suffixes)}]"
    else:
        url = f"{url_prefix}.{suffixes[0]}"
    print(
        f"Unable to download requested sequence data from Ensembl ({url}). "
        "Please check whether above URL is currently available (might be a temporal server issue). "
        "Apart from that, did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)

Python Snakemake From line 1 of ensembl-sequence/wrapper.py

__author__ = "Michael Chambers"
__copyright__ = "Copyright 2019, Michael Chambers"
__email__ = "greenkidneybean@gmail.com"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts

samtools_opts = get_samtools_opts(
    snakemake, parse_threads=False, parse_write_index=False, parse_output_format=False
)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell("samtools faidx {samtools_opts} {extra} {snakemake.input[0]} {log}")

Python Snakemake SAMtools snakemake-wrapper-utils From line 1 of faidx/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "koester@jimmy.harvard.edu"
__license__ = "MIT"


from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Samtools takes additional threads through its option -@
# One thread for samtools merge
# Other threads are *additional* threads passed to the '-@' argument
threads = "" if snakemake.threads <= 1 else " -@ {} ".format(snakemake.threads - 1)

shell(
    "samtools index {threads} {extra} {snakemake.input[0]} {snakemake.output[0]} {log}"
)

Python Snakemake SAMtools From line 1 of index/wrapper.py

shell:
    """cyrcular graph table {input.graph} {input.bcf} --reference {input.reference} --circle-table {output.overview} --segment-tables {output.details} 2> {log}"""

SnakeMake From line 19 of rules/annotate.smk

shell:
    "cyrcular graph annotate "
    "  --reference {input.reference} "
    "  --gene-annotation {input.gene_annotation} "
    "  --regulatory-annotation {input.regulatory_annotation} "
    "  --output {output.annotated} "
    "  {input.graph} "
    "2> {log} "

SnakeMake From line 45 of rules/annotate.smk

script:
    "../scripts/sort_bcf_header.py"

SnakeMake From line 85 of rules/annotate.smk

shell:
    """
    bcftools view -h {input.bcf} > {output.header} 2> {log}
    """

SnakeMake BCFtools From line 98 of rules/annotate.smk

shell:
    """
    bcftools view -h {input.vcf} | rg {params.fields:q} > {output.header} 2> {log}
    """

SnakeMake BCFtools From line 115 of rules/annotate.smk

wrapper:
    "v1.25.0/bio/bcftools/index"

SnakeMake From line 12 of rules/call.smk

wrapper:
    "v1.25.0/bio/bcftools/concat"

SnakeMake From line 34 of rules/call.smk

wrapper:
    "v1.25.0/bio/bcftools/sort"

SnakeMake From line 47 of rules/call.smk

shell:
    "varlociraptor "
    "call variants generic --obs {params.obs} "
    "--scenario {input.scenario} > {output} 2> {log}"

SnakeMake Varlociraptor From line 66 of rules/call.smk

shell:
    "varlociraptor estimate alignment-properties {input.ref} --bam {input.bam} > {output} 2> {log}"

SnakeMake Varlociraptor From line 83 of rules/call.smk

shell:
    "varlociraptor preprocess variants {input.ref} "
    "--candidates {input.candidates} "
    "--max-depth 200 "
    "--alignment-properties {input.alignment_props} "
    "--pairhmm-mode {params.mode} "
    "--bam {input.bam} --output {output} 2> {log}"

SnakeMake Varlociraptor From line 108 of rules/call.smk

shell:
    "rbt vcf-split {input} {output}"

SnakeMake rust-bio-tools From line 128 of rules/call.smk

wrapper:
    "v1.25.0/bio/bcftools/sort"

SnakeMake From line 141 of rules/call.smk

shell:
    """cyrcular\
    graph\
    breakends\
    {input.bam}\
    --reference {input.ref}\
    --min-read-depth {params.min_read_depth}\
    --min-split-reads {params.min_split_reads}\
    --max-paths-per-component {params.max_paths_per_component}\
    --max-deletion-length {params.max_deletion_length}\
    -t {threads}\
    --output {output.bnds}\
    --graph {output.graph}\
    --dot {output.dot}\
    2> {log}"""

SnakeMake From line 167 of rules/call.smk

script:
    "../scripts/copy_qc_plots.py"

SnakeMake From line 61 of rules/datavzrd.smk

script:
    "../scripts/copy_graph_plots.py"

SnakeMake From line 80 of rules/datavzrd.smk

shell:
    "datavzrd {input.config} --output {output} &> {log}"

SnakeMake From line 103 of rules/datavzrd.smk

script:
    "../scripts/filter_overview_table.py"

SnakeMake From line 15 of rules/filter.smk

shell:
    """
    varlociraptor filter-calls control-fdr --mode {params.mode} --events PRESENT --var BND --fdr {params.fdr} {input.calls} | varlociraptor decode-phred | bcftools sort -m 4G -Ob > {output.fdr_calls} 2> {log}
    """

SnakeMake BCFtools Varlociraptor From line 34 of rules/filter.smk

wrapper:
    "v1.25.0/bio/minimap2/aligner"

SnakeMake From line 16 of rules/map.smk

shell:
    """{params.cmd} {input.fastqs} | pigz -c > {output} 2> {log}"""

SnakeMake From line 35 of rules/map.smk

wrapper:
    "v1.25.0/bio/samtools/index"

SnakeMake From line 52 of rules/map.smk

wrapper:
    "v1.25.0/bio/samtools/faidx"

SnakeMake From line 69 of rules/map.smk

shell:
    """cyrcular\
    graph\
    plot\
    {input.bam}\
    --graph {input.graph}\
    --output {output.plots}\
    2> {log}"""

SnakeMake From line 18 of rules/plot.smk

shell:
    """
    mkdir -p {output.pdf_dir}
    count=`ls -1 {input.graph}/ 2>{log} | wc -l`
    for f in {input.graph}/*.dot; do (dot $f -Tpdf > "{output.pdf_dir}/graph_$(basename ${{f}} .dot).pdf" 2>>{log}); done
    """

SnakeMake From line 38 of rules/plot.smk

wrapper:
    "v1.25.0/bio/reference/ensembl-sequence"

SnakeMake From line 17 of rules/ref.smk

wrapper:
    "v1.25.0/bio/samtools/faidx"

SnakeMake From line 32 of rules/ref.smk

wrapper:
    "v1.25.0/bio/minimap2/index"

SnakeMake From line 56 of rules/ref.smk

shell:
    """wget https://ftp.ensembl.org/pub/release-{params.release}/regulation/homo_sapiens/homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.20220201.gff.gz --no-check-certificate -O {output} 2> {log}"""

SnakeMake From line 73 of rules/ref.smk

shell:
    """wget {params.download_link} --no-check-certificate -O {output} 2> {log}"""

SnakeMake From line 89 of rules/ref.smk

wrapper:
    "v1.25.0/bio/reference/ensembl-annotation"

SnakeMake From line 105 of rules/ref.smk

from contextlib import redirect_stderr

with open(snakemake.log[0], "w") as logfile:
    with redirect_stderr(logfile):
        import os
        import shutil
        import pandas as pd
        from pathlib import Path

        os.makedirs(snakemake.params.output_dir, exist_ok=True)
        overview = pd.read_csv(snakemake.input.overview, sep="\t")
        graph_ids = set(overview["graph_id"])
        for graph_id in graph_ids:
            shutil.copy(
                f"{snakemake.input.plots}/graph_{graph_id}.pdf", snakemake.params.output_dir
            )
        Path(snakemake.output.marker).touch()

Python Pandas From line 1 of scripts/copy_graph_plots.py

from contextlib import redirect_stderr

with open(snakemake.log[0], "w") as logfile:
    with redirect_stderr(logfile):
        import os
        import shutil
        import pandas as pd
        from pathlib import Path

        os.makedirs(snakemake.params.output_dir, exist_ok=True)
        overview = pd.read_csv(snakemake.input.overview, sep="\t")
        event_ids = {s.replace("-", "_") for s in overview["event_id"]}
        for event_id in event_ids:
            shutil.copy(
                f"{snakemake.input.plots}/graph_{event_id}.html",
                snakemake.params.output_dir,
            )
        Path(snakemake.output.marker).touch()

Python Pandas From line 1 of scripts/copy_qc_plots.py

from contextlib import redirect_stderr

with open(snakemake.log[0], "w") as logfile:
    with redirect_stderr(logfile):
        import pandas as pd

        df = pd.read_csv(snakemake.input.table, sep="\t")
        df.loc[:, ["gene_names", "gene_ids", "regulatory_features"]] = df.loc[
            :, ["gene_names", "gene_ids", "regulatory_features"]
        ].fillna("")
        df["category"] = df[["num_exons", "regulatory_features", "gene_names"]].apply(
            lambda r: "coding"
            if r["num_exons"] > 0
            else (
                "regulatory"
                if r["regulatory_features"]
                else ("intronic" if r["gene_names"] else "other")
            ),
            axis=1,
        )
        for kind in ["coding", "regulatory", "intronic", "other"]:
            part = df.query(f"category == '{kind}'")
            part.to_csv(getattr(snakemake.output, kind), sep="\t", index=False)
        df.to_csv(snakemake.output.categorized, sep="\t", index=False)

Python Pandas From line 1 of scripts/filter_overview_table.py

with open(snakemake.input.header, "rt") as header_file:
    header = [l.strip() for l in header_file.readlines()]
    file_format_line = header[0]
    chrom_line = header[-1]
    other_lines = header[1:-1]
    kinds = [
        "fileDate",
        "source",
        "reference",
        "contig",
        "phasing",
        "FILTER",
        "INFO",
        "FORMAT",
        "ALT",
        "assembly",
        "META",
        "SAMPLE",
        "PEDIGREE",
        "pedigreeDB",
    ]
    categories = {kind: [] for kind in kinds}
    others = []
    for line in other_lines:
        if "=" in line:
            kind = line.split("=")[0].lstrip("#")
            group = categories.get(kind, others)
        else:
            group = others
        group.append(line)

    with open(snakemake.output.sorted_header, "wt") as out:
        print(file_format_line, file=out)
        for kind in kinds:
            lines = categories[kind]
            for line in lines:
                print(line, file=out)
        for line in others:
            print(line, file=out)
        print(chrom_line, file=out)