Snakemake workflow for neoantigen prediction.
This workflow is still in development
This snakemake pipeline implements the GATK best-practices workflow
Step 1 Clone repository
Clone this repository to your local system 需要写一个脚本判断是否有conda,如果没有自动下载安装并创建snakemake环境
Step 2 Edit files
Configure the workflow accoreding to your needs via editing the files
config.yaml
,
sample.tsv
Step 3 Execute the process
This workflow will download reference genomes and annotation automatically. Test your configuration by performing a dry-run via
snakemake --use-conda -n
Execute the workflow locally via
snakemake --use-conda --cores N
using
N
cores or run it in a cluster environment via
snakemake --use-conda --cluster qsub --jobs 100
Step 4 Inspection results
After execution, you can create a self-contained interactive HTML report with all results via:
snakemake --report report.HTML
Code Snippets
15 16 | wrapper: "0.64.0/bio/bwa/mem" |
26 27 | wrapper: "0.64.0/bio/picard/markduplicates" |
41 42 | wrapper: "0.64.0/bio/gatk/baserecalibrator" |
56 57 | wrapper: "0.64.0/bio/gatk/applybqsr" |
16 17 | wrapper: "0.64.0/bio/gatk/mutect" |
17 18 | wrapper: "0.64.0/bio/fastp" |
32 33 | wrapper: "0.64.0/bio/fastp" |
13 14 | wrapper: "0.64.0/bio/reference/ensembl-sequence" |
26 27 | wrapper: "0.64.0/bio/bwa/index" |
40 41 | shell: "samtools dict {input} > {output} 2 > {log}" |
51 52 | wrapper: "0.64.0/bio/samtools/faidx" |
61 62 63 64 65 | wrapper: "0.64.0/bio/vep/plugins" ''' 有一个wildtype插件需要自己下 ''' |
76 77 | wrapper: "0.64.0/bio/vep/cache" |
12 13 | wrapper: "0.64.0/bio/razers3" |
11 12 | wrapper: "0.64.0/bio/optitype" |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | __author__ = "Patrik Smeds" __copyright__ = "Copyright 2016, Patrik Smeds" __email__ = "patrik.smeds@gmail.com" __license__ = "MIT" from os import path from snakemake.shell import shell log = snakemake.log_fmt_shell(stdout=False, stderr=True) # Check inputs/arguments. if len(snakemake.input) == 0: raise ValueError("A reference genome has to be provided!") elif len(snakemake.input) > 1: raise ValueError("Only one reference genome can be inputed!") # Prefix that should be used for the database prefix = snakemake.params.get("prefix", "") if len(prefix) > 0: prefix = "-p " + prefix # Contrunction algorithm that will be used to build the database, default is bwtsw construction_algorithm = snakemake.params.get("algorithm", "") if len(construction_algorithm) != 0: construction_algorithm = "-a " + construction_algorithm shell( "bwa index" " {prefix}" " {construction_algorithm}" " {snakemake.input[0]}" " {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | __author__ = "Johannes Köster, Julian de Ruiter" __copyright__ = "Copyright 2016, Johannes Köster and Julian de Ruiter" __email__ = "koester@jimmy.harvard.edu, julianderuiter@gmail.com" __license__ = "MIT" from os import path from snakemake.shell import shell # Extract arguments. extra = snakemake.params.get("extra", "") sort = snakemake.params.get("sort", "none") sort_order = snakemake.params.get("sort_order", "coordinate") sort_extra = snakemake.params.get("sort_extra", "") log = snakemake.log_fmt_shell(stdout=False, stderr=True) # Check inputs/arguments. if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in { 1, 2, }: raise ValueError("input must have 1 (single-end) or " "2 (paired-end) elements") if sort_order not in {"coordinate", "queryname"}: raise ValueError("Unexpected value for sort_order ({})".format(sort_order)) # Determine which pipe command to use for converting to bam or sorting. if sort == "none": # Simply convert to bam using samtools view. pipe_cmd = "samtools view -Sbh -o {snakemake.output[0]} -" elif sort == "samtools": # Sort alignments using samtools sort. pipe_cmd = "samtools sort {sort_extra} -o {snakemake.output[0]} -" # Add name flag if needed. if sort_order == "queryname": sort_extra += " -n" prefix = path.splitext(snakemake.output[0])[0] sort_extra += " -T " + prefix + ".tmp" elif sort == "picard": # Sort alignments using picard SortSam. pipe_cmd = ( "picard SortSam {sort_extra} INPUT=/dev/stdin" " OUTPUT={snakemake.output[0]} SORT_ORDER={sort_order}" ) else: raise ValueError("Unexpected value for params.sort ({})".format(sort)) shell( "(bwa mem" " -t {snakemake.threads}" " {extra}" " {snakemake.params.index}" " {snakemake.input.reads}" " | " + pipe_cmd + ") {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | __author__ = "Sebastian Kurscheid" __copyright__ = "Copyright 2019, Sebastian Kurscheid" __email__ = "sebastian.kurscheid@anu.edu.au" __license__ = "MIT" from snakemake.shell import shell extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True) n = len(snakemake.input.sample) assert ( n == 1 or n == 2 ), "input->sample must have 1 (single-end) or 2 (paired-end) elements." if n == 1: reads = "--in1 {}".format(snakemake.input.sample) else: reads = "--in1 {} --in2 {}".format(*snakemake.input.sample) trimmed_paths = snakemake.output.get("trimmed", None) if trimmed_paths is not None: if n == 1: trimmed = "--out1 {}".format(snakemake.output.trimmed) else: trimmed = "--out1 {} --out2 {}".format(*snakemake.output.trimmed) else: trimmed = "" html = "--html {}".format(snakemake.output.html) json = "--json {}".format(snakemake.output.json) shell( "(fastp --thread {snakemake.threads} {snakemake.params.extra} " "{reads} " "{trimmed} " "{json} " "{html} ) {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | __author__ = "Christopher Schröder" __copyright__ = "Copyright 2020, Christopher Schröder" __email__ = "christopher.schroeder@tu-dortmund.de" __license__ = "MIT" from snakemake.shell import shell extra = snakemake.params.get("extra", "") java_opts = snakemake.params.get("java_opts", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True, append=True) shell( "gatk --java-options '{java_opts}' ApplyBQSR {extra} -R {snakemake.input.ref} -I {snakemake.input.bam} " "--bqsr-recal-file {snakemake.input.recal_table} " "-O {snakemake.output.bam} {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | __author__ = "Christopher Schröder" __copyright__ = "Copyright 2020, Christopher Schröder" __email__ = "christopher.schroeder@tu-dortmund.de" __license__ = "MIT" from snakemake.shell import shell extra = snakemake.params.get("extra", "") java_opts = snakemake.params.get("java_opts", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True) known = snakemake.input.get("known", "") if known: known = "--known-sites {}".format(known) shell( "gatk --java-options '{java_opts}' BaseRecalibrator {extra} " "-R {snakemake.input.ref} -I {snakemake.input.bam} " "-O {snakemake.output.recal_table} {known} {log}" ) |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | __author__ = "Thibault Dayris" __copyright__ = "Copyright 2019, Dayris Thibault" __email__ = "thibault.dayris@gustaveroussy.fr" __license__ = "MIT" from snakemake.shell import shell from snakemake.utils import makedirs log = snakemake.log_fmt_shell(stdout=True, stderr=True) extra = snakemake.params.get("extra", "") shell( "gatk Mutect2 " # Tool and its subprocess "--input {snakemake.input.map} " # Path to input mapping file "--output {snakemake.output.vcf} " # Path to output vcf file "--reference {snakemake.input.fasta} " # Path to reference fasta file "{extra} " # Extra parameters "{log}" # Logging behaviour ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | __author__ = "Jan Forster" __copyright__ = "Copyright 2020, Jan Forster" __email__ = "j.forster@dkfz.de" __license__ = "MIT" import os from snakemake.shell import shell extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True) outdir = os.path.dirname(snakemake.output[0]) # get sequencing type seq_type = snakemake.params.get("sequencing_type", "dna") seq_type = "--{}".format(seq_type) # check if non-default config.ini is used config = snakemake.params.get("config", "") if any(config): config = "--config {}".format(config) shell( "(OptiTypePipeline.py" " --input {snakemake.input.reads}" " --outdir {outdir}" " --prefix {snakemake.wildcards.sample}" " {seq_type}" " {config}" " {extra})" " {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" from snakemake.shell import shell log = snakemake.log_fmt_shell(stdout=True, stderr=True) shell( "picard MarkDuplicates {snakemake.params} INPUT={snakemake.input} " "OUTPUT={snakemake.output.bam} METRICS_FILE={snakemake.output.metrics} " "{log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | __author__ = "Jan Forster" __copyright__ = "Copyright 2020, Jan Forster" __email__ = "j.forster@dkfz.de" __license__ = "MIT" import os from snakemake.shell import shell extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True) shell( "(razers3" " -tc {snakemake.threads}" " {extra}" " -o {snakemake.output[0]}" " {snakemake.params.genome}" " {snakemake.input.reads})" " {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2019, Johannes Köster" __email__ = "johannes.koester@uni-due.de" __license__ = "MIT" import subprocess as sp import sys from itertools import product from snakemake.shell import shell species = snakemake.params.species.lower() release = int(snakemake.params.release) build = snakemake.params.build branch = "" if release >= 81 and build == "GRCh37": # use the special grch37 branch for new releases branch = "grch37/" log = snakemake.log_fmt_shell(stdout=False, stderr=True) spec = ("{build}" if int(release) > 75 else "{build}.{release}").format( build=build, release=release ) suffixes = "" datatype = snakemake.params.get("datatype", "") if datatype == "dna": suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"] elif datatype == "cdna": suffixes = ["cdna.all.fa.gz"] elif datatype == "cds": suffixes = ["cds.all.fa.gz"] elif datatype == "ncrna": suffixes = ["ncrna.fa.gz"] elif datatype == "pep": suffixes = ["pep.all.fa.gz"] else: raise ValueError("invalid datatype, must be one of dna, cdna, cds, ncrna, pep") success = False for suffix in suffixes: url = "ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species_cap}.{spec}.{suffix}".format( release=release, species=species, datatype=datatype, spec=spec.format(build=build, release=release), suffix=suffix, species_cap=species.capitalize(), branch=branch, ) try: shell("curl -sSf {url} > /dev/null 2> /dev/null") except sp.CalledProcessError: continue shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}") success = True break if not success: print( "Unable to download requested sequence data from Ensembl. " "Did you check that this combination of species, build, and release is actually provided?", file=sys.stderr, ) exit(1) |
1 2 3 4 5 6 7 8 9 10 | __author__ = "Michael Chambers" __copyright__ = "Copyright 2019, Michael Chambers" __email__ = "greenkidneybean@gmail.com" __license__ = "MIT" from snakemake.shell import shell shell("samtools faidx {snakemake.params} {snakemake.input[0]} > {snakemake.output[0]}") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2020, Johannes Köster" __email__ = "johannes.koester@uni-due.de" __license__ = "MIT" from pathlib import Path from snakemake.shell import shell extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True) shell( "vep_install --AUTO cf " "--SPECIES {snakemake.params.species} " "--ASSEMBLY {snakemake.params.build} " "--CACHE_VERSION {snakemake.params.release} " "--CACHEDIR {snakemake.output} " "--CONVERT " "--NO_UPDATE {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2020, Johannes Köster" __email__ = "johannes.koester@uni-due.de" __license__ = "MIT" import sys from pathlib import Path from urllib.request import urlretrieve from zipfile import ZipFile from tempfile import NamedTemporaryFile if snakemake.log: sys.stderr = open(snakemake.log[0], "w") outdir = Path(snakemake.output[0]) outdir.mkdir() with NamedTemporaryFile() as tmp: urlretrieve( "https://github.com/Ensembl/VEP_plugins/archive/release/{release}.zip".format( release=snakemake.params.release ), tmp.name, ) with ZipFile(tmp.name) as f: for member in f.infolist(): memberpath = Path(member.filename) if len(memberpath.parts) == 1: # skip root dir continue targetpath = outdir / memberpath.relative_to(memberpath.parts[0]) if member.is_dir(): targetpath.mkdir() else: with open(targetpath, "wb") as out: out.write(f.read(member.filename)) |
Support
- Future updates
Related Workflows





