Neoantigen Prediction Snakemake Workflow: Genomic Variant Detection and Peptidome Incorporation
This workflow detects genomic variants with Strelka and and tries to incorporate germline and somatic variants into a sample-specific peptidome using Microphaser . Somatic neopeptides are filtered against germline peptides in terms of similarity and MHC affinity. Affinity prediction is performed using ( netMHCpan , netMHC2pan ). The workflow allows easy definition of tumor-normal pairs, where multiple tumor samples (e.g. metastasis) can be grouped with the same normal sample.
Software requirements
Unfortunately, the use of netMHCpan and netMHCIIpan is restricted to academic use only. To use those tools in the pipeline, please download them separately following instructions at https://services.healthtech.dtu.dk/software.php. Please specify the path to your netMHC folder in the
config.yaml
.
Code Snippets
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" from snakemake.shell import shell shell( "picard MarkDuplicates {snakemake.params} INPUT={snakemake.input} " "OUTPUT={snakemake.output.bam} METRICS_FILE={snakemake.output.metrics} " "&> {snakemake.log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" import os from snakemake.shell import shell extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True) fq1 = snakemake.input.get("fq1") assert fq1 is not None, "input-> fq1 is a required input parameter" fq1 = ( [snakemake.input.fq1] if isinstance(snakemake.input.fq1, str) else snakemake.input.fq1 ) fq2 = snakemake.input.get("fq2") if fq2: fq2 = ( [snakemake.input.fq2] if isinstance(snakemake.input.fq2, str) else snakemake.input.fq2 ) assert len(fq1) == len( fq2 ), "input-> equal number of files required for fq1 and fq2" input_str_fq1 = ",".join(fq1) input_str_fq2 = ",".join(fq2) if fq2 is not None else "" input_str = " ".join([input_str_fq1, input_str_fq2]) if fq1[0].endswith(".gz"): readcmd = "--readFilesCommand zcat" else: readcmd = "" outprefix = os.path.dirname(snakemake.output[0]) + "/" shell( "STAR " "{extra} " "--runThreadN {snakemake.threads} " "--genomeDir {snakemake.params.index} " "--readFilesIn {input_str} " "{readcmd} " "--outFileNamePrefix {outprefix} " "--outStd Log " "{log}" ) |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | __author__ = "Thibault Dayris" __copyright__ = "Copyright 2019, Dayris Thibault" __email__ = "thibault.dayris@gustaveroussy.fr" __license__ = "MIT" from snakemake.shell import shell from snakemake.utils import makedirs log = snakemake.log_fmt_shell(stdout=True, stderr=True) extra = snakemake.params.get("extra", "") sjdb_overhang = snakemake.params.get("sjdbOverhang", "100") gtf = snakemake.input.get("gtf") if gtf is not None: gtf = "--sjdbGTFfile " + gtf sjdb_overhang = "--sjdbOverhang " + sjdb_overhang else: gtf = sjdb_overhang = "" makedirs(snakemake.output) shell( "STAR " # Tool "--runMode genomeGenerate " # Indexation mode "{extra} " # Optional parameters "--runThreadN {snakemake.threads} " # Number of threads "--genomeDir {snakemake.output} " # Path to output "--genomeFastaFiles {snakemake.input.fasta} " # Path to fasta files "{sjdb_overhang} " # Read-len - 1 "{gtf} " # Highly recommended GTF "{log}" # Logging ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | __author__ = "Patrik Smeds" __copyright__ = "Copyright 2016, Patrik Smeds" __email__ = "patrik.smeds@gmail.com" __license__ = "MIT" from os import path from snakemake.shell import shell log = snakemake.log_fmt_shell(stdout=False, stderr=True) # Check inputs/arguments. if len(snakemake.input) == 0: raise ValueError("A reference genome has to be provided!") elif len(snakemake.input) > 1: raise ValueError("Only one reference genome can be inputed!") # Prefix that should be used for the database prefix = snakemake.params.get("prefix", "") if len(prefix) > 0: prefix = "-p " + prefix # Contrunction algorithm that will be used to build the database, default is bwtsw construction_algorithm = snakemake.params.get("algorithm", "") if len(construction_algorithm) != 0: construction_algorithm = "-a " + construction_algorithm shell( "bwa index" " {prefix}" " {construction_algorithm}" " {snakemake.input[0]}" " {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2018, Johannes Köster" __email__ = "johannes.koester@protonmail.com" __license__ = "MIT" from snakemake.shell import shell extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=False, stderr=True) shell( "picard " "CreateSequenceDictionary " "{extra} " "R={snakemake.input[0]} " "O={snakemake.output[0]} " "{log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2019, Johannes Köster" __email__ = "johannes.koester@uni-due.de" __license__ = "MIT" from snakemake.shell import shell species = snakemake.params.species.lower() release = snakemake.params.release fmt = snakemake.params.fmt build = snakemake.params.build log = snakemake.log_fmt_shell(stdout=False, stderr=True) suffix = "" if fmt == "gtf": suffix = "gtf.gz" elif fmt == "gff3": suffix = "gff3.gz" url = "ftp://ftp.ensembl.org/pub/release-{release}/{fmt}/{species}/{species_cap}.{build}.{release}.{suffix}".format( release=release, build=build, species=species, fmt=fmt, species_cap=species.capitalize(), suffix=suffix, ) shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2019, Johannes Köster" __email__ = "johannes.koester@uni-due.de" __license__ = "MIT" import subprocess as sp from snakemake.shell import shell species = snakemake.params.species.lower() release = snakemake.params.release build = snakemake.params.build log = snakemake.log_fmt_shell(stdout=False, stderr=True) suffixes = "" datatype = snakemake.params.get("datatype", "") if datatype == "dna": suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"] elif datatype == "cdna": suffixes = ["cdna.all.fa.gz"] elif datatype == "cds": suffixes = ["cds.all.fa.gz"] elif datatype == "ncrna": suffixes = ["ncrna.fa.gz"] elif datatype == "pep": suffixes = ["pep.all.fa.gz"] else: raise ValueError("invalid datatype, must be one of dna, cdna, cds, ncrna, pep") success = False for suffix in suffixes: url = "ftp://ftp.ensembl.org/pub/release-{release}/fasta/{species}/{datatype}/{species_cap}.{build}.{suffix}".format( release=release, species=species, datatype=datatype, build=build, suffix=suffix, species_cap=species.capitalize(), ) try: shell("curl -sSf {url} > /dev/null 2> /dev/null") except sp.CalledProcessError: continue shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}") success = True break if not success: raise ValueError( "Requested sequence does not seem to exist on ensembl FTP servers or servers are unavailable (url {})".format( url ) ) |
1 2 3 4 5 6 7 8 9 10 | __author__ = "Michael Chambers" __copyright__ = "Copyright 2019, Michael Chambers" __email__ = "greenkidneybean@gmail.com" __license__ = "MIT" from snakemake.shell import shell shell("samtools faidx {snakemake.params} {snakemake.input[0]} > {snakemake.output[0]}") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | __author__ = "Johannes Köster, Julian de Ruiter" __copyright__ = "Copyright 2016, Johannes Köster and Julian de Ruiter" __email__ = "koester@jimmy.harvard.edu, julianderuiter@gmail.com" __license__ = "MIT" from os import path from snakemake.shell import shell # Extract arguments. extra = snakemake.params.get("extra", "") sort = snakemake.params.get("sort", "none") sort_order = snakemake.params.get("sort_order", "coordinate") sort_extra = snakemake.params.get("sort_extra", "") log = snakemake.log_fmt_shell(stdout=False, stderr=True) # Check inputs/arguments. if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in { 1, 2, }: raise ValueError("input must have 1 (single-end) or " "2 (paired-end) elements") if sort_order not in {"coordinate", "queryname"}: raise ValueError("Unexpected value for sort_order ({})".format(sort_order)) # Determine which pipe command to use for converting to bam or sorting. if sort == "none": # Simply convert to bam using samtools view. pipe_cmd = "samtools view -Sbh -o {snakemake.output[0]} -" elif sort == "samtools": # Sort alignments using samtools sort. pipe_cmd = "samtools sort {sort_extra} -o {snakemake.output[0]} -" # Add name flag if needed. if sort_order == "queryname": sort_extra += " -n" prefix = path.splitext(snakemake.output[0])[0] sort_extra += " -T " + prefix + ".tmp" elif sort == "picard": # Sort alignments using picard SortSam. pipe_cmd = ( "picard SortSam {sort_extra} INPUT=/dev/stdin" " OUTPUT={snakemake.output[0]} SORT_ORDER={sort_order}" ) else: raise ValueError("Unexpected value for params.sort ({})".format(sort)) shell( "(bwa mem" " -t {snakemake.threads}" " {extra}" " {snakemake.params.index}" " {snakemake.input.reads}" " | " + pipe_cmd + ") {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | __author__ = "Johannes Köster, Derek Croote" __copyright__ = "Copyright 2020, Johannes Köster" __email__ = "johannes.koester@uni-due.de" __license__ = "MIT" import os import tempfile from snakemake.shell import shell log = snakemake.log_fmt_shell(stdout=True, stderr=True) outdir = os.path.dirname(snakemake.output[0]) if outdir: outdir = "--outdir {}".format(outdir) extra = snakemake.params.get("extra", "") with tempfile.TemporaryDirectory() as tmp: shell( "fasterq-dump --temp {tmp} --threads {snakemake.threads} " "{extra} {outdir} {snakemake.wildcards.accession} {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" from snakemake.shell import shell shell( "bcftools concat {snakemake.params} -o {snakemake.output[0]} " "{snakemake.input.calls}" ) |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | __author__ = "Julian de Ruiter" __copyright__ = "Copyright 2017, Julian de Ruiter" __email__ = "julianderuiter@gmail.com" __license__ = "MIT" from snakemake.shell import shell n = len(snakemake.input) assert n == 2, "Input must contain 2 (paired-end) elements." log = snakemake.log_fmt_shell(stdout=False, stderr=True) shell( "cutadapt" " {snakemake.params.adapters}" " {snakemake.params.others}" " -o {snakemake.output.fastq1}" " -p {snakemake.output.fastq2}" " -j {snakemake.threads}" " {snakemake.input}" " > {snakemake.output.qc} {log}" ) |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | __author__ = "Julian de Ruiter" __copyright__ = "Copyright 2017, Julian de Ruiter" __email__ = "julianderuiter@gmail.com" __license__ = "MIT" from snakemake.shell import shell log = snakemake.log_fmt_shell(stdout=False, stderr=True) shell( "cutadapt" " {snakemake.params}" " -j {snakemake.threads}" " -o {snakemake.output.fastq}" " {snakemake.input[0]}" " > {snakemake.output.qc} {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2019, Johannes Köster" __email__ = "johannes.koester@uni-due.de" __license__ = "MIT" import tempfile import subprocess import sys import os from snakemake.shell import shell from snakemake.exceptions import WorkflowError species = snakemake.params.species.lower() release = int(snakemake.params.release) build = snakemake.params.build type = snakemake.params.type if release < 98: print("Ensembl releases <98 are unsupported.", file=open(snakemake.log[0], "w")) exit(1) branch = "" if release >= 81 and build == "GRCh37": # use the special grch37 branch for new releases branch = "grch37/" log = snakemake.log_fmt_shell(stdout=False, stderr=True) if type == "all": if species == "homo_sapiens" and release >= 93: suffixes = [ "-chr{}".format(chrom) for chrom in list(range(1, 23)) + ["X", "Y", "MT"] ] else: suffixes = [""] elif type == "somatic": suffixes = ["_somatic"] elif type == "structural_variations": suffixes = ["_structural_variations"] else: raise ValueError( "Unsupported type {} (only all, somatic, structural_variations are allowed)".format( type ) ) species_filename = species if release >= 91 else species.capitalize() urls = [ "ftp://ftp.ensembl.org/pub/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.{ext}".format( release=release, species=species, suffix=suffix, species_filename=species_filename, branch=branch, ext=ext, ) for suffix in suffixes for ext in ["vcf.gz", "vcf.gz.csi"] ] names = [os.path.basename(url) for url in urls if url.endswith(".gz")] try: gather = "curl {urls}".format(urls=" ".join(map("-O {}".format, urls))) workdir = os.getcwd() with tempfile.TemporaryDirectory() as tmpdir: if snakemake.input.get("fai"): shell( "(cd {tmpdir}; {gather} && " "bcftools concat -Oz --naive {names} > concat.vcf.gz && " "bcftools reheader --fai {workdir}/{snakemake.input.fai} concat.vcf.gz " "> {workdir}/{snakemake.output}) {log}" ) else: shell( "(cd {tmpdir}; {gather} && " "bcftools concat -Oz --naive {names} " "> {workdir}/{snakemake.output}) {log}" ) except subprocess.CalledProcessError as e: if snakemake.log: sys.stderr = open(snakemake.log[0], "a") print( "Unable to download variation data from Ensembl. " "Did you check that this combination of species, build, and release is actually provided? ", file=sys.stderr, ) exit(1) |
1 2 3 4 5 6 7 8 9 10 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" from snakemake.shell import shell shell("samtools index {snakemake.params} {snakemake.input[0]} {snakemake.output[0]}") |
1 2 3 4 5 6 7 8 9 10 11 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" from snakemake.shell import shell log = snakemake.log_fmt_shell(stdout=False, stderr=True) shell("tabix {snakemake.params} {snakemake.input[0]} {log}") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2020, Johannes Köster" __email__ = "johannes.koester@uni-due.de" __license__ = "MIT" from pathlib import Path from snakemake.shell import shell def get_only_child_dir(path): children = [child for child in path.iterdir() if child.is_dir()] assert ( len(children) == 1 ), "Invalid VEP cache directory, only a single entry is allowed, make sure that cache was created with the snakemake VEP cache wrapper" return children[0] extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=False, stderr=True) fork = "--fork {}".format(snakemake.threads) if snakemake.threads > 1 else "" stats = snakemake.output.stats cache = snakemake.input.cache plugins = snakemake.input.plugins entrypath = get_only_child_dir(get_only_child_dir(Path(cache))) species = entrypath.parent.name release, build = entrypath.name.split("_") load_plugins = " ".join(map("--plugin {}".format, snakemake.params.plugins)) if snakemake.output.calls.endswith(".vcf.gz"): fmt = "z" elif snakemake.output.calls.endswith(".bcf"): fmt = "b" else: fmt = "v" shell( "(bcftools view {snakemake.input.calls} | " "vep {extra} {fork} " "--format vcf " "--vcf " "--cache " "--cache_version {release} " "--species {species} " "--assembly {build} " "--dir_cache {cache} " "--dir_plugins {plugins} " "--offline " "{load_plugins} " "--output_file STDOUT " "--stats_file {stats} | " "bcftools view -O{fmt} > {snakemake.output.calls}) {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2020, Johannes Köster" __email__ = "johannes.koester@uni-due.de" __license__ = "MIT" from pathlib import Path from snakemake.shell import shell extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True) shell( "vep_install --AUTO cf " "--SPECIES {snakemake.params.species} " "--ASSEMBLY {snakemake.params.build} " "--CACHE_VERSION {snakemake.params.release} " "--CACHEDIR {snakemake.output} " "--CONVERT " "--NO_UPDATE {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2020, Johannes Köster" __email__ = "johannes.koester@uni-due.de" __license__ = "MIT" import sys from pathlib import Path from urllib.request import urlretrieve from zipfile import ZipFile from tempfile import NamedTemporaryFile if snakemake.log: sys.stderr = open(snakemake.log[0], "w") outdir = Path(snakemake.output[0]) outdir.mkdir() with NamedTemporaryFile() as tmp: urlretrieve( "https://github.com/Ensembl/VEP_plugins/archive/release/{release}.zip".format( release=snakemake.params.release ), tmp.name, ) with ZipFile(tmp.name) as f: for member in f.infolist(): memberpath = Path(member.filename) if len(memberpath.parts) == 1: # skip root dir continue targetpath = outdir / memberpath.relative_to(memberpath.parts[0]) if member.is_dir(): targetpath.mkdir() else: with open(targetpath, "wb") as out: out.write(f.read(member.filename)) |
1 2 3 4 5 6 7 8 9 10 11 12 13 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" from snakemake.shell import shell shell( "bcftools concat {snakemake.params} -o {snakemake.output[0]} " "{snakemake.input.calls}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" from snakemake.shell import shell ## Extract arguments extra = snakemake.params.get("extra", "") shell("bcftools index" " {extra}" " {snakemake.input[0]}") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | __author__ = "Jan Forster" __copyright__ = "Copyright 2020, Jan Forster" __email__ = "j.forster@dkfz.de" __license__ = "MIT" from snakemake.shell import shell ## Extract arguments header = snakemake.input.get("header", "") if header: header_cmd = "-h " + header else: header_cmd = "" samples = snakemake.input.get("samples", "") if samples: samples_cmd = "-s " + samples else: samples_cmd = "" extra = snakemake.params.get("extra", "") view_extra = snakemake.params.get("view_extra", "") shell( "bcftools reheader " "{extra} " "{header_cmd} " "{samples_cmd} " "{snakemake.input.vcf} " "| bcftools view " "{view_extra} " "> {snakemake.output}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" from snakemake.shell import shell shell( "bcftools view {snakemake.params} {snakemake.input[0]} " "-o {snakemake.output[0]}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | __author__ = "Jan Forster" __copyright__ = "Copyright 2019, Jan Forster" __email__ = "j.forster@dkfz.de" __license__ = "MIT" from os import path from snakemake.shell import shell ## Extract arguments extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=False, stderr=True) shell( "(pre.py" " --threads {snakemake.threads}" " -r {snakemake.params.genome}" " {extra}" " {snakemake.input.variants}" " {snakemake.output})" " {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | __author__ = "Jan Forster" __copyright__ = "Copyright 2019, Jan Forster" __email__ = "j.forster@dkfz.de" __license__ = "MIT" import os from snakemake.shell import shell extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True) discarded_fusions = snakemake.output.get("discarded", "") if discarded_fusions: discarded_cmd = "-O " + discarded_fusions else: discarded_cmd = "" blacklist = snakemake.params.get("blacklist") if blacklist: blacklist_cmd = "-b " + blacklist else: blacklist_cmd = "" known_fusions = snakemake.params.get("known_fusions") if known_fusions: known_cmd = "-k" + known_fusions else: known_cmd = "" sv_file = snakemake.params.get("sv_file") if sv_file: sv_cmd = "-d" + sv_file else: sv_cmd = "" shell( "arriba " "-x {snakemake.input.bam} " "-a {snakemake.input.genome} " "-g {snakemake.input.annotation} " "{blacklist_cmd} " "{known_cmd} " "{sv_cmd} " "-o {snakemake.output.fusions} " "{discarded_cmd} " "{extra} " "{log}" ) |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | __author__ = "Joël Simoneau" __copyright__ = "Copyright 2019, Joël Simoneau" __email__ = "simoneaujoel@gmail.com" __license__ = "MIT" from snakemake.shell import shell # Creating log log = snakemake.log_fmt_shell(stdout=True, stderr=True) # Placeholder for optional parameters extra = snakemake.params.get("extra", "") # Allowing for multiple FASTA files fasta = snakemake.input.get("fasta") assert fasta is not None, "input-> a FASTA-file is required" fasta = " ".join(fasta) if isinstance(fasta, list) else fasta shell( "kallisto index " # Tool "{extra} " # Optional parameters "--index={snakemake.output.index} " # Output file "{fasta} " # Input FASTA files "{log}" # Logging ) |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | __author__ = "Joël Simoneau" __copyright__ = "Copyright 2019, Joël Simoneau" __email__ = "simoneaujoel@gmail.com" __license__ = "MIT" from snakemake.shell import shell # Creating log log = snakemake.log_fmt_shell(stdout=True, stderr=True) # Placeholder for optional parameters extra = snakemake.params.get("extra", "") # Allowing for multiple FASTQ files fastq = snakemake.input.get("fastq") assert fastq is not None, "input-> a FASTQ-file is required" fastq = " ".join(fastq) if isinstance(fastq, list) else fastq shell( "kallisto quant " # Tool "{extra} " # Optional parameters "--threads={snakemake.threads} " # Number of threads "--index={snakemake.input.index} " # Input file "--output-dir={snakemake.output} " # Output directory "{fastq} " # Input FASTQ files "{log}" # Logging ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | __author__ = "Jan Forster" __copyright__ = "Copyright 2020, Jan Forster" __email__ = "j.forster@dkfz.de" __license__ = "MIT" import os from snakemake.shell import shell extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True) shell( "(razers3" " -tc {snakemake.threads}" " {extra}" " -o {snakemake.output[0]}" " {snakemake.params.genome}" " {snakemake.input.reads})" " {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | __author__ = "David Laehnemann, Victoria Sack" __copyright__ = "Copyright 2018, David Laehnemann, Victoria Sack" __email__ = "david.laehnemann@hhu.de" __license__ = "MIT" import os from snakemake.shell import shell prefix = os.path.splitext(snakemake.output[0])[0] shell( "samtools bam2fq {snakemake.params} " " -@ {snakemake.threads} " " {snakemake.input[0]}" " >{snakemake.output[0]} " ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | __author__ = "Christopher Schröder" __copyright__ = "Copyright 2020, Christopher Schröder" __email__ = "christopher.schroeder@tu-dortmund.de" __license__ = "MIT" from snakemake.shell import shell extra = snakemake.params.get("extra", "") java_opts = snakemake.params.get("java_opts", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True, append=True) shell( "gatk --java-options '{java_opts}' ApplyBQSR {extra} -R {snakemake.input.ref} -I {snakemake.input.bam} " "--bqsr-recal-file {snakemake.input.recal_table} " "-O {snakemake.output.bam} {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | __author__ = "Christopher Schröder" __copyright__ = "Copyright 2020, Christopher Schröder" __email__ = "christopher.schroeder@tu-dortmund.de" __license__ = "MIT" from snakemake.shell import shell extra = snakemake.params.get("extra", "") spark_runner = snakemake.params.get("spark_runner", "LOCAL") spark_master = snakemake.params.get( "spark_master", "local[{}]".format(snakemake.threads) ) spark_extra = snakemake.params.get("spark_extra", "") java_opts = snakemake.params.get("java_opts", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True) known = snakemake.input.get("known", "") if known: known = "--known-sites {}".format(known) shell( "gatk --java-options '{java_opts}' BaseRecalibratorSpark {extra} " "-R {snakemake.input.ref} -I {snakemake.input.bam} " "-O {snakemake.output.recal_table} {known} " "-- --spark-runner {spark_runner} --spark-master {spark_master} {spark_extra} " "{log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | __author__ = "Jan Forster" __copyright__ = "Copyright 2020, Jan Forster" __email__ = "j.forster@dkfz.de" __license__ = "MIT" import os from snakemake.shell import shell extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True) outdir = os.path.dirname(snakemake.output[0]) # get sequencing type seq_type = snakemake.params.get("sequencing_type", "dna") seq_type = "--{}".format(seq_type) # check if non-default config.ini is used config = snakemake.params.get("config", "") if any(config): config = "--config {}".format(config) shell( "(OptiTypePipeline.py" " --input {snakemake.input.reads}" " --outdir {outdir}" " --prefix {snakemake.wildcards.sample}" " {seq_type}" " {config}" " {extra})" " {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" from snakemake.shell import shell shell( "bcftools concat {snakemake.params} -o {snakemake.output[0]} " "{snakemake.input.calls}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 | __author__ = "Dayne Filer" __copyright__ = "Copyright 2019, Dayne Filer" __email__ = "dayne.filer@gmail.com" __license__ = "MIT" from snakemake.shell import shell shell( "bcftools norm {snakemake.params} {snakemake.input[0]} " "-o {snakemake.output[0]}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | __author__ = "Johannes Köster, Felix Mölder" __copyright__ = "Copyright 2017, Johannes Köster" __email__ = "johannes.koester@protonmail.com, felix.moelder@uni-due.de" __license__ = "MIT" from snakemake.shell import shell shell.executable("bash") log = snakemake.log_fmt_shell(stdout=False, stderr=True) params = snakemake.params.get("extra", "") pipe = "" if snakemake.output[0].endswith(".bcf"): pipe = "| bcftools view -Ob -" if snakemake.threads == 1: freebayes = "freebayes" else: chunksize = snakemake.params.get("chunksize", 100000) regions = "<(fasta_generate_regions.py {snakemake.input.ref}.fai {chunksize})".format( snakemake=snakemake, chunksize=chunksize ) if snakemake.input.get("regions", ""): regions = ( "<(bedtools intersect -a " r"<(sed 's/:\([0-9]*\)-\([0-9]*\)$/\t\1\t\2/' " "{regions}) -b {snakemake.input.regions} | " r"sed 's/\t\([0-9]*\)\t\([0-9]*\)$/:\1-\2/')" ).format(regions=regions, snakemake=snakemake) freebayes = ("freebayes-parallel {regions} {snakemake.threads}").format( snakemake=snakemake, regions=regions ) shell( "({freebayes} {params} -f {snakemake.input.ref}" " {snakemake.input.samples} {pipe} > {snakemake.output[0]}) {log}" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | __author__ = "Jan Forster" __copyright__ = "Copyright 2019, Jan Forster" __email__ = "jan.forster@uk-essen.de" __license__ = "MIT" import os from pathlib import Path from snakemake.shell import shell config_extra = snakemake.params.get("config_extra", "") run_extra = snakemake.params.get("run_extra", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True) bam = snakemake.input.get("bam") # input bam file, required assert bam is not None, "input-> bam is a required input parameter" if snakemake.output[0].endswith(".vcf.gz"): run_dir = Path(snakemake.output[0]).parents[2] else: run_dir = snakemake.output shell( "configureStrelkaGermlineWorkflow.py " # configure the strelka run "--bam {bam} " # input bam "--referenceFasta {snakemake.input.fasta} " # reference genome "--runDir {run_dir} " # output directory "{config_extra} " # additional parameters for the configuration "&& {run_dir}/runWorkflow.py " # run the strelka workflow "-m local " # run in local mode "-j {snakemake.threads} " # number of threads "{run_extra} " # additional parameters for the run "{log}" ) # logging |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | __author__ = "Thibault Dayris" __copyright__ = "Copyright 2019, Dayris Thibault" __email__ = "thibault.dayris@gustaveroussy.fr" __license__ = "MIT" from pathlib import Path from snakemake.shell import shell from snakemake.utils import makedirs log = snakemake.log_fmt_shell(stdout=True, stderr=True) config_extra = snakemake.params.get("config_extra", "") run_extra = snakemake.params.get("run_extra", "") # If a normal bam is given in input, # then it should be provided in the input # block, so Snakemake will perform additional # tests on file existance. normal = ( "--normalBam {}".format(snakemake.input["normal"]) if "normal" in snakemake.input.keys() else "" ) if snakemake.output[0].endswith("vcf.gz"): run_dir = Path(snakemake.output[0]).parents[2] else: run_dir = snakemake.output shell( "(configureStrelkaSomaticWorkflow.py " # Configuration script "{normal} " # Path to normal bam (if any) "--tumorBam {snakemake.input.tumor} " # Path to tumor bam "--referenceFasta {snakemake.input.fasta} " # Path to fasta file "--runDir {run_dir} " # Path to output directory "{config_extra} " # Extra parametersfor configuration " && " "{run_dir}/runWorkflow.py " # Run the pipeline "--mode local " # Stop internal job submission "--jobs {snakemake.threads} " # Nomber of threads "{run_extra}) " # Extra parameters for runWorkflow "{log}" # Logging behaviour ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" from snakemake.shell import shell shell( "bcftools concat {snakemake.params} -o {snakemake.output[0]} " "{snakemake.input.calls}" ) |
20 21 | wrapper: "0.59.2/bio/vep/annotate" |
43 44 | wrapper: "0.59.2/bio/vep/annotate" |
21 22 | wrapper: "0.65.0/bio/strelka/somatic" |
42 43 | wrapper: "0.65.0/bio/strelka/germline" |
55 56 | wrapper: "0.60.0/bio/bcftools/view" |
75 76 | wrapper: "0.60.0/bio/bcftools/concat" |
88 89 | wrapper: "0.60.0/bio/bcftools/view" |
103 104 | wrapper: "0.60.0/bio/bcftools/reheader" |
117 118 | wrapper: "0.64.0/bio/bcftools/concat" |
132 133 | wrapper: "0.60.0/bio/hap.py/pre.py" |
146 147 | wrapper: "0.65.0/bio/bcftools/norm" |
14 15 | wrapper: "0.65.0/bio/freebayes" |
27 28 | shell: "rbt vcf-split {input} {output}" |
12 13 | shell: 'vembrane filter --output-fmt bcf --output {output} "{params.filter}" {input} &> {log}' |
29 30 | shell: "varlociraptor filter-calls posterior-odds --events {params.events} --odds barely < {input} > {output} 2> {log}" |
47 48 | wrapper: "0.67.0/bio/bcftools/concat" |
62 63 64 | shell: "varlociraptor filter-calls control-fdr {input} --var {wildcards.vartype} " "--events {params.query[events]} --fdr {params.query[threshold]} > {output} 2> {log}" |
77 78 | wrapper: "0.59.2/bio/bcftools/concat" |
90 91 | shell: "echo -e 'normal {params.prefix}_N\ntumor {params.prefix}_T' > {output}" |
105 106 | wrapper: "0.60.0/bio/bcftools/reheader" |
16 17 | shell: "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.sample} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir results/HLA-LA/output --maxThreads {threads} > {log} 2>&1" |
36 37 | script: "../scripts/parse_HLA_types.py" |
51 52 | wrapper: "0.61.0/bio/razers3" |
65 66 | wrapper: "0.61.0/bio/samtools/bam2fq/interleaved" |
81 82 | wrapper: "0.63.0/bio/optitype" |
96 97 98 | shell: "cut {input} -f2-7 | awk 'NR == 1 {{print}} NR>1 {{for (i = 1; i<=6; ++i) sub(/^/, \"&HLA-\", $i); print}}' " '| sed -e s/[*,:]//g | sed "s/ /\t/g" > {output} 2> {log}' |
15 16 | wrapper: "0.56.0/bio/bwa/mem" |
29 30 | wrapper: "0.39.0/bio/picard/markduplicates" |
50 51 | wrapper: "0.62.0/bio/gatk/baserecalibratorspark" |
69 70 | wrapper: "0.62.0/bio/gatk/applybqsr" |
30 31 32 33 | run: alleles = ",".join(pd.read_csv(input.alleles, sep="\t").iloc[0]) cmd = "if [ -s {input.peptides} ]; then {params.netMHC}/netMHCpan {params.extra} -xlsfile {output} -a {alleles} -f {input.peptides} > {log}; else touch {output}; fi" shell(cmd) |
47 48 49 50 | run: alleles = ",".join(pd.read_csv(input.alleles, sep="\t")["Allele"].tolist()) cmd = "if [ -s {input.peptides} ]; then {params.netMHC}/netMHCIIpan {params.extra} -xlsfile {output} -a {alleles} -f {input.peptides} > {log}; else touch {output}; fi" shell(cmd) |
65 66 | script: "../scripts/group_mhc_output.py" |
97 98 | script: "../scripts/merge_data.py" |
126 127 | script: "../scripts/add_rna_info.py" |
18 19 20 | shell: "microphaser somatic {input.bam} --variants {input.vcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} " "< {input.track} > {output.mt_fasta} 2> {log}" |
43 44 45 | shell: "microphaser normal {input.bam} --variants {input.vcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} " "< {input.track} > {output.wt_fasta} 2> {log}" |
58 59 | shell: "cat {input} > {output} 2> {log}" |
76 77 | shell: "microphaser build_reference -r {input} -o {output.bin} -l {params.length} --peptides {output.fasta} > {log} 2>&1" |
101 102 | shell: "microphaser filter -r {input.proteome} -t {input.tsv} -o {output.tsv} -n {output.wt_fasta} -s {output.removed} -l {params.length} > {output.mt_fasta} 2>{log}" |
117 118 | shell: "xsv cat rows -d '\t' {input} | xsv fmt -t '\t' -d ',' > {output} 2>{log}" |
12 13 | wrapper: "0.45.1/bio/reference/ensembl-sequence" |
27 28 | wrapper: "0.45.1/bio/reference/ensembl-sequence" |
41 42 | wrapper: "0.60.1/bio/kallisto/index" |
57 58 | wrapper: "0.45.1/bio/reference/ensembl-annotation" |
74 75 | wrapper: "0.42.0/bio/star/index" |
85 86 | shell: "grep '^{wildcards.contig}\t' {input} > {output}" |
97 98 | wrapper: "0.45.1/bio/samtools/faidx" |
109 110 | wrapper: "0.45.1/bio/picard/createsequencedictionary" |
124 125 126 | shell: "paste <(cut -f1 {input}) <(yes 0 | head -n {params.n_contigs}) <(cut -f2 {input})" " | head -n {params.n_contigs} | bgzip -c > {output} && tabix -p bed {output}" |
143 144 | wrapper: "0.59.2/bio/reference/ensembl-variation" |
157 158 | shell: "rbt vcf-fix-iupac-alleles < {input} | bcftools view -Oz > {output}" |
169 170 | wrapper: "0.45.1/bio/bwa/index" |
187 188 189 | shell: "cd resources/graphs && wget http://www.well.ox.ac.uk/downloads/PRG_MHC_GRCh38_withIMGT.tar.gz " "&& tar -xvzf PRG_MHC_GRCh38_withIMGT.tar.gz" |
206 207 | shell: "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1" |
220 221 | wrapper: "0.59.2/bio/vep/cache" |
232 233 | wrapper: "0.59.2/bio/vep/plugins" |
241 242 | shell: "echo 'TUMOR' > {output}" |
11 12 | wrapper: "0.60.1/bio/kallisto/quant" |
35 36 | wrapper: "0.42.0/bio/star/align" |
53 54 | wrapper: "0.60.1/bio/arriba" |
14 15 16 17 18 19 20 | shell: "varlociraptor estimate tmb " " --plot-mode {wildcards.plotmode} " "--coding-genome-size {params.coding_genome_size} " "--somatic-tumor-events {params.somatic_events} " "--tumor-sample {params.tumor_sample} " "< {input} > {output} 2> {log}" |
7 8 | wrapper: "0.56.0/bio/sra-tools/fasterq-dump" |
21 22 | shell: "cat {input} > {output} 2> {log}" |
40 41 | wrapper: "0.59.2/bio/cutadapt/pe" |
56 57 | wrapper: "0.59.2/bio/cutadapt/se" |
70 71 | shell: "cat {input} > {output} 2> {log}" |
8 9 | wrapper: "0.60.0/bio/bcftools/index" |
19 20 | wrapper: "0.59.2/bio/samtools/index" |
33 34 | wrapper: "0.59.2/bio/tabix" |
44 45 | shell: "gzip < {input} > {output}" |
57 58 | script: "../scripts/tsv_to_xlsx.py" |
16 17 | script: "../scripts/render-scenario.py" |
35 36 37 | shell: "varlociraptor preprocess variants {params.omit_isize} --candidates {input.candidates} " "{input.ref} --bam {input.bam} --output {output} 2> {log}" |
56 57 58 59 | shell: "varlociraptor " "call variants generic --obs {params.obs} " "--scenario {input.scenario} > {output} 2> {log}" |
73 74 75 | shell: "bcftools sort --max-mem {resources.mem_mb}M --temp-dir `mktemp -d` " "-Ob {input} > {output} 2> {log}" |
88 89 | wrapper: "0.59.2/bio/bcftools/concat" |
14 15 | shell: "vl2svg {input} {output} > {log} 2>&1" |
1 2 3 4 5 6 7 8 9 10 11 12 13 | import pandas as pd ## load data table data = pd.read_csv(snakemake.input["table"], sep='\t') ## Merge transcript count transcript_count = pd.read_csv(snakemake.params["abundance"], sep='\t') transcript_count = transcript_count[["target_id", "tpm"]] transcript_count.columns = ["Transcript_ID", "TPM"] transcript_count["Transcript_ID"] = transcript_count["Transcript_ID"].str.split('.', expand=True)[0] data = data.merge(transcript_count, on="Transcript_ID", how="left") data.to_csv(snakemake.output[0], sep='\t', index=False) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | import sys import os import pandas as pd import numpy as np first = True out = open(snakemake.output[0], "w") for e in snakemake.input: if os.path.getsize(e) > 0: mhcout = open(e, 'r') alleles = next(mhcout).split('\t') header = next(mhcout).rstrip().split('\t') if first: allele = '' for i in range(0, len(header)): #print(header[i].rstrip()) if i < len(alleles): #print(alleles[i]) if alleles[i] != '': allele = alleles[i].rstrip() + '_' header[i] = allele + header[i].rstrip() header[len(header) -1] = "NB" first = False #print(header) out.write('\t'.join(header) + '\n') for line in mhcout: out.write(line) out.close() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | import sys import os import pandas as pd import numpy as np def select_columns(mhc): rank_cols = [c for c in mhc.columns if "Rank" in c] affinity_cols = [c for c in mhc.columns if "nM" in c] mhc_cols = ["Pos"] + ["ID"] + ["Peptide"] + rank_cols + affinity_cols + ["NB"] mhc = mhc[mhc_cols] mhc["Rank_min"] = mhc[rank_cols].min(axis=1) mhc["Aff_min"] = mhc[affinity_cols].min(axis=1) mhc["Top_rank_HLA"] = mhc[rank_cols].idxmin(axis=1) mhc["Top_affinity_HLA"] = mhc[affinity_cols].idxmin(axis=1) mhc["Top_rank_HLA"] = mhc["Top_rank_HLA"].str.replace("_Rank","") mhc["Top_affinity_HLA"] = mhc["Top_affinity_HLA"].str.replace("_nM","") return mhc def merge(info, tumor, normal, outfile): tumor = select_columns(tumor) normal = select_columns(normal) id_length = len(tumor.ID[0]) print(info.columns) info["ID"] = info["id"].astype(str).str[:id_length] merged_mhc = tumor.merge(normal,how='left', on=['Pos','ID']) merged_mhc = merged_mhc.rename(columns={col: col.replace("_y","_normal") for col in merged_mhc.columns}).rename(columns={col: col.replace("_x","_tumor") for col in merged_mhc.columns}) info = info.rename(columns={"gene_id":"Gene_ID","gene_name":"Gene_Symbol","strand":"Strand","positions":"Variant_Position","chrom":"Chromosome","somatic_aa_change":"Somatic_AminoAcid_Change"}) merged_dataframe = merged_mhc.merge(info, how='left', on = 'ID') merged_dataframe["Peptide_tumor"] = merged_dataframe[["Peptide_tumor","Peptide_normal"]].apply(lambda x: diffEpitope(*x), axis=1) ## Are all possible variants in the peptide ("Cis") or not ("Trans") merged_dataframe["Variant_Orientation"] = "Cis" trans = merged_dataframe.nvariant_sites > merged_dataframe.nvar merged_dataframe.loc[trans, "Variant_Orientation"] = "Trans" ## check misssense/silent mutation status nonsilent = merged_dataframe.Peptide_tumor != merged_dataframe.Peptide_normal merged_dataframe = merged_dataframe[nonsilent] merged_dataframe = merged_dataframe.drop_duplicates(subset=["transcript","offset","Peptide_tumor","Somatic_AminoAcid_Change"]) data = merged_dataframe[["ID","transcript","Gene_ID","Gene_Symbol","Chromosome","offset","freq","depth", "Somatic_AminoAcid_Change", "nvar", "nsomatic", "somatic_positions", "Peptide_tumor","NB_tumor","Rank_min_tumor","Aff_min_tumor", "Top_rank_HLA_tumor","Top_affinity_HLA_tumor","Peptide_normal","NB_normal", "Rank_min_normal","Aff_min_normal","Top_rank_HLA_normal","Top_affinity_HLA_normal"]] data.columns = ["ID","Transcript_ID","Gene_ID","Gene_Symbol","Chromosome","Position","Frequency","Read_Depth", "Somatic_AminoAcid_Change", "nvar", "nsomatic", "somatic_positions", "Peptide_tumor","BindingHLAs_tumor","Rank_min_tumor","Affinity_min_tumor", "Top_rank_HLA_tumor","Top_affinity_HLA_tumor","Peptide_normal","BindingHLAs_normal", "Rank_min_normal","Aff_min_normal","Top_rank_HLA_normal","Top_affinity_HLA_normal"] # data = data[data.BindingHLAs_tumor > 0] # data = data[(data.NB_normal.isna()) | (data.NB_normal == 0)] #data = data[(data.BindingHLAs_normal == 0)] ### Delete Stop-Codon including peptides data = data[data.Peptide_tumor.str.count("x") == 0] data = data[data.Peptide_tumor.str.count("X") == 0] data.sort_values(["Chromosome", "somatic_positions"], inplace=True) ### Remove Duplicate kmers data = data.drop_duplicates(["Transcript_ID", "Peptide_tumor", "Somatic_AminoAcid_Change", "Peptide_normal"]) data.to_csv(outfile, index=False, sep = '\t') ## highlight the difference between mutated neopeptide and wildtype def diffEpitope(e1,e2): if str(e2) == 'nan' or str(e2) == '': return(e1) e1 = str(e1) e2 = str(e2) diff_pos = [i for i in range(len(e1)) if e1[i] != e2[i]] e_new = e1 e2_new = e2 for p in diff_pos: e_new = e_new[:p] + e_new[p].lower() + e_new[p+1:] e2_new = e2_new[:p] + e2_new[p].lower() + e2_new[p+1:] return(e_new) def main(): info = pd.read_csv(snakemake.input[0], sep = '\t', dtype=str) tumor = pd.read_csv(snakemake.input[1], sep = '\t') normal = pd.read_csv(snakemake.input[2], sep = '\t') outfile = snakemake.output[0] merge(info, tumor, normal, outfile) if __name__ == '__main__': sys.exit(main()) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | import pandas as pd hlaI = ["A","B","C"] hlaII = ["DRB1", "DPA1", "DPB1", "DQA1", "DQB1"] hlas = pd.read_csv(snakemake.input[0], sep='\t') hlasI = hlas[hlas.Locus.isin(hlaI)] hlasI["Allele"]="HLA-" + hlasI.Allele.str.split(":", expand=True)[[0,1]].apply(lambda x: ''.join(x), axis=1).str.replace('*','') hlasI = hlasI[["Allele"]].drop_duplicates() hlasI.to_csv(snakemake.output[0], sep='\t', index=False) hlasII = hlas[hlas.Locus.isin(hlaII)] hlasII["HLA"] = hlasII.Locus.str[0:2] hlasII["Allele"] = hlasII.Allele.str.split(":", expand=True)[[0,1]].apply(lambda x: ''.join(x), axis=1).str.replace('*','') hlasII = pd.DataFrame("HLA-" + hlasII.groupby(["HLA","Chromosome"])["Allele"].apply(lambda x: "-".join(x)).reset_index()["Allele"]).drop_duplicates() hlasII.loc[hlasII.Allele.str.contains("DRB"), "Allele"] = hlasII[hlasII.Allele.str.contains("DRB")]["Allele"].str.replace("HLA-DRB1","DRB1_") hlasII.to_csv(snakemake.output[1], sep='\t', index=False) |
1 2 3 4 5 6 7 8 | from jinja2 import Template import pandas as pd with open(snakemake.input[0]) as template, open(snakemake.output[0], "w") as out: samples = snakemake.params.samples out.write(Template(template.read()).render( samples=samples[(samples["sample"] == snakemake.wildcards.pair) | (samples["sample"] == samples.loc[snakemake.wildcards.pair, "matched_normal"])] )) |
1 2 3 4 5 6 7 | import sys import pandas as pd sys.stderr = open(snakemake.log[0], "w") data = pd.read_csv(snakemake.input.tsv, sep="\t") data.to_excel(snakemake.output.xlsx, index=False) |
Support
- Future updates
Related Workflows





