Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation, topic
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
Authors
- Johannes Köster (@johanneskoester)
Usage
Step 1: Install workflow
If you simply want to use this workflow, download and extract the latest release . If you intend to modify and further develop this workflow, fork this repository. Please consider providing any generally applicable modifications via a pull request.
In any case, if you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository and, if available, its DOI (see above).
Step 2: Configure workflow
Configure the workflow according to your needs via editing the file
config.yaml
.
Step 3: Execute workflow
Test your configuration by performing a dry-run via
snakemake -n
Execute the workflow locally via
snakemake --cores $N
using
$N
cores or run it in a cluster environment via
snakemake --cluster qsub --jobs 100
or
snakemake --drmaa --jobs 100
See the Snakemake documentation for further details.
Testing
Tests cases are in the subfolder
.test
. They should be executed via continuous integration with Travis CI.
Code Snippets
12 13 | script: "../scripts/plot-signals.py" |
23 24 | script: "../scripts/plot-read-lengths.py" |
34 35 | script: "../scripts/plot-quals.py" |
46 47 | script: "../scripts/plot-kmer-mapping.py" |
59 60 61 62 63 | shell: "dot -Tsvg {input} " "-Grankdir=TB -Nshape=box -Nstyle=rounded -Nfontname=sans " "-Nfontsize=10 -Npenwidth=2 -Epenwidth=2 -Ecolor=grey -Nbgcolor=white " # -Ncolor='{params.color}'" "> {output}" |
12 13 14 15 16 17 18 19 20 | shell: """ if [ -s {input.reads} ] then kraken --threads {threads} --db {input.db} {input.reads} > {output} 2> {log} else touch {output} fi """ |
33 34 | shell: "kraken-report --db {input.db} {input.tsv} > {output} 2> {log}" |
45 46 | script: "../scripts/plot-classification.py" |
6 7 | shell: "cat {input} | gzip > {output}" |
23 24 25 | shell: "porechop -t {threads} -i {input} " "-b {params.output_dir} --format fastq.gz > {log} 2>&1; " |
10 11 | wrapper: "0.27.1/bio/fastqc" |
21 22 | shell: "crimson fastqc {input} {output}" |
8 9 | script: "../scripts/calc-colormap.py" |
1 2 3 4 5 6 7 8 | import pickle import common full_tree = common.load_classification_tree(snakemake.input[0], min_percentage=0.0) cmap = common.TreeCMap(full_tree) with open(snakemake.output[0], "wb") as out: pickle.dump(cmap, out) |
1 2 3 4 5 6 7 8 9 10 | import pickle from networkx.drawing.nx_agraph import write_dot import common cmap = pickle.load(open(snakemake.input.colormap, "rb")) tree = common.load_classification_tree(snakemake.input.classification, min_percentage=1.0) cmap.color_tree(tree) write_dot(tree, snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | import pickle import matplotlib matplotlib.use("agg") import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import numpy as np import common d = pd.read_table(snakemake.input.mapping, names=["status", "seqid", "taxid", "seqlen", "kmer_mapping"]) mappings = [] for i, per_read_mapping in enumerate(d.kmer_mapping.str.split(" ")): j = 0 def mapping_stretches(row): global j taxid, count = row.split(":") count = int(count) yield from ([taxid, x] for x in range(j, j + count)) j += count m = pd.DataFrame([item for row in per_read_mapping for item in mapping_stretches(row)]) m.columns = ["taxid", "kmer"] m["read"] = i m = m.set_index(["read", "kmer"], drop=False) mappings.append(m) if i == 100: break if mappings: mappings = pd.concat(mappings) mappings = mappings.taxid.unstack() mappings = mappings.iloc[:, :100] # color image in HSV representation of the circular layout of the taxid tree cmap = pickle.load(open(snakemake.input.colormap, "rb")) img = mappings.applymap(cmap.rgb) shape = list(img.shape) + [3] img = np.vstack(img.values.tolist()).reshape(shape) #plt.figure(figsize=(30, 10)) plt.imshow(img, interpolation="nearest", aspect="auto") plt.xlabel("k-mer") plt.ylabel("read") plt.savefig(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | import json import pandas as pd import matplotlib matplotlib.use("Agg") import seaborn as sns import matplotlib.pyplot as plt import numpy as np fastqc = json.load(open(snakemake.input[0])) qualdist = pd.DataFrame.from_records(fastqc["Per base sequence quality"]["contents"]) qualdist.Base = qualdist.Base.apply(lambda b: int(b.split("-")[0])) qualdist = qualdist.replace("NaN", np.NaN) single_value = qualdist["Lower Quartile"].isnull() qualdist = qualdist[~single_value] if not qualdist.empty: plt.fill_between(x=qualdist.Base, y2=qualdist["Lower Quartile"], y1=qualdist["Upper Quartile"], color="#cee3a3") plt.plot(qualdist.Base, qualdist.Mean, "-", color="#649600") sns.despine() plt.xlabel("read base") plt.ylabel("quality (PHRED scale)") plt.xticks(rotation="vertical") plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | import json import pandas as pd import matplotlib matplotlib.use("Agg") import seaborn as sns import matplotlib.pyplot as plt fastqc = json.load(open(snakemake.input[0])) lendist = pd.DataFrame.from_records(fastqc["Sequence Length Distribution"]["contents"]) lendist["Length"] = lendist["Length"].apply(lambda l: int(l.split("-")[0]) if isinstance(l, str) else l) total = lendist["Count"].sum() lendist = lendist[lendist["Count"] > 1] if not lendist.empty: sns.barplot(x="Length", y="Count", data=lendist, color="#649600") sns.despine() plt.title("total reads: {}".format(total)) plt.yscale("log") plt.xlabel("min read length") plt.ylabel("count") plt.xticks(rotation="vertical") plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | import json import pandas as pd import matplotlib matplotlib.use("Agg") import seaborn as sns import matplotlib.pyplot as plt import numpy as np import h5py def get_signals(): for f in snakemake.input: f = h5py.File(f, "r") for read, group in f["Raw"]["Reads"].items(): yield group["Signal"] for signal in get_signals(): l, u = snakemake.params.steps signal = signal[l:u] plt.plot(np.arange(signal.shape[0]), signal, "-", color="#649600") sns.despine() plt.xlabel("") plt.ylabel("raw signal") plt.xticks([]) plt.savefig(snakemake.output[0], bbox_inches="tight") |
34 35 | shell: "snakemake --nolock {input} --report {output}" |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | __author__ = "Julian de Ruiter" __copyright__ = "Copyright 2017, Julian de Ruiter" __email__ = "julianderuiter@gmail.com" __license__ = "MIT" from os import path from tempfile import TemporaryDirectory from snakemake.shell import shell log = snakemake.log_fmt_shell(stdout=False, stderr=True) def basename_without_ext(file_path): """Returns basename of file path, without the file extension.""" base = path.basename(file_path) split_ind = 2 if base.endswith(".gz") else 1 base = ".".join(base.split(".")[:-split_ind]) return base # Run fastqc, since there can be race conditions if multiple jobs # use the same fastqc dir, we create a temp dir. with TemporaryDirectory() as tempdir: shell("fastqc {snakemake.params} --quiet " "--outdir {tempdir} {snakemake.input[0]}" " {log}") # Move outputs into proper position. output_base = basename_without_ext(snakemake.input[0]) html_path = path.join(tempdir, output_base + "_fastqc.html") zip_path = path.join(tempdir, output_base + "_fastqc.zip") if snakemake.output.html != html_path: shell("mv {html_path} {snakemake.output.html}") if snakemake.output.zip != zip_path: shell("mv {zip_path} {snakemake.output.zip}") |
Support
- Future updates
Related Workflows





