Snakemake Workflow: De Novo Transcriptome Assembly with HISAT2 and StringTie
Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
Snakemake workflow: De novo transcriptome assembly with Hisat2 and StringTie.
This pipeline comprises all steps of the 'new tuxedo suit' workflow published by Pertea et al. (1) and can be used to perform genome-guided de novo transcriptome assembly on bulk RNA-seq data with default parameters (without downstream analysis in R). Additionally, the piepline comprises:
-
adapter and quality trimmimg
-
read quality control with FASTQC
-
generation of a representative alingment file and bed file for the purpose of visualizing read coverage.
(1) Pertea, Mihaela; Kim, Daehwan; Pertea, Geo M.; Leek, Jeffrey T.; Salzberg, Steven L. (2016): Transcript-level expression analysis of RNA-seq experiments with HISAT, StringTie and Ballgown. In: Nature Protocols 11, 1650 EP
Code Snippets
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | __author__ = "Wibowo Arindrarto" __copyright__ = "Copyright 2016, Wibowo Arindrarto" __email__ = "bow@bow.web.id" __license__ = "BSD" from snakemake.shell import shell # Placeholder for optional parameters extra = snakemake.params.get("extra", "") # Run log log = snakemake.log_fmt_shell() # Input file wrangling reads = snakemake.input.get("reads") if isinstance(reads, str): input_flags = "-U {0}".format(reads) elif len(reads) == 1: input_flags = "-U {0}".format(reads[0]) elif len(reads) == 2: input_flags = "-1 {0} -2 {1}".format(*reads) else: raise RuntimeError( "Reads parameter must contain at least 1 and at most 2" " input files.") # Executed shell command shell( "(hisat2 {extra} --threads {snakemake.threads}" " -x {snakemake.params.idx} {input_flags}" " | samtools view -Sbh -o {snakemake.output[0]} -)" " {log}") |
74 75 | wrapper: "0.35.0/bio/trimmomatic/pe" |
94 95 | script: "scripts/hisat2_wrapper0.34.0.py" |
107 108 | wrapper: "0.35.1/bio/samtools/sort" |
115 116 | wrapper: "0.35.1/bio/samtools/index" |
150 151 | shell: "stringtie -v --merge -p {threads} -G {input.anno} -o {output} {input.gtf} 2> {log}" |
164 165 | shell: "gffcompare -G -r {input.anno} -o output/gffcompare/GFFcompare {input.st_transcripts}" |
178 | shell: "stringtie -e -B -p {threads} -G {input.anno} -o {output} {input.sbam}" |
190 191 192 193 194 195 | shell: """ cd output prepDE.py -i ballgown cd .. """ |
212 213 | wrapper: "0.34.0/bio/fastqc" |
226 227 | wrapper: "0.34.0/bio/fastqc" |
239 240 | wrapper: "0.35.1/bio/multiqc" |
257 258 | wrapper: "0.34.0/bio/fastqc" |
274 275 | wrapper: "0.35.1/bio/multiqc" |
294 295 296 297 298 299 | shell: """ nreads=$(samtools view -c {input}) rate=$(echo "scale=5;100000/$nreads" | bc) sambamba view -f bam -t 5 --subsampling-seed=42 -s $rate {input} | samtools sort -m 4G -@ 8 -T - > {output} 2> {log} """ |
311 312 | wrapper: "0.35.1/bio/samtools/merge" |
323 324 | wrapper: "0.35.1/bio/samtools/index" |
336 337 | shell: "bamCoverage -b {input.bam} -o {output}" |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | __author__ = "Julian de Ruiter" __copyright__ = "Copyright 2017, Julian de Ruiter" __email__ = "julianderuiter@gmail.com" __license__ = "MIT" from os import path from tempfile import TemporaryDirectory from snakemake.shell import shell log = snakemake.log_fmt_shell(stdout=False, stderr=True) def basename_without_ext(file_path): """Returns basename of file path, without the file extension.""" base = path.basename(file_path) split_ind = 2 if base.endswith(".gz") else 1 base = ".".join(base.split(".")[:-split_ind]) return base # Run fastqc, since there can be race conditions if multiple jobs # use the same fastqc dir, we create a temp dir. with TemporaryDirectory() as tempdir: shell("fastqc {snakemake.params} --quiet " "--outdir {tempdir} {snakemake.input[0]}" " {log}") # Move outputs into proper position. output_base = basename_without_ext(snakemake.input[0]) html_path = path.join(tempdir, output_base + "_fastqc.html") zip_path = path.join(tempdir, output_base + "_fastqc.zip") if snakemake.output.html != html_path: shell("mv {html_path} {snakemake.output.html}") if snakemake.output.zip != zip_path: shell("mv {zip_path} {snakemake.output.zip}") |
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | __author__ = "Johannes Köster, Jorge Langa" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" from snakemake.shell import shell # Distribute available threads between trimmomatic itself and any potential pigz instances def distribute_threads(input_files, output_files, available_threads): gzipped_input_files = sum(1 for file in input_files if file.endswith(".gz")) gzipped_output_files = sum(1 for file in output_files if file.endswith(".gz")) potential_threads_per_process = available_threads // (1 + gzipped_input_files + gzipped_output_files) if potential_threads_per_process > 0: # decompressing pigz creates at most 4 threads pigz_input_threads = min(4, potential_threads_per_process) if gzipped_input_files != 0 else 0 pigz_output_threads = \ (available_threads - pigz_input_threads * gzipped_input_files) // (1 + gzipped_output_files) \ if gzipped_output_files != 0 else 0 trimmomatic_threads = available_threads - pigz_input_threads * gzipped_input_files - \ pigz_output_threads * gzipped_output_files else: # not enough threads for pigz pigz_input_threads = 0 pigz_output_threads = 0 trimmomatic_threads = available_threads return trimmomatic_threads, pigz_input_threads, pigz_output_threads def compose_input_gz(filename, threads): if filename.endswith(".gz") and threads > 0: return "<(pigz -p {threads} --decompress --stdout {filename})".format( threads=threads, filename=filename ) return filename def compose_output_gz(filename, threads, compression_level): if filename.endswith(".gz") and threads > 0: return ">(pigz -p {threads} {compression_level} > {filename})".format( threads=threads, compression_level=compression_level, filename=filename ) return filename extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True) compression_level = snakemake.params.get("compression_level", "-5") trimmer = " ".join(snakemake.params.trimmer) # Distribute threads input_files = [snakemake.input.r1, snakemake.input.r2] output_files = [snakemake.output.r1, snakemake.output.r1_unpaired, snakemake.output.r2, snakemake.output.r2_unpaired] trimmomatic_threads, input_threads, output_threads = distribute_threads( input_files, output_files, snakemake.threads ) input_r1, input_r2 = [compose_input_gz(filename, input_threads) for filename in input_files] output_r1, output_r1_unp, output_r2, output_r2_unp = [ compose_output_gz(filename, output_threads, compression_level) for filename in output_files ] shell( "trimmomatic PE -threads {trimmomatic_threads} {extra} " "{input_r1} {input_r2} " "{output_r1} {output_r1_unp} " "{output_r2} {output_r2_unp} " "{trimmer} " "{log}" ) |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | __author__ = "Julian de Ruiter" __copyright__ = "Copyright 2017, Julian de Ruiter" __email__ = "julianderuiter@gmail.com" __license__ = "MIT" from os import path from snakemake.shell import shell input_dirs = set(path.dirname(fp) for fp in snakemake.input) output_dir = path.dirname(snakemake.output[0]) output_name = path.basename(snakemake.output[0]) log = snakemake.log_fmt_shell(stdout=True, stderr=True) shell( "multiqc" " {snakemake.params}" " --force" " -o {output_dir}" " -n {output_name}" " {input_dirs}" " {log}") |
1 2 3 4 5 6 7 8 9 10 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" from snakemake.shell import shell shell("samtools index {snakemake.params} {snakemake.input[0]} {snakemake.output[0]}") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" from snakemake.shell import shell # Samtools takes additional threads through its option -@ # One thread for samtools merge # Other threads are *additional* threads passed to the '-@' argument threads = ( "" if snakemake.threads <= 1 else " -@ {} ".format(snakemake.threads - 1) ) shell("samtools merge {threads} {snakemake.params} " "{snakemake.output[0]} {snakemake.input}") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | __author__ = "Johannes Köster" __copyright__ = "Copyright 2016, Johannes Köster" __email__ = "koester@jimmy.harvard.edu" __license__ = "MIT" import os from snakemake.shell import shell prefix = os.path.splitext(snakemake.output[0])[0] # Samtools takes additional threads through its option -@ # One thread for samtools # Other threads are *additional* threads passed to the argument -@ threads = ( "" if snakemake.threads <= 1 else " -@ {} ".format(snakemake.threads - 1) ) shell( "samtools sort {snakemake.params} {threads} -o {snakemake.output[0]} " "-T {prefix} {snakemake.input[0]}") |
Support
- Future updates
Related Workflows





