A Snakemake workflow to process paired-end RNA-Seq data.

public 1yr ago 0 bookmarks

View Workflow

A Snakemake workflow to process paired-end RNA-Seq data.

Steps:

The workflow consists following steps:

Quality control of the raw and/or trimmed data (FastQC, MultiQC)
Adapter trimming w/ trim_galore (Optional)
Contamination check and decontamination (Optional)
Alignment to the reference genome (hisat2, STAR)
Alignment quality control with RSeQC, QualiMap
Transcript/gene quantification (StringTie, featureCounts, RSEM)
Alignment-free transcript quantification (kallisto/salmon)

Future additions:

Differential gene expression analysis (deseq2)
Machine learning-based mapping uncertainty analysis (GeneQC)

dag

Code Snippets

shell:
    """
    fastqc --outdir {params.prefix} -t {threads} -f fastq {input}
    """

SnakeMake FastQC From line 17 of rules/01a_fastqc.smk

wrapper:
    "0.35.0/bio/multiqc"

SnakeMake MultiQC From line 9 of rules/01b_multiqc.smk

wrapper:
    "0.35.2/bio/trim_galore/pe"

SnakeMake From line 13 of rules/02_trim.smk

shell:
    """
    fastqc -t {threads} {input.r1} {input.r2} -q -f fastq -o {params.prefix}
    """

SnakeMake FastQC From line 30 of rules/02_trim.smk

wrapper:
    "0.49.0/bio/multiqc"

SnakeMake MultiQC From line 43 of rules/02_trim.smk

wrapper:
    "0.49.0/bio/multiqc"

SnakeMake MultiQC From line 54 of rules/02_trim.smk

shell:
    '''
    bwa mem -t {threads} {input.rrna} {input.r1} {input.r2} > {output}
    '''

SnakeMake BWA From line 11 of rules/03a_rrna_check.smk

shell:
    '''
    samtools view -@ {threads} -bS -o {output} {input.sam}
    '''

SnakeMake SAMtools From line 24 of rules/03a_rrna_check.smk

shell:
    '''
    samtools flagstat {input.bam} --threads {threads} > {output}
    '''

SnakeMake SAMtools From line 37 of rules/03a_rrna_check.smk

shell:
    '''
    samtools stats {input.bam} > {output}
    '''

SnakeMake SAMtools From line 50 of rules/03a_rrna_check.smk

wrapper:
    "0.48.0/bio/multiqc"

SnakeMake MultiQC From line 64 of rules/03a_rrna_check.smk

shell:
    """
    bbsplit.sh -Xmx120g threads={threads} in1={input.r1} in2={input.r2} ref_rrna={input.rrna} path={params.index} basename={params.out_rrna}/out_%.fq outu1={output.out1} outu2={output.out2} refstats={output.stats} 2> {log}
    """

SnakeMake From line 18 of rules/03b_rrna_clean.smk

shell:
    """
    fastqc -t {threads} {input.r1} {input.r2} -q -f fastq -o {params.prefix}
    """

SnakeMake FastQC From line 37 of rules/03b_rrna_clean.smk

wrapper:
    "0.47.0/bio/multiqc"

SnakeMake MultiQC From line 50 of rules/03b_rrna_clean.smk

shell:
    '''
    bwa mem -t {threads} {input.rrna} {input.r1} {input.r2} > {output}
    '''

SnakeMake BWA From line 13 of rules/03c_rrna_cleaned_check.smk

shell:
    '''
    samtools view -@ {threads} -bS -o {output} {input.sam}
    '''

SnakeMake SAMtools From line 26 of rules/03c_rrna_cleaned_check.smk

shell:
    '''
    samtools stats {input.bam} -@ {threads} > {output}
    '''

SnakeMake SAMtools From line 39 of rules/03c_rrna_cleaned_check.smk

wrapper:
    "0.48.0/bio/multiqc"

SnakeMake MultiQC From line 52 of rules/03c_rrna_cleaned_check.smk

shell:
    "gffread {input.anno} -T -o {output.gtf}"

SnakeMake gffread From line 8 of rules/04a_star.smk

wrapper:
    "0.49.0/bio/star/index"

SnakeMake From line 22 of rules/04a_star.smk

wrapper:
    "0.49.0/bio/star/align"

SnakeMake From line 41 of rules/04a_star.smk

shell:
    """
    cat {input} | awk '($5 > 0 && $7 > 2 && $6==0)' | cut -f1-6 | sort | uniq > {output}
    """

SnakeMake From line 49 of rules/04a_star.smk

wrapper:
    "0.49.0/bio/star/align"

SnakeMake From line 73 of rules/04a_star.smk

script:
    "../scripts/gtf2bed.py"

SnakeMake From line 11 of rules/04b_rseqc.smk

shell:
    "junction_annotation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
    "> {log[0]} 2>&1"

SnakeMake From line 38 of rules/04b_rseqc.smk

shell:
    "bam_stat.py -i {input} > {output} 2> {log}"

SnakeMake From line 51 of rules/04b_rseqc.smk

shell:
    "infer_experiment.py -r {input.bed} -i {input.bam} > {output} 2> {log}"

SnakeMake From line 65 of rules/04b_rseqc.smk

shell:
    "read_distribution.py -r {input.bed} -i {input.bam} > {output} 2> {log}"

SnakeMake From line 78 of rules/04b_rseqc.smk

shell:
    "read_GC.py -i {input} -o {params.prefix} > {log} 2>&1"

SnakeMake From line 92 of rules/04b_rseqc.smk

wrapper:
    "0.49.0/bio/multiqc"

SnakeMake MultiQC From line 126 of rules/04b_rseqc.smk

shell:
    '''
    samtools sort -@ {threads} -n -o {output} -T {params.prfx} {input.bam}
    '''

SnakeMake SAMtools From line 11 of rules/04c_qualimap.smk

shell:
    '''
    qualimap rnaseq -bam {input.sorted_bam} -gtf {params.gtf} --outdir {params.outdir} --sorted --paired
    '''

SnakeMake QualiMap From line 26 of rules/04c_qualimap.smk

wrapper:
    "0.49.0/bio/multiqc"

SnakeMake MultiQC From line 38 of rules/04c_qualimap.smk

shell:
    """
    rsem-prepare-reference \
    -p {threads} \
    --gff3 {input.gff} {input.fasta} {params.ref_name}
    """

SnakeMake RSEM From line 12 of rules/05a_rsem.smk

shell:
    """
    rsem-calculate-expression \
    --no-qualities \
    -p {threads} \
    --strandedness reverse \
    --alignments --paired-end {input.bam} {params.genomedir} {params.prefix}
    """

SnakeMake RSEM From line 31 of rules/05a_rsem.smk

wrapper:
    "0.49.0/bio/multiqc"

SnakeMake MultiQC From line 47 of rules/05a_rsem.smk

shell:
    """
    htseq-count -m intersection-nonempty --stranded=reverse --idattr gene_id -r pos -f bam {input} {params.gtf} > {output} 2> {log}
    """

SnakeMake htseqcount From line 13 of rules/05b_htseq.smk

shell:
    """
    multiqc -m htseq {params.prefix} --filename {output} 2> {log}
    """

SnakeMake MultiQC HTSeq From line 29 of rules/05b_htseq.smk

shell:
    """
    grep "^>" {input.ref} | cut -d " " -f 1 > {output.decoy}
    sed -i.bak -e 's/>//g' {output.decoy}
    cat {input.tcp} {input.ref} > {output.gent}
    """

SnakeMake From line 12 of rules/06a_salmon.smk

shell:
    """
    salmon index -p {threads} -t {input.gent} -d {input.decoy} -i {output} &> {log}
    """

SnakeMake Salmon From line 30 of rules/06a_salmon.smk

shell:
    """
    salmon quant -i {input.index} -l A -1 {input.r1} -2 {input.r2} -o {output} --validateMappings --gcBias --seqBias --writeUnmappedNames --writeMappings={output.mappings} -p {threads} --numBootstraps 100
    """

SnakeMake Quant Salmon From line 49 of rules/06a_salmon.smk

wrapper:
    "0.49.0/bio/multiqc"

SnakeMake MultiQC From line 61 of rules/06a_salmon.smk

shell:
    '''
    gffread -w {output} -g {input.ref} {input.gtf}
    '''

SnakeMake gffread From line 73 of rules/06a_salmon.smk

shell:
    '''
    salmon quant -t {input.tcp} -l A -a {input.bam} -o {output} --gcBias --seqBias --writeUnmappedNames -p {threads} -g {input.gtf} --numBootstraps 100
    '''

SnakeMake Quant Salmon From line 91 of rules/06a_salmon.smk

wrapper:
    "0.47.0/bio/multiqc"

SnakeMake MultiQC From line 106 of rules/06a_salmon.smk

shell:
    "kallisto index -i {output} {input} 2> {log}"

SnakeMake kallisto From line 11 of rules/06b_kallisto.smk

shell:
    "kallisto quant --threads {threads} -i {input.idx} -o {output} --gtf {input.gtf} "
    "{params.extra} {input.r1} {input.r2} 2> {log}"

SnakeMake Quant kallisto From line 29 of rules/06b_kallisto.smk

wrapper:
    "0.49.0/bio/multiqc"

SnakeMake MultiQC From line 40 of rules/06b_kallisto.smk

import gffutils

db = gffutils.create_db(snakemake.input[0],
                        dbfn=snakemake.output.db,
                        force=True,
                        keep_order=True,
                        merge_strategy='merge',
                        sort_attribute_values=True,
                        disable_infer_genes=True,
                        disable_infer_transcripts=True)

with open(snakemake.output.bed, 'w') as outfileobj:
    for tx in db.features_of_type('transcript', order_by='start'):
        bed = [s.strip() for s in db.bed12(tx).split('\t')]
        bed[3] = tx.id
        outfileobj.write('{}\n'.format('\t'.join(bed)))

Python From line 3 of scripts/gtf2bed.py

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "julianderuiter@gmail.com"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


input_dirs = set(path.dirname(fp) for fp in snakemake.input)
output_dir = path.dirname(snakemake.output[0])
output_name = path.basename(snakemake.output[0])
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "multiqc"
    " {snakemake.params}"
    " --force"
    " -o {output_dir}"
    " -n {output_name}"
    " {input_dirs}"
    " {log}")

Python Snakemake MultiQC From line 3 of multiqc/wrapper.py

__author__ = "Kerrin Mendler"
__copyright__ = "Copyright 2018, Kerrin Mendler"
__email__ = "mendlerke@gmail.com"
__license__ = "MIT"


from snakemake.shell import shell
import os.path


log = snakemake.log_fmt_shell()

# Check that two input files were supplied
n = len(snakemake.input)
assert n == 2, "Input must contain 2 files. Given: %r." % n

# Don't run with `--fastqc` flag
if "--fastqc" in snakemake.params.get("extra", ""):
    raise ValueError("The trim_galore Snakemake wrapper cannot "
                       "be run with the `--fastqc` flag. Please "
                       "remove the flag from extra params. "
                       "You can use the fastqc Snakemake wrapper on "
                       "the input and output files instead.")

# Check that four output files were supplied
m = len(snakemake.output)
assert m == 4, "Output must contain 4 files. Given: %r." % m

# Check that all output files are in the same directory
out_dir = os.path.dirname(snakemake.output[0])
for file_path in snakemake.output[1:]:
    assert out_dir == os.path.dirname(file_path), \
        "trim_galore can only output files to a single directory." \
        " Please indicate only one directory for the output files."

shell(
    "(trim_galore"
    " {snakemake.params.extra}"
    " --paired"
    " -o {out_dir}"
    " {snakemake.input})"
    " {log}")

Python Snakemake FastQC Trim_Galore From line 3 of pe/wrapper.py

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "julianderuiter@gmail.com"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


input_dirs = set(path.dirname(fp) for fp in snakemake.input)
output_dir = path.dirname(snakemake.output[0])
output_name = path.basename(snakemake.output[0])
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "multiqc"
    " {snakemake.params}"
    " --force"
    " -o {output_dir}"
    " -n {output_name}"
    " {input_dirs}"
    " {log}"
)

Python Snakemake MultiQC From line 3 of multiqc/wrapper.py

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "julianderuiter@gmail.com"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


input_dirs = set(path.dirname(fp) for fp in snakemake.input)
output_dir = path.dirname(snakemake.output[0])
output_name = path.basename(snakemake.output[0])
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "multiqc"
    " {snakemake.params}"
    " --force"
    " -o {output_dir}"
    " -n {output_name}"
    " {input_dirs}"
    " {log}"
)

Python Snakemake MultiQC From line 3 of multiqc/wrapper.py

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "julianderuiter@gmail.com"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


input_dirs = set(path.dirname(fp) for fp in snakemake.input)
output_dir = path.dirname(snakemake.output[0])
output_name = path.basename(snakemake.output[0])
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "multiqc"
    " {snakemake.params}"
    " --force"
    " -o {output_dir}"
    " -n {output_name}"
    " {input_dirs}"
    " {log}"
)

Python Snakemake MultiQC From line 3 of multiqc/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "koester@jimmy.harvard.edu"
__license__ = "MIT"


import os
from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

fq1 = snakemake.input.get("fq1")
assert fq1 is not None, "input-> fq1 is a required input parameter"
fq1 = (
    [snakemake.input.fq1]
    if isinstance(snakemake.input.fq1, str)
    else snakemake.input.fq1
)
fq2 = snakemake.input.get("fq2")
if fq2:
    fq2 = (
        [snakemake.input.fq2]
        if isinstance(snakemake.input.fq2, str)
        else snakemake.input.fq2
    )
    assert len(fq1) == len(
        fq2
    ), "input-> equal number of files required for fq1 and fq2"
input_str_fq1 = ",".join(fq1)
input_str_fq2 = ",".join(fq2) if fq2 is not None else ""
input_str = " ".join([input_str_fq1, input_str_fq2])

if fq1[0].endswith(".gz"):
    readcmd = "--readFilesCommand zcat"
else:
    readcmd = ""

outprefix = os.path.dirname(snakemake.output[0]) + "/"

shell(
    "STAR "
    "{extra} "
    "--runThreadN {snakemake.threads} "
    "--genomeDir {snakemake.params.index} "
    "--readFilesIn {input_str} "
    "{readcmd} "
    "--outFileNamePrefix {outprefix} "
    "--outStd Log "
    "{log}"
)

Python Snakemake STAR From line 1 of align/wrapper.py

__author__ = "Thibault Dayris"
__copyright__ = "Copyright 2019, Dayris Thibault"
__email__ = "thibault.dayris@gustaveroussy.fr"
__license__ = "MIT"

from snakemake.shell import shell
from snakemake.utils import makedirs

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

extra = snakemake.params.get("extra", "")
sjdb_overhang = snakemake.params.get("sjdbOverhang", "100")

gtf = snakemake.input.get("gtf")
if gtf is not None:
    gtf = "--sjdbGTFfile " + gtf
    sjdb_overhang = "--sjdbOverhang " + sjdb_overhang
else:
    gtf = sjdb_overhang = ""

makedirs(snakemake.output)

shell(
    "STAR "  # Tool
    "--runMode genomeGenerate "  # Indexation mode
    "{extra} "  # Optional parameters
    "--runThreadN {snakemake.threads} "  # Number of threads
    "--genomeDir {snakemake.output} "  # Path to output
    "--genomeFastaFiles {snakemake.input.fasta} "  # Path to fasta files
    "{sjdb_overhang} "  # Read-len - 1
    "{gtf} "  # Highly recommended GTF
    "{log}"  # Logging
)