Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
A Snakemake workflow for Short Variant Discovery in Host Genomes
Usage
-
Test that it works:
-
Make sure you have installed snakemake, samtools and bcftools. Either
-
install them with conda/mamba :
conda install -c bioconda samtools bcftools
). -
or create an environment (
conda create -n 3dohg -c bioconda snakemake samtools bcftools
), and activate it (conda activate 3dohg
)
-
-
Generate mock data with
bash workflow/scripts/generate_mock_data.sh
-
Run the pipeline:
snakemake --use-conda --jobs 8 all
. It will download all the necesary software through conda. It should take less than 5 minutes.
-
-
Run it with your own data:
-
Edit
config/samples.tsv
and add your samples and where are they located. -
Edit
config/features.tsv
with information regarding the reference you are using. -
Run the pipeline:
snakemake --use-conda --jobs 8 all
. -
(slurm users):
./run_slurm
-
Features
-
FASTQ processing with
fastp
-
Mapping with
bowtie2
-
Sample swap detection with
gtcheck
-
SNP calling with
GATK4
-
SNP annotation with
SNPEff
DAG
Code Snippets
11 12 | shell: "bcftools index {input} 2> {log} 1>&2" |
29 30 31 32 33 34 35 36 37 | shell: """ bowtie2-build \ --threads {threads} \ {params.extra} \ {input.reference} \ {params.output_path} \ 2> {log} 1>&2 """ |
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | shell: """ (bowtie2 \ -x {params.index_prefix} \ -1 {input.forward_} \ -2 {input.reverse_} \ -U {input.unpaired1},{input.unpaired2} \ --threads {threads} \ --rg-id '{params.rg_id}' \ --rg '{params.rg_extra}' \ {params.extra} \ | samtools sort \ -l 9 \ -M \ -m {params.samtools_mem} \ -o {output.cram} \ --reference {input.reference} \ --threads {threads} \ ) 2> {log} 1>&2 """ |
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | shell: """ fastp \ --in1 {input.forward_} \ --in2 {input.reverse_} \ --out1 {output.forward_} \ --out2 {output.reverse_} \ --unpaired1 {output.unpaired1} \ --unpaired2 {output.unpaired2} \ --html {output.html} \ --json {output.json} \ --compression 1 \ --verbose \ --adapter_sequence {params.adapter_forward} \ --adapter_sequence_r2 {params.adapter_reverse} \ --thread {threads} \ {params.extra} \ 2> {log} 1>&2 """ |
12 13 | shell: "fastqc {input} 2> {log} 1>&2" |
23 24 25 26 27 28 29 30 31 32 | shell: """ gatk BaseRecalibrator \ {params.extra} \ --input {input.bam} \ --reference {input.reference} \ --known-sites {input.known_sites} \ --output {output.table} \ 2> {log} 1>&2 """ |
65 66 67 68 69 70 71 72 73 74 | shell: """ gatk ApplyBQSR \ {params.extra} \ --input {input.bam} \ --reference {input.reference} \ --bqsr-recal-file {input.table} \ --output {output.bam} \ 2> {log} 1>&2 """ |
123 124 125 126 127 128 129 130 131 132 133 | shell: """ gatk HaplotypeCaller \ {params.extra} \ --reference {input.reference} \ --input {input.bam} \ --output {output.gvcf_gz} \ --emit-ref-confidence GVCF \ -ploidy {params.ploidy} \ 2> {log} 1>&2 """ |
169 170 171 172 173 174 175 176 177 | shell: """ gatk CombineGVCFs \ {params.extra} \ {params.variant_line} \ --reference {input.reference} \ --output {output.vcf_gz} \ 2> {log} 1>&2 """ |
205 206 207 208 209 210 211 212 213 | shell: """ gatk GenotypeGVCFs \ {params.extra} \ --variant {input.vcf_gz} \ --reference {input.reference} \ --output {output.vcf_gz} \ 2> {log} 1>&2 """ |
251 252 253 254 255 256 257 258 259 | shell: """ gatk CalculateGenotypePosteriors \ {params.extra} \ --output {output.vcf} \ --variant {input.vcf} \ --reference {input.reference} \ 2> {log} 1>&2 """ |
292 293 294 295 296 297 298 299 300 301 302 | shell: """ gatk VariantFiltration \ {params.extra} \ --reference {input.reference} \ --variant {input.vcf} \ --output {output.vcf} \ --filter-expression '{params.filter_expression}' \ --filter-name '{params.filter_name}' \ 2> {log} 1>&2 """ |
360 361 362 363 364 365 366 367 368 | shell: """ bcftools concat \ --output {output} \ --output-type z9 \ --threads {threads} \ {input} \ 2> {log} 1>&2 """ |
18 19 20 21 22 23 24 25 26 27 | shell: """ samtools view \ --bam \ --uncompressed \ --output {output.bam} \ {input.cram} \ {params.chromosome} \ 2> {log} 1>&2 """ |
57 58 59 60 61 62 63 64 65 66 67 | shell: """ picard MarkDuplicates \ --INPUT {input.bam} \ --OUTPUT {output.bam} \ --METRICS_FILE {output.metrics} \ --ASSUME_SORT_ORDER coordinate \ --COMPRESSION_LEVEL 1 \ --REFERENCE_SEQUENCE {input.reference} \ 2> {log} 1>&2 """ |
13 14 15 16 17 | shell: """ ln --symbolic $(readlink --canonicalize {input.forward_}) {output.forward_} ln --symbolic $(readlink --canonicalize {input.reverse_}) {output.reverse_} """ |
12 13 14 15 16 17 18 19 20 21 22 23 24 | shell: """ (gzip \ --decompres \ --stdout {input.fa_gz} \ | bgzip \ --compress-level 9 \ --threads {threads} \ --stdout \ /dev/stdin \ > {output.fa_gz} \ ) 2> {log} """ |
38 39 40 41 42 43 44 | shell: """ (gzip -dc {input.vcf_gz} \ | bgzip --threads {threads} \ > {output.vcf_gz}) \ 2> {log} """ |
16 17 18 19 20 21 22 23 24 25 | shell: """ multiqc \ --title {params.chromosome} \ --force \ --filename {params.chromosome} \ --outdir {params.out_dir} \ {input} \ 2> {log} 1>&2 """ |
24 25 26 27 28 29 30 31 32 33 | shell: """ multiqc \ --title {params.library} \ --force \ --filename {params.library} \ --outdir {params.out_dir} \ {input} \ 2> {log} 1>&2 """ |
13 14 15 16 17 18 19 20 21 22 | shell: """ multiqc \ --filename reads \ --title reads \ --force \ --outdir {params.dir} \ {input} \ 2> {log} 1>&2 """ |
37 38 39 40 41 42 43 44 45 46 | shell: """ multiqc \ --title fastp \ --force \ --filename fastp \ --outdir {params.dir} \ {input} \ 2> {log} 1>&2 """ |
61 62 63 64 65 66 67 68 69 70 | shell: """ multiqc \ --title bowtie2 \ --force \ --filename bowtie2 \ --outdir {params.dir} \ {input} \ 2> {log} 1>&2 """ |
85 86 87 88 89 90 91 92 93 94 | shell: """ multiqc \ --title picard \ --force \ --filename picard \ --outdir {params.dir} \ {input} \ 2> {log} 1>&2 """ |
109 110 111 112 113 114 115 116 117 118 | shell: """ multiqc \ --title gatk4 \ --force \ --filename gatk4 \ --outdir {params.dir} \ {input} \ 2> {log} 1>&2 """ |
133 134 135 136 137 138 139 140 141 142 | shell: """ multiqc \ --title snpeff \ --force \ --filename snpeff \ --outdir {params.dir} \ {input} \ 2> {log} 1>&2 """ |
11 12 | shell: "samtools index {input} 2> {log} 1>&2" |
25 26 | shell: "samtools index {input} 2> {log} 1>&2" |
39 40 | shell: "samtools dict {input} --output {output} 2> {log} 1>&2" |
53 54 | shell: "samtools dict {input} --output {output} 2> {log} 1>&2" |
67 68 | shell: "tabix {input} 2> {log} 1>&2" |
81 82 | shell: "bgzip {input} 2> {log} 1>&2" |
96 97 | shell: "samtools stats {input.bam} > {output.tsv} 2> {log}" |
111 112 | shell: "samtools stats {input.cram} > {output.tsv} 2> {log}" |
126 127 | shell: "samtools flagstats {input.bam} > {output.txt} 2> {log}" |
141 142 | shell: "samtools flagstats {input.cram} > {output.txt} 2> {log}" |
156 157 | shell: "samtools idxstats {input.bam} > {output.tsv} 2> {log}" |
171 172 | shell: "samtools idxstats {input.cram} > {output.tsv} 2> {log}" |
12 13 14 15 16 17 18 19 | shell: """ snpEff download \ {params.snpeff_db} \ -dataDir {params.datadir} \ -verbose \ 2> {log} 1>&2 """ |
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | shell: """ (snpEff ann \ {params.snpeff_db} \ -dataDir {params.datadir} \ -csvStats {output.csv} \ -verbose \ -i vcf \ -o gatk \ {input.vcf} \ | bgzip \ --compress-level 9 \ --stdout \ > {output.vcf} \ ) 2> {log} 1>&2 mv {params.html} {output.html} 2>> {log} 1>&2 """ |
15 16 17 18 19 20 21 22 23 24 25 26 | shell: """ picard AddOrReplaceReadGroups \ --INPUT {input.bam} \ --OUTPUT {output.bam} \ --RGLB {params.sample_library} \ --RGPL illumina \ --RGPU {params.sample_library} \ --RGSM {params.sample_library} \ --COMPRESSION_LEVEL 9 \ 2> {log} 1>&2 """ |
50 51 52 53 54 55 56 57 58 59 60 61 62 | shell: """ (bcftools mpileup \ --output-type z9 \ --fasta-ref {input.reference} \ {input} \ | bcftools call \ --variants-only \ --multiallelic-caller \ --output-type z9 \ --output {output.vcf} ) \ 2> {log} 1>&2 """ |
83 84 85 86 87 88 89 90 91 | shell: """ bcftools filter \ --include 'QUAL>{params.min_qual}' \ --output-type z9 \ --output {output.vcf} \ {input.vcf} \ 2> {log} 1>&2 """ |
110 111 112 113 114 115 116 117 | shell: """ bcftools concat \ --output-type z9 \ --output {output.vcf} \ {input.vcf} \ 2> {log} 1>&2 """ |
130 131 132 133 134 135 | shell: """ bcftools gtcheck \ {input.vcf} \ > {output.tsv} 2> {log} """ |
148 149 150 151 152 153 154 | shell: """ Rscript workflow/scripts/plot_gtcheck.R \ --infile {input} \ --outfile {output} \ 2> {log} 1>&2 """ |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | library(getopt) library(tidyverse) option_matrix <- matrix( c( "infile", "i", 1, "character", "outfile", "o", 1, "character" ), byrow = TRUE, ncol = 4 ) options <- getopt(option_matrix) read_gtcheck <- function(gtcheck_filename) { read_tsv( file = gtcheck_filename, skip = 20, col_names = c( "dc", "query_sample", "genotyped_sample", "discordance", "log_p_hwe", "number_of_sites" ) ) %>% select(-dc) } create_distance_matrix <- function(gtcheck_long) { gtcheck_diagonal <- gtcheck_long %>% select(query_sample, genotyped_sample) %>% mutate(discordance = 0) %>% pivot_longer( query_sample:genotyped_sample, names_to = "type", values_to = "query_sample" ) %>% select(query_sample) %>% mutate( genotyped_sample = query_sample, discordance = 0.0 ) %>% distinct() gtcheck_upper <- gtcheck_long %>% select(query_sample, genotyped_sample, discordance) gtcheck_lower <- gtcheck_long %>% select( genotyped_sample = query_sample, query_sample = genotyped_sample, discordance ) gtcheck_discordances <- bind_rows(gtcheck_diagonal, gtcheck_upper, gtcheck_lower) %>% arrange(query_sample) gtcheck_discordances_matrix <- gtcheck_discordances %>% pivot_wider( names_from = genotyped_sample, values_from = discordance ) %>% column_to_rownames("query_sample") %>% as.matrix() return(gtcheck_discordances_matrix) } if (!interactive()) { gtcheck_long <- read_gtcheck(options$infile) gtcheck_discordances_matrix <- create_distance_matrix(gtcheck_long) pdf(file = options$outfile, paper = "a4") heatmap(gtcheck_discordances_matrix) dev.off() } |
Support
- Future updates
Related Workflows





