Bioinfo Macro Host Genome Short Variant Discovery Workflow

public 1yr ago 0 bookmarks

View Workflow

bioinfo_macro_host_genomics — View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

A Snakemake workflow for Short Variant Discovery in Host Genomes

Usage

Test that it works:
- Make sure you have installed snakemake, samtools and bcftools. Either
  - install them with conda/mamba : conda install -c bioconda samtools bcftools ).
  - or create an environment ( conda create -n 3dohg -c bioconda snakemake samtools bcftools ), and activate it ( conda activate 3dohg )
- Generate mock data with bash workflow/scripts/generate_mock_data.sh
- Run the pipeline: snakemake --use-conda --jobs 8 all . It will download all the necesary software through conda. It should take less than 5 minutes.
Run it with your own data:
- Edit config/samples.tsv and add your samples and where are they located.
- Edit config/features.tsv with information regarding the reference you are using.
- Run the pipeline: snakemake --use-conda --jobs 8 all .
- (slurm users): ./run_slurm

Features

FASTQ processing with fastp
Mapping with bowtie2
SAM/BAM/CRAM processing with samtools and picard
Sample swap detection with gtcheck
SNP calling with GATK4
SNP annotation with SNPEff

DAG

host_genomics_pipeline

Code Snippets

shell:
    "bcftools index {input} 2> {log} 1>&2"

SnakeMake BCFtools From line 11 of rules/bcftools.smk

shell:
    """
    bowtie2-build \
        --threads {threads} \
        {params.extra} \
        {input.reference} \
        {params.output_path} \
    2> {log} 1>&2
    """

SnakeMake Bowtie 2 From line 29 of rules/bowtie2.smk

shell:
    """
    (bowtie2 \
        -x {params.index_prefix} \
        -1 {input.forward_} \
        -2 {input.reverse_} \
        -U {input.unpaired1},{input.unpaired2} \
        --threads {threads} \
        --rg-id '{params.rg_id}' \
        --rg '{params.rg_extra}' \
        {params.extra} \
    | samtools sort \
        -l 9 \
        -M \
        -m {params.samtools_mem} \
        -o {output.cram} \
        --reference {input.reference} \
        --threads {threads} \
    ) 2> {log} 1>&2
    """

SnakeMake SAMtools From line 78 of rules/bowtie2.smk

shell:
    """
    fastp \
        --in1 {input.forward_} \
        --in2 {input.reverse_} \
        --out1 {output.forward_} \
        --out2 {output.reverse_} \
        --unpaired1 {output.unpaired1} \
        --unpaired2 {output.unpaired2} \
        --html {output.html} \
        --json {output.json} \
        --compression 1 \
        --verbose \
        --adapter_sequence {params.adapter_forward} \
        --adapter_sequence_r2 {params.adapter_reverse} \
        --thread {threads} \
        {params.extra} \
    2> {log} 1>&2
    """

SnakeMake fastp From line 27 of rules/fastp.smk

shell:
    "fastqc {input} 2> {log} 1>&2"

SnakeMake FastQC From line 12 of rules/fastqc.smk

shell:
    """
    gatk BaseRecalibrator \
        {params.extra} \
        --input {input.bam} \
        --reference {input.reference} \
        --known-sites {input.known_sites} \
        --output {output.table} \
    2> {log} 1>&2
    """

SnakeMake gatk From line 23 of rules/gatk4.smk

shell:
    """
    gatk ApplyBQSR \
        {params.extra} \
        --input {input.bam} \
        --reference {input.reference} \
        --bqsr-recal-file {input.table} \
        --output {output.bam} \
    2> {log} 1>&2
    """

SnakeMake gatk From line 65 of rules/gatk4.smk

shell:
    """
    gatk HaplotypeCaller \
        {params.extra} \
        --reference {input.reference} \
        --input {input.bam} \
        --output {output.gvcf_gz} \
        --emit-ref-confidence GVCF \
        -ploidy {params.ploidy} \
    2> {log} 1>&2
    """

SnakeMake gatk From line 123 of rules/gatk4.smk

shell:
    """
    gatk CombineGVCFs \
        {params.extra} \
        {params.variant_line} \
        --reference {input.reference} \
        --output {output.vcf_gz} \
    2> {log} 1>&2
    """

SnakeMake gatk From line 169 of rules/gatk4.smk

shell:
    """
    gatk GenotypeGVCFs \
        {params.extra} \
        --variant {input.vcf_gz} \
        --reference {input.reference} \
        --output {output.vcf_gz} \
    2> {log} 1>&2
    """

SnakeMake gatk From line 205 of rules/gatk4.smk

shell:
    """
    gatk CalculateGenotypePosteriors \
        {params.extra} \
        --output {output.vcf} \
        --variant {input.vcf} \
        --reference {input.reference} \
    2> {log} 1>&2
    """

SnakeMake gatk From line 251 of rules/gatk4.smk

shell:
    """
    gatk VariantFiltration \
        {params.extra} \
        --reference {input.reference} \
        --variant {input.vcf} \
        --output {output.vcf} \
        --filter-expression '{params.filter_expression}' \
        --filter-name '{params.filter_name}' \
    2> {log} 1>&2
    """

SnakeMake gatk From line 292 of rules/gatk4.smk

shell:
    """
    bcftools concat \
        --output {output} \
        --output-type z9 \
        --threads {threads} \
        {input} \
    2> {log} 1>&2
    """

SnakeMake BCFtools From line 360 of rules/gatk4.smk

shell:
    """
    samtools view \
        --bam \
        --uncompressed \
        --output {output.bam} \
        {input.cram} \
        {params.chromosome} \
    2> {log} 1>&2
    """

SnakeMake SAMtools From line 18 of rules/picard.smk

shell:
    """
    picard MarkDuplicates \
        --INPUT {input.bam} \
        --OUTPUT {output.bam} \
        --METRICS_FILE {output.metrics} \
        --ASSUME_SORT_ORDER coordinate \
        --COMPRESSION_LEVEL 1 \
        --REFERENCE_SEQUENCE {input.reference} \
    2> {log} 1>&2
    """

SnakeMake Picard From line 57 of rules/picard.smk

shell:
    """
    ln --symbolic $(readlink --canonicalize {input.forward_}) {output.forward_}
    ln --symbolic $(readlink --canonicalize {input.reverse_}) {output.reverse_}
    """

SnakeMake From line 13 of rules/reads.smk

shell:
    """
    (gzip \
        --decompres \
        --stdout {input.fa_gz} \
    | bgzip \
        --compress-level 9 \
        --threads {threads} \
        --stdout \
        /dev/stdin \
    > {output.fa_gz} \
    ) 2> {log}
    """

SnakeMake From line 12 of rules/reference.smk

shell:
    """
    (gzip -dc {input.vcf_gz} \
    | bgzip --threads {threads} \
    > {output.vcf_gz}) \
    2> {log}
    """

SnakeMake From line 38 of rules/reference.smk

shell:
    """
    multiqc \
        --title {params.chromosome} \
        --force \
        --filename {params.chromosome} \
        --outdir {params.out_dir} \
        {input} \
    2> {log} 1>&2
    """

SnakeMake MultiQC From line 16 of rules/report_chromosome.smk

shell:
    """
    multiqc \
        --title {params.library} \
        --force \
        --filename {params.library} \
        --outdir {params.out_dir} \
        {input} \
    2> {log} 1>&2
    """

SnakeMake MultiQC From line 24 of rules/report_library.smk

shell:
    """
    multiqc \
        --filename reads \
        --title reads \
        --force \
        --outdir {params.dir} \
        {input} \
    2> {log} 1>&2
    """

SnakeMake MultiQC From line 13 of rules/report_step.smk

shell:
    """
    multiqc \
        --title fastp \
        --force \
        --filename fastp \
        --outdir {params.dir} \
        {input} \
    2> {log} 1>&2
    """

SnakeMake MultiQC fastp From line 37 of rules/report_step.smk

shell:
    """
    multiqc \
        --title bowtie2 \
        --force \
        --filename bowtie2 \
        --outdir {params.dir} \
        {input} \
    2> {log} 1>&2
    """

SnakeMake MultiQC Bowtie 2 From line 61 of rules/report_step.smk

shell:
    """
    multiqc \
        --title picard \
        --force \
        --filename picard \
        --outdir {params.dir} \
        {input} \
    2> {log} 1>&2
    """

SnakeMake MultiQC Picard From line 85 of rules/report_step.smk

shell:
    """
    multiqc \
        --title gatk4 \
        --force \
        --filename gatk4 \
        --outdir {params.dir} \
        {input} \
    2> {log} 1>&2
    """

SnakeMake MultiQC From line 109 of rules/report_step.smk

shell:
    """
    multiqc \
        --title snpeff \
        --force \
        --filename snpeff \
        --outdir {params.dir} \
        {input} \
    2> {log} 1>&2
    """

SnakeMake MultiQC From line 133 of rules/report_step.smk

shell:
    "samtools index {input} 2> {log} 1>&2"

SnakeMake SAMtools From line 11 of rules/samtools.smk

shell:
    "samtools index {input} 2> {log} 1>&2"

SnakeMake SAMtools From line 25 of rules/samtools.smk

shell:
    "samtools dict {input} --output {output} 2> {log} 1>&2"

SnakeMake SAMtools From line 39 of rules/samtools.smk

shell:
    "samtools dict {input} --output {output} 2> {log} 1>&2"

SnakeMake SAMtools From line 53 of rules/samtools.smk

shell:
    "tabix {input} 2> {log} 1>&2"

SnakeMake tabix From line 67 of rules/samtools.smk

shell:
    "bgzip {input} 2> {log} 1>&2"

SnakeMake From line 81 of rules/samtools.smk

shell:
    "samtools stats {input.bam} > {output.tsv} 2> {log}"

SnakeMake SAMtools From line 96 of rules/samtools.smk

shell:
    "samtools stats {input.cram} > {output.tsv} 2> {log}"

SnakeMake SAMtools From line 111 of rules/samtools.smk

shell:
    "samtools flagstats {input.bam} > {output.txt} 2> {log}"

SnakeMake SAMtools From line 126 of rules/samtools.smk

shell:
    "samtools flagstats {input.cram} > {output.txt} 2> {log}"

SnakeMake SAMtools From line 141 of rules/samtools.smk

shell:
    "samtools idxstats {input.bam} > {output.tsv} 2> {log}"

SnakeMake SAMtools From line 156 of rules/samtools.smk

shell:
    "samtools idxstats {input.cram} > {output.tsv} 2> {log}"

SnakeMake SAMtools From line 171 of rules/samtools.smk

shell:
    """
    snpEff download \
        {params.snpeff_db} \
        -dataDir {params.datadir} \
        -verbose \
    2> {log} 1>&2
    """

SnakeMake snpEff From line 12 of rules/snpeff.smk

shell:
    """
    (snpEff ann \
        {params.snpeff_db} \
        -dataDir {params.datadir} \
        -csvStats {output.csv} \
        -verbose \
        -i vcf \
        -o gatk \
        {input.vcf} \
    | bgzip \
        --compress-level 9 \
        --stdout \
    > {output.vcf} \
    ) 2> {log} 1>&2

    mv {params.html} {output.html} 2>> {log} 1>&2
    """

SnakeMake gatk From line 43 of rules/snpeff.smk

shell:
    """
    picard AddOrReplaceReadGroups \
        --INPUT {input.bam} \
        --OUTPUT {output.bam} \
        --RGLB {params.sample_library} \
        --RGPL illumina \
        --RGPU {params.sample_library} \
        --RGSM {params.sample_library} \
        --COMPRESSION_LEVEL 9 \
    2> {log} 1>&2
    """

SnakeMake Picard From line 15 of rules/swaps.smk

shell:
    """
    (bcftools mpileup \
        --output-type z9 \
        --fasta-ref {input.reference} \
        {input} \
    | bcftools call \
        --variants-only \
        --multiallelic-caller \
        --output-type z9 \
        --output {output.vcf} ) \
    2> {log} 1>&2
    """

SnakeMake BCFtools From line 50 of rules/swaps.smk

shell:
    """
    bcftools filter \
        --include 'QUAL>{params.min_qual}' \
        --output-type z9 \
        --output {output.vcf} \
        {input.vcf} \
    2> {log} 1>&2
    """

SnakeMake BCFtools From line 83 of rules/swaps.smk

shell:
    """
    bcftools concat \
        --output-type z9 \
        --output {output.vcf} \
        {input.vcf} \
    2> {log} 1>&2
    """

SnakeMake BCFtools From line 110 of rules/swaps.smk

shell:
    """
    bcftools gtcheck \
        {input.vcf} \
    > {output.tsv} 2> {log}
    """

SnakeMake BCFtools From line 130 of rules/swaps.smk

shell:
    """
    Rscript workflow/scripts/plot_gtcheck.R \
        --infile {input} \
        --outfile {output} \
    2> {log} 1>&2
    """

SnakeMake From line 148 of rules/swaps.smk

library(getopt)
library(tidyverse)

option_matrix <- matrix(
  c(
    "infile", "i", 1, "character",
    "outfile", "o", 1, "character"
  ),
  byrow = TRUE, ncol = 4
)

options <- getopt(option_matrix)



read_gtcheck <- function(gtcheck_filename) {
  read_tsv(
    file = gtcheck_filename,
    skip = 20,
    col_names = c(
      "dc", "query_sample", "genotyped_sample", "discordance", "log_p_hwe",
      "number_of_sites"
    )
  ) %>%
  select(-dc)
}

create_distance_matrix <- function(gtcheck_long) {
  gtcheck_diagonal <-
    gtcheck_long %>%
    select(query_sample, genotyped_sample) %>%
    mutate(discordance = 0) %>%
    pivot_longer(
      query_sample:genotyped_sample,
      names_to = "type",
      values_to = "query_sample"
    ) %>%
    select(query_sample) %>%
    mutate(
      genotyped_sample = query_sample,
      discordance = 0.0
    ) %>%
    distinct()

  gtcheck_upper <-
    gtcheck_long %>%
    select(query_sample, genotyped_sample, discordance)

  gtcheck_lower <-
    gtcheck_long %>%
    select(
      genotyped_sample = query_sample,
      query_sample = genotyped_sample,
      discordance
    )

  gtcheck_discordances <-
    bind_rows(gtcheck_diagonal, gtcheck_upper, gtcheck_lower) %>%
    arrange(query_sample)

  gtcheck_discordances_matrix <-
    gtcheck_discordances %>%
    pivot_wider(
      names_from = genotyped_sample,
      values_from = discordance
    ) %>%
    column_to_rownames("query_sample") %>%
    as.matrix()

  return(gtcheck_discordances_matrix)
}


if (!interactive()) {
  gtcheck_long <- read_gtcheck(options$infile)
  gtcheck_discordances_matrix <- create_distance_matrix(gtcheck_long)
  pdf(file = options$outfile, paper = "a4")
  heatmap(gtcheck_discordances_matrix)
  dev.off()
}