BioWorkflows: Transparent and robust SARS-CoV-2 variant calling and lineage assignment with comprehensive reporting.

Tool	Link
ABySS	www.doi.org/10.1101/gr.214346.116
Altair	www.doi.org/10.21105/joss.01057
BAMClipper	www.doi.org/10.1038/s41598-017-01703-6
BCFtools	www.doi.org/10.1093/gigascience/giab008
BEDTools	www.doi.org/10.1093/bioinformatics/btq033
Biopython	www.doi.org/10.1093/bioinformatics/btp163
bwa	www.doi.org/10.1093/bioinformatics/btp324
Covariants	www.github.com/hodcroftlab/covariants
delly	www.doi.org/10.1093/bioinformatics/bts378
ensembl-vep	www.doi.org/10.1186/s13059-016-0974-4
entrez-direct	www.ncbi.nlm.nih.gov/books/NBK179288
fastp	www.doi.org/10.1093/bioinformatics/bty560
FastQC	www.bioinformatics.babraham.ac.uk/projects/fastqc
fgbio	www.github.com/fulcrum-genomics/fgbio
FreeBayes	www.arxiv.org/abs/1207.3907
intervaltree	www.github.com/chaimleib/intervaltree
Jupyter	www.jupyter.org
kallisto	www.doi.org/10.1038/nbt.3519
Kraken2	www.doi.org/10.1186/s13059-019-1891-0
Krona	www.doi.org/10.1186/1471-2105-12-385
mason	www. http://publications.imp.fu-berlin.de/962
MEGAHIT	www.doi.org/10.1093/bioinformatics/btv033
Minimap2	www.doi.org/10.1093/bioinformatics/bty191
MultiQC	www.doi.org/10.1093/bioinformatics/btw354
pandas	pandas.pydata.org
Picard	broadinstitute.github.io/picard
PySAM	www.doi.org/10.11578/dc.20190903.1
QUAST	www.doi.org/10.1093/bioinformatics/btt086
RaGOO	www.doi.org/10.1186/s13059-019-1829-6
ruamel.yaml	www.sourceforge.net/projects/ruamel-yaml
Rust-Bio-Tools	www.github.com/rust-bio/rust-bio-tools
SAMtools	www.doi.org/10.1093/bioinformatics/btp352
Snakemake	www.doi.org/10.12688/f1000research.29032.1
sourmash	www.doi.org/10.21105/joss.00027
SPAdes	www.doi.org/10.1089/cmb.2012.0021
SVN	www.doi.org/10.1142/s0219720005001028
Tabix	www.doi.org/10.1093/bioinformatics/btq671
Trinity	www.doi.org/10.1038/nprot.2013.084
Varlociraptor	www.doi.org/10.1186/s13059-020-01993-6
Vega-Lite	www.doi.org/10.1109/TVCG.2016.2599030
Velvet	www.doi.org/10.1101/gr.074492.107
vembrane	www.github.com/vembrane/vembrane

shell:
    '(echo "$(( $(zcat {input.fastq1} | wc -l) / 4 ))" > {output.read_count}) 2> {log}'

SnakeMake From line 16 of rules/assembly.smk

shell:
    "(megahit -1 {input.fastq1} -2 {input.fastq2} {params.preset} --out-dir {params.outdir} -f && "
    " mv {params.outdir}/final.contigs.fa {output})"
    " > {log} 2>&1"

SnakeMake From line 39 of rules/assembly.smk

shell:
    "(megahit -r {input} {params.preset} --out-dir {params.outdir} -f && "
    " mv {params.outdir}/final.contigs.fa {output})"
    " > {log} 2>&1"

SnakeMake From line 62 of rules/assembly.smk

shell:
    "({wildcards.spadesflavor}.py -1 {input.fastq1} -2 {input.fastq2} -o {params.outdir} -t {threads} && "
    " if [ -f {params.outdir}/raw_contigs.fasta ]; then mv {params.outdir}/raw_contigs.fasta {output}; else mv {params.outdir}/contigs.fasta {output}; fi )"
    " > {log} 2>&1"

SnakeMake From line 86 of rules/assembly.smk

shell:
    "if [ -d {params.outdir} ]; then rm -Rf {params.outdir}; fi && "
    "(spades.py --corona -s {input} -o {params.outdir} -t {threads} && "
    " mv {params.outdir}/raw_contigs.fasta {output})"
    " > {log} 2>&1"

SnakeMake From line 104 of rules/assembly.smk

script:
    "../scripts/check_contigs.py"

SnakeMake From line 120 of rules/assembly.smk

shell:
    "(mkdir -p {params.outdir}/{wildcards.sample} && cd {params.outdir}/{wildcards.sample} &&"
    " ragoo.py ../../../../../{input.contigs} ../../../../../{input.reference} &&"
    " cd ../../../../../ && mv {params.outdir}/{wildcards.sample}/ragoo_output/ragoo.fasta {output})"
    " > {log} 2>&1"

SnakeMake From line 138 of rules/assembly.smk

script:
    "../scripts/ragoo-remove-chr0.py"

SnakeMake From line 154 of rules/assembly.smk

shell:
    "bcftools consensus -f {input.fasta} {input.bcf} > {output} 2> {log}"

SnakeMake BCFtools Consensus From line 175 of rules/assembly.smk

shell:
    "medaka_consensus -f -i {input.fasta} -o {params.outdir} -d {input.reference} -t {threads} > {log} 2>&1"

SnakeMake Medaka From line 199 of rules/assembly.smk

shell:
    "cp {input} {output} 2> {log}"

SnakeMake From line 212 of rules/assembly.smk

shell:
    "cp {input} {output} 2> {log}"

SnakeMake From line 225 of rules/assembly.smk

shell:
    "minimap2 -ax asm5 {input.target} {input.query} -o {output} 2> {log}"

SnakeMake Minimap2 From line 239 of rules/assembly.smk

shell:
    "quast.py --min-contig 1 --threads {threads} -o {params.outdir} -r {input.reference} "
    "--bam {input.bam} {input.fasta} "
    "> {log} 2>&1"

SnakeMake QUAST From line 257 of rules/assembly.smk

shell:
    "(zcat {input.left} > {output.left} &&"
    "zcat {input.right} > {output.right}) 2>{log}"

SnakeMake From line 85 of rules/benchmarking.smk

shell:
    "minimap2 --MD --eqx -ax asm5 {input} -o {output} 2> {log}"

SnakeMake Minimap2 From line 117 of rules/benchmarking.smk

script:
    "../scripts/assembly-benchmark-results.py"

SnakeMake From line 133 of rules/benchmarking.smk

script:
    "../scripts/summarize-non-cov2.py"

SnakeMake From line 149 of rules/benchmarking.smk

shell:
    "rbt csv-report -s '\t' {input.summary} {output}"

SnakeMake rust-bio-tools From line 174 of rules/benchmarking.smk

script:
    "../scripts/generate-mixtures.py"

SnakeMake From line 187 of rules/benchmarking.smk

script:
    "../scripts/evaluate-strain-call-error.py"

SnakeMake From line 205 of rules/benchmarking.smk

script:
    "../scripts/plot-caller-error.py"

SnakeMake From line 224 of rules/benchmarking.smk

shell:
    "(Trinity --left {input.fastq1} --max_memory 16G --right {input.fastq2} --CPU {threads} --seqType fq --output {params.outdir} && "
    "mv {params.outdir}.Trinity.fasta {output} ) > {log} 2>&1"

SnakeMake From line 242 of rules/benchmarking.smk

shell:
    """
    velveth {params.outdir} 21 -fastq.gz -shortPaired {input.fastq1} {input.fastq2} > {log} 2>&1
    velvetg {params.outdir} -ins_length 150 -exp_cov 10 >> {log} 2>&1
    mv {params.outdir}/contigs.fa {output} >> {log} 2>&1
    """

SnakeMake From line 261 of rules/benchmarking.smk

shell:
    "(cd {params.outdir} &&"
    " ragoo.py ../../../../../{input.contigs} ../../../../../{input.reference} &&"
    " cd ../../../../../ && mv {params.outdir}/ragoo_output/ragoo.fasta {output})"
    " > {log} 2>&1"

SnakeMake From line 287 of rules/benchmarking.smk

script:
    "../scripts/plot-assembly-comparison.py"

SnakeMake From line 356 of rules/benchmarking.smk

script:
    "../scripts/get-read-statistics.py"

SnakeMake From line 374 of rules/benchmarking.smk

script:
    "../scripts/plot-dependency-of-pangolin-call.py"

SnakeMake From line 393 of rules/benchmarking.smk

script:
    "../scripts/plot-pangolin-conflict.py"

SnakeMake From line 410 of rules/benchmarking.smk

script:
    "../scripts/collect_lineage_calls.py"

SnakeMake From line 439 of rules/benchmarking.smk

script:
    "../scripts/get_largest_contig.py"

SnakeMake From line 452 of rules/benchmarking.smk

script:
    "../scripts/select_random_lineages.py"

SnakeMake From line 467 of rules/benchmarking.smk

script:
    "../scripts/aggregate_read_calls.py"

SnakeMake From line 480 of rules/benchmarking.smk

script:
    "../scripts/benchmarking/compare-vcf.py"

SnakeMake From line 524 of rules/benchmarking.smk

script:
    "../scripts/benchmarking/aggregate-test-case-variants.py"

SnakeMake From line 540 of rules/benchmarking.smk

script:
    "../scripts/benchmarking/filter-test-case-variants.py"

SnakeMake From line 555 of rules/benchmarking.smk

script:
    "../scripts/benchmarking/get-test-case-variant-paths.py"

SnakeMake From line 575 of rules/benchmarking.smk

script:
    "../scripts/benchmarking/check-presence-of-test-case-variant-in-call.py"

SnakeMake From line 593 of rules/benchmarking.smk

shell:
    "varlociraptor call variants "
    "--testcase-prefix {output.testcase} "
    "--testcase-locus {wildcards.chrom}:{wildcards.pos} "
    "{params.biases} generic --obs {wildcards.sample}={input.obs} "
    "--scenario {input.scenario} > {output.bcf} 2> {log}"

SnakeMake Varlociraptor From line 614 of rules/benchmarking.smk

script:
    "../scripts/masking.py"

SnakeMake From line 31 of rules/generate_output.smk

script:
    "../scripts/plot-all-coverage.py"

SnakeMake From line 51 of rules/generate_output.smk

script:
    "../scripts/plot-all-coverage.py"

SnakeMake From line 73 of rules/generate_output.smk

script:
    "../scripts/quality-filter.py"

SnakeMake From line 91 of rules/generate_output.smk

script:
    "../scripts/generate-high-quality-report.py"

SnakeMake From line 122 of rules/generate_output.smk

script:
    "../scripts/generate-overview-table.py"

SnakeMake From line 162 of rules/generate_output.smk

shell:
    "rbt csv-report {input} --formatter {params.formatter} --pin-until {params.pin_until} {output} > {log} 2>&1"

SnakeMake rust-bio-tools From line 205 of rules/generate_output.smk

script:
    "../scripts/generate-filter-overview.py"

SnakeMake From line 228 of rules/generate_output.smk

shell:
    "rbt csv-report {input} --pin-until {params.pin_until} {output} > {log} 2>&1"

SnakeMake rust-bio-tools From line 249 of rules/generate_output.smk

script:
    "../scripts/plot-lineages-over-time.py"

SnakeMake From line 275 of rules/generate_output.smk

script:
    "../scripts/plot-variants-over-time.py"

SnakeMake From line 308 of rules/generate_output.smk

script:
    "../scripts/aggregate-pangolin-calls-per-stage.py"

SnakeMake From line 328 of rules/generate_output.smk

shell:
    "rbt csv-report {input} --pin-until {params.pin_until} {output} > {log} 2>&1"

SnakeMake rust-bio-tools From line 349 of rules/generate_output.smk

shell:
    "snakemake --nolock --report-stylesheet resources/custom-stylesheet.css {input} "
    "--report {output} {params.for_testing} "
    "> {log} 2>&1"

SnakeMake Snakemake From line 413 of rules/generate_output.smk

script:
    "../scripts/collect-lineage-variants.py"

SnakeMake From line 17 of rules/lineage_variant_calling.smk

shell:
    "bcftools annotate -a {input.annotation} -c LINEAGES,SIGNATURES {input.calls} > {output} 2> {log}"

SnakeMake BCFtools From line 33 of rules/lineage_variant_calling.smk

script:
    "../scripts/generate-lineage-variant-table.py"

SnakeMake From line 47 of rules/lineage_variant_calling.smk

shell:
    "nanoQC {input} -o {params.outdir} > {log} 2>&1"

SnakeMake From line 18 of rules/long_read.smk

shell:
    "echo $(( $(cat {input} | wc -l ) / 4)) > {output} 2> {log}"

SnakeMake From line 31 of rules/long_read.smk

shell:
    """
    if file {input} | grep -q compressed ; then
        gzip -d -c {input} | NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 > {output} 2> {log}
    else
        NanoFilt --length {params.min_length} --quality {params.min_PHRED} --maxlength 700 {input} > {output} 2> {log}
    fi
    """

SnakeMake nanofilt From line 47 of rules/long_read.smk

shell:
    "notramp -a -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir}  2> {log}"

SnakeMake From line 71 of rules/long_read.smk

shell:
    "( if [ -d {params.outdir} ]; then rm -Rf {params.outdir}; fi && "
    "canu -correct -nanopore {input} -p {wildcards.sample} -d {params.outdir} genomeSize=30k "
    "minOverlapLength=10 minReadLength=200 useGrid=false {params.for_testing} "
    "corMMapMerSize=10 corOutCoverage=50000 corMinCoverage=0 maxInputCoverage=20000 "
    "corOverlapper=minimap utgOverlapper=minimap obtOverlapper=minimap "
    "corConcurrency={params.concurrency} ovbConcurrency={params.concurrency} "
    "cormhapConcurrency={params.concurrency} cormhapThreads={params.concurrency} "
    "cormmapConcurrency={params.concurrency} cormmapThreads={params.concurrency} "
    "obtmmapConcurrency={params.concurrency} obtmmapThreads={params.concurrency} "
    "utgmmapConcurrency={params.concurrency} utgmmapThreads={params.concurrency} "
    "redConcurrency={params.concurrency} redThreads={params.concurrency} "
    "ovsConcurrency={params.concurrency} oeaConcurrency={params.concurrency} && "
    "gzip -d {params.corr_gz} --keep)"
    "> {log} 2>&1"

SnakeMake CANU From line 105 of rules/long_read.smk

shell:
    "notramp -t --incl_prim -r {input.reads} -p {input.primer} -g {input.ref_genome} -o {params.outdir}  2> {log}"

SnakeMake From line 135 of rules/long_read.smk

shell:
    "bcftools consensus -f {input.fasta} {input.bcf} > {output} 2> {log}"

SnakeMake BCFtools Consensus From line 150 of rules/long_read.smk

shell:
    "(cp {input} {output} && "
    ' sed -i "1s/.*/>{wildcards.sample}/" {output})'
    " 2> {log}"

SnakeMake From line 168 of rules/long_read.smk

script:
    "../scripts/update-sample-sheet.py"

SnakeMake From line 14 of rules/preprocessing.smk

script:
    "../scripts/vcf-to-fasta.py"

SnakeMake From line 29 of rules/pseudoassembly.smk

shell:
    "minimap2 --eqx -ax asm5 {input.assembly} {input.pseudoassembly} -o {output} 2> {log}"

SnakeMake Minimap2 From line 43 of rules/pseudoassembly.smk

script:
    "../scripts/aggregate-assembly-comparisons.py"

SnakeMake From line 58 of rules/pseudoassembly.smk

wrapper:
    "v1.12.2/bio/fastqc"

SnakeMake FastQC From line 15 of rules/qc.smk

wrapper:
    "v1.15.1/bio/multiqc"

SnakeMake MultiQC From line 46 of rules/qc.smk

wrapper:
    "v1.15.1/bio/multiqc"

SnakeMake MultiQC From line 72 of rules/qc.smk

wrapper:
    "v1.15.1/bio/samtools/flagstat"

SnakeMake From line 83 of rules/qc.smk

shell:
    "(samtools depth -aH -o {output} {input} && "
    " sed -i 's/{params.ref}.3/{wildcards.sample}/' {output})"
    " 2> {log}"

SnakeMake From line 98 of rules/qc.smk

shell:
    "(kraken2 --db {input.db} --threads {threads} --unclassified-out {params.unclassified} "
    "--classified-out {params.classified} --report {output.report} --gzip-compressed "
    "--paired {input.reads} > {output.kraken_output}) 2> {log}"

SnakeMake From line 142 of rules/qc.smk

shell:
    "(kraken2 --db {input.db} --threads {threads}"
    " --report {output.report} --gzip-compressed"
    " {input.reads} > {output.kraken_output})"
    "2> {log}"

SnakeMake From line 162 of rules/qc.smk

shell:
    "ktImportTaxonomy -m 3 -t 5 -tax {input.taxonomy_database} -o {output} {input} 2> {log}"

SnakeMake Krona From line 180 of rules/qc.smk

shell:
    "zcat -f {input} > {output}"

SnakeMake From line 194 of rules/qc.smk

script:
    "../scripts/extract-reads-of-interest.py"

SnakeMake From line 211 of rules/qc.smk

shell:
    "(samtools sort  -@ {threads} -n {input} -o {output.bam_sorted}; "
    " samtools fastq -@ {threads} {output.bam_sorted} -1 {output.fq1} -2 {output.fq2})"
    " > {log} 2>&1"

SnakeMake SAMtools From line 227 of rules/qc.smk

shell:
    "(samtools sort  -@ {threads} -n {input} -o {output.bam_sorted}; "
    " samtools fastq -@ {threads} -0 {output.fq} {output.bam_sorted})"
    "> {log} 2>&1"

SnakeMake SAMtools From line 244 of rules/qc.smk

shell:
    "(kraken2 --db {input.db} --threads {threads} --report {output.report} --gzip-compressed "
    "--paired {input.reads} > {output.kraken_output}) 2> {log}"

SnakeMake From line 265 of rules/qc.smk

shell:
    "(kraken2 --db {input.db} --threads {threads} --report {output.report} --gzip-compressed "
    "{input.reads} > {output.kraken_output}) 2> {log}"

SnakeMake From line 284 of rules/qc.smk

shell:
    "ktImportTaxonomy -m 3 -t 5 -tax {input.taxonomy_database} -o {output} {input} 2> {log}"

SnakeMake Krona From line 300 of rules/qc.smk

wrapper:
    "v1.15.1/bio/samtools/sort"

SnakeMake From line 19 of rules/read_clipping.smk

script:
    "../scripts/bed-to-bedpe.py"

SnakeMake From line 32 of rules/read_clipping.smk

shell:
    "(cp {input.bam} {params.output_dir} &&"
    " cp {input.bai} {params.output_dir} &&"
    " cd {params.output_dir} &&"
    " bamclipper.sh -b {params.bam} -p {params.bed_path} -n {threads} -u 5 -d 5) "
    " > {params.cwd}/{log} 2>&1"

SnakeMake From line 55 of rules/read_clipping.smk

shell:
    "fgbio --sam-validation-stringency=LENIENT ClipBam -i {input.bam} -o {output} -H true -r {input.ref} > {log} 2>&1"

SnakeMake fgbio From line 78 of rules/read_clipping.smk

shell:
    "samtools fastq -@ {threads} {input.bam} -1 {output.fq1} -2 {output.fq2} 2> {log}"

SnakeMake SAMtools From line 93 of rules/read_clipping.smk

shell:
    "samtools fastq -@ {threads} {input.bam} > {output} 2> {log}"

SnakeMake SAMtools From line 107 of rules/read_clipping.smk

script:
    "../scripts/plot-primer-clipping.py"

SnakeMake From line 139 of rules/read_clipping.smk

wrapper:
    "v1.15.1/bio/bwa/index"

SnakeMake From line 21 of rules/read_mapping.smk

wrapper:
    "v1.15.1/bio/bwa/index"

SnakeMake From line 39 of rules/read_mapping.smk

wrapper:
    "v1.15.1/bio/bwa/mem"

SnakeMake From line 56 of rules/read_mapping.smk

wrapper:
    "v1.15.1/bio/picard/markduplicates"

SnakeMake From line 68 of rules/read_mapping.smk

wrapper:
    "v1.15.1/bio/samtools/calmd"

SnakeMake From line 83 of rules/read_mapping.smk

wrapper:
    "v1.15.1/bio/fastp"

SnakeMake fastp From line 30 of rules/read_trimming.smk

wrapper:
    "v1.15.1/bio/fastp"

SnakeMake fastp From line 52 of rules/read_trimming.smk

shell:
    "curl -sSL https://www.ncbi.nlm.nih.gov/sars-cov-2/download-nuccore-ids > {output} 2> {log}"

SnakeMake From line 14 of rules/ref.smk

shell:
    "((esearch -db nucleotide -query '{params.accession}' | "
    "efetch -format fasta > {output}) && [ -s {output} ]) 2> {log}"

SnakeMake From line 33 of rules/ref.smk

shell:
    "(bigBedToBed http://hgdownload.soe.ucsc.edu/gbdb/wuhCor1/uniprot/unipChainCov2.bb"
    " -chrom=NC_045512v2 -start=0 -end=29903 {output})"
    "2>{log}"

SnakeMake From line 58 of rules/ref.smk

shell:
    "(cut -f1-12 {input} | sed -e 's/ /-/g' | sed -e 's/NC_045512v2/NC_045512.2/g'"
    " | gt bed_to_gff3 -featuretype gene -thicktype transcript -blocktype CDS -o {output} -force /dev/stdin )"
    "2> {log}"

SnakeMake From line 73 of rules/ref.smk

script:
    "../scripts/fix-protein-gff.py"

SnakeMake From line 102 of rules/ref.smk

shell:
    "bgzip -c {input} > {output} 2> {log}"

SnakeMake From line 129 of rules/ref.smk

shell:
    "curl -sSL https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/"
    "master/problematic_sites_sarsCov2.vcf | bgzip -c > {output} 2> {log}"

SnakeMake From line 153 of rules/ref.smk

shell:
    "mkdir {output} && curl -SL https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20220926.tar.gz | tar zxvf - -C {output} 2> {log}"

SnakeMake From line 165 of rules/ref.smk

shell:
    "ktUpdateTaxonomy.sh {output} 2> {log}"

SnakeMake Krona From line 176 of rules/ref.smk

shell:
    "curl -SL -o {output} {params.human_genome} 2> {log}"

SnakeMake From line 190 of rules/ref.smk

shell:
    "(mkdir -p {output} &&"
    " curl -L https://github.com/cov-lineages/pangoLEARN/archive/master.tar.gz |"
    " tar xvz --strip-components=1 -C {output})"
    " > {log} 2>&1"

SnakeMake From line 201 of rules/ref.smk

shell:
    "(mkdir -p {output} &&"
    " curl -L https://github.com/cov-lineages/lineages/archive/master.tar.gz | "
    " tar xvz --strip-components=1 -C {output})"
    " > {log} 2>&1"

SnakeMake From line 215 of rules/ref.smk

shell:
    "(curl -L -u $GISAID_API_TOKEN https://www.epicov.org/epi3/3p/resseq02/export/provision.json.xz |"
    " xz -d -T0 > {output})"
    " > {log} 2>&1"

SnakeMake From line 229 of rules/ref.smk

shell:
    "sed -E 's/>(\S+)\\b/>{params.lineage}/;t' {input} > {output}"

SnakeMake From line 247 of rules/ref.smk

script:
    "../scripts/get-strains-from-genbank.py"

SnakeMake From line 265 of rules/ref.smk

script:
    "../scripts/extract-strains-from-gisaid-provision.py"

SnakeMake From line 18 of rules/strain_calling.smk

shell:
    "cat {input} > {output}"

SnakeMake From line 31 of rules/strain_calling.smk

wrapper:
    "v1.15.1/bio/kallisto/index"

SnakeMake From line 43 of rules/strain_calling.smk

shell:
    "awk 'BEGIN {{ t=0.0;sq=0.0; n=0; }} ;NR%4==2 {{ n++;L=length($0);t+=L;sq+=L*L; }}END{{ m=t/n;printf(\"%f\\n\",m) ; }}' {input} && "
    "awk 'BEGIN {{ t=0.0;sq=0.0; n=0; }} ;NR%4==2 {{ n++;L=length($0);t+=L;sq+=L*L; }}END{{ m=t/n;printf(\"%f\\n\",sq/n-m*m) ; }}' {input} && "
    "awk 'BEGIN {{ t=0.0;sq=0.0; n=0; }} ;NR%4==2 {{ n++;L=length($0);t+=L;sq+=L*L; }}END{{ m=t/n;printf(\"%f\\n\",m) ; }}' {input} > {output.avg_read_length} && "
    "awk 'BEGIN {{ t=0.0;sq=0.0; n=0; }} ;NR%4==2 {{ n++;L=length($0);t+=L;sq+=L*L; }}END{{ m=t/n;printf(\"%f\\n\",sq/n-m*m) ; }}' {input} > {output.standard_deviation}"

SnakeMake From line 57 of rules/strain_calling.smk

wrapper:
    "v1.15.1/bio/kallisto/quant"

SnakeMake Quant From line 74 of rules/strain_calling.smk

shell:
    "pangolin {input.contigs} --data {params.pango_data_path} --outfile {output} > {log} 2>&1"

SnakeMake Pangolin From line 155 of rules/strain_calling.smk

wrapper:
    "v1.15.1/bio/tabix/index"

SnakeMake From line 16 of rules/utils.smk

wrapper:
    "v1.15.1/bio/samtools/index"

SnakeMake From line 27 of rules/utils.smk

shell:
    "bcftools index {input} 2> {log}"

SnakeMake BCFtools From line 40 of rules/utils.smk

shell:
    "bcftools sort -O b {input} -o {output} 2> {log}"

SnakeMake BCFtools From line 53 of rules/utils.smk

wrapper:
    "v1.15.1/bio/samtools/faidx"

SnakeMake From line 64 of rules/utils.smk

shell:
    "gzip --keep {input}"

SnakeMake From line 77 of rules/utils.smk

wrapper:
    "v1.15.1/bio/vep/plugins"

SnakeMake From line 14 of rules/variant_annotation.smk

wrapper:
    "v1.15.1/bio/vep/annotate"

SnakeMake From line 38 of rules/variant_annotation.smk

wrapper:
    "v1.15.1/bio/freebayes"

SnakeMake FreeBayes From line 24 of rules/variant_calling.smk

script:
    "../scripts/delly.py"

SnakeMake From line 41 of rules/variant_calling.smk

shell:
    "(medaka_haploid_variant -i {input.sample} -r {input.ref} -o medaka_tmp/medaka/{wildcards.date}/{wildcards.reference}/{wildcards.sample}"
    " -t {threads} -m {params.model} && mv medaka_tmp/medaka/{wildcards.date}/{wildcards.reference}/{wildcards.sample}/medaka.annotated.vcf {output} &&"
    " rm -r medaka_tmp/medaka/{wildcards.date}/{wildcards.reference}/{wildcards.sample}) > {log} 2>&1"

SnakeMake From line 62 of rules/variant_calling.smk

shell:
    "(longshot -P 0 -F -A --no_haps --bam {input.bam} --ref {input.ref} --out {output} &&"
    " sed -i '2 i\##contig=<ID={params.reference_name}>' {output})"
    " 2> {log}"

SnakeMake From line 85 of rules/variant_calling.smk

    shell:
        "bcftools view -Oz -o {output} {input} 2> {log}"  # TODO -Oz generates a .vcf.gz!!


rule render_scenario:
    input:
        local(get_resource("scenario.yaml")),
    output:
        "results/{date}/scenarios/{sample}.yaml",
    log:
        "logs/{date}/render-scenario/{sample}.log",
    conda:
        "../envs/unix.yaml"

SnakeMake BCFtools From line 103 of rules/variant_calling.smk

shell:
    "sed 's/sample:/{wildcards.sample}:/' {input} > {output}"

SnakeMake From line 116 of rules/variant_calling.smk

shell:
    "varlociraptor estimate alignment-properties {input.ref} --bam {input.bam} > {output} 2> {log}"

SnakeMake Varlociraptor From line 131 of rules/variant_calling.smk

shell:
    "varlociraptor preprocess variants {params.extra} --candidates {input.candidates} "
    "{input.ref} --bam {input.bam} --max-depth {params.depth} --output {output} 2> {log}"

SnakeMake Varlociraptor From line 151 of rules/variant_calling.smk

shell:
    "varlociraptor "
    "call variants {params.biases} generic --obs {wildcards.sample}={input.obs} "
    "--scenario {input.scenario} > {output} 2> {log}"

SnakeMake Varlociraptor From line 168 of rules/variant_calling.smk

wrapper:
    "v1.15.1/bio/bcftools/concat"

SnakeMake From line 190 of rules/variant_calling.smk

wrapper:
    "v1.15.1/bio/vembrane/filter"

SnakeMake From line 19 of rules/variant_filtration.smk

shell:
    "varlociraptor filter-calls control-fdr --mode local-smart {input} --var {wildcards.vartype} "
    "--events {params.events} --fdr {params.fdr} > {output} 2> {log}"

SnakeMake Varlociraptor From line 37 of rules/variant_filtration.smk

wrapper:
    "v1.15.1/bio/bcftools/concat"

SnakeMake From line 52 of rules/variant_filtration.smk

shell:
    "rbt vcf-report {input.ref} --bams {params.bams} --vcfs {params.bcfs} --formats {params.format_field} "
    "--infos PROB_* -d {params.max_read_depth} -l {params.js_files} -- {output} 2> {log}"

SnakeMake rust-bio-tools From line 37 of rules/variant_report.smk

script:
    "../scripts/ucsc_vcf.py"

SnakeMake From line 61 of rules/variant_report.smk

shell:
    "cat {input} > {output} 2> {log}"

SnakeMake From line 81 of rules/variant_report.smk

import sys

sys.stderr = open(snakemake.log[0], "w")

from collections import defaultdict
from typing import List

import pandas as pd
import pysam


def aggregate_assembly_comparisons(
    bam_files: List[str], samples: List[str], output: str
):
    data = []
    for sample, bam_file_path in zip(samples, bam_files):
        sample_data = defaultdict()
        with pysam.AlignmentFile(bam_file_path) as bam_file:
            for record in bam_file:
                sample_data["Sample"] = sample
                try:
                    sample_data["Edit distance"] = record.get_tag("NM")
                except KeyError:
                    sample_data["Edit distance"] = "tag 'NM' not present"
                sample_data["Cigarstring"] = record.cigarstring

        data.append(sample_data)

    pd.DataFrame(data).to_csv(output, sep="\t", index=False)


aggregate_assembly_comparisons(
    snakemake.input, snakemake.params.samples, snakemake.output[0]
)

Python Pandas pysam From line 6 of scripts/aggregate-assembly-comparisons.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd

pangolin_calls = []

assert len(snakemake.input) == len(snakemake.params.samples)
assert len(snakemake.input) == len(snakemake.params.stages)

for path, sample, stage in zip(
    snakemake.input, snakemake.params.samples, snakemake.params.stages
):
    pangolin_call = pd.read_csv(path)
    pangolin_call["sample"] = sample
    pangolin_call["stage"] = stage

    pangolin_calls.append(pangolin_call)

pangolin_calls_by_stage = pd.concat(pangolin_calls, axis=0, ignore_index=True)

failed_called = (pangolin_calls_by_stage["qc_status"] == "fail") | (
    pangolin_calls_by_stage["lineage"] == "None"
)
pangolin_calls_by_stage.loc[failed_called, "lineage"] = (
    "Call failed. Reason: " + pangolin_calls_by_stage.loc[failed_called, "note"]
)

pangolin_calls_by_stage = pangolin_calls_by_stage.pivot(
    index="sample", columns="stage", values="lineage"
).reset_index()

only_pseudo_assembly = ["scaffold", "polished", "masked-polished", "pseudo"]
only_consensus_assembly = [
    "scaffold",
    "polished",
    "masked-polished",
    "consensus",
    "masked-consensus",
]
both_fallbacks = list(
    set(only_pseudo_assembly).symmetric_difference(set(only_consensus_assembly))
)

columns = pangolin_calls_by_stage.columns.to_list()

if all(stage in columns for stage in both_fallbacks):
    sorted_columns = [
        "sample",
        "scaffold",
        "polished",
        "masked-polished",
        "consensus",
        "masked-consensus",
        "pseudo",
    ]
elif all(stage in columns for stage in only_pseudo_assembly):
    sorted_columns = ["sample", "scaffold", "polished", "masked-polished", "pseudo"]
elif all(stage in columns for stage in only_consensus_assembly):
    sorted_columns = [
        "sample",
        "scaffold",
        "polished",
        "masked-polished",
        "consensus",
        "masked-consensus",
    ]

pangolin_calls_by_stage = pangolin_calls_by_stage[sorted_columns]

pangolin_calls_by_stage.rename(
    columns={
        "sample": "Sample",
        "scaffold": "Scaffolded Seq",
        "polished": "Polished Seq",
        "masked-polished": "Masked Polished Seq",
        "pseudo": "Pseudo Seq",
        "consensus": "Consensus Seq",
        "masked-consensus": "Masked Consensus Seq",
    },
    inplace=True,
)

pangolin_calls_by_stage.fillna("Not called on", inplace=True)
# pangolin_calls_by_stage.replace({"None": "Call failed"}, inplace=True)
pangolin_calls_by_stage.sort_values(by="Sample", inplace=True)

print(pangolin_calls_by_stage)

pangolin_calls_by_stage.to_csv(snakemake.output[0], index=False)

Python Pandas Consensus From line 6 of scripts/aggregate-pangolin-calls-per-stage.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd


def aggregate_calls(sm_input, sm_output):
    all_calls = []
    for i, file in enumerate(sm_input):
        call = pd.read_csv(file, sep="\t")
        call["sample"] = i
        all_calls.append(call)

    all_calls = pd.concat(all_calls, axis=0, ignore_index=True)
    all_calls.to_csv(sm_output, sep="\t", index=False)


if __name__ == "__main__":
    aggregate_calls(snakemake.input, snakemake.output[0])

Python Pandas From line 6 of scripts/aggregate_read_calls.py

sys.stderr = open(snakemake.log[0], "w")
import pysam

# see https://pysam.readthedocs.io/en/latest/api.html#pysam.AlignedSegment.cigartuples
BAM_CEQUAL = 7
BAM_CDIFF = 8
BAM_CSOFT_CLIP = 4
BAM_CHARD_CLIP = 5
BAM_CINS = 1
BAM_CDEL = 2

N_WINDOW = 100


def is_uncertain_region(record, rpos, rend, ref_fasta):
    refseq = ref_fasta.fetch(record.reference_name, rpos - N_WINDOW, rend + N_WINDOW)
    return ("N" * 10) in refseq


def get_edit_dist(record, ref_fasta):
    edit_dist = 0
    qpos = 0
    rpos = record.reference_start
    for op, length in record.cigartuples:
        if op == BAM_CEQUAL:
            # no edit
            qpos += length
            rpos += length
        else:
            if op == BAM_CDIFF:
                refseq = ref_fasta.fetch(record.reference_name, rpos, rpos + length)
                if not is_uncertain_region(record, rpos, rpos + length, ref_fasta):
                    # Only consider edits where the reference has a true nucleotide
                    # because IUPAC codes lead to mason simulating an N in the reads.
                    edit_dist += sum(base in "ACGT" for base in refseq)
                qpos += length
                rpos += length

            elif op == BAM_CSOFT_CLIP:
                edit_dist += length
                qpos += length
            elif op == BAM_CHARD_CLIP:
                edit_dist += length
            elif op == BAM_CINS:
                refseq = ref_fasta.fetch(record.reference_name, rpos, rpos + length)
                if not is_uncertain_region(record, rpos, rpos + length, ref_fasta):
                    # only count edit distance if the region behind the insert does not
                    # contain N stretches in the reference. Rationale: those regions are apparently
                    # hard to assemble, and we cannot properly simulate reads for them, so
                    # we should not count them in a benchmark.
                    edit_dist += length
                qpos += length
            elif op == BAM_CDEL:
                refseq = ref_fasta.fetch(record.reference_name, rpos, rpos + length)
                if not is_uncertain_region(record, rpos, rpos + length, ref_fasta):
                    # only count edit distance if the region behind the insert does not
                    # contain N stretches in the reference. Rationale: those regions are apparently
                    # hard to assemble, and we cannot properly simulate reads for them, so
                    # we should not count them in a benchmark.
                    edit_dist += length
                rpos += length
            else:
                raise ValueError(f"Unsupported CIGAR operation: {op}")
    return edit_dist


with open(snakemake.output[0], "w") as out:
    print(
        "Accession",
        "Contigs",
        "Total contigs",
        "Contig length",
        "Reference length",
        "Contig frac",
        "Cum frac",
        "Relevant edit dist",
        "Cigar string",
        "Edit frac",
        sep="\t",
        file=out,
    )
    sum_of_edit_dist = 0
    largest_no_contig = 0
    small_larg_cover = 1.0
    small_larg_cover_name = "asd"
    smallest_cum_frac = 1.0

    # for bam_file in snakemake_input:
    for bam_file, ref_fasta in zip(snakemake.input.bams, snakemake.input.refs):
        current_contig = 1

        ref_fasta = pysam.FastaFile(ref_fasta)

        with pysam.AlignmentFile(bam_file, "rb") as samfile:
            total_contigs = samfile.count()
            accession = samfile.get_reference_name(0)

            largest_no_contig = (
                total_contigs
                if total_contigs > largest_no_contig
                else largest_no_contig
            )

        with pysam.AlignmentFile(bam_file, "rb") as samfile:
            ref_lengths = samfile.lengths[0]
            largest_coverage = 0
            cum_frac = 0
            for read in samfile.fetch():
                query_alignment_length = read.query_alignment_length
                frac = round(query_alignment_length / ref_lengths, 2)

                # Calculate edit distance from CIGAR string, because NM tag counts matching Ns as edits.
                edit = get_edit_dist(read, ref_fasta)

                cum_frac += frac
                cum_frac = round(cum_frac, 2)
                sum_of_edit_dist = sum_of_edit_dist + edit
                edit_frac = round(edit / query_alignment_length, 5)
                largest_coverage = frac if frac > largest_coverage else largest_coverage

                print(
                    accession,
                    current_contig,
                    total_contigs,
                    query_alignment_length,
                    ref_lengths,
                    frac,
                    cum_frac,
                    edit,
                    read.cigarstring,
                    edit_frac,
                    sep="\t",
                    file=out,
                )

                current_contig += 1
            smallest_cum_frac = (
                cum_frac if smallest_cum_frac > cum_frac else smallest_cum_frac
            )
            small_larg_cover = (
                largest_coverage
                if small_larg_cover > largest_coverage
                else small_larg_cover
            )
            small_larg_cover_name = (
                accession
                if small_larg_cover >= largest_coverage
                else small_larg_cover_name
            )

    print("Largest number of contigs")
    print(largest_no_contig)
    print("Smallest largest coverage in " + str(small_larg_cover_name))
    print(small_larg_cover)
    print("Sum of edit distances")
    print(sum_of_edit_dist)
    print("Smallest cum. fraction of contigs")
    print(smallest_cum_frac)
    print("Largest number of contigs", file=out)
    print(largest_no_contig, file=out)
    print("Smallest largest coverage in " + str(small_larg_cover_name), file=out)
    print(small_larg_cover, file=out)
    print("Sum of edit distances", file=out)
    print(sum_of_edit_dist, file=out)
    print("Smallest cum. fraction of contigs", file=out)
    print(smallest_cum_frac, file=out)

Python pysam From line 2 of scripts/assembly-benchmark-results.py

import pandas as pd

# Function to create a bedpe file from a bed file
bed_list = []
with open(snakemake.input[0]) as f:
    line = f.readlines()
    for name in line:
        bed_list.append(name.split())
df_bed = pd.DataFrame(
    bed_list, columns=["chrom", "start", "end", "name", "score", "strand"]
)
df_sense = df_bed.loc[df_bed["strand"] == "+"]
df_antisense = df_bed.loc[df_bed["strand"] == "-"]
# The dataframes for the sense and antisense strands need to be set to the same index so they can be merged again for the bedpe file
df_sense.reset_index(inplace=True)
df_antisense.reset_index(inplace=True)
data = [
    df_sense["chrom"],
    df_sense["start"],
    df_sense["end"],
    df_antisense["chrom"],
    df_antisense["start"],
    df_antisense["end"],
]
headers = ["chrom1", "start1", "end1", "chrom2", "start2", "end2"]
df_bedpe = pd.concat(data, axis=1, keys=headers)
df_bedpe.to_csv(snakemake.output[0], header=None, index=None, sep="\t", mode="a")

Python Pandas From line 6 of scripts/bed-to-bedpe.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd

all_variants = []
for path in snakemake.input:
    all_variants.append(pd.read_csv(path, sep="\t"))

pd.concat(all_variants).to_csv(snakemake.output[0], index=False, sep="\t")

Python Pandas From line 6 of benchmarking/aggregate-test-case-variants.py

import sys

from pysam import VariantFile

# sys.stderr = open(snakemake.log[0], "w")


tests_cases = []

for path, pos, variant, test_case in zip(
    snakemake.input,
    snakemake.params.poses,
    snakemake.params.variants,
    snakemake.params.test_cases,
):
    with VariantFile(path, "rb") as variant_file:
        for record in variant_file.fetch():
            if int(pos) == record.pos:
                tests_cases.append(test_case)

with open(snakemake.output[0], "w") as f:
    for path in tests_cases:
        print(f"{path}", file=f)

Python pysam From line 6 of benchmarking/check-presence-of-test-case-variant-in-call.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd
from pysam import VariantFile

AA_ALPHABET_TRANSLATION = {
    "Gly": "G",
    "Ala": "A",
    "Leu": "L",
    "Met": "M",
    "Phe": "F",
    "Trp": "W",
    "Lys": "K",
    "Gln": "Q",
    "Glu": "E",
    "Ser": "S",
    "Pro": "P",
    "Val": "V",
    "Ile": "I",
    "Cys": "C",
    "Tyr": "Y",
    "His": "H",
    "Arg": "R",
    "Asn": "N",
    "Asp": "D",
    "Thr": "T",
}


def phred_to_prob(phred):
    if phred is None:
        return 0
    return 10 ** (-phred / 10)


def get_AA_variant(record):
    for ann in record.info["ANN"]:
        ann = ann.split("|")
        hgvsp = ann[11]
        feature = ann[3]
        if hgvsp:
            _enssast_id, alteration = hgvsp.split(":", 1)
            _prefix, alteration = alteration.split(".", 1)
            for triplet, amino in AA_ALPHABET_TRANSLATION.items():
                alteration = alteration.replace(triplet, amino)
            hgvsp = f"{feature}:{alteration}"
    return hgvsp


def extract_data(variant_file: VariantFile):
    variants = []
    for record in variant_file.fetch():
        prob_not_present = phred_to_prob(record.info["PROB_ABSENT"][0]) + phred_to_prob(
            record.info["PROB_ARTIFACT"][0]
        )
        variant = get_AA_variant(record)
        if variant != "":
            variants.append(
                {
                    "chrom": record.chrom,
                    "pos": record.pos,
                    "variant": variant,
                    # "ANN" : record.info["ANN"],
                    "vaf": record.samples[0]["AF"][0],
                    "prob_not_present": prob_not_present,
                }
            )
    data = pd.DataFrame(variants)
    data["prob_present"] = 1 - data["prob_not_present"]
    data.drop(columns="prob_not_present", inplace=True)
    data.drop_duplicates(subset=["chrom", "pos", "variant"], inplace=True)
    return data


found_variants_list = []
for illumina_bcf, ont_bcf in zip(snakemake.input.illumina_bcf, snakemake.input.ont_bcf):
    with VariantFile(illumina_bcf, "rb") as illumina_file:
        illumina_variants = extract_data(illumina_file)

    with VariantFile(ont_bcf, "rb") as ont_file:
        ont_variants = extract_data(ont_file)

    found_variants = pd.merge(
        illumina_variants,
        ont_variants,
        how="outer",
        left_on=["chrom", "pos", "variant"],
        right_on=["chrom", "pos", "variant"],
        suffixes=("_illumina", "_ont"),
    )
    found_variants_list.append(found_variants)

found_variants = pd.concat(found_variants_list)

# found_variants.fillna({'vaf_illumina':0.0, 'vaf_ont':0.0, 'prob_present_illumina':0.0, 'prob_present_ont':0.0}, inplace=True)
found_variants["difference"] = found_variants["vaf_illumina"].astype(
    float
) - found_variants["vaf_ont"].astype(float)
found_variants.sort_values(by="difference", inplace=True, ascending=False)

found_variants.drop(columns=["difference"], inplace=True)
found_variants["test_case"] = snakemake.wildcards.test_case
found_variants.to_csv(snakemake.output[0], index=False, sep="\t")

Python Pandas pysam From line 6 of benchmarking/compare-vcf.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd

found_variants = pd.read_csv(snakemake.input[0], sep="\t")

different_probs = found_variants.loc[
    (found_variants["prob_present_illumina"] != found_variants["prob_present_ont"])
    & (found_variants["prob_present_illumina"] > 0)
    & (found_variants["prob_present_ont"] > 0)
]

illumina_only = found_variants.loc[
    (found_variants["prob_present_illumina"].isna())
    & (found_variants["prob_present_ont"] > 0)
]

ont_only = found_variants.loc[
    (found_variants["prob_present_illumina"] > 0)
    & (found_variants["prob_present_ont"].isna() > 0)
]


different_probs.to_csv(snakemake.output.different_probs, sep="\t", index=False)
illumina_only.to_csv(snakemake.output.illumina_only, sep="\t", index=False)
ont_only.to_csv(snakemake.output.ont_only, sep="\t", index=False)

Python Pandas From line 6 of benchmarking/filter-test-case-variants.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd
from snakemake.io import expand

test_case_path = "{pos}\t{variant}\tresults/{date}/call-test-cases/ref~main/{sample}.{{varrange}}.chrom~{chrom}.pos~{pos}.bcf\tresults/{date}/candidate-calls/ref~main/{sample}.{{varrange}}.bcf"

variants = []
paths = []

for test_case in snakemake.input:
    variants_to_test = pd.read_csv(test_case, sep="\t")
    variants.append(variants_to_test)

variants = pd.concat(variants)

if len(variants) > 0:
    variants.loc[
        variants["vaf_illumina"] < variants["vaf_ont"], "to_test"
    ] = snakemake.params.illumina
    variants.loc[
        variants["vaf_illumina"] > variants["vaf_ont"], "to_test"
    ] = snakemake.params.ont

    sample_table = pd.DataFrame(snakemake.params.sample_table).dropna(
        subset=["test_case"]
    )
    sample_table = sample_table[["test_case", "technology", "sample_name", "date"]]

    variants["test_case"] = variants["test_case"].astype(str)
    variants = pd.merge(
        variants,
        sample_table,
        how="left",
        left_on=["test_case", "to_test"],
        right_on=["test_case", "technology"],
    )

    illumina_variants = variants.loc[variants["to_test"] == snakemake.params.illumina]
    illumina_paths = expand(
        test_case_path,
        zip,
        date=illumina_variants["date"],
        sample=illumina_variants["sample_name"],
        chrom=illumina_variants["chrom"],
        pos=illumina_variants["pos"],
        variant=illumina_variants["variant"],
    )
    illumina_paths = expand(illumina_paths, varrange=snakemake.params.illumina_varrange)
    paths.append(illumina_paths)

    ont_variants = variants.loc[variants["to_test"] == snakemake.params.ont]
    ont_variants = expand(
        test_case_path,
        zip,
        date=ont_variants["date"],
        sample=ont_variants["sample_name"],
        chrom=ont_variants["chrom"],
        pos=ont_variants["pos"],
        variant=ont_variants["variant"],
    )
    ont_variants = expand(ont_variants, varrange=snakemake.params.ont_varrange)
    paths.append(ont_variants)

    paths = sum(paths, [])

with open(snakemake.output.paths, "w") as f:
    for path in paths:
        print(f"{path}", file=f)

if len(variants) > 0:
    variants.drop(columns=["technology"]).to_csv(
        snakemake.output.overview, sep="\t", index=False
    )
else:
    pd.DataFrame().to_csv(snakemake.output.overview, sep="\t", index=False)

Python Pandas From line 6 of benchmarking/get-test-case-variant-paths.py

import sys

sys.stderr = open(snakemake.log[0], "w")
# parameter = snakemake.params.get("parameter", "")

from shutil import copyfile

from Bio import SeqIO


def is_fasta(filename):
    with open(filename, "r") as handle:
        fasta = SeqIO.parse(handle, "fasta")
        return any(fasta)  # False when `fasta` is empty, i.e. wasn't a FASTA file


def check_contigs(sm_input, sm_output):
    if is_fasta(sm_input):
        copyfile(sm_input, sm_output)
    else:
        with open(sm_output, "w") as write_handle:
            write_handle.write(f">filler-contig\nN")


if __name__ == "__main__":
    check_contigs(snakemake.input[0], snakemake.output[0])

Python Biopython From line 6 of scripts/check_contigs.py

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd


def collect_calls(sm_input, sm_output, states, lineage, number, length):
    # agg pangolin calls
    all_pangolin_calls = []

    for file, state in zip(sm_input.pangolin, states):
        call = pd.read_csv(file)
        call["actual_lineage"] = lineage
        call["state"] = state
        all_pangolin_calls.append(call)

    pangolin_calls = pd.concat(all_pangolin_calls, axis=0, ignore_index=True)
    pangolin_calls["read_number"] = number
    pangolin_calls["read_length"] = length
    pangolin_calls["correct_call"] = (
        pangolin_calls["lineage"] == pangolin_calls["actual_lineage"]
    )
    pangolin_calls = pangolin_calls[
        [
            "lineage",
            "actual_lineage",
            "read_number",
            "read_length",
            "correct_call",
            "state",
        ]
    ]

    # add kallisto calls
    call = pd.read_csv(sm_input.kallisto[0], sep="\t")
    call = call.iloc[[call["fraction"].idxmax()]]
    call["actual_lineage"] = lineage
    call["read_number"] = number
    call["read_length"] = length
    call["correct_call"] = call["target_id"] == call["actual_lineage"]
    call["state"] = "read"
    call.rename(columns={"target_id": "lineage"}, inplace=True)

    call = call[
        [
            "lineage",
            "actual_lineage",
            "read_number",
            "read_length",
            "correct_call",
            "state",
        ]
    ]

    # bring them together
    call = pd.concat([pangolin_calls, call])

    call.to_csv(sm_output, sep="\t", index=False)


if __name__ == "__main__":
    collect_calls(
        snakemake.input,
        snakemake.output[0],
        snakemake.params.get("states", ""),
        snakemake.wildcards.lineage.replace("-", "."),
        snakemake.wildcards.number,
        snakemake.wildcards.length,
    )

Python Pandas kallisto Pangolin From line 6 of scripts/collect_lineage_calls.py

import sys
from collections import defaultdict, namedtuple
from enum import Enum
from itertools import product

sys.stderr = open(snakemake.log[0], "w")

import gffutils
import numpy as np
import requests
from dnachisel.biotools import get_backtranslation_table, translate
from pysam import FastaFile, VariantFile, VariantHeader, VariantRecord
from requests.models import ContentDecodingError

covariants_data = requests.get(
    "https://raw.githubusercontent.com/hodcroftlab/covariants/master/web/data/clusters.json"
).json()
translate_aa = get_backtranslation_table("Standard")
gff = gffutils.create_db(snakemake.input.annotation, dbfn=":memory:")
gene_start = {gene["gene_name"][0]: gene.start for gene in gff.features_of_type("gene")}
gene_end = {gene["gene_name"][0]: gene.end for gene in gff.features_of_type("gene")}


def aa_to_dna(aa_seq):
    return (
        "".join(combination)
        for combination in product(*[translate_aa[aa] for aa in aa_seq])
    )


def codon_equivalence_class(dna_seq):
    aa = translate(dna_seq)
    return aa_to_dna(aa)


class VariantType(Enum):
    Ins = 1
    Del = 2
    Subst = 3


class SynonymousVariant:
    def __init__(self, left, pos, right):
        self.left = left
        self.pos = pos
        self.right = right

    def __eq__(self, other):
        return (
            self.left == other.left
            and self.right == other.right
            and self.pos == other.pos
        )

    def __hash__(self):
        return hash((self.left, self.pos, self.right))

    def __lt__(self, other):
        return self.pos < other.pos

    def is_same_feature(self, other):
        return True

    def variant_type(self):
        if self.left == "-":
            return VariantType.Ins
        elif self.right == "-":
            return VariantType.Del
        else:
            return VariantType.Subst

    def genome_pos(self):
        return self.pos - 1

    def signature(self):
        return f"{self.left}{self.pos}{self.right}"

    def __repr__(self):
        return repr(self.signature())


class NonSynonymousVariant(SynonymousVariant):
    def __init__(self, left, pos, right, gene):
        super().__init__(left, pos, right)
        self.gene = gene

    def __eq__(self, other):
        return super().__eq__(other) and self.gene == other.gene

    def __hash__(self):
        return hash((self.left, self.pos, self.right, self.gene))

    def is_same_feature(self, other):
        return self.gene == other.gene

    def genome_pos(self):
        return gene_start[self.gene] - 1 + (self.pos - 1) * 3

    def signature(self):
        return f"{self.gene}:{self.left}{self.pos}{self.right}"

    def is_in_first_codon(self):
        return self.pos == 1

    def is_in_last_codon(self):
        aa_len = gene_end[self.gene] - gene_start[self.gene] / 3
        assert self.pos <= aa_len
        return self.pos == aa_len


with FastaFile(snakemake.input.reference) as infasta:
    assert infasta.nreferences == 1
    contig = infasta.references[0]
    ref_len = infasta.lengths[0]
    header = VariantHeader()
    header.add_line(f"##contig=<ID={contig},length={ref_len}")
    header.add_line(
        '##INFO=<ID=SIGNATURES,Number=.,Type=String,Description="Variant signature as obtained from covariants.org">'
    )
    header.add_line(
        '##INFO=<ID=LINEAGES,Number=.,Type=String,Description="Lineages having this variant">'
    )

    known_non_synonymous_variants = defaultdict(set)
    for lineage_entry in covariants_data["clusters"]:
        if (
            "mutations" in lineage_entry
            and "nonsynonymous" in lineage_entry["mutations"]
        ):
            for variant in lineage_entry["mutations"]["nonsynonymous"]:
                variant = NonSynonymousVariant(**variant)
                if variant.gene in gene_start:
                    known_non_synonymous_variants[variant].add(
                        lineage_entry["build_name"]
                    )
                else:
                    print(
                        f"Skipping variant at {variant.gene} because gene is not in given GFF annotation.",
                        file=sys.stderr,
                    )

    known_synonymous_variants = defaultdict(set)
    for lineage_entry in covariants_data["clusters"]:
        if "mutations" in lineage_entry and "synonymous" in lineage_entry["mutations"]:
            for variant in lineage_entry["mutations"]["synonymous"]:
                known_synonymous_variants[SynonymousVariant(**variant)].add(
                    lineage_entry["build_name"]
                )

    with VariantFile(snakemake.output[0], "wb", header=header) as outvcf:

        def get_variants(all_variants, variant_type, merge=True):
            filtered_variants = sorted(
                filter(
                    lambda item: item[0].variant_type() == variant_type,
                    all_variants.items(),
                )
            )

            if not merge:
                yield from filtered_variants
            else:

                def process_batch(batch, batch_lineages):
                    # Step 1: collect all visited lineages in batch
                    all_lineages = np.array(
                        list(
                            set(
                                lineage
                                for lineages in batch_lineages
                                for lineage in lineages
                            )
                        )
                    )
                    # Step 2: build matrix of variants vs lineages (columns mark combinations of variants that can be merged)
                    lineage_matrix = np.array(
                        [
                            [(lineage in lineages) for lineage in all_lineages]
                            for lineages in batch_lineages
                        ]
                    )
                    # Step 3: remove duplicate columns
                    if len(lineage_matrix) > 0:
                        lineage_matrix = np.unique(lineage_matrix, axis=1)
                    # Step 4: iterate over combinations
                    batch = np.array(batch)
                    batch_lineages = np.array(batch_lineages)
                    for variant_combination in lineage_matrix.T:
                        # select variants and lineages
                        variants = batch[variant_combination]
                        lineages = set.intersection(
                            *batch_lineages[variant_combination]
                        )
                        # yield them in consecutive inner batches
                        last_pos = None
                        inner_batch_start = 0
                        for i, variant in enumerate(variants):
                            if last_pos is not None and variant.pos != last_pos + 1:
                                # yield inner batch
                                yield variants[inner_batch_start:i], lineages
                                inner_batch_start = i
                            last_pos = variant.pos
                        yield variants[inner_batch_start:], lineages

                batch = []
                batch_lineages = []
                for variant, lineages in filtered_variants:
                    if not batch or (
                        variant.pos == batch[-1].pos + 1
                        and variant.is_same_feature(batch[-1])
                    ):
                        batch.append(variant)
                        batch_lineages.append(lineages)
                    else:
                        # yield and remove the last batch
                        yield from process_batch(batch, batch_lineages)
                        # clear and start with new batch
                        batch = [variant]
                        batch_lineages = [lineages]

                yield from process_batch(batch, batch_lineages)

        def write_record(pos, ref_allele, alt_allele, lineages, variants):
            record = outvcf.new_record()
            record.contig = contig
            record.alleles = (ref_allele, alt_allele)
            record.pos = pos + 1  # pysam expects 1-based positions here

            record.info["LINEAGES"] = ",".join(lineages)

            record.info["SIGNATURES"] = ",".join(
                variant.signature() for variant in variants
            )

            outvcf.write(record)

        for variants, lineages in get_variants(
            known_synonymous_variants, VariantType.Ins
        ):
            pos = variants[0].genome_pos() - 1
            ref_allele = infasta.fetch(reference=contig, start=pos, end=pos + 1)
            alt_allele = ref_allele + "".join(variant.right for variant in variants)
            write_record(pos, ref_allele, alt_allele, lineages, variants)

        for variants, lineages in get_variants(
            known_synonymous_variants, VariantType.Del
        ):
            pos = variants[0].genome_pos() - 1
            alt_allele = infasta.fetch(reference=contig, start=pos, end=pos + 1)
            ref_allele = alt_allele + "".join(variant.left for variant in variants)
            write_record(pos, ref_allele, alt_allele, lineages, variants)

        for variant, lineages in get_variants(
            known_synonymous_variants, VariantType.Subst, merge=False
        ):
            pos = variant.genome_pos()
            write_record(pos, variant.left, variant.right, lineages, [variant])

        for variants, lineages in get_variants(
            known_non_synonymous_variants, VariantType.Ins
        ):
            pos = variants[0].genome_pos()
            assert not variants[
                0
            ].is_in_first_codon(), "unsupported insertion: is in first codon of protein"
            assert not variants[
                -1
            ].is_in_last_codon(), "unsupported insertion: is in last codon of protein"

            # METHOD: add an unchanged codon before and after the actual variant
            ref_allele = infasta.fetch(reference=contig, start=pos - 3, end=pos + 3)
            pre_codons = codon_equivalence_class(
                infasta.fetch(reference=contig, start=pos - 3, end=pos)
            )
            post_codons = codon_equivalence_class(
                infasta.fetch(reference=contig, start=pos, end=pos + 3)
            )
            for pre_codon, post_codon in product(pre_codons, post_codons):
                for ins_seq in aa_to_dna(
                    "".join(variant.right for variant in variants)
                ):
                    alt_allele = pre_codon + ins_seq + post_codon
                    write_record(pos - 3, ref_allele, alt_allele, lineages, variants)

        for variants, lineages in get_variants(
            known_non_synonymous_variants, VariantType.Del
        ):
            variant = variants[0]
            pos = variants[0].genome_pos()
            del_len = len(variants) * 3

            assert not variants[
                0
            ].is_in_first_codon(), "unsupported deletion: is in first codon of protein"
            assert not variants[
                -1
            ].is_in_last_codon(), "unsupported deletion: is in last codon of protein"

            # METHOD: add an unchanged codon before and after the actual variant
            # in order to capture ambiguity in the alignment
            # before the potential deletion
            pre_codons = codon_equivalence_class(
                infasta.fetch(reference=contig, start=pos - 3, end=pos)
            )
            post_codons = codon_equivalence_class(
                infasta.fetch(
                    reference=contig, start=pos + del_len, end=pos + del_len + 3
                )
            )
            # ref allele including the unchanged codons
            ref_allele = infasta.fetch(
                reference=contig, start=pos - 3, end=pos + del_len + 3
            )
            for pre_codon, post_codon in product(pre_codons, post_codons):
                alt_allele = pre_codon + post_codon
                write_record(pos - 3, ref_allele, alt_allele, lineages, variants)

        for variant, lineages in get_variants(
            known_non_synonymous_variants, VariantType.Subst, merge=False
        ):
            pos = variant.genome_pos()

            ref_allele = infasta.fetch(reference=contig, start=pos, end=pos + 3)
            for alt_allele in aa_to_dna(variant.right):

                write_record(pos, ref_allele, alt_allele, lineages, [variant])

Python numpy pysam requests From line 6 of scripts/collect-lineage-variants.py

import pysam
from snakemake.shell import shell

exclude = (
    "-x {}".format(snakemake.input.exclude)
    if snakemake.input.get("exclude", "")
    else ""
)

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

with pysam.AlignmentFile(snakemake.input.sample) as bam:
    if bam.mapped < 10000:
        # Not enough reads to perform SV calling.
        # Output empty BCF.
        header = pysam.VariantHeader()

        # Retrieve and record reference lengths.
        ref = pysam.FastaFile(snakemake.input.ref)
        for contig in ref.references:
            n = ref.get_reference_length(contig)
            header.add_line("##contig=<ID={},length={}>".format(contig, n))

        # Write BCF.
        with pysam.VariantFile(snakemake.output[0], "wb", header=header) as bcf:
            exit(0)


shell(
    "OMP_NUM_THREADS={snakemake.threads} delly call {extra} "
    "{exclude} -g {snakemake.input.ref} "
    "-o {snakemake.output[0]} {snakemake.input.sample} {log}"
)

Python Snakemake pysam delly From line 6 of scripts/delly.py

import sys

sys.stderr = open(snakemake.log[0], "w")

from os import name

import numpy as np
import pandas as pd


def extract_mixture_sample(path, prefix, separator, percentage, mix, max_reads):
    path = path.split(prefix + separator)[-1]
    path = path.split(".strains")[0]
    path = path.replace("-", ".")
    path = path.split(separator)
    path = [ele.split(percentage) for ele in path]
    df = pd.DataFrame(path, columns=["target_id", "true_fraction"])
    df["mix"] = mix
    df["true_fraction"] = df["true_fraction"].str.replace(".polished", "")
    df["true_fraction"] = df["true_fraction"].astype(int)
    df["true_fraction"] = df["true_fraction"] / 100
    df["true_counts"] = round(df["true_fraction"] * int(max_reads))
    return df


def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())


def load_kallisto_df(i, path):
    kallisto_df = pd.read_csv(path, delimiter="\t")
    kallisto_df["mix"] = i
    kallisto_df = kallisto_df.rename(columns={"fraction": "est_fraction"})
    return kallisto_df


def load_pangolin_df(i, path):
    pangolin_df = pd.read_csv(path, delimiter=",")
    pangolin_df.rename(
        columns={"lineage": "target_id", "scorpio_support": "est_fraction"},
        inplace=True,
    )
    pangolin_df.drop(columns=["taxon", "qc_status", "note"], inplace=True)
    pangolin_df["mix"] = i
    return pangolin_df


def eval_error(paths, sm_output, max_reads, prefix, separator, percentage, load_func):
    results_df = pd.DataFrame()

    for i, path in enumerate(paths):
        df = load_func(i, path)

        org_mix_df = extract_mixture_sample(
            path, prefix, separator, percentage, i, max_reads
        )

        df = df.merge(org_mix_df, how="outer").fillna(0)

        results_df = pd.concat([results_df, df])

    for sample in results_df["mix"].unique():
        sample_rmse = rmse(
            results_df[results_df["mix"] == sample]["est_fraction"],
            results_df[results_df["mix"] == sample]["true_fraction"],
        )
        results_df.loc[results_df["mix"] == sample, "rmse"] = sample_rmse

    results_df.set_index(["mix", "rmse", "target_id"], inplace=True)
    results_df.to_csv(sm_output, sep="\t")


max_reads = snakemake.params.max_reads
prefix = snakemake.params.prefix
separator = snakemake.params.separator
percentage = snakemake.params.percentage

if snakemake.wildcards.caller == "pangolin":
    load_func = load_pangolin_df
elif snakemake.wildcards.caller == "kallisto":
    load_func = load_kallisto_df
else:
    raise ValueError("unexpected value for wildcards.caller")

eval_error(
    snakemake.input,
    snakemake.output[0],
    max_reads,
    prefix,
    separator,
    percentage,
    load_func,
)

Python Pandas numpy kallisto Pangolin From line 6 of scripts/evaluate-strain-call-error.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import pysam

sars_cov2_id, _ = snakemake.params.reference_genome[0].split(".", 1)


def is_sars_cov2(record, mate=False):
    if mate:
        return record.next_reference_name.startswith(sars_cov2_id)
    else:
        return record.reference_name.startswith(sars_cov2_id)


with pysam.AlignmentFile(snakemake.input.bam, "rb") as inbam:
    with pysam.AlignmentFile(snakemake.output[0], "wb", template=inbam) as outbam:
        for record in inbam:
            if record.is_paired:
                if (
                    (record.is_unmapped and record.mate_is_unmapped)
                    or (is_sars_cov2(record) and record.mate_is_unmapped)
                    or (is_sars_cov2(record, mate=True) and record.is_unmapped)
                    or (is_sars_cov2(record) and is_sars_cov2(record, mate=True))
                ):
                    outbam.write(record)
            else:
                if record.is_unmapped or is_sars_cov2(record):
                    outbam.write(record)

Python pysam From line 6 of scripts/extract-reads-of-interest.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import os

import numpy as np
import pandas as pd


def extract_strains_from_provision(
    path_to_provision: str, path_to_strain_summary: str, path_to_strain_genomes: str
):
    # select strain genomes
    provision = pd.DataFrame()
    chunks = pd.read_json(path_to_provision, lines=True, chunksize=9000)
    for i, chunk in enumerate(chunks):
        print(f"Parsing chunk {i}", file=sys.stderr)
        provision = pd.concat(
            [provision, select_oldest_strains(chunk)], ignore_index=True
        )
    provision = select_oldest_strains(provision)

    # save strain genomes
    provision["covv_lineage"] = provision["covv_lineage"].str.replace("/", "_")
    provision["covv_lineage_fasta"] = provision["covv_lineage"].values + ".fasta"
    np.vectorize(write_sequence)(
        provision["covv_lineage"].values,
        provision["covv_lineage_fasta"].values,
        provision["sequence"].values,
        path_to_strain_genomes,
    )

    # save strain genome summary
    provision["covv_lineage_fasta"] = np.vectorize(os.path.join)(
        path_to_strain_genomes, provision["covv_lineage_fasta"].values
    )
    provision["covv_lineage_fasta"].to_csv(
        path_to_strain_summary, sep="\n", header=False, index=False
    )
    print(provision, file=sys.stderr)


def select_oldest_strains(df: pd.DataFrame):
    # covv_lineage -> pangolin lineage
    # n_content -> share of nan in seq
    # covv_subm_date -> submission date of seq
    # covv_host -> host of the seq
    # is_complete -> seq ist complete

    preselect_filter = (
        (df["covv_host"] == "Human")
        & (df["is_complete"] == True)
        & (df["covv_lineage"] != "None")
        & (df["covv_lineage"] != "")
        & ~(df["covv_lineage"].str.contains("\("))
        & ~(df["covv_lineage"].str.contains("\)"))
    )
    cols_of_interesst = ["covv_lineage", "n_content", "covv_subm_date"]
    df = df.copy()
    df.dropna(subset=cols_of_interesst, inplace=True)
    df = df[preselect_filter]
    df.sort_values(by=cols_of_interesst, inplace=True)
    df.drop_duplicates(subset=["covv_lineage"], inplace=True)
    return df


def write_sequence(
    covv_lineage: str, covv_lineage_fasta: str, sequence: str, out_path: str
):
    os.makedirs(out_path, exist_ok=True)

    print(f"{covv_lineage_fasta}", file=sys.stderr)

    genome_file = os.path.join(out_path, covv_lineage_fasta)
    if not os.path.isfile(genome_file):
        with open(genome_file, "w") as f:
            print(f">{covv_lineage}", file=f)
            print(f"{sequence}", file=f)


extract_strains_from_provision(
    path_to_provision=snakemake.input[0],
    path_to_strain_summary=snakemake.output[0],
    path_to_strain_genomes=snakemake.params.save_strains_to,
)

Python Pandas numpy Pangolin From line 6 of scripts/extract-strains-from-gisaid-provision.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import re

import pandas as pd


def set_id(info_string: str, format: str) -> str:
    if "ID=" in info_string:
        return info_string

    infos = dict(x.split("=") for x in info_string.split(";"))

    info_string = f"ID={format}-{infos['Name']};" + info_string
    return info_string


def fix_transcipt(info_string: str, type: str, to_change: str, true_parent: str) -> str:
    if type != to_change:
        return info_string

    infos = dict(x.split("=") for x in info_string.split(";"))

    return re.sub("Parent=?.*;", f'Parent={true_parent}-{infos["Name"]};', info_string)


gff = pd.read_csv(snakemake.input[0], sep="\t", header=None)

info_column = gff.columns[-1]
format_column = gff.columns[2]

exons = gff.loc[gff[format_column] == "CDS"].copy()
exons.replace(to_replace="CDS", value="exon", inplace=True)
gff = pd.concat([gff, exons])


gff[info_column] = gff.apply(lambda x: set_id(x[info_column], x[format_column]), axis=1)
gff[info_column] = gff.apply(
    lambda x: fix_transcipt(
        x[info_column], x[format_column], to_change="CDS", true_parent="transcript"
    ),
    axis=1,
)
gff[info_column] = gff.apply(
    lambda x: fix_transcipt(
        x[info_column], x[format_column], to_change="exon", true_parent="transcript"
    ),
    axis=1,
)


gff.loc[gff[format_column] == "gene", info_column] = gff.loc[
    gff[format_column] == "gene", info_column
].apply(lambda x: x + ";biotype=protein_coding")

gff[format_column].replace(to_replace="transcript", value="mRNA", inplace=True)

gff.to_csv(snakemake.output[0], header=False, index=False, sep="\t")

Python Pandas From line 6 of scripts/fix-protein-gff.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd
from pandas._typing import FilePathOrBuffer

summary = pd.DataFrame()


def register_quality_data(path_to_type_summary: FilePathOrBuffer, assembly_type: str):
    if path_to_type_summary != "resources/genomes/main.fasta":
        global summary
        quality_data = pd.read_csv(path_to_type_summary, sep="\t", index_col="Sample")
        quality_data["filter"] = (
            quality_data["identity"] > snakemake.params.min_identity
        ) & (quality_data["n_share"] < snakemake.params.max_n)
        quality_data[["identity", "n_share"]] = quality_data[
            ["identity", "n_share"]
        ].applymap(lambda x: "{:,.2f}%".format(x * 100))
        quality_data.rename(
            columns={
                "identity": "{}: Identity".format(assembly_type),
                "n_share": "{}: Share N".format(assembly_type),
                "filter": "{}: Pass Filter".format(assembly_type),
            },
            inplace=True,
        )
        summary = pd.concat([summary, quality_data], axis=1)


register_quality_data(snakemake.input.de_novo, "De Novo")
register_quality_data(snakemake.input.pseudo, "Pseudo")
register_quality_data(snakemake.input.consensus, "Consensus")


summary.to_csv(snakemake.output[0])

Python Pandas From line 6 of scripts/generate-filter-overview.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd
import pysam

# No samples make it past the quality filter
if snakemake.input.contigs == "resources/genomes/main.fasta":
    # write empty fasta file
    with open(snakemake.output.fasta, "w") as outfile:
        pass

    # Write empty csv file
    pd.DataFrame(
        columns=[
            "SENDING_LAB",
            "DATE_DRAW",
            "SEQ_TYPE",
            "SEQ_REASON",
            "SAMPLE_TYPE",
            "PUBLICATION_STATUS",
            "OWN_FASTA_ID",
        ]
    ).to_csv(snakemake.output.table, sep=";", index=False)

else:
    # Aggregating fasta files
    sequence_names = []
    include_flag = []
    sample_dict = {}
    for sample in snakemake.params.includeflag:
        sample_dict.update(sample)

    with open(snakemake.output.fasta, "w") as outfile:
        for file in snakemake.input.contigs:
            with pysam.FastxFile(file) as infile:
                for entry in infile:
                    sequence_names.append(entry.name)
                    to_include = int(sample_dict.get(entry.name))
                    include_flag.append(to_include)
                    if to_include:
                        print(f">{entry.name}", file=outfile)
                        print(entry.sequence, file=outfile)

    # Creating csv-table
    csv_table = pd.DataFrame(
        {
            "SENDING_LAB": snakemake.params.sending_lab_number,
            "DATE_DRAW": snakemake.params.date_draw,
            "SEQ_TYPE": snakemake.params.seq_type,
            "SEQ_REASON": "N",
            "SAMPLE_TYPE": "s001",
            "PUBLICATION_STATUS": "N",
            "OWN_FASTA_ID": sequence_names,
            "include": include_flag,
        }
    )

    # Only include samples with include flag
    csv_table["include"] = csv_table["include"].astype(int)
    csv_table = csv_table[csv_table["include"] == 1]
    csv_table.drop(columns=["include"], inplace=True)

    # Final touches
    csv_table.sort_values(by="OWN_FASTA_ID", inplace=True)
    csv_table.to_csv(snakemake.output.table, sep=";", index=False)

Python Pandas pysam From line 6 of scripts/generate-high-quality-report.py

import re
import sys

sys.stderr = open(snakemake.log[0], "w")

import gffutils
import numpy as np
import pandas as pd
import pysam


def phred_to_prob(phred):
    if phred is None:
        return pd.NA
    return 10 ** (-phred / 10)


# np.prod returns 1's as values for a pd series with NaN's. A list would return NaN's
# switched for use of minimum
# def prod_prob_not_present(probs):
#     if pd.isna(probs).any():
#         return pd.NA
#     else:
#         return np.prod(probs)


def min_prob_not_present(probs):
    if pd.isna(probs).any():
        return pd.NA
    else:
        return np.min(probs)


def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)


def add_number_suffix(number):
    number = str(number)

    if number.endswith("1") and number != "11":
        return f"{number}st"
    elif number.endswith("2") and number != "12":
        return f"{number}nd"
    elif number.endswith("3") and number != "13":
        return f"{number}rd"
    else:
        return f"{number}th"


def rename_enumeration(list_length):
    return [add_number_suffix(x) for x in range(1, list_length + 1)]


variants_df = pd.DataFrame()
lineage_df = pd.DataFrame()

# read generated variant file and extract all variants
with pysam.VariantFile(snakemake.input.variant_file, "rb") as infile:
    for record in infile:
        if "SIGNATURES" in record.info:
            signatures = record.info.get("SIGNATURES", ("#ERROR0",))
            vaf = record.samples[0]["AF"][0]
            dp = record.samples[0]["DP"]
            prob_not_present = phred_to_prob(
                record.info["PROB_ABSENT"][0]
            ) + phred_to_prob(record.info["PROB_ARTIFACT"][0])
            if pd.isna(prob_not_present):
                vaf = pd.NA
            lineages = record.info["LINEAGES"]
            for signature in signatures:
                # generate df with all signatures + VAF and Prob_not_present from calculation
                variants_df = pd.concat(
                    [
                        variants_df,
                        pd.DataFrame(
                            {
                                "Frequency": vaf,
                                "Mutations": signature,
                                "Prob_not_present": prob_not_present,
                                "ReadDepth": dp,
                            },
                            index=[0],
                        ),
                    ],
                    ignore_index=True,
                )

                lineage_df = pd.concat(
                    [
                        lineage_df,
                        pd.DataFrame(
                            {
                                "Mutations": [signature],
                                **{
                                    lineage.replace(".", " "): "x"
                                    for lineage in lineages
                                },
                            },
                            index=[0],
                        ),
                    ],
                    ignore_index=True,
                )

# aggregate both dataframes by summing up repeating rows for VAR (maximum=1) and multiply Prob_not_present
variants_df = variants_df.groupby(["Mutations"]).agg(
    func={
        "Frequency": lambda x: min(sum(x), 1.0),
        "Prob_not_present": min_prob_not_present,
        "ReadDepth": np.min,
    },
    axis=1,
)

# new column for 1-prob_not_present = prob_present
variants_df["Probability"] = 1.0 - variants_df["Prob_not_present"]
variants_df["Prob X VAF"] = variants_df["Probability"] * variants_df["Frequency"]
lineage_df = lineage_df.drop_duplicates()

# merge duplicated mutations
lineage_df = lineage_df.fillna(0)
lineage_df = lineage_df.replace({"x": 1})
lineage_df = (
    lineage_df.groupby(["Mutations"])
    .agg(func={column: np.max for column in lineage_df.columns})
    .reset_index(drop=True)
)
lineage_df = lineage_df.replace({1: "x", 0: ""})

# calculate Jaccard coefficient for each lineage
# iterate over lineages in columns (mutations as index)
lineage_df.set_index("Mutations", inplace=True)
jaccard_coefficient = {}
for lineage in lineage_df.columns:
    lineage_defining_variants = variants_df.index.isin(
        lineage_df.index[lineage_df[lineage] == "x"]
    )
    lineage_defining_non_variants = ~lineage_defining_variants
    jaccard_coefficient[lineage] = round(
        (
            variants_df[lineage_defining_variants]["Prob X VAF"].sum()
            + variants_df[lineage_defining_non_variants]["Prob_not_present"].sum()
        )
        / len(variants_df),
        3,
    )

jaccard_row = pd.DataFrame(
    {"Mutations": "Similarity", **jaccard_coefficient}, index=[0]
)
# merge variants dataframe and lineage dataframe
variants_df = variants_df.merge(lineage_df, left_index=True, right_index=True)

# add feature column for sorting
variants_df["Features"] = variants_df.index.to_series().str.extract(r"(.+)[:].+|\*")

# position of variant for sorting and change type
variants_df["Position"] = variants_df.index.to_series().str.extract(
    r"(.*:?[A-Z]+|\*$|-)([0-9]+)([A-Z]+$|\*$|-)$"
)[1]
variants_df = variants_df.astype({"Position": "int64"})

# generate sorting list from .gff with correct order of features
gff = gffutils.create_db(snakemake.input.annotation, dbfn=":memory:")
gene_start = {gene["gene_name"][0]: gene.start for gene in gff.features_of_type("gene")}
sorter = [k[0] for k in sorted(gene_start.items(), key=lambda item: item[1])]
sorterIndex = dict(zip(sorter, range(len(sorter))))
variants_df["Features_Rank"] = variants_df["Features"].map(sorterIndex)

# row for lineage name after renaming columns (column names can't be formatted)
lineages_row_df = pd.DataFrame(
    {
        "Mutations": "Lineage",
        **{x: x for x in list(lineage_df.columns) if x != "Mutations"},
    },
    index=[0],
)

# concat row with Jaccard coefficient, drop unneccesary columns, sort with Jaccard coefficient, round
variants_df.reset_index(inplace=True)
variants_df = pd.concat([jaccard_row, variants_df])
variants_df = pd.concat([lineages_row_df, variants_df])


variants_df = variants_df.round({"Probability": 2, "Frequency": 2})
variants_df.set_index("Mutations", inplace=True)
variants_df.sort_values(
    by="Similarity", axis=1, na_position="first", ascending=False, inplace=True
)
# rename hits ascending
variants_df.rename(
    columns={
        x: y
        for x, y in zip(
            list(variants_df.columns)[8:],
            rename_enumeration(len(list(variants_df.columns)[8:])),
        )
    },
    errors="raise",
    inplace=True,
)
# sort final DF
variants_df.loc[
    variants_df["1st"] == "x",
    "Order",
] = 1
variants_df.loc[
    variants_df["1st"] != "x",
    "Order",
] = 2
variants_df.at["Similarity", "Order"] = 0
variants_df.at["Lineage", "Order"] = 0
variants_df["Prob X VAF"].replace([0, 0.0], np.NaN, inplace=True)
variants_df.sort_values(
    by=["Order", "Features_Rank", "Position"],
    ascending=[True, True, True],
    na_position="last",
    inplace=True,
)
# drop unwanted columns
variants_df.drop(
    columns=[
        "Prob_not_present",
        "Prob X VAF",
        "Features",
        "Position",
        "Features_Rank",
        "Order",
    ],
    inplace=True,
)
all_columns = variants_df.columns
first_columns = ["Probability", "Frequency", "ReadDepth"]
rest_columns = [item for item in all_columns if item not in first_columns]
variants_df = variants_df[[*first_columns, *rest_columns]]

# drop other lineages, top 10 only
variants_df.drop(variants_df.columns[13:], axis=1, inplace=True)

# output variant_df
variants_df.to_csv(snakemake.output.variant_table, index=True, sep=",")

Python Pandas numpy pysam From line 6 of scripts/generate-lineage-variant-table.py

sys.stderr = open(snakemake.log[0], "w")

with open(snakemake.output[0], "w") as out:
    print(*snakemake.params.mixtures, sep="\n", file=out)

Python From line 6 of scripts/generate-mixtures.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import json

import pandas as pd
import pysam

KRAKEN_FILTER_KRITERIA = "D"


def iter_with_samples(inputfiles):
    return zip(snakemake.params.samples, inputfiles)


def is_patient_report():
    return snakemake.params.mode == "patient"


data = pd.DataFrame(index=snakemake.params.samples)


# add kraken estimates
species_columns = pd.DataFrame()
for sample, file in iter_with_samples(snakemake.input.kraken):

    kraken_results = pd.read_csv(
        file,
        delimiter="\t",
        names=["%", "covered", "assigned", "code", "taxonomic ID", "name"],
    )
    kraken_results["name"] = kraken_results["name"].str.strip()

    keep_rows = (
        (kraken_results["code"] == KRAKEN_FILTER_KRITERIA)
        | (
            kraken_results["name"]
            == "Severe acute respiratory syndrome-related coronavirus"
        )
        | (kraken_results["name"] == "unclassified")
    )

    kraken_results = kraken_results.loc[keep_rows, ["%", "name"]].set_index("name").T

    eukaryota = "Eukaryota (%)"
    bacteria = "Bacteria (%)"
    viruses = "Viruses (%)"
    sars_cov2 = "thereof SARS (%)"
    unclassified = "Unclassified (%)"
    colnames = {
        "Eukaryota": eukaryota,
        "Bacteria": bacteria,
        "Viruses": viruses,
        "Severe acute respiratory syndrome-related coronavirus": sars_cov2,
        "unclassified": unclassified,
    }
    kraken_results.rename(columns=colnames, inplace=True)
    kraken_results = kraken_results.reindex(
        columns=[eukaryota, bacteria, viruses, sars_cov2, unclassified]
    ).fillna(0)
    kraken_results["sample"] = sample
    species_columns = pd.concat([species_columns, kraken_results], ignore_index=True)

data = data.join(species_columns.set_index("sample"))

# add numbers of raw reads
for sample, file in iter_with_samples(snakemake.input.reads_raw):
    if "fastq-read-counts" in file:
        with open(file) as infile:
            number_reads = infile.read().strip()
    else:
        with open(file) as infile:
            number_reads = json.load(infile)["summary"]["before_filtering"][
                "total_reads"
            ]
    data.loc[sample, "Raw Reads (#)"] = int(int(number_reads) / 2)

# add numbers of trimmed reads
for sample, file in iter_with_samples(snakemake.input.reads_trimmed):
    if "fastq-read-counts" in file:
        with open(file) as infile:
            number_reads = infile.read().strip()
    else:
        with open(file) as infile:
            number_reads = json.load(infile)["summary"]["after_filtering"][
                "total_reads"
            ]
    data.loc[sample, "Trimmed Reads (#)"] = int(int(number_reads) / 2)

# add numbers of reads used for assembly
for sample, file in iter_with_samples(snakemake.input.reads_used_for_assembly):
    with open(file) as infile:
        data.loc[sample, "Filtered Reads (#)"] = int(infile.read())


def register_contig_lengths(assemblies, name):
    for sample, file in iter_with_samples(assemblies):
        if file == "resources/genomes/main.fasta":
            data.loc[sample, name] = 0
        else:
            with pysam.FastxFile(file) as infile:
                data.loc[sample, name] = max(len(contig.sequence) for contig in infile)


if is_patient_report():
    # add lengths of Initial contigs
    register_contig_lengths(snakemake.input.initial_contigs, "Largest Contig (bp)")

    # add lengths of polished contigs
    register_contig_lengths(snakemake.input.polished_contigs, "De Novo Sequence (bp)")

    # add lengths of pseudo assembly
    register_contig_lengths(snakemake.input.pseudo_contigs, "Pseudo Sequence (bp)")

    # add lengths of Consensus assembly
    register_contig_lengths(
        snakemake.input.consensus_contigs, "Consensus Sequence (bp)"
    )

    # add type of assembly use:
    for ele in snakemake.params.assembly_used:
        sample, used = ele.split(",")
        if "pseudo" == used:
            data.loc[sample, "Best Quality"] = "Pseudo"
        elif "normal" == used:
            data.loc[sample, "Best Quality"] = "De Novo"
        elif "consensus" == used:
            data.loc[sample, "Best Quality"] = "Consensus"
        elif "not-accepted" == used:
            data.loc[sample, "Best Quality"] = "-"

    # add pangolin results
    for sample, file in iter_with_samples(snakemake.input.pangolin):
        pangolin_results = pd.read_csv(file)
        assert (
            pangolin_results.shape[0] == 1
        ), "unexpected number of rows (>1) in pangolin results"
        lineage = pangolin_results.loc[0, "lineage"]
        scorpio = pangolin_results.loc[0, "scorpio_call"]
        if lineage == "None":
            pangolin_call = "no strain called"
        else:
            # TODO parse scorpio output
            #     match = re.match(
            #         "((?P<varcount>\d+/\d+) .+ SNPs$)|(seq_len:\d+)$|($)",
            #         pangolin_results.fillna("").loc[0, "note"].strip(),
            #     )
            #     assert (
            #         match is not None
            #     ), "unexpected pangolin note, please update above regular expression"
            #     varcount = match.group("varcount") or ""
            #     if varcount:
            #         varcount = f" ({varcount})"
            # pangolin_call = f"{lineage}{varcount}"
            pangolin_call = f"{lineage}"
        data.loc[sample, "Pango Lineage"] = pangolin_call
        if scorpio == "None":
            scorpio_call = "-"
        else:
            scorpio_call = f"{scorpio}"
        data.loc[sample, "WHO Label"] = scorpio_call
        data["WHO Label"].fillna("-", inplace=True)
        data["WHO Label"].replace({"nan": "-"}, inplace=True)


# add variant calls
AA_ALPHABET_TRANSLATION = {
    "Gly": "G",
    "Ala": "A",
    "Leu": "L",
    "Met": "M",
    "Phe": "F",
    "Trp": "W",
    "Lys": "K",
    "Gln": "Q",
    "Glu": "E",
    "Ser": "S",
    "Pro": "P",
    "Val": "V",
    "Ile": "I",
    "Cys": "C",
    "Tyr": "Y",
    "His": "H",
    "Arg": "R",
    "Asn": "N",
    "Asp": "D",
    "Thr": "T",
}

for sample, file in iter_with_samples(snakemake.input.bcf):
    mutations_of_interest = {}
    other_mutations = {}

    def insert_entry(variants, hgvsp, vaf):
        prev_vaf = variants.get(hgvsp)
        if prev_vaf is None or prev_vaf < vaf:
            # Only insert if there was no entry before or it had a smaller vaf.
            # Such duplicate calls can occur if there are multiple genomic variants
            # that lead to the same protein alteration.
            # We just report the protein alteration here, so what matters to us is the
            # variant call with the highest VAF.
            # TODO in principle, the different alterations could even be complementary.
            # Hence, one could try to determine that and provide a joint vaf.
            variants[hgvsp] = vaf

    def fmt_variants(variants):
        return " ".join(sorted(f"{hgvsp}:{vaf:.3f}" for hgvsp, vaf in variants.items()))

    with pysam.VariantFile(file, "rb") as infile:
        for record in infile:
            vaf = record.samples[0]["AF"][0]
            for ann in record.info["ANN"]:
                ann = ann.split("|")
                hgvsp = ann[11]
                enssast_id = ann[6]
                feature = ann[3]
                if hgvsp:
                    # TODO think about regex instead of splitting
                    enssast_id, alteration = hgvsp.split(":", 1)
                    _prefix, alteration = alteration.split(".", 1)
                    for triplet, amino in AA_ALPHABET_TRANSLATION.items():
                        alteration = alteration.replace(triplet, amino)

                    hgvsp = f"{feature}:{alteration}"
                    entry = (hgvsp, f"{vaf:.3f}")
                    if alteration in snakemake.params.mth.get(feature, {}):
                        insert_entry(mutations_of_interest, hgvsp, vaf)
                    else:
                        insert_entry(other_mutations, hgvsp, vaf)

    data.loc[sample, "VOC Mutations"] = fmt_variants(mutations_of_interest)
    data.loc[sample, "Other Mutations"] = fmt_variants(other_mutations)


data["Other Mutations"][
    data["Other Mutations"].str.len() > 32767
] = "Too many variants to display"

int_cols = [
    "Raw Reads (#)",
    "Trimmed Reads (#)",
    "Filtered Reads (#)",
]

if is_patient_report():
    int_cols += [
        "Largest Contig (bp)",
        "De Novo Sequence (bp)",
        "Pseudo Sequence (bp)",
        "Consensus Sequence (bp)",
    ]


data[int_cols] = data[int_cols].fillna("0").applymap(lambda x: "{0:,}".format(int(x)))
data = data.loc[:, (data != "0").any(axis=0)]
data.index.name = "Sample"
data.sort_index(inplace=True)
data.to_csv(snakemake.output[0], float_format="%.1f")

Python Pandas JSON pysam Consensus Kraken Pangolin From line 6 of scripts/generate-overview-table.py

import sys
from typing import Sequence

sys.stderr = open(snakemake.log[0], "w")

from Bio import SeqIO


def get_largest_contig(sm_input, sm_output):
    contigs = []

    for seq_record in SeqIO.parse(sm_input, "fasta"):
        contigs.append([seq_record.id, str(seq_record.seq), len(seq_record)])

    contigs.sort(key=lambda x: x[2])

    try:
        name = f">{contigs[-1][0]}"
        sequence = contigs[-1][1]
    except:
        name = ">filler-contig"
        sequence = "N"

    with open(sm_output, "w") as writer:
        writer.write(f"{name}\n{sequence}")


if __name__ == "__main__":
    get_largest_contig(snakemake.input[0], snakemake.output[0])

Python Biopython From line 6 of scripts/get_largest_contig.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd


def get_read_statistics(sm_input, sm_output):
    all_read_counts = []
    for input in sm_input:
        with open(input) as in_file:
            all_read_counts.append(next(in_file).rstrip())
    all_read_counts = pd.Series(all_read_counts, dtype=int)

    with open(sm_output, "w") as f:
        print("Length of all reads:\n", file=f)
        print(all_read_counts.describe().apply(lambda x: format(x, "f")), file=f)


get_read_statistics(snakemake.input, snakemake.output[0])

Python Pandas From line 6 of scripts/get-read-statistics.py

import sys

sys.stderr = open(snakemake.log[0], "w")


def get_lineage_references():
    lineage_references = snakemake.params.lineage_references

    with open(snakemake.output[0], "w") as outfile:
        for _key, value in lineage_references.items():
            print(
                "resources/genomes-renamed/{accession}.fasta".format(accession=value),
                file=outfile,
            )


get_lineage_references()

Python From line 6 of scripts/get-strains-from-genbank.py

import sys

sys.stderr = open(snakemake.log[0], "w")

from collections import Counter

import pysam

# source: https://www.bioinformatics.org/sms/iupac.html
IUPAC = {
    frozenset("A"): "A",
    frozenset("C"): "C",
    frozenset("G"): "G",
    frozenset("T"): "T",
    frozenset("AG"): "R",
    frozenset("CT"): "Y",
    frozenset("GC"): "S",
    frozenset("AT"): "W",
    frozenset("GT"): "K",
    frozenset("AC"): "M",
    frozenset("CGT"): "B",
    frozenset("AGT"): "D",
    frozenset("ACT"): "H",
    frozenset("ACG"): "V",
    frozenset("ACTG"): "N",
}


def get_sequence():
    with pysam.FastxFile(snakemake.input.sequence) as fh:
        for entry in fh:
            return entry.sequence


def split(word):
    return [char for char in word]


def get_base_count(pileupcolumn):
    bases = []

    # pileupread: representation of a read aligned to a particular position in the reference sequence.
    for pileupread in pileupcolumn.pileups:
        # TODO Check pileupread for missing bases
        if not pileupread.is_del and not pileupread.is_refskip:

            read_base = pileupread.alignment.query_sequence[pileupread.query_position]

            bases.append(read_base)

    return Counter(bases)


def get_coverage(base_count):
    return sum(base_count.values())


def get_and_write_coverages_and_base_counts(
    coverage_header: str = "#CHROM\tPOS\tCoverage",
):
    with pysam.AlignmentFile(snakemake.input.bamfile, "rb") as bamfile, open(
        snakemake.output.coverage, "w"
    ) as coverage_manager:
        print(coverage_header, file=coverage_manager)
        coverages = {}
        base_counts = {}
        for base in bamfile.pileup():
            base_count = get_base_count(base)
            coverage = get_coverage(base_count)
            print(
                "%s\t%s\t%s"
                % (
                    base.reference_name,
                    base.reference_pos,
                    coverage,
                ),
                file=coverage_manager,
            )

            coverages[base.reference_pos] = coverage
            base_counts[base.reference_pos] = base_count

        return coverages, base_counts


def get_allel_freq(correct_base, base_counts):
    return base_counts[correct_base] / sum(base_counts.values())


def get_UPAC_mask(base_counts):
    return IUPAC[frozenset("".join(base_counts.keys()))]


def mask_sequence(sequence, coverages, base_counts):
    sequence = split(sequence)

    covered_postions = coverages.keys()

    for position, base in enumerate(sequence):

        if position not in covered_postions:
            # TODO Check why there are postions that are not covered by any reads and are not Ns
            # sequence[position] = "N"

            print(
                "Base %s at pos. %s not covered by any read."
                % (
                    base,
                    position,
                ),
                file=sys.stderr,
            )
            continue

        if coverages[position] < snakemake.params.min_coverage:
            sequence[position] = "N"

            print(
                "Coverage of base %s at pos. %s = %s. Masking with N."
                % (
                    base,
                    position,
                    coverages[position],
                ),
                file=sys.stderr,
            )
            continue

        if (
            not snakemake.params.is_ont
            and get_allel_freq(base, base_counts[position])
            < snakemake.params.min_allele
        ):
            if "N" in base_counts[position].keys():
                mask = "N"
            else:
                mask = get_UPAC_mask(base_counts[position])

            print(
                "Coverage of base %s at pos. %s = %s with Allel frequency = %s. Bases in reads: %s. Masking with %s."
                % (
                    base,
                    position,
                    coverages[position],
                    get_allel_freq(base, base_counts[position]),
                    base_counts[position],
                    mask,
                ),
                file=sys.stderr,
            )

            sequence[position] = mask

    return "".join(sequence)


def write_sequence(sequence):
    with pysam.FastxFile(snakemake.input.sequence) as infile, open(
        snakemake.output.masked_sequence, mode="w"
    ) as outfile:
        print(">%s" % next(infile).name.split(".")[0], file=outfile)
        print(sequence, file=outfile)


sequence = get_sequence()
assert isinstance(sequence, str), "More than one sequence in .fasta file."
coverages, base_counts = get_and_write_coverages_and_base_counts()
masked_sequence = mask_sequence(sequence, coverages, base_counts)
write_sequence(masked_sequence)

Python pysam From line 6 of scripts/masking.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import altair as alt
import pandas as pd


def plot_coverage(sm_input, sm_output, min_coverage):

    coverage = pd.DataFrame()
    for sample in sm_input:
        sample_df = pd.read_csv(sample, sep="\t")

        sample_df.rename(
            columns={sample_df.columns[2]: "Coverage", "POS": "Pos"}, inplace=True
        )

        sample_df["Sample"] = sample_df["#CHROM"].apply(lambda x: str(x).split(".")[0])

        coverage = pd.concat([coverage, sample_df], ignore_index=True)

    coverage["# Coverage"] = coverage.Coverage.apply(
        lambda x: f"< {min_coverage}"
        if int(x) < int(min_coverage)
        else f">= {min_coverage}"
    )

    max_y_pos = 100
    max_x_pos = coverage.Pos.max()

    coverage["Coverage"] = coverage["Coverage"].apply(
        lambda x: max_y_pos if x > max_y_pos else x
    )

    if len(coverage) > 0:
        alt.Chart(coverage).mark_bar().encode(
            x=alt.X("Pos:Q", scale=alt.Scale(domain=(0, max_x_pos), nice=False)),
            y=alt.Y("Coverage", scale=alt.Scale(domain=[0, max_y_pos])),
            row=alt.Row("Sample:N"),
            color=alt.Color(
                "# Coverage",
                scale=alt.Scale(
                    domain=[f"< {min_coverage}", f">= {min_coverage}"],
                    range=["indianred", "lightgreen"],
                ),
            ),
        ).properties(width=1200, height=150).interactive().save(sm_output)
    else:
        alt.Chart(coverage).mark_bar().encode().properties(
            width="container", height=150
        ).save(sm_output)


plot_coverage(snakemake.input, snakemake.output[0], snakemake.params.min_coverage)

Python Pandas altair From line 6 of scripts/plot-all-coverage.py

sys.stderr = open(snakemake.log[0], "w")

import sys

import altair as alt
import pandas as pd
import pysam

data = pd.DataFrame()


def register_lengths(sample, file_list, state, amplicon_state, data):
    for file, assembler in zip(file_list, snakemake.params.assembler):
        if state in ("initial", "scaffolded"):
            with pysam.FastxFile(file) as infile:
                data = pd.concat(
                    [
                        data,
                        pd.DataFrame(
                            {
                                "Sample": sample,
                                "Assembler": assembler,
                                "Amplicon": amplicon_state,
                                "length (bp)": max(
                                    len(contig.sequence) for contig in infile
                                ),
                                "State": state,
                            },
                            index=[0],
                        ),
                    ],
                    ignore_index=True,
                )
        else:
            quastDf = pd.read_csv(file, sep="\t")
            data = pd.concat(
                [
                    data,
                    pd.DataFrame(
                        {
                            "Sample": sample,
                            "Assembler": assembler,
                            "Amplicon": amplicon_state,
                            "length (bp)": quastDf.loc[0, "N50"],
                            "State": "N50",
                            "Genome fraction (%)": quastDf.loc[0, "Genome fraction (%)"]
                            if "Genome fraction (%)" in quastDf.columns
                            else float("nan"),
                        },
                        index=[0],
                    ),
                ],
                ignore_index=True,
            )
    return data


for sample, amplicon_state in zip(
    snakemake.params.samples, snakemake.params.amplicon_state
):

    def load(inputfile, label=None):
        return register_lengths(
            sample,
            [x for x in snakemake.input.get(inputfile) if sample in x],
            label or inputfile,
            amplicon_state,
            data,
        )

    data = load("initial")
    data = load("final", "scaffolded")
    data = load("quast", "N50")

data.replace({"Amplicon": {0: "a) Shotgun", 1: "b) Amplicon"}}, inplace=True)
data.replace(
    {
        "Assembler": {
            "megahit-std": "MEGAHIT",
            "megahit-meta-large": "MEGAHIT (meta-large)",
            "megahit-meta-sensitive": "MEGAHIT (meta-sensitive)",
            "trinity": "Trinty",
            "velvet": "Velvet",
            "metaspades": "SPAdes (metaSPAdes)",
            "spades": "SPAdes",
            "coronaspades": "SPAdes (coronaSPAdes)",
            "rnaviralspades": "SPAdes (RNAviralSPAdes)",
        }
    },
    inplace=True,
)

data.to_csv(snakemake.output[1])

height = 200
width = 65

plot_bp = (
    alt.Chart(data)
    .encode(
        y=alt.Y(
            "length (bp)",
            scale=alt.Scale(domain=[0, 35000], clamp=True),
            axis=alt.Axis(tickCount=5),
        ),
        x=alt.X("State", sort=["N50", "initial", "scaffolded"]),
        color=alt.Color("Assembler", scale=alt.Scale(scheme="turbo"), legend=None),
    )
    .properties(height=height, width=width)
)

combined_bp = plot_bp.mark_point(opacity=1, filled=True) + plot_bp.mark_boxplot(
    opacity=0.8,
    box={"stroke": "black", "strokeWidth": 1, "fill": "none"},
    median={"stroke": "black", "strokeWidth": 1},
    outliers=False,
)
combined_bp = (
    combined_bp.facet(
        column=alt.Column(
            "Assembler:N",
            title="",
            header=alt.Header(labelAngle=-45, labelOrient="bottom", labelPadding=-5),
        ),
        row=alt.Row(
            "Amplicon:N",
            title="",
            sort="ascending",
            header=alt.Header(
                labelAngle=-360,
                labelAlign="center",
                labelAnchor="end",
                labelPadding=-90,
                labelOrient="left",
                labelFontWeight="bold",
                labelFontSize=12,
            ),
        ),
    )
    .configure_axisX(labelAngle=-45)
    .configure_axis(labelFontSize=12, titleFontSize=12)
    .configure_view(stroke=None)
)

plot_genome_frac = (
    alt.Chart(data)
    .mark_point(opacity=0.5)
    .encode(
        x=alt.X(
            "jitter:Q",
            stack="zero",
            title="",
            axis=alt.Axis(ticks=False, grid=False, labels=False),
            scale=alt.Scale(),
        ),
        y=alt.Y(
            "Genome fraction (%)",
            stack=None,
            title="Genome fraction (%)",
            axis=alt.Axis(tickCount=5),
        ),
        color=alt.Color("Assembler", scale=alt.Scale(scheme="turbo"), legend=None),
        column=alt.Column(
            "Assembler:N",
            title="",
            header=alt.Header(labelAngle=-45, labelOrient="bottom", labelPadding=-5),
        ),
        row=alt.Row(
            "Amplicon:N",
            title="",
            sort="ascending",
            header=alt.Header(
                labelAngle=-360,
                labelAlign="center",
                labelAnchor="end",
                labelPadding=-90,
                labelOrient="left",
                labelFontWeight="bold",
                labelFontSize=12,
            ),
        ),
    )
    .configure_axisY(grid=True)
    .configure_axis(labelFontSize=12, titleFontSize=12)
    .configure_view(stroke=None)
    .transform_calculate(jitter="20 * sqrt(-2*log(random()))*cos(2*PI*random())")
    .properties(height=height, width=50)
)


combined_bp.save(snakemake.output[0])
plot_genome_frac.save(snakemake.output[2])

Python Pandas pysam QUAST metaspades altair From line 6 of scripts/plot-assembly-comparison.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import altair as alt
import pandas as pd


def mask(strain):
    return (
        strain
        if strain in ["other", "unmapped", "B.1.1.7", "B.1.351"]
        else "some strain"
    )


def plot_error_heatmap(sm_input, sm_output, type="heatmap"):
    results_df = pd.read_csv(sm_input, delimiter="\t")

    results_df["Output"] = results_df["target_id"].apply(lambda x: mask(x))
    results_df = results_df[results_df["Output"] != "unmapped"]
    results_df = results_df[results_df["Output"] != "other"]
    no_of_mixs = int(results_df["mix"].max() + 1)

    if type == "heatmap":
        plot = (
            alt.Chart(results_df)
            .mark_rect()
            .encode(
                alt.X(
                    "true_fraction:Q",
                    axis=alt.Axis(format="%", title="True Fraction"),
                    bin=alt.Bin(maxbins=35),
                    scale=alt.Scale(
                        domain=(0.0, 1.0),
                        bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                    ),
                ),
                alt.Y(
                    "est_fraction:Q",
                    axis=alt.Axis(format="%", title="Est. Fraction"),
                    bin=alt.Bin(maxbins=35),
                    scale=alt.Scale(
                        domain=(0.0, 1.0),
                        bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                    ),
                ),
                alt.Color(
                    "count(true_fraction):Q",
                    scale=alt.Scale(type="log", scheme="greenblue"),
                ),
            )
        )

        # Define the degree of the polynomial fits
        degree_list = [1, 3]

        base = (
            alt.Chart(results_df)
            .mark_circle()
            .encode(alt.X("true_fraction"), alt.Y("est_fraction"))
        )

    else:
        plot = (
            alt.Chart(results_df)
            .mark_point()
            .encode(x="true_fraction:Q", y="est_fraction:Q", color="Kallisto output:N")
        )

    line = (
        alt.Chart(pd.DataFrame({"true_fraction": [0, 1], "est_fraction": [0, 1]}))
        .mark_line(color="grey")
        .encode(alt.X("true_fraction"), alt.Y("est_fraction"))
    )

    (plot + line).properties(
        title=f"{snakemake.wildcards.caller}, No. of mixtures {no_of_mixs}"
    ).save(sm_output)


def plot_bar_of_zeros(sm_input, sm_output):
    results_df = pd.read_csv(sm_input, delimiter="\t")

    results_df["Output"] = results_df["target_id"].apply(lambda x: mask(x))
    results_df = results_df[results_df["Output"] != "unmapped"]
    results_df = results_df[results_df["Output"] != "other"]

    results_df = results_df[
        (results_df["true_fraction"] == 0) | (results_df["est_fraction"] == 0)
    ]

    results_df["Axis"] = results_df["true_fraction"].apply(
        lambda x: "True Fraction = 0" if x == 0 else "Est. Fraction = 0"
    )

    base = alt.Chart(results_df[["true_fraction", "target_id", "Axis"]])

    bar = (
        base.mark_bar()
        .encode(x=alt.X("target_id", sort="-y"), y=alt.Y("count(target_id)"))
        .facet(
            row="Axis:N",
        )
    ).properties(
        title=f"Count of records where the other fraction is 0. Shows the x and y axis of the heatmap in more detail."
    )

    (bar).save(sm_output)


def plot_worst_predictons_content(sm_input, sm_output):
    plots = []

    for i in range(3):
        try:
            results_df = pd.read_csv(sm_input, delimiter="\t")

            results_df["Output"] = results_df["target_id"].apply(lambda x: mask(x))
            results_df = results_df[results_df["Output"] != "unmapped"]
            results_df = results_df[results_df["Output"] != "other"]

            results_df_false = results_df[results_df["true_fraction"] == 0]
            worst_predictions = results_df_false.target_id.value_counts()
            worst_prediction = worst_predictions.index[i]
            worst_predictions = results_df_false[
                results_df_false["target_id"] == worst_prediction
            ].mix.unique()

            results_df = results_df[results_df.mix.isin(worst_predictions)]
            results_df = results_df[results_df.target_id != worst_prediction]
            results_df = results_df[results_df.true_fraction != 0]

            no_samples = len(results_df.mix.unique())

            plot = (
                alt.Chart(results_df)
                .mark_bar()
                .encode(
                    x=alt.X("target_id:O", sort="-y"),
                    y="count(target_id)",
                    color="true_fraction:Q",
                )
                .properties(
                    title=f"Composition of {no_samples} mixtures, where {worst_prediction} is called, but not contained in mixture"
                )
            )

            plots.append(plot)
        except:
            # TODO make exception handling more fine-grained
            pass

    alt.vconcat(*plots).save(sm_output)


if __name__ == "__main__":
    plot_error_heatmap(snakemake.input[0], snakemake.output[0])
    plot_bar_of_zeros(snakemake.input[0], snakemake.output[1])
    plot_worst_predictons_content(snakemake.input[0], snakemake.output[2])

Python Pandas altair From line 6 of scripts/plot-caller-error.py

import sys

sys.stderr = open(snakemake.log[0], "w")

MIXTURE_PREFIX = snakemake.params.get("prefix", "")
MIXTURE_PART_INDICATOR = snakemake.params.get("separator", "")
MIXTURE_PERCENTAGE_INDICATOR = snakemake.params.get("percentage", "")

import re

import altair as alt
import pandas as pd


def regex_per_line(content, pattern):
    match = re.search(pattern, content)
    if match:
        return int(match.group("share")) / 100
    else:
        return 0


def plot_dependency_of_pangolin_call(sm_input, sm_output):
    # aggregate pangolin outputs
    all_sampes = pd.DataFrame()
    for input in sm_input:
        pangolin_output = pd.read_csv(input)
        pangolin_output["mixture_content"] = input.split(MIXTURE_PREFIX, 1)[-1].split(
            "."
        )[0]
        all_sampes = pd.concat([all_sampes, pangolin_output], ignore_index=True)

    all_sampes["mixture_content"] = all_sampes["mixture_content"].str.replace("-", ".")

    # get share of called lineage
    all_sampes["regex"] = (
        MIXTURE_PART_INDICATOR
        + all_sampes["lineage"]
        + MIXTURE_PERCENTAGE_INDICATOR
        + "(?P<share>\d.)"
    )
    all_sampes["share_of_called_lineage"] = all_sampes[
        ["mixture_content", "regex"]
    ].apply(lambda x: regex_per_line(*x), axis=1)
    all_sampes.drop(columns=["regex"], inplace=True)
    all_sampes["correct_call"] = all_sampes["share_of_called_lineage"].apply(
        lambda x: "Yes" if x > 0 else "No"
    )

    # plot histogram
    source = all_sampes.copy()
    source.rename(
        columns={
            "share_of_called_lineage": "Share of called lineage in sample",
            "correct_call": "Lineage in sample",
        },
        inplace=True,
    )

    histogram = (
        alt.Chart(source)
        .mark_bar()
        .encode(
            alt.X(
                "Share of called lineage in sample:Q",
                bin=alt.Bin(extent=[0, 1], step=0.1),
                axis=alt.Axis(format="%"),
            ),
            y="count()",
            color=alt.Color(
                "Lineage in sample",
                scale=alt.Scale(scheme="tableau20"),
                sort="descending",
            ),
        )
        .configure_legend(orient="top")
    )

    histogram.save(sm_output)


if __name__ == "__main__":
    plot_dependency_of_pangolin_call(snakemake.input, snakemake.output[0])

Python Pandas Pangolin altair From line 8 of scripts/plot-dependency-of-pangolin-call.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import altair as alt
import pandas as pd


def plot_lineages_over_time(sm_input, sm_output, dates, sm_output_table):
    pangolin_outputs = []
    for call, date in zip(sm_input, dates):
        pangolin_call = pd.read_csv(call)
        pangolin_call["date"] = date
        pangolin_outputs.append(pangolin_call)

    pangolin_calls = pd.concat(pangolin_outputs, axis=0, ignore_index=True)

    # write out as table
    pangolin_calls.to_csv(sm_output_table)

    pangolin_calls = pangolin_calls[pangolin_calls["lineage"] != "None"]

    # get occurrences
    if len(pangolin_calls) > 0:
        pangolin_calls["lineage_count"] = pangolin_calls.groupby(
            "lineage", as_index=False
        )["lineage"].transform(lambda s: s.count())
    else:
        pangolin_calls["lineage_count"] = pd.Series()

    # mask low occurrences
    print(pangolin_calls["lineage"].value_counts())
    df = pd.DataFrame(pangolin_calls["lineage"].value_counts())
    df.sort_values(by=["lineage"])
    if len(df.index) > 10:
        pangolin_calls.loc[
            ~pangolin_calls["lineage"].isin(df.head(10).index), "lineage"
        ] = "other occ."
    pangolin_calls.rename(columns={"lineage": "Lineage", "date": "Date"}, inplace=True)

    area_plot = (
        alt.Chart(pangolin_calls)
        .mark_bar(opacity=0.8)
        .encode(
            x=alt.X("Date:O"),
            y=alt.Y(
                "count()",
                stack="normalize",
                axis=alt.Axis(format="%"),
                title="Fraction in Run",
            ),
            stroke="Lineage",
            color=alt.Color(
                "Lineage",
                scale=alt.Scale(scheme="tableau10"),
                legend=alt.Legend(orient="top"),
            ),
        )
    ).properties(width=800)

    area_plot.save(sm_output)


plot_lineages_over_time(
    snakemake.input, snakemake.output[0], snakemake.params.dates, snakemake.output[1]
)

Python Pandas altair From line 6 of scripts/plot-lineages-over-time.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import altair as alt
import pandas as pd

MIXTURE_PART_INDICATOR = snakemake.params.separator
MIXTURE_PERCENTAGE_INDICATOR = snakemake.params.percentage


######################################################
# Currently only supporting mixtures with one strain #
######################################################


def plot_pangolin_conflict(sm_input, sm_output):
    # aggregate pangolin outputs
    all_sampes = pd.DataFrame()
    for input in sm_input:
        # get actual lineage
        _prefix, true_lineage_with_percent = input.split(MIXTURE_PART_INDICATOR)
        true_lineage, percent = true_lineage_with_percent.split(
            MIXTURE_PERCENTAGE_INDICATOR
        )
        true_lineage = true_lineage.replace("-", ".")
        percent = percent.replace(".polished.strains.pangolin.csv", "")

        # create df for one call
        pangolin_output = pd.read_csv(input)
        pangolin_output["true_lineage"] = true_lineage
        pangolin_output["true_lineage_percent"] = percent
        all_sampes = pd.concat([all_sampes, pangolin_output], ignore_index=True)

    all_sampes["correct_lineage_assignment"] = (
        all_sampes["lineage"] == all_sampes["true_lineage"]
    )

    # get share of correct and incorrect calls
    print(
        all_sampes["correct_lineage_assignment"].value_counts(normalize=True),
        file=sys.stderr,
    )

    wrongly_assigned = all_sampes[all_sampes["correct_lineage_assignment"] == False]

    # plot
    source = wrongly_assigned.copy()
    source.rename(
        columns={
            "lineage": "Called lineage",
            "true_lineage": "Actual lineage",
            "note": "Note",
        },
        inplace=True,
    )

    scatter_plot = (
        alt.Chart(source)
        .mark_circle()
        .encode(
            alt.X("Actual lineage:O"),
            alt.Y(
                "Called lineage:O",
            ),
            size="count()",
            color=alt.Color("count()", scale=alt.Scale(scheme="tableau20")),
        )
    )

    scatter_plot.save(sm_output[0])
    wrongly_assigned.to_csv(sm_output[1])


plot_pangolin_conflict(snakemake.input, snakemake.output)

Python Pandas Pangolin altair From line 6 of scripts/plot-pangolin-conflict.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import altair as alt
import pandas as pd
import pysam
from intervaltree import IntervalTree

# read primer bedpe to df
PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=None)
PRIMER.drop(PRIMER.columns[[0, 3]], axis=1, inplace=True)
PRIMER.columns = ["p1_start", "p1_end", "p2_start", "p2_end"]

# convert df to interval trees
primer_intervals = IntervalTree()
no_primer_intervals = IntervalTree()
for index, row in PRIMER.iterrows():
    primer_intervals[row["p1_start"] : row["p2_end"] + 1] = (
        row["p1_start"],
        row["p2_end"] + 1,
    )
    no_primer_intervals[row["p1_end"] + 1 : row["p2_start"]] = (
        row["p1_end"] + 1,
        row["p2_start"],
    )


def iter_with_samples(inputfiles):
    return zip(snakemake.params.samples, inputfiles)


def count_intervals(file):
    with pysam.AlignmentFile(file, "rb") as bam:
        counter_primer = 0
        counter_no_primer = 0
        counter_primer_within = 0
        counter_no_primer_within = 0
        counter_nothing = 0
        mate_pair_intervals = {}
        for read in bam.fetch():
            if not mate_pair_intervals.get(read.query_name):
                mate_pair_intervals[read.query_name] = [read.reference_start]
            else:
                mate_pair_intervals[read.query_name].append(read.reference_end)
        for pair in mate_pair_intervals:
            if (
                len(mate_pair_intervals[pair]) > 1
                and mate_pair_intervals[pair][0] != None
                and mate_pair_intervals[pair][1] != None
            ):
                if primer_intervals.envelop(
                    mate_pair_intervals[pair][0], mate_pair_intervals[pair][1] + 1
                ):
                    if (
                        sorted(
                            primer_intervals.envelop(
                                mate_pair_intervals[pair][0],
                                mate_pair_intervals[pair][1] + 1,
                            )
                        )[0].begin
                        == mate_pair_intervals[pair][0]
                        and sorted(
                            primer_intervals.envelop(
                                mate_pair_intervals[pair][0],
                                mate_pair_intervals[pair][1] + 1,
                            )
                        )[0].end
                        == mate_pair_intervals[pair][1] + 1
                    ):
                        counter_primer += 1
                    else:
                        counter_primer_within += 1
                elif no_primer_intervals.envelop(
                    mate_pair_intervals[pair][0] + 1, mate_pair_intervals[pair][1]
                ):
                    if (
                        sorted(
                            no_primer_intervals.envelop(
                                mate_pair_intervals[pair][0] + 1,
                                mate_pair_intervals[pair][1],
                            )
                        )[0].begin
                        == mate_pair_intervals[pair][0] + 1
                        and sorted(
                            no_primer_intervals.envelop(
                                mate_pair_intervals[pair][0] + 1,
                                mate_pair_intervals[pair][1],
                            )
                        )[0].end
                        == mate_pair_intervals[pair][1]
                    ):
                        counter_no_primer += 1
                    else:
                        counter_no_primer_within += 1
                else:
                    counter_nothing += 1
            else:
                counter_nothing += 1
        counters = pd.DataFrame(
            {
                "n_count": [
                    counter_primer,
                    counter_primer_within,
                    counter_no_primer,
                    counter_no_primer_within,
                    counter_nothing,
                ],
                "class": [
                    "uncut primer exact",
                    "uncut primer within",
                    "cut primer exact",
                    "cut primer within",
                    "no mathing win",
                ],
            }
        )
        return counters


def plot_classes(counters):
    bars = (
        alt.Chart(counters)
        .mark_bar()
        .encode(
            y="class",
            x="n_count",
            row=alt.Row("sample:N"),
            column=alt.Column("state:N", sort="descending"),
        )
    )
    text = bars.mark_text(
        align="left",
        baseline="middle",
        dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    ).encode(text="n_count", row=alt.Row("sample:N"), column=alt.Column("state:N"))
    return bars, text


all_df = pd.DataFrame()
for sample, file in iter_with_samples(snakemake.input.unclipped):
    counts_before = count_intervals(file)
    counts_before["sample"] = sample
    counts_before["state"] = "before"
    all_df = pd.concat([all_df, counts_before], ignore_index=True)

for sample, file in iter_with_samples(snakemake.input.clipped):
    counts_after = count_intervals(file)
    counts_after["sample"] = sample
    counts_after["state"] = "after"
    all_df = pd.concat([all_df, counts_after], ignore_index=True)

bars, text = plot_classes(all_df)

(bars).properties(title="Amplicon matching").save(snakemake.output.plot)

Python Pandas pysam altair intervaltree From line 6 of scripts/plot-primer-clipping.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import altair as alt
import pandas as pd
import pysam

AA_ALPHABET_TRANSLATION = {
    "Gly": "G",
    "Ala": "A",
    "Leu": "L",
    "Met": "M",
    "Phe": "F",
    "Trp": "W",
    "Lys": "K",
    "Gln": "Q",
    "Glu": "E",
    "Ser": "S",
    "Pro": "P",
    "Val": "V",
    "Ile": "I",
    "Cys": "C",
    "Tyr": "Y",
    "His": "H",
    "Arg": "R",
    "Asn": "N",
    "Asp": "D",
    "Thr": "T",
}


def get_calls():
    variants = []
    for file, date, sample in zip(
        snakemake.input.bcf, snakemake.params.dates, snakemake.params.samples
    ):
        with pysam.VariantFile(file, "rb") as infile:
            for record in infile:
                vaf = record.samples[0]["AF"][0]
                for ann in record.info["ANN"]:
                    ann = ann.split("|")
                    hgvsp = ann[11]
                    enssast_id = ann[6]
                    feature = ann[3]
                    orf = ann[3]
                    if hgvsp:
                        # TODO think about regex instead of splitting
                        enssast_id, alteration = hgvsp.split(":", 1)
                        _prefix, alteration = alteration.split(".", 1)
                        for triplet, amino in AA_ALPHABET_TRANSLATION.items():
                            alteration = alteration.replace(triplet, amino)

                        variants.append(
                            {
                                "feature": feature,
                                "alteration": alteration,
                                "vaf": vaf,
                                "date": date,
                                "sample": sample,
                                "orf": orf,
                            }
                        )
    variants_df = pd.DataFrame(variants)
    variants_df = variants_df[variants_df["orf"] == snakemake.wildcards.ORFNAME]
    return variants_df


def plot_variants_over_time(sm_output, sm_output_table):
    calls = get_calls()

    # write out as table
    calls.to_csv(sm_output_table)

    if len(calls) > 0:
        # get occurrences
        calls["total occurrence"] = calls.groupby("alteration", as_index=False)[
            "alteration"
        ].transform(lambda s: s.count())

        # mask low occurrences
        print(calls["alteration"].value_counts())
        df = pd.DataFrame(calls["alteration"].value_counts())
        df.sort_values(by=["alteration"])
        if len(df.index) > 10:
            # print(calls.loc[calls["alteration"].isin(df.head(10).index)])
            calls.loc[
                ~calls["alteration"].isin(df.head(10).index), "alteration"
            ] = "other occ."
        else:
            calls.loc[calls["total occurrence"] < 0, "alteration"] = "other occ."

    calls.rename(columns={"alteration": "Alteration", "date": "Date"}, inplace=True)

    area_plot = (
        alt.Chart(calls)
        .mark_bar(opacity=0.8)
        .encode(
            x=alt.X("Date:O"),
            y=alt.Y(
                "count()",
                stack="normalize",
                axis=alt.Axis(format="%"),
                title="Fraction in Run",
            ),
            stroke="Alteration",
            color=alt.Color(
                "Alteration",
                scale=alt.Scale(scheme="tableau10"),
                legend=alt.Legend(orient="top"),
            ),
        )
    ).properties(width=800)

    area_plot.save(sm_output)


plot_variants_over_time(snakemake.output[0], snakemake.output[1])

Python Pandas pysam altair From line 6 of scripts/plot-variants-over-time.py

sys.stderr = open(snakemake.log[0], "w")
min_identity = snakemake.params.get("min_identity", 0.9)
max_n = snakemake.params.get("max_n", 0.05)

from os import path
from typing import List

import pandas as pd
import pysam


def get_identity(quast_report_paths: List[str]) -> dict:
    """Extracts genome fraction form quast reports

    Args:
        quast_report_paths (List[str]): List of paths to quast reports (tsv) to be parsed

    Returns:
        dict: Dict consisting of sample name and genome fractions
    """

    identity_dict = {}

    for report_path in quast_report_paths:
        # extract sample name
        sample = path.dirname(report_path).split("/")[-1]

        # load report
        report_df = pd.read_csv(
            report_path, delimiter="\t", index_col=0, squeeze=True, names=["value"]
        )

        # select genome fraction (%)
        try:
            fraction = float(report_df.at["Genome fraction (%)"]) / 100
        except:
            # no "Genome fraction (%)" in quast report. Case for not assemblable samples
            fraction = 0.0

        # store in dict
        identity_dict[sample] = fraction

    return identity_dict


def get_sequence(path: str):
    with pysam.FastxFile(path) as fh:
        for entry in fh:
            return entry.name.split(".")[0], entry.sequence


def get_n_share(contig_paths: List[str]) -> dict:
    """Extracts share of Ns in given contigs.

    Args:
        contig_paths (List[str]): List of paths of to be parsed contig

    Returns:
        dict: Dict consisting of sample name and share of Ns
    """

    n_share_dict = {}
    seq_dict = {}

    for contig_path in contig_paths:
        name, sequence = get_sequence(contig_path)
        assert isinstance(sequence, str), "More than one sequence in .fasta file."
        seq_dict[name] = sequence

    for key, value in seq_dict.items():
        n_share_dict[key] = value.count("N") / len(value)

    return n_share_dict


def filter_and_save(
    identity: dict,
    n_share: dict,
    min_identity: float,
    max_n: float,
    save_path: str,
    summary_path: str,
):
    """Filters and saves sample names

    Args:
        identity (dict): Dict consisting of sample name and genome fractions
        n_share (dict): Dict consisting of sample name and share of Ns
        min_identity (float): Min identity to virus reference genome of reconstructed genome
        max_n (float): Max share of N in the reconstructed genome
        save_path (str): Path to save the filtered sample to as .txt
        summary_path (str): Path to identity and n share to as .tsv
    """

    # aggregate all result into one df
    agg_df = pd.DataFrame({"identity": identity, "n_share": n_share})

    # print agg_df to stderr for logging
    print("Aggregated data of all samples", file=sys.stderr)
    print(agg_df, file=sys.stderr)
    agg_df.index.name = "Sample"
    agg_df.to_csv(summary_path, sep="\t")

    # filter this accordingly to the given params
    filtered_df = agg_df[
        (agg_df["identity"] > min_identity) & (agg_df["n_share"] < max_n)
    ]

    # print filtered to stderr for logging
    print("", file=sys.stderr)
    print("Filtered data", file=sys.stderr)
    print(filtered_df, file=sys.stderr)

    # print accepted samples to stderr for logging
    print("", file=sys.stderr)
    print("Accepted samples", file=sys.stderr)
    print(filtered_df.index.values, file=sys.stderr)

    # save accepted samples
    with open(save_path, "w") as snakemake_output:
        for sample in filtered_df.index.values:
            snakemake_output.write("%s\n" % sample)


identity_dict = get_identity(snakemake.input.quast)
n_share_dict = get_n_share(snakemake.input.contigs)
filter_and_save(
    identity_dict,
    n_share_dict,
    min_identity,
    max_n,
    snakemake.output.passed_filter,
    snakemake.output.filter_summary,
)

Python Pandas pysam QUAST From line 6 of scripts/quality-filter.py

import sys

sys.stderr = open(snakemake.log[0], "w")
sample = snakemake.wildcards.sample

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


def remove_chr0(data_path, out_path):
    """This function removes the Chr0 contig generated by raGOO.
    It also renames the id in the FASTA file to the actual sample name.
    In the case where no pseudomolecule is constructed other than the Chr0, it ensures,
    that the FASTA fill contains a 'filler-contig' with a sequence of 'N'.

    Args:
        data_path (string): Path to raGOO output
        out_path (string): Path to store the filtered raGOO output
    """
    valid_records = []
    with open(data_path, "r") as handle:
        i = 1
        for record in SeqIO.parse(handle, "fasta"):
            if "Chr0" not in record.name:
                # rename id from "virus-reference-genome"_RaGOO to actual sample name
                record.id = sample + ".{}".format(i)
                valid_records.append(record)
                i += 1

    # if there was no contig except the Chr0 one
    # add a filler contig to the fasta file
    # in order avoid failing of the following tools
    if not valid_records:
        valid_records.append(
            SeqRecord(
                Seq("N"),
                id="filler-contig",
                name="filler-contig",
                description="filler-contig",
            )
        )
    SeqIO.write(valid_records, out_path, "fasta")


remove_chr0(snakemake.input[0], snakemake.output[0])

Python Biopython From line 6 of scripts/ragoo-remove-chr0.py

import sys

sys.stderr = open(snakemake.log[0], "w")
# parameter = snakemake.params.get("parameter", "")

import random


def select_random_lineages(sm_input, sm_output, number_of_samples):

    with open(sm_input) as f:
        lines = f.read().splitlines()

    lineages = []

    for i in range(number_of_samples):
        rnd_strain_path = random.choice(lines)
        lines.remove(rnd_strain_path)
        strain = rnd_strain_path.replace(".fasta", "").split("/")[-1]
        strain = strain.replace(".", "-")
        lineages.append(strain)

    with open(sm_output, "w") as handler:
        for element in lineages:
            handler.write(element + "\n")


if __name__ == "__main__":
    select_random_lineages(
        snakemake.input[0], snakemake.output[0], snakemake.params.number_of_samples
    )

Python From line 6 of scripts/select_random_lineages.py

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd


def analyize_pangolin(sm_input, accessions):
    temp_dict = {}
    for sample, pango_csv_path in zip(accessions, sm_input.pangolin):
        with open(pango_csv_path, "r") as pango_file:
            pango_df = pd.read_csv(pango_file)
            if pango_df.loc[0, "note"] == "seq_len:1":
                temp_dict[sample] = "assembly failed"
            elif pango_df.loc[0, "qc_status"] == "fail":
                temp_dict[sample] = "is non-sars-cov-2"
            elif (
                pango_df.loc[0, "qc_status"] == "pass"
                and pango_df.loc[0, "lineage"] == "None"
            ):
                temp_dict[sample] = "is non-sars-cov-2"
            else:
                temp_dict[sample] = "is sars-cov-2"
    return temp_dict


def analyize_kallisto(sm_input, accessions):
    temp_dict = {}
    for sample, kallisto_tsv_path in zip(accessions, sm_input.kallisto):
        with open(kallisto_tsv_path, "r") as kallisto_file:
            kallisto_df = pd.read_csv(kallisto_file, delimiter="\t")
            if kallisto_df.loc[0, "target_id"] == "other":
                temp_dict[sample] = "is non-sars-cov-2"
            else:
                temp_dict[sample] = "is sars-cov-2"
    return temp_dict


def aggregate_and_save(pangolin_summary, kallisto_summary, sm_output):
    agg_df = pd.DataFrame(
        [pangolin_summary, kallisto_summary], index=["pangolin", "kallisto"]
    ).T
    agg_df.index.name = "non-cov-2-sample"
    agg_df.to_csv(sm_output, sep="\t")


pangolin_summary = analyize_pangolin(snakemake.input, snakemake.params.accessions)
kallisto_summary = analyize_kallisto(snakemake.input, snakemake.params.accessions)
aggregate_and_save(pangolin_summary, kallisto_summary, snakemake.output[0])

Python Pandas kallisto Pangolin From line 6 of scripts/summarize-non-cov2.py

import sys

import pandas as pd
from snakemake.shell import shell

sys.stderr = open(snakemake.log[0], "w")

pangolin_results = pd.read_csv(snakemake.input.strain_call)
strain = pangolin_results.loc[0]["lineage"]

shell(
    "bcftools view -Ov {snakemake.input.bcfs} | (echo track name={snakemake.wildcards.target} description={strain}-{snakemake.wildcards.filter}; cat -) > {snakemake.output}"
)

Python Snakemake Pandas BCFtools From line 6 of scripts/ucsc_vcf.py

import os
from datetime import date, datetime
from os import getcwd, listdir, mkdir, path
from shutil import copy2, move

import pandas as pd
from ruamel import yaml  # conda install -c conda-forge ruamel.yaml

# define location of sample sheet and workflow configx
SAMPLE_SHEET = snakemake.input[0]  # "config/pep/samples.csv"


def update_sample_sheet(SAMPLE_SHEET, verbose=True, dry_run=False):
    """
    This function
        - copies files from the incoming data directory to the snakemake data directory and
        - moves files from the incoming data directory to the archive directory and
        - updates the sample sheet with the files copied to the snakemake data directory.

    The paths of these directory must be defined in the config yaml of the workflow as follows:

        data-handling:
            # path of incoming data
            incoming: [YOUR PATH]]
            # path to store data in the workflow
            data: [YOUR PATH]
            # path to archive data from incoming to
            archive: [YOUR PATH]

    Args:
        SAMPLE_SHEET ([type]): Path to the location of the sample sheet
        CONFIG_YAML ([type]): Path to the location of the config yaml
        verbose (bool, optional): Provide additional details. Defaults to True.
        dry_run (bool, optional): No files are move, copied or generated if True. Defaults to False.
    """

    if dry_run:
        print("DRY RUN - FILES ARE NOT MOVED")

    # TODO Check if the cwd is the root of this repo
    # check if current working directory is the snakemake workflow
    # if not getcwd().endswith("snakemake-workflow-sars-cov2"):
    #     raise Exception(
    #         "Please change your working directory to 'snakemake-workflow-sars-cov2'"
    #     )

    today = date.today().strftime("%Y-%m-%d")

    config = snakemake.config

    DATA_PATH = str(config["data-handling"]["data"])
    IN_PATH = str(config["data-handling"]["incoming"])
    ARCHIVE_PATH = str(config["data-handling"]["archive"])

    ##################################
    ### Check directories and data ###
    ##################################

    if verbose:
        print("Checking directories")

    # check if directory exist
    for given_path in [IN_PATH, ARCHIVE_PATH, DATA_PATH]:
        if not path.exists(given_path):
            raise Exception("Data directory (%s) not found" % given_path)

    # check if there is new data in the incoming data directory:
    # get files that are in incoming and do not contain 'ndetermined' and '.fastq.gz' in their name and are not under a specific filesize
    incoming_files = []
    for f in listdir(IN_PATH):
        if (
            path.isfile(path.join(IN_PATH, f))
            and "ndetermined" not in f
            and ".fastq.gz" in f
            and os.stat(IN_PATH + f).st_size > 100
        ):
            incoming_files.append(f)
        else:
            print(f, "not used")

    # add date subfolder in data path
    DATA_PATH += today
    if not path.isdir(DATA_PATH):
        mkdir(DATA_PATH)

    # get files that are in outgoing directory
    data_files = [f for f in listdir(DATA_PATH) if path.isfile(path.join(DATA_PATH, f))]

    # print prompt, which data is in incoming and in outgoing and thus is not moved
    files_not_to_copy = [f for f in data_files if f in incoming_files]

    if files_not_to_copy:
        if verbose:
            print(
                "Following files are already located in %s and are not moved:"
                % DATA_PATH
            )
            i = 0
            for f in files_not_to_copy:
                print("\t%s" % DATA_PATH + f)
                i += 1
            print("\tIn total: {}".format(i))

    files_to_copy = [f for f in incoming_files if f not in data_files]

    ##################################
    ######### update the csv #########
    ##################################
    # check if there are no files to copy, thus list is empty
    if not files_to_copy:
        print("No (new) files to copy")
    else:

        if verbose:
            print("Updating sample sheet")
        # create dataframe
        new_files_df = pd.DataFrame(files_to_copy, columns=["file"])

        # get only files, that contain .fastq.gz
        new_files_df = new_files_df[new_files_df["file"].str.contains(".fastq.gz")]
        new_files_df = new_files_df[~new_files_df["file"].str.contains("Undetermined")]

        # get id of sample, thus split at first '_'
        new_files_df["sample_name"] = new_files_df["file"].apply(
            lambda x: (x.split("_", 1)[0])
        )

        # add path of file
        new_files_df["path"] = DATA_PATH + "/" + new_files_df["file"]

        # identify R1 or R2
        new_files_df["read"] = new_files_df["file"].apply(
            lambda x: "R1" if "R1" in x else "R2"
        )

        # set multiindex
        new_files_df.set_index(
            [new_files_df["sample_name"], new_files_df["read"]], inplace=True
        )

        # drop not need columns
        new_files_df.drop(columns=["file", "sample_name", "read"], inplace=True)

        # unstack multiindex
        new_files_df = new_files_df.unstack(1)
        new_files_df.sort_index(inplace=True)
        new_files_df.columns = ["fq1", "fq2"]
        new_files_df["date"] = today
        new_files_df["is_amplicon_data"] = 1
        new_files_df.loc[
            new_files_df.index.str.contains("No-RKI", case=False),
            ["include_in_high_genome_summary"],
        ] = "0"
        new_files_df.loc[
            ~new_files_df.index.str.contains("No-RKI", case=False),
            ["include_in_high_genome_summary"],
        ] = "1"
        print(new_files_df)

        new_sample_sheet = (
            pd.read_csv(SAMPLE_SHEET, index_col="sample_name")
            .append(new_files_df)
            .sort_values(by=["date", "sample_name"])
        )
        new_sample_sheet.index = new_sample_sheet.index.astype("str")

        # remove last line of sample.csv
        new_sample_sheet.drop("NAME", inplace=True, errors="ignore")

        # check for duplicates
        # TODO Generalize for more than two samples
        new_sample_sheet.index = new_sample_sheet.index.where(
            ~new_sample_sheet.index.duplicated(),
            new_sample_sheet.index.astype("str") + "_2",
        )
        # save to csv
        if verbose:
            print("\t{} samples added".format(len(new_files_df)))

        if not dry_run:
            new_sample_sheet.to_csv(snakemake.input[0])

        ##################################
        ## copying and moving the files ##
        ##################################

        if verbose:
            print("Copying files to " + DATA_PATH)

        # move all data in incoming path to data folder in snakemake
        # if ends with .fastq.gz and does not contain Undetermined
        i = 0
        for file in files_to_copy:
            if file.endswith(".fastq.gz") and not "ndetermined" in file:
                # if verbose:
                #     print("\t%s" % IN_PATH + file)
                if not dry_run:
                    copy2(IN_PATH + file, DATA_PATH)
                i += 1
        if verbose:
            print("\t{} files copied".format(i))

    # archiving incoming data
    all_incoming_files = [
        f for f in listdir(IN_PATH) if path.isfile(path.join(IN_PATH, f))
    ]
    if not all_incoming_files:
        print("No files to move")

    else:
        if verbose:
            print("Moving files to " + ARCHIVE_PATH + today)

        if not path.isdir(ARCHIVE_PATH + today):
            mkdir(ARCHIVE_PATH + today)

        archive_files = [
            f
            for f in listdir(ARCHIVE_PATH + today)
            if path.isfile(path.join(ARCHIVE_PATH + today, f))
        ]
        timestamp = datetime.now().strftime("%H:%M:%S")

        # move all files from incoming to archive
        i = 0
        for file in all_incoming_files:
            if not dry_run:
                if file in archive_files:
                    # if file is already in the archive add a timestemp when moving
                    move(
                        IN_PATH + file,
                        ARCHIVE_PATH + today + "/" + file + "-" + timestamp,
                    )
                else:
                    move(IN_PATH + file, ARCHIVE_PATH + today)
            i += 1
            pass
        if verbose:
            print("\t{} files moved".format(i))

        # save sample sheet in archive folder as backup
        if not dry_run:
            if files_to_copy:
                new_sample_sheet.to_csv(ARCHIVE_PATH + today + "/samples.csv")


update_sample_sheet(SAMPLE_SHEET)

Python Snakemake Pandas From line 6 of scripts/update-sample-sheet.py

import re
import sys

import numpy as np
import pysam

sys.stderr = open(snakemake.log[0], "w")

IUPAC = {
    frozenset("AG"): "R",
    frozenset("CT"): "Y",
    frozenset("GC"): "S",
    frozenset("AT"): "W",
    frozenset("GT"): "K",
    frozenset("AC"): "M",
    frozenset("CGT"): "B",
    frozenset("AGT"): "D",
    frozenset("ACT"): "H",
    frozenset("ACG"): "V",
    frozenset("ACTG"): "N",
}


def phred_to_prob(phred):
    if phred is None:
        return 0
    return 10 ** (-phred / 10)


with pysam.FastaFile(snakemake.input.fasta) as infasta, pysam.VariantFile(
    snakemake.input.bcf, "rb"
) as invcf, pysam.AlignmentFile(snakemake.input.bam, "rb") as inbam:

    assert len(infasta.references) == 1, "expected reference with single contig"
    contig = infasta.references[0]
    ref_seq = infasta.fetch(contig)
    cov_a, cov_c, cov_g, cov_t = inbam.count_coverage(contig)
    coverage = np.add(np.add(np.add(cov_a, cov_c), cov_g), cov_t)

    seq = ""
    last_pos = -1  # last considered reference position
    for record in invcf:
        rec_pos = record.pos - 1  # convert to zero based
        if rec_pos > last_pos + 2:
            chunk_seq = np.array(list(ref_seq[last_pos + 1 : rec_pos]))

            # check for low coverage regions
            chunk_low_cov = (
                coverage[last_pos + 1 : rec_pos] < snakemake.params.min_coverage
            )

            if len(chunk_seq[chunk_low_cov]) > 0:
                chunk_seq[chunk_low_cov] = "N"

            seq += "".join(chunk_seq)
        elif rec_pos < last_pos:
            # This must be an alternative allele to the last considered record.
            # But the last considered record had at least VAF>=0.5.
            # Hence, this must be a minor allele, and can therefore be ignored.
            continue

        last_pos = rec_pos - 1

        try:
            dp_sample = record.samples[0]["DP"][0]
        except TypeError:
            dp_sample = record.samples[0]["DP"]

        if dp_sample is None:
            dp_sample = 0

        # ignore low coverage records (subsequent iteration will add an N for that locus then)
        is_low_coverage = dp_sample < snakemake.params.min_coverage
        if is_low_coverage:
            continue

        def get_prob(event):
            return phred_to_prob(record.info[f"PROB_{event.upper()}"][0])

        prob_high = get_prob("clonal") + get_prob("subclonal_high")
        prob_major = get_prob("subclonal_major")

        apply = prob_high >= snakemake.params.min_prob_apply
        uncertain = (
            prob_major >= snakemake.params.min_prob_apply
            or (prob_high + prob_major) >= 0.5
        )

        if not (apply or uncertain):
            # we simply ignore this record
            continue

        assert len(record.alleles) == 2
        ref_allele, alt_allele = record.alleles

        # REF: A, ALT: <DEL>

        def handle_deletion(del_len):
            global last_pos
            global seq
            if not apply:
                seq += "N" * del_len
            last_pos += del_len + 1

        if alt_allele == "<DEL>":
            seq += ref_allele
            del_len = record.info["SVLEN"][0]
            handle_deletion(del_len)
        elif alt_allele == "<DUP>":
            dup_seq = ref_seq[rec_pos : record.stop]
            seq += dup_seq * 2
            last_pos += len(dup_seq)
        elif re.match("[A-Z]+$", alt_allele) is None:
            # TODO cover more variant types before publication
            raise ValueError(f"Unexpected alt allele: {alt_allele} not yet supported")
        elif len(ref_allele) == len(alt_allele):
            # SNV or MNV
            if apply:
                seq += alt_allele
            else:
                # store IUPAC codes
                for a, b in zip(*record.alleles):
                    bases = frozenset((a.upper(), b.upper()))
                    if len(bases) > 1:
                        # get IUPAC representation of bases
                        seq += IUPAC[bases]
                    else:
                        # add single base
                        (base,) = bases
                        seq += base
            last_pos += len(alt_allele)
        elif len(ref_allele) > 1 and len(alt_allele) == 1:
            # deletion
            del_len = len(ref_allele) - 1
            seq += alt_allele
            handle_deletion(del_len)
        elif len(ref_allele) == 1 and len(alt_allele) > 1:
            # insertion
            ins_seq = alt_allele[1:]
            seq += ref_allele
            if apply:
                seq += ins_seq
            else:
                seq += "N" * len(ins_seq)
            last_pos += 1
        elif len(ref_allele) > 1 and len(alt_allele) > 1:
            # replacement
            last_pos += len(ref_allele)
            if apply:
                seq += alt_allele
            else:
                seq += "N" * len(alt_allele)
        else:
            raise ValueError(f"Unexpected alleles: {ref_allele}, {alt_allele}")

    # add sequence until end
    seq += ref_seq[last_pos:]


with open(snakemake.output[0], "w") as outfasta:
    print(f">{snakemake.wildcards.sample}", file=outfasta)
    print(seq, file=outfasta)

Python numpy pysam From line 6 of scripts/vcf-to-fasta.py

shell:
    "tar -zcvf {output} results/{params.latest_run} 2> {log} 2>&1"

SnakeMake From line 77 of workflow/Snakefile

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "julianderuiter@gmail.com"
__license__ = "MIT"


from os import path
import re
from tempfile import TemporaryDirectory

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)


def basename_without_ext(file_path):
    """Returns basename of file path, without the file extension."""

    base = path.basename(file_path)
    # Remove file extension(s) (similar to the internal fastqc approach)
    base = re.sub("\\.gz$", "", base)
    base = re.sub("\\.bz2$", "", base)
    base = re.sub("\\.txt$", "", base)
    base = re.sub("\\.fastq$", "", base)
    base = re.sub("\\.fq$", "", base)
    base = re.sub("\\.sam$", "", base)
    base = re.sub("\\.bam$", "", base)

    return base


# Run fastqc, since there can be race conditions if multiple jobs
# use the same fastqc dir, we create a temp dir.
with TemporaryDirectory() as tempdir:
    shell(
        "fastqc {snakemake.params} -t {snakemake.threads} "
        "--outdir {tempdir:q} {snakemake.input[0]:q}"
        " {log}"
    )

    # Move outputs into proper position.
    output_base = basename_without_ext(snakemake.input[0])
    html_path = path.join(tempdir, output_base + "_fastqc.html")
    zip_path = path.join(tempdir, output_base + "_fastqc.zip")

    if snakemake.output.html != html_path:
        shell("mv {html_path:q} {snakemake.output.html:q}")

    if snakemake.output.zip != zip_path:
        shell("mv {zip_path:q} {snakemake.output.zip:q}")

Python Snakemake FastQC From line 3 of fastqc/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "koester@jimmy.harvard.edu"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.bcftools import get_bcftools_opts


bcftools_opts = get_bcftools_opts(snakemake, parse_ref=False, parse_memory=False)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


shell("bcftools concat {bcftools_opts} {extra} {snakemake.input.calls} {log}")

Python Snakemake BCFtools snakemake-wrapper-utils From line 1 of concat/wrapper.py

__author__ = "Patrik Smeds"
__copyright__ = "Copyright 2016, Patrik Smeds"
__email__ = "patrik.smeds@gmail.com"
__license__ = "MIT"

from os.path import splitext

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

# Check inputs/arguments.
if len(snakemake.input) == 0:
    raise ValueError("A reference genome has to be provided!")
elif len(snakemake.input) > 1:
    raise ValueError("Only one reference genome can be inputed!")

# Prefix that should be used for the database
prefix = snakemake.params.get("prefix", splitext(snakemake.output.idx[0])[0])

if len(prefix) > 0:
    prefix = "-p " + prefix

# Contrunction algorithm that will be used to build the database, default is bwtsw
construction_algorithm = snakemake.params.get("algorithm", "")

if len(construction_algorithm) != 0:
    construction_algorithm = "-a " + construction_algorithm

shell(
    "bwa index" " {prefix}" " {construction_algorithm}" " {snakemake.input[0]}" " {log}"
)

Python Snakemake BWA From line 1 of index/wrapper.py

__author__ = "Johannes Köster, Julian de Ruiter"
__copyright__ = "Copyright 2016, Johannes Köster and Julian de Ruiter"
__email__ = "koester@jimmy.harvard.edu, julianderuiter@gmail.com"
__license__ = "MIT"


import tempfile
from os import path
from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts
from snakemake_wrapper_utils.samtools import get_samtools_opts


# Extract arguments.
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
sort = snakemake.params.get("sorting", "none")
sort_order = snakemake.params.get("sort_order", "coordinate")
sort_extra = snakemake.params.get("sort_extra", "")
samtools_opts = get_samtools_opts(snakemake)
java_opts = get_java_opts(snakemake)


index = snakemake.input.idx
if isinstance(index, str):
    index = path.splitext(snakemake.input.idx)[0]
else:
    index = path.splitext(snakemake.input.idx[0])[0]


# Check inputs/arguments.
if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in {
    1,
    2,
}:
    raise ValueError("input must have 1 (single-end) or 2 (paired-end) elements")


if sort_order not in {"coordinate", "queryname"}:
    raise ValueError("Unexpected value for sort_order ({})".format(sort_order))


# Determine which pipe command to use for converting to bam or sorting.
if sort == "none":
    # Simply convert to bam using samtools view.
    pipe_cmd = "samtools view {samtools_opts}"

elif sort == "samtools":
    # Add name flag if needed.
    if sort_order == "queryname":
        sort_extra += " -n"

    # Sort alignments using samtools sort.
    pipe_cmd = "samtools sort {samtools_opts} {sort_extra} -T {tmpdir}"

elif sort == "picard":
    # Sort alignments using picard SortSam.
    pipe_cmd = "picard SortSam {java_opts} {sort_extra} --INPUT /dev/stdin --TMP_DIR {tmpdir} --SORT_ORDER {sort_order} --OUTPUT {snakemake.output[0]}"

else:
    raise ValueError(f"Unexpected value for params.sort ({sort})")

with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "(bwa mem"
        " -t {snakemake.threads}"
        " {extra}"
        " {index}"
        " {snakemake.input.reads}"
        " | " + pipe_cmd + ") {log}"
    )

Python Snakemake SAMtools Picard snakemake-wrapper-utils From line 1 of mem/wrapper.py

__author__ = "Sebastian Kurscheid"
__copyright__ = "Copyright 2019, Sebastian Kurscheid"
__email__ = "sebastian.kurscheid@anu.edu.au"
__license__ = "MIT"

from snakemake.shell import shell
import re

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


# Assert input
n = len(snakemake.input.sample)
assert (
    n == 1 or n == 2
), "input->sample must have 1 (single-end) or 2 (paired-end) elements."


# Input files
if n == 1:
    reads = "--in1 {}".format(snakemake.input.sample)
else:
    reads = "--in1 {} --in2 {}".format(*snakemake.input.sample)


# Output files
trimmed_paths = snakemake.output.get("trimmed", None)
if trimmed_paths:
    if n == 1:
        trimmed = "--out1 {}".format(snakemake.output.trimmed)
    else:
        trimmed = "--out1 {} --out2 {}".format(*snakemake.output.trimmed)

        # Output unpaired files
        unpaired = snakemake.output.get("unpaired", None)
        if unpaired:
            trimmed += f" --unpaired1 {unpaired} --unpaired2 {unpaired}"
        else:
            unpaired1 = snakemake.output.get("unpaired1", None)
            if unpaired1:
                trimmed += f" --unpaired1 {unpaired1}"
            unpaired2 = snakemake.output.get("unpaired2", None)
            if unpaired2:
                trimmed += f" --unpaired2 {unpaired2}"

        # Output merged PE reads
        merged = snakemake.output.get("merged", None)
        if merged:
            if not re.search(r"--merge\b", extra):
                raise ValueError(
                    "output.merged specified but '--merge' option missing from params.extra"
                )
            trimmed += f" --merged_out {merged}"
else:
    trimmed = ""


# Output failed reads
failed = snakemake.output.get("failed", None)
if failed:
    trimmed += f" --failed_out {failed}"


# Stats
html = "--html {}".format(snakemake.output.html)
json = "--json {}".format(snakemake.output.json)


shell(
    "(fastp --thread {snakemake.threads} "
    "{extra} "
    "{adapters} "
    "{reads} "
    "{trimmed} "
    "{json} "
    "{html} ) {log}"
)

Python Snakemake JSON From line 1 of fastp/wrapper.py

__author__ = "Johannes Köster, Felix Mölder, Christopher Schröder"
__copyright__ = "Copyright 2017, Johannes Köster"
__email__ = "johannes.koester@protonmail.com, felix.moelder@uni-due.de"
__license__ = "MIT"


from os import path
from snakemake.shell import shell
from tempfile import TemporaryDirectory

shell.executable("bash")

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

params = snakemake.params.get("extra", "")
norm = snakemake.params.get("normalize", False)


# Infer output format
uncompressed_bcf = snakemake.params.get("uncompressed_bcf", False)
out_name, out_ext = path.splitext(snakemake.output[0])
if out_ext == ".vcf":
    out_format = "v"
elif out_ext == ".bcf":
    if uncompressed_bcf:
        out_format = "u"
    else:
        out_format = "b"
elif out_ext == ".gz":
    out_name, out_ext = path.splitext(out_name)
    if out_ext == ".vcf":
        out_format = "z"
    else:
        raise ValueError("output file with invalid extension (.vcf, .vcf.gz, .bcf).")
else:
    raise ValueError("output file with invalid extension (.vcf, .vcf.gz, .bcf).")


pipe = ""
if norm:
    pipe = f"| bcftools norm {norm} --output-type {out_format} -"
else:
    pipe = f"| bcftools view --output-type {out_format} -"


if snakemake.threads == 1:
    freebayes = "freebayes"
else:
    chunksize = snakemake.params.get("chunksize", 100000)
    regions = (
        "<(fasta_generate_regions.py {snakemake.input.ref}.fai {chunksize})".format(
            snakemake=snakemake, chunksize=chunksize
        )
    )
    if snakemake.input.get("regions", ""):
        regions = (
            "<(bedtools intersect -a "
            r"<(sed 's/:\([0-9]*\)-\([0-9]*\)$/\t\1\t\2/' "
            "{regions}) -b {snakemake.input.regions} | "
            r"sed 's/\t\([0-9]*\)\t\([0-9]*\)$/:\1-\2/')"
        ).format(regions=regions, snakemake=snakemake)
    freebayes = ("freebayes-parallel {regions} {snakemake.threads}").format(
        snakemake=snakemake, regions=regions
    )

with TemporaryDirectory() as tempdir:
    shell(
        "({freebayes} {params} -f {snakemake.input.ref}"
        " {snakemake.input.samples} | bcftools sort -T {tempdir} - {pipe} > {snakemake.output[0]}) {log}"
    )

Python Snakemake BCFtools FreeBayes From line 1 of freebayes/wrapper.py

__author__ = "Joël Simoneau"
__copyright__ = "Copyright 2019, Joël Simoneau"
__email__ = "simoneaujoel@gmail.com"
__license__ = "MIT"

from snakemake.shell import shell

# Creating log
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Placeholder for optional parameters
extra = snakemake.params.get("extra", "")

# Allowing for multiple FASTA files
fasta = snakemake.input.get("fasta")
assert fasta is not None, "input-> a FASTA-file is required"
fasta = " ".join(fasta) if isinstance(fasta, list) else fasta

shell(
    "kallisto index "  # Tool
    "{extra} "  # Optional parameters
    "--index={snakemake.output.index} "  # Output file
    "{fasta} "  # Input FASTA files
    "{log}"  # Logging
)

Python Snakemake kallisto From line 3 of index/wrapper.py

__author__ = "Joël Simoneau"
__copyright__ = "Copyright 2019, Joël Simoneau"
__email__ = "simoneaujoel@gmail.com"
__license__ = "MIT"

from snakemake.shell import shell

# Creating log
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Placeholder for optional parameters
extra = snakemake.params.get("extra", "")

# Allowing for multiple FASTQ files
fastq = snakemake.input.get("fastq")
assert fastq is not None, "input-> a FASTQ-file is required"
fastq = " ".join(fastq) if isinstance(fastq, list) else fastq

shell(
    "kallisto quant "  # Tool
    "{extra} "  # Optional parameters
    "--threads={snakemake.threads} "  # Number of threads
    "--index={snakemake.input.index} "  # Input file
    "--output-dir={snakemake.output} "  # Output directory
    "{fastq} "  # Input FASTQ files
    "{log}"  # Logging
)

Python Snakemake Quant kallisto From line 3 of quant/wrapper.py

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "julianderuiter@gmail.com"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


extra = snakemake.params.get("extra", "")
# Set this to False if multiqc should use the actual input directly
# instead of parsing the folders where the provided files are located
use_input_files_only = snakemake.params.get("use_input_files_only", False)

if not use_input_files_only:
    input_data = set(path.dirname(fp) for fp in snakemake.input)
else:
    input_data = set(snakemake.input)

output_dir = path.dirname(snakemake.output[0])
output_name = path.basename(snakemake.output[0])
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "multiqc"
    " {extra}"
    " --force"
    " -o {output_dir}"
    " -n {output_name}"
    " {input_data}"
    " {log}"
)

Python Snakemake MultiQC From line 3 of multiqc/wrapper.py

__author__ = "Johannes Köster, Christopher Schröder"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "koester@jimmy.harvard.edu"
__license__ = "MIT"


import tempfile
from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts

log = snakemake.log_fmt_shell()

extra = snakemake.params.get("extra", "")
java_opts = get_java_opts(snakemake)

bams = snakemake.input.bams
if isinstance(bams, str):
    bams = [bams]
bams = list(map("--INPUT {}".format, bams))

if snakemake.output.bam.endswith(".cram"):
    output = "/dev/stdout"
    if snakemake.params.embed_ref:
        view_options = "-O cram,embed_ref"
    else:
        view_options = "-O cram"
    convert = f" | samtools view -@ {snakemake.threads} {view_options} --reference {snakemake.input.ref} -o {snakemake.output.bam}"
else:
    output = snakemake.output.bam
    convert = ""

with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "(picard MarkDuplicates"  # Tool and its subcommand
        " {java_opts}"  # Automatic java option
        " {extra}"  # User defined parmeters
        " {bams}"  # Input bam(s)
        " --TMP_DIR {tmpdir}"
        " --OUTPUT {output}"  # Output bam
        " --METRICS_FILE {snakemake.output.metrics}"  # Output metrics
        " {convert} ) {log}"  # Logging
    )

Python Snakemake SAMtools snakemake-wrapper-utils From line 1 of markduplicates/wrapper.py

__author__ = "Filipe G. Vieira"
__copyright__ = "Copyright 2020, Filipe G. Vieira"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts

samtools_opts = get_samtools_opts(
    snakemake, parse_write_index=False, parse_output=False
)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

shell(
    "samtools calmd {samtools_opts} {extra} {snakemake.input.aln} {snakemake.input.ref} > {snakemake.output[0]} {log}"
)

Python Snakemake SAMtools snakemake-wrapper-utils From line 1 of calmd/wrapper.py

__author__ = "Michael Chambers"
__copyright__ = "Copyright 2019, Michael Chambers"
__email__ = "greenkidneybean@gmail.com"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts

samtools_opts = get_samtools_opts(
    snakemake, parse_threads=False, parse_write_index=False, parse_output_format=False
)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell("samtools faidx {samtools_opts} {extra} {snakemake.input[0]} {log}")

Python Snakemake SAMtools snakemake-wrapper-utils From line 1 of faidx/wrapper.py

__author__ = "Christopher Preusch"
__copyright__ = "Copyright 2017, Christopher Preusch"
__email__ = "cpreusch[at]ust.hk"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts

samtools_opts = get_samtools_opts(
    snakemake, parse_write_index=False, parse_output=False, parse_output_format=False
)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

shell(
    "samtools flagstat {samtools_opts} {extra} {snakemake.input[0]} > {snakemake.output[0]} {log}"
)

Python Snakemake SAMtools snakemake-wrapper-utils From line 1 of flagstat/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "koester@jimmy.harvard.edu"
__license__ = "MIT"


from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Samtools takes additional threads through its option -@
# One thread for samtools merge
# Other threads are *additional* threads passed to the '-@' argument
threads = "" if snakemake.threads <= 1 else " -@ {} ".format(snakemake.threads - 1)

shell(
    "samtools index {threads} {extra} {snakemake.input[0]} {snakemake.output[0]} {log}"
)

Python Snakemake SAMtools From line 1 of index/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "koester@jimmy.harvard.edu"
__license__ = "MIT"


import tempfile
from pathlib import Path
from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts


samtools_opts = get_samtools_opts(snakemake)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


with tempfile.TemporaryDirectory() as tmpdir:
    tmp_prefix = Path(tmpdir) / "samtools_fastq.sort_"

    shell(
        "samtools sort {samtools_opts} {extra} -T {tmp_prefix} {snakemake.input[0]} {log}"
    )

Python Snakemake SAMtools snakemake-wrapper-utils From line 1 of sort/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "koester@jimmy.harvard.edu"
__license__ = "MIT"


from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

shell("tabix {snakemake.params} {snakemake.input[0]} {log}")

Python Snakemake tabix From line 1 of index/wrapper.py

__author__ = "Christopher Schröder"
__copyright__ = "Copyright 2020, Christopher Schröder"
__email__ = "christopher.schroeder@tu-dortmund.de"
__license__ = "MIT"

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

extra = snakemake.params.get("extra", "")

shell(
    "vembrane filter"  # Tool and its subcommand
    " {extra}"  # Extra parameters
    " {snakemake.params.expression:q}"
    " {snakemake.input}"  # Path to input file
    " > {snakemake.output}"  # Path to output file
    " {log}"  # Logging behaviour
)

Python Snakemake vembrane From line 3 of filter/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "johannes.koester@uni-due.de"
__license__ = "MIT"

import os
from pathlib import Path
from snakemake.shell import shell


def get_only_child_dir(path):
    children = [child for child in path.iterdir() if child.is_dir()]
    assert (
        len(children) == 1
    ), "Invalid VEP cache directory, only a single entry is allowed, make sure that cache was created with the snakemake VEP cache wrapper"
    return children[0]


extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

fork = "--fork {}".format(snakemake.threads) if snakemake.threads > 1 else ""
stats = snakemake.output.stats
cache = snakemake.input.get("cache", "")
plugins = snakemake.input.plugins
plugin_aux_files = {"LoFtool": "LoFtool_scores.txt", "ExACpLI": "ExACpLI_values.txt"}

load_plugins = []
for plugin in snakemake.params.plugins:
    if plugin in plugin_aux_files.keys():
        aux_path = os.path.join(plugins, plugin_aux_files[plugin])
        load_plugins.append(",".join([plugin, aux_path]))
    else:
        load_plugins.append(",".join([plugin, snakemake.input.get(plugin.lower(), "")]))
load_plugins = " ".join(map("--plugin {}".format, load_plugins))

if snakemake.output.calls.endswith(".vcf.gz"):
    fmt = "z"
elif snakemake.output.calls.endswith(".bcf"):
    fmt = "b"
else:
    fmt = "v"

fasta = snakemake.input.get("fasta", "")
if fasta:
    fasta = "--fasta {}".format(fasta)

gff = snakemake.input.get("gff", "")
if gff:
    gff = "--gff {}".format(gff)

if cache:
    entrypath = get_only_child_dir(get_only_child_dir(Path(cache)))
    species = (
        entrypath.parent.name[:-7]
        if entrypath.parent.name.endswith("_refseq")
        else entrypath.parent.name
    )
    release, build = entrypath.name.split("_")
    cache = (
        "--offline --cache --dir_cache {cache} --cache_version {release} --species {species} --assembly {build}"
    ).format(cache=cache, release=release, build=build, species=species)

shell(
    "(bcftools view '{snakemake.input.calls}' | "
    "vep {extra} {fork} "
    "--format vcf "
    "--vcf "
    "{cache} "
    "{gff} "
    "{fasta} "
    "--dir_plugins {plugins} "
    "{load_plugins} "
    "--output_file STDOUT "
    "--stats_file {stats} | "
    "bcftools view -O{fmt} > {snakemake.output.calls}) {log}"
)

Python Snakemake BCFtools Variant Effect Predictor (VEP) From line 1 of annotate/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "johannes.koester@uni-due.de"
__license__ = "MIT"

import sys
from pathlib import Path
from urllib.request import urlretrieve
from zipfile import ZipFile
from tempfile import NamedTemporaryFile

if snakemake.log:
    sys.stderr = open(snakemake.log[0], "w")

outdir = Path(snakemake.output[0])
outdir.mkdir()

with NamedTemporaryFile() as tmp:
    urlretrieve(
        "https://github.com/Ensembl/VEP_plugins/archive/release/{release}.zip".format(
            release=snakemake.params.release
        ),
        tmp.name,
    )

    with ZipFile(tmp.name) as f:
        for member in f.infolist():
            memberpath = Path(member.filename)
            if len(memberpath.parts) == 1:
                # skip root dir
                continue
            targetpath = outdir / memberpath.relative_to(memberpath.parts[0])
            if member.is_dir():
                targetpath.mkdir()
            else:
                with open(targetpath, "wb") as out:
                    out.write(f.read(member.filename))