Realignment Pipeline for Increased Reproducibility and Portability

public 1yr ago 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

About

This realignment pipeline is a reimplantation of the pipeline found at https://github.com/BrenKenna/data_processing. The original pipeline is tailored to run on the grid infrastructure.

To increase reproducibility, portability and

Code Snippets

shell:
    """
    set +u
        eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"
        conda activate /cvmfs/softdrive.nl/projectmine_sw/software
        set -u
        pm_copy /projectmine-nfs/Tape/User/kooyman/UploadFromBroad/{wildcards.SM}.cram  {output.cram}
        set +u
        conda deactivate
        set -u
    """

SnakeMake From line 47 of rules/optimised.smk

shell:
    """
    echo "start $(date)"
    samtools index -@ {threads} {input.cram}
    echo "indexed $(date)"
    samtools collate -@ {threads} --output-fmt BAM -uOn 128 {input.cram} {wildcards.SM}.tmp |samtools bam2fq -@ {threads} -t -s /dev/null -1 {wildcards.SM}.R1.fq.gz -2 {wildcards.SM}.R2.fq.gz - > /dev/null
     echo "collated $(date)"
    bwa-mem2 mem -K 100000000 -t {threads} -Y {params.ref} -R "@RG\\tID:{wildcards.SM}\\tLB:{wildcards.SM}\\tSM:{wildcards.SM}\\tPL:ILLUMINA" {wildcards.SM}.R1.fq.gz {wildcards.SM}.R2.fq.gz 2>> {log} | samblaster -a --addMateTags | samtools view -h1 --threads {threads} -o  {wildcards.SM}.aln.bam
     echo "bwa $(date)"
    rm -f {wildcards.SM}.R1.fq.gz {wildcards.SM}.R2.fq.gz
    samtools sort -T /tmp/{wildcards.SM} --write-index  -m 7G -l 1 -@ {threads} {wildcards.SM}.aln.bam -o {output.bam}##idx##{output.index}

    echo "sorted $(date)"
    """

SnakeMake SAMtools BWA Bwa-mem2 SAMBLASTER From line 80 of rules/optimised.smk

shell:
    """
    echo "start $(date)"
    picard -Djava.io.tmpdir={resources.tmpdir} MarkDuplicates I={input.bam} AS=true O={wildcards.SM}.dedup.bam METRICS_FILE={output.metrics_dedup} QUIET=true COMPRESSION_LEVEL=1 2>> {log.dedup}
    #why sort this stuff
    echo "MarkDuplicated $(date)"
    #samtools sort -m 7G  -l 1  -T /tmp/{wildcards.SM} --write-index  -@ {threads} {wildcards.SM}.dedup.bam -o {wildcards.SM}.dedup-sorted.bam##idx##{wildcards.SM}.dedup-sorted.bam.bai 
    #echo "sorted dedup.bam $(date)"
    #rm {wildcards.SM}.dedup.bam
    samtools index -@ {threads} {wildcards.SM}.dedup.bam
    echo "indexed dedup $(date)"
    gatk3 -Xmx8G -Djava.io.tmpdir={resources.tmpdir} -T BaseRecalibrator -I {wildcards.SM}.dedup.bam -R {params.ref} -o {wildcards.SM}.recal -nct {threads} --downsample_to_fraction .1 {params.bqsr_loci} -knownSites {params.dbSNP} -knownSites {params.Mills} -knownSites {params.KnownIndels} 2>> {log.bqsr}
    echo "BaseRecalibrator done $(date)"

    gatk3 -Xmx8G -Djava.io.tmpdir={resources.tmpdir} -T PrintReads -I {wildcards.SM}.dedup.bam -R {params.ref} -nct {threads} --BQSR {wildcards.SM}.recal -o {output.bam} --globalQScorePrior -1.0 --preserve_qscores_less_than 6 --static_quantized_quals 10 --static_quantized_quals 20 --static_quantized_quals 30 --disable_indel_quals 2>> {log.bqsr}
    echo "printedreads $(date)"
    """

SnakeMake SAMtools gatk Picard From line 125 of rules/optimised.smk

shell:
    """
    samtools view -C -O CRAM -T {params.ref} -@ {threads} {input.bam} -o {output.cram}
    samtools index -@ {threads} {output.cram}

    """

SnakeMake SAMtools From line 162 of rules/optimised.smk

shell:
    """
    gatk --java-options -Djava.io.tmpdir={resources.tmpdir} HaplotypeCaller -R {params.ref}  --dbsnp {params.dbSNP}  -I {input.cram} -O {output} -L {wildcards.chr} --seconds-between-progress-updates 100 -ERC GVCF --native-pair-hmm-threads {threads} 
    """

SnakeMake gatk From line 188 of rules/optimised.smk

shell:
    """
    bcftools concat -O z  -o {output.gvcf} {input}
    tabix -p vcf  {output.gvcf}
    """

SnakeMake BCFtools tabix From line 210 of rules/optimised.smk

shell:
    """
    touch {input.index}
    gatk --java-options -Djava.io.tmpdir={resources.tmpdir} HaplotypeCaller -R {params.ref}  --dbsnp {params.dbSNP}  -I {input.cram} -O {output.gvcf} -ERC GVCF -L {params.tgt} --seconds-between-progress-updates 100 --native-pair-hmm-threads {threads} &>> {log}
    mv {output.gvcf} {output.gvcf}.tmp
    zcat {output.gvcf}.tmp |bgzip -@ {threads} -l 6 -c> {output.gvcf}

    tabix -fp vcf {output.gvcf}
    """

SnakeMake gatk tabix From line 237 of rules/optimised.smk

shell:
    """
    samtools index {input.cram}
    """

SnakeMake SAMtools From line 258 of rules/optimised.smk

shell:
    """
     echo "start $(date)"
    samtools index -@ 8 {input.cram}
    echo "index $(date)"
    samtools bamshuf -@ 8 --reference {params.refold} --output-fmt BAM -uOn 128 {input.cram} {wildcards.SM}.tmp |samtools bam2fq -@ 8 -t -s /dev/null -1 {wildcards.SM}.R1.fq.gz -2 {wildcards.SM}.R2.fq.gz - > /dev/null
    echo "bamshuf $(date)"
    bwa mem -K 100000000 -t 8 -Y {params.ref} -R "@RG\\tID:{wildcards.SM}\\tLB:{wildcards.SM}\\tSM:{wildcards.SM}\\tPL:ILLUMINA" {wildcards.SM}.R1.fq.gz {wildcards.SM}.R2.fq.gz 2>> {log.bwa} | samblaster -a --addMateTags | samtools view -h --threads {threads} -bS > {wildcards.SM}.aln.bam
    echo "bwa $(date)"
    rm -f {wildcards.SM}.R1.fq.gz {wildcards.SM}.R2.fq.gz
    samtools sort -@ 8 {wildcards.SM}.aln.bam > {wildcards.SM}.sorted.bam
    echo "sorted $(date)"
    rm -f {wildcards.SM}.aln.bam
    samtools view -@ 8 -h -T {params.ref} -C {wildcards.SM}.sorted.bam > {output}
    rm -f {wildcards.SM}.sorted.bam
    echo "cramed $(date)"
    """

SnakeMake SAMtools BWA SAMBLASTER From line 17 of rules/original.smk

shell:
    """
    wrk="."
    echo "start $(date)"
    samtools view -h -@ 8 -T {params.ref} {input.cram} -b > {wildcards.SM}.bam
     echo "cram2bam input $(date)"
    samtools index -@ 8 {wildcards.SM}.bam
    echo "indexed input $(date)"
    picard -Djava.io.tmpdir=${{wrk}} MarkDuplicates I={wildcards.SM}.bam AS=true O={wildcards.SM}.dedup.bam METRICS_FILE={output.metrics_dedup} QUIET=true COMPRESSION_LEVEL=0 2>> {log.dedup}
    rm {wildcards.SM}.bam
    echo "MarkDuplicated $(date)"
    samtools sort -@ 8 -o {wildcards.SM}.dedup-sorted.bam {wildcards.SM}.dedup.bam
    echo "sorted dedup.bam $(date)"
    rm {wildcards.SM}.dedup.bam
    echo "removed dedup.bam $(date)"

    samtools index -@ 8 {wildcards.SM}.dedup-sorted.bam
    echo "indexed dedup-sorted.bam $(date)"

    gatk3 -Xmx8g -Djava.io.tmpdir=${{wrk}} -T BaseRecalibrator -I {wildcards.SM}.dedup-sorted.bam -R {params.ref} -o {wildcards.SM}.recal -nct {threads} --downsample_to_fraction .1 {params.bqsr_loci} -knownSites {params.dbSNP} -knownSites {params.Mills} -knownSites {params.KnownIndels} 2>> {log.bqsr}
    echo "BaseRecalibrator done $(date)"
    gatk3 -Xmx8g -Djava.io.tmpdir=${{wrk}}  -T PrintReads -I {wildcards.SM}.dedup-sorted.bam -R {params.ref} -nct {threads} --BQSR {wildcards.SM}.recal -o {wildcards.SM}.bqsr.bam --globalQScorePrior -1.0 --preserve_qscores_less_than 6 --static_quantized_quals 10 --static_quantized_quals 20 --static_quantized_quals 30 --disable_indel_quals 2>> {log.bqsr}
    echo "printedreads $(date)"
    rm {wildcards.SM}.dedup-sorted.bam
     echo "removed dedup-sorted $(date)"
    samtools sort -@ 8 {wildcards.SM}.bqsr.bam -o {wildcards.SM}.final.bam
    echo "sorted $(date)"
    rm {wildcards.SM}.bqsr.bam
    samtools view -h -@ 8 -T {params.ref} -C {wildcards.SM}.final.bam > {output.cram}
    echo "bam2cram output$(date)"
    samtools index -@ 8 {output.cram}
    echo "cramindex output$(date)"
    """

SnakeMake SAMtools gatk Picard From line 60 of rules/original.smk

shell:
    """
        set -x
    wrk=$(pwd)

        # Call variants chr1
        echo -e "\\n\\nCalling chr1\\n" >> {log}
        mkdir -p ${{wrk}}/chr1
        cd ${{wrk}}/chr1
        set -x
        gatk --java-options -Djava.io.tmpdir=. HaplotypeCaller -R {params.ref}  --dbsnp {params.dbSNP}  -I ${{wrk}}/{input.cram} -O ${{wrk}}/chr1/{wildcards.SM}.chr1.g.vcf.gz -L chr1 -ERC GVCF --native-pair-hmm-threads {threads} &>> ../{log}


        # Initialize gVCF and remove temp chrom dir
        if [ -f ${{wrk}}/chr1/{wildcards.SM}.chr1.g.vcf.gz ]
        then
            zcat ${{wrk}}/chr1/{wildcards.SM}.chr1.g.vcf.gz > ${{wrk}}/{wildcards.SM}.g.vcf
            cd ${{wrk}}
            rm -fr chr1
        else
            echo -e "\\nError during variant, exiting\\n"
            cd ..
            rm -fr  ${{wrk}}/{wildcards.SM}.g.vcf
            exit 1
        fi


        # Iteratively haplotype caller and append to gVCF
        for chrom in chr{{2..22}} chr{{X..Y}}
            do

            # Call variants
            echo -e "\\n\\nCalling ${{chrom}}\\n" >> {log}
            mkdir -p ${{wrk}}/${{chrom}}
            cd ${{wrk}}/${{chrom}}
            gatk --java-options -Djava.io.tmpdir=.  HaplotypeCaller -R {params.ref} --dbsnp {params.dbSNP} -I ${{wrk}}/{input.cram} -O ${{wrk}}/${{chrom}}/{wildcards.SM}.${{chrom}}.g.vcf.gz -L ${{chrom}} -ERC GVCF --native-pair-hmm-threads 2 &>> ../{log}

            # Append to gVCF
            zcat ${{wrk}}/${{chrom}}/{wildcards.SM}.${{chrom}}.g.vcf.gz | grep -v "#" >> ${{wrk}}/{wildcards.SM}.g.vcf
            cd ${{wrk}}
            rm -fr ${{chrom}}
        done


        bgzip -c ${{wrk}}/{wildcards.SM}.g.vcf > {output.gvcf}
        tabix -p vcf -f {output.gvcf}


    """

SnakeMake gatk tabix From line 112 of rules/original.smk

shell:
    """
    gatk --java-options -Djava.io.tmpdir=. HaplotypeCaller -R {params.ref} --dbsnp {params.dbSNP} -I {input.cram} -O {output.gvcf} -ERC GVCF  -L {params.tgt} --native-pair-hmm-threads {threads} &>> {log}
    tabix -fp vcf {output.gvcf}
    """