Twist Cancer hydragenetics pipeline for HG38

public 1yr ago Version: v0.1.1 0 bookmarks

View Workflow

Twist Cancer inherited hg38 hydra pipelines

:speech_balloon: Introduction

This pipeline is created to run on Illumina data from a custom Twist Inherited Cancer panel, designed at Clinical Genomics Uppsala.

This snakemake pipelin

Code Snippets

script:
    "../scripts/add_ref_to_vcf.py"

SnakeMake From line 30 of rules/add_ref_to_vcf.smk

script:
    "../scripts/exomedepth_export.R"

SnakeMake From line 33 of rules/exomedepth_export.smk

wrapper:
    "v1.32.0/bio/bedtools/intersect"

SnakeMake From line 34 of rules/export_qc.smk

wrapper:
    "v1.32.0/bio/bedtools/intersect"

SnakeMake From line 65 of rules/export_qc.smk

script:
    "../scripts/export_qc_xlsx_report.py"

SnakeMake From line 104 of rules/export_qc.smk

script:
    "../scripts/sample_order_multiqc.py"

SnakeMake From line 30 of rules/sample_order_multiqc.smk

import sys
from pysam import VariantFile

vcf_in = VariantFile(snakemake.input.vcf)  # dosen't matter if bgziped or not. Automatically recognizes

# Add reference_description descriptions to new header
new_header = vcf_in.header
# new_header.add_line("reference="+ sys.argv[2])
new_header.add_line("##reference=" + snakemake.input.ref)

# start new vcf with the new_header
vcf_out = VariantFile(snakemake.output.vcf, 'w', header=new_header)


for record in vcf_in.fetch():
    vcf_out.write(record)

Python pysam From line 2 of scripts/add_ref_to_vcf.py

load(snakemake@input[["exon"]])
message(paste('Loaded data from', snakemake@input[["exon"]], sep = " "))


cnv_call_df <- data.frame(
  all.exons@CNV.calls[order(all.exons@CNV.calls$BF, decreasing = TRUE), ]
  )

if (length(all.exons@CNV.calls) > 0) {

  # for compatability with alissa remove rows where there is no-change (reads.ratio == 1)
  cnv_call_df <- cnv_call_df[cnv_call_df$reads.ratio != 1, ] 

  # Create Nexus SV format text file
  nexus <- c("id", "reads.ratio")
  nexus_df <- cnv_call_df[nexus]

  nexus_df$type[nexus_df$reads.ratio > 1] <- "CN Gain"
  nexus_df$type[nexus_df$reads.ratio < 1] <- "CN Loss"
  nexus_df$type[nexus_df$reads.ratio < 0.05] <- "Homozygous Copy Loss"
  nexus_df$reads.ratio <- log2(nexus_df$reads.ratio)

  # Add empty columns
  nexus_df <- cbind(nexus_df, empty_column1 = NA,
                    empty_column2 = NA, empty_column2 = NA)
  new_names <- c("Chromosome Region", "Probe Median",
                 "Event", "Cytoband", "Min Region")
  names(nexus_df) <- new_names

  #reorder columns
  nexus_df <- nexus_df[,
      c("Chromosome Region", "Min Region",	"Event", "Cytoband", "Probe Median")
      ]

  # write.table(nexus_df, file = 'test.txt',
  #             row.names = FALSE, quote = FALSE, sep = "\t", na = "")
  write.table(nexus_df, file = snakemake@output[["nexus_sv"]],
              row.names = FALSE, quote = FALSE, sep = "\t", na = "")

  # AED file
  keep <- c("chromosome", "start", "end", "gene", "nexons", "reads.ratio",
            "type")
  aed_df <- cnv_call_df[keep]
  aed_df$chromosome <- sub("^", "chr", aed_df$chromosome)
  aed_df$colour <- aed_df$type
  aed_df$colour[aed_df$colour == "duplication"] <- "rgb(0,0,255)"
  aed_df$colour[aed_df$colour == "deletion"] <- "rgb(255,0,0)"
  aed_df$type[aed_df$type == "duplication"] <- "copynumber/gain"
  aed_df$type[aed_df$type == "deletion"] <- "copynumber/loss"


  aed_df$new <- NA  # blank column
  new_aed_names <- c("bio:sequence(aed:String)", "bio:start(aed:Integer)",
                "bio:end(aed:Integer)", "aed:name(aed:String)",
                "bio:markerCount(aed:Integer)", "bio:state(aed:Rational)",
                "aed:category(aed:String)", "style:color(aed:Color)",
                "aed:value(aed:String)")
  names(aed_df) <- new_aed_names
  aed_df <- aed_df[, c("bio:sequence(aed:String)", "bio:start(aed:Integer)",
                        "bio:end(aed:Integer)", "aed:name(aed:String)",
                        "aed:value(aed:String)", "bio:markerCount(aed:Integer)",
                        "bio:state(aed:Rational)", "aed:category(aed:String)",
                        "style:color(aed:Color)")]

  header2 <- data.frame("", "", "", "affx:ucscGenomeVersion(aed:String)",
                        "hg38", "", "", "", "",
                        stringsAsFactors = FALSE)

  names(header2) <- c("bio:sequence(aed:String)", "bio:start(aed:Integer)",
                      "bio:end(aed:Integer)", "aed:name(aed:String)",
                      "aed:value(aed:String)", "bio:markerCount(aed:Integer)",
                      "bio:state(aed:Rational)", "aed:category(aed:String)",
                      "style:color(aed:Color)")

  aed_df <- rbind(header2, aed_df)
  header1 <- data.frame("", "", "", "namespace:affx(aed:URI)",
                        "http://affymetrix.com/ontology/", "", "", "", "",
                        stringsAsFactors = FALSE)
  names(header1) <- c("bio:sequence(aed:String)", "bio:start(aed:Integer)",
                      "bio:end(aed:Integer)", "aed:name(aed:String)",
                      "aed:value(aed:String)", "bio:markerCount(aed:Integer)",
                      "bio:state(aed:Rational)", "aed:category(aed:String)",
                      "style:color(aed:Color)")
  aed_df <- rbind(header1, aed_df)


  write.table(aed_df, file = snakemake@output[["aed"]], row.names = FALSE,
              quote = FALSE, sep = "\t")
} else {
  message("No result found")
  writeLines("Chromosome Region\tMin Region\tEvent\tCytoband\tProbe Median",
             snakemake@output[["nexus_sv"]])
  writeLines("", snakemake@output[["aed"]])
}

R From line 1 of scripts/exomedepth_export.R

import sys
import subprocess
import gzip
from datetime import date
import xlsxwriter
from operator import itemgetter

sample = snakemake.input.mosdepth_summary.split("/")[-1].split(".mosdepth.summary.txt")[0]

min_cov = int(snakemake.params.coverage_thresholds.strip().split(",")[0])
med_cov = int(snakemake.params.coverage_thresholds.strip().split(",")[1])
max_cov = int(snakemake.params.coverage_thresholds.strip().split(",")[2])

cmd_avg_cov = "grep total_region " + snakemake.input.mosdepth_summary + " | awk '{print $4}'"
avg_coverage = subprocess.run(cmd_avg_cov, stdout=subprocess.PIPE, shell='TRUE').stdout.decode('utf-8').strip()

cmd_duplication = "grep -A1 PERCENT " + snakemake.input.picard_dup + " |tail -1 | cut -f9"
duplication = float(subprocess.run(cmd_duplication, stdout=subprocess.PIPE, shell='TRUE').stdout.decode('utf-8').strip())*100

wanted_transcripts = []
with open(snakemake.input.wanted_transcripts) as wanted_file:
    for line in wanted_file:
        wanted_transcripts.append(line.split()[3])


# Avg cov per regions file
region_cov_table = []
bed_table = []
with gzip.open(snakemake.input.mosdepth_regions, 'rt') as regions_file:
    for lline in regions_file:
        line = lline.strip().split('\t')
        gene = line[3].split("_")[0]
        transcript = "_".join(line[3].split("_")[1:3])
        exon = str(line[3].split("_")[3])
        coverage_row = [line[0], line[1], line[2], gene, exon, transcript, float(line[4])]
        if coverage_row not in region_cov_table:
            region_cov_table.append(coverage_row)
        if line[0:5] not in bed_table:
            bed_table.append(line[0:5])

# Thresholds file
threshold_table = []
region_breadth = [0, 0, 0]
total_breadth = [0, 0, 0]
total_length = 0
with gzip.open(snakemake.input.mosdepth_thresholds, 'rt') as threshold_file:
    next(threshold_file)
    for lline in threshold_file:
        line = lline.strip().split("\t")
        gene = line[3].split("_")[0]
        transcript = "_".join(line[3].split("_")[1:3])
        exon = str(line[3].split("_")[3])

        length = int(line[2])-int(line[1])
        total_length += length
        total_breadth[0] += int(line[4])
        total_breadth[1] += int(line[5])
        total_breadth[2] += int(line[6])

        region_breadth[0] = round(int(line[4])/length, 4)
        region_breadth[1] = round(int(line[5])/length, 4)
        region_breadth[2] = round(int(line[6])/length, 4)
        outline = line[0:3]+region_breadth+[gene, exon, transcript]
        if outline not in threshold_table:
            threshold_table.append(outline)

# Per base in bedfile file, only low coverage in any coding exon.
low_cov_lines = []
with open(snakemake.input.mosdepth_perbase, 'r') as mosdepth_perbase:
    for lline in mosdepth_perbase:
        line = lline.strip().split("\t")
        if int(line[3]) <= int(min_cov) and line[0:4] not in low_cov_lines:
            low_cov_lines.append(line[0:4])
low_cov_lines = sorted(low_cov_lines, key=itemgetter(0, 1))  # Sort based on chr and start pos

low_cov_table = []
num_low_regions = 0
for line in low_cov_lines:
    line[3] = int(line[3])
    exons = []
    for bed_line in bed_table:
        # get all exons that cover that low cov line
        if line[0] == bed_line[0] and int(line[1]) >= int(bed_line[1]) and int(line[2]) <= int(bed_line[2]):
            exons.append(bed_line[3])

    if len(exons) > 0:
        if any(exon in wanted_transcripts for exon in exons):
            low_cov_table.append(line + list(set(exons) & set(wanted_transcripts)) + [";".join(exons)])
            num_low_regions += 1
        else:
            low_cov_table.append(line + [""] + [";".join(exons)])


# PGRS coverage
pgrs_cov_table = []
with open(snakemake.input.pgrs_coverage) as pgrs_file:
    for lline in pgrs_file:
        line = lline.strip().split("\t")
        line[3] = int(line[3])
        pgrs_cov_table.append(line[0:4]+[line[7]])


# Create xlsx file and sheets
empty_list = ['', '', '', '', '', '']
workbook = xlsxwriter.Workbook(snakemake.output[0])
worksheet_overview = workbook.add_worksheet('Overview')
worksheet_low_cov = workbook.add_worksheet('Low Coverage')
worksheet_cov = workbook.add_worksheet('Coverage')
worksheet_threshold = workbook.add_worksheet('Thresholds')
worksheet_pgrs_cov = workbook.add_worksheet('PGRS Coverage')

format_heading = workbook.add_format({'bold': True, 'font_size': 18})
format_line = workbook.add_format({'top': 1})
format_table_heading = workbook.add_format({'bold': True, 'text_wrap': True})
format_wrap_text = workbook.add_format({'text_wrap': True})
format_italic = workbook.add_format({'italic': True})
format_red_font = workbook.add_format({'font_color': 'red'})

# Overview
worksheet_overview.write(0, 0, sample, format_heading)
worksheet_overview.write(1, 0, "RunID: " + snakemake.params.sequenceid)
worksheet_overview.write(2, 0, "Processing date: " + date.today().strftime("%B %d, %Y"))
worksheet_overview.write_row(3, 0, empty_list, format_line)

worksheet_overview.write(4, 0, "Created by: ")
worksheet_overview.write(4, 4, "Valid from: ")
worksheet_overview.write(5, 0, "Signed by: ")
worksheet_overview.write(5, 4, "Document nr: ")
worksheet_overview.write_row(6, 0, empty_list, format_line)

worksheet_overview.write(7, 0, "Sheets:", format_table_heading)
worksheet_overview.write_url(8, 0, "internal:'Low Coverage'!A1", string='Positions with coverage lower than '+str(min_cov)+'x')
worksheet_overview.write_url(9, 0, "internal:'Coverage'!A1", string='Average coverage of all regions in bed')
worksheet_overview.write_url(10, 0, "internal:'Thresholds'!A1", string='Coverage Thresholds')
worksheet_overview.write_url(11, 0, "internal:'PGRS Coverage'!A1", string='PGRS Coverage')
worksheet_overview.write_row(12, 0, empty_list, format_line)

worksheet_overview.write_row(15, 0, ['RunID', 'DNAnr', 'Avg. coverage [x]', 'Duplicationlevel [%]', str(min_cov) + 'x',
                                     str(med_cov) + 'x', str(max_cov) + 'x'], format_table_heading)
worksheet_overview.write_row(16, 0, [snakemake.params.sequenceid, sample, avg_coverage, str(round(duplication, 2)),
                                     str(round(total_breadth[0]/total_length, 4)), str(round(total_breadth[1]/total_length, 4)),
                                     str(round(total_breadth[2]/total_length, 4))])  # lagga till avg cov pgrs?

worksheet_overview.write(18, 0, 'Number of regions not coverage by at least ' + str(min_cov) + 'x: ')
worksheet_overview.write(19, 0, str(num_low_regions))

worksheet_overview.write(22, 0, "Bedfile used: " + snakemake.input.design_bed)
worksheet_overview.write(23, 0, "PGRS-bedfile used: " + snakemake.input.pgrs_bed)

# low cov
worksheet_low_cov.set_column(1, 2, 10)
worksheet_low_cov.set_column(4, 5, 25)

worksheet_low_cov.write(0, 0, 'Mosdepth low coverage analysis', format_heading)
worksheet_low_cov.write_row(1, 0, empty_list, format_line)
worksheet_low_cov.write(2, 0, "Sample: " + str(sample))
worksheet_low_cov.write(3, 0, "Gene regions with coverage lower than " + str(min_cov) + "x.")

table_area = 'A6:F'+str(len(low_cov_table)+6)
header_dict = [{'header': 'Chr'}, {'header': 'Start'}, {'header': 'Stop'}, {'header': 'Mean Coverage'},
               {'header': 'Preferred transcript'}, {'header': 'All transcripts'}, ]
worksheet_low_cov.add_table(table_area, {'data': low_cov_table, 'columns': header_dict, 'style': 'Table Style Light 1'})

# cov
worksheet_cov.set_column(1, 2, 10)
worksheet_cov.set_column(5, 5, 15)
worksheet_cov.write(0, 0, 'Average Coverage per Exon', format_heading)
worksheet_cov.write_row(1, 0, empty_list, format_line)
worksheet_cov.write(2, 0, 'Sample: '+str(sample))
worksheet_cov.write(3, 0, 'Average coverage of each region in exon-bedfile')

table_area = 'A6:G'+str(len(region_cov_table)+6)
header_dict = [{'header': 'Chr'}, {'header': 'Start'}, {'header': 'Stop'}, {'header': 'Gene'}, {'header': 'Exon'},
               {'header': 'Transcript'}, {'header': 'Avg Coverage'}, ]
worksheet_cov.add_table(table_area, {'data': region_cov_table, 'columns': header_dict, 'style': 'Table Style Light 1'})

# threshold
worksheet_threshold.set_column(1, 2, 10)
worksheet_threshold.set_column(8, 8, 15)

worksheet_threshold.write(0, 0, 'Coverage breadth per exon', format_heading)
worksheet_threshold.write_row(1, 0, empty_list, format_line)
worksheet_threshold.write(2, 0, 'Sample: '+str(sample))
worksheet_threshold.write(3, 0, 'Coverage breath of each region in exon-bedfile')

table_area = 'A6:I' + str(len(threshold_table)+6)
header_dict = [{'header': 'Chr'}, {'header': 'Start'}, {'header': 'Stop'}, {'header': str(min_cov)+'x'},
               {'header': str(med_cov)+'x'}, {'header': str(max_cov)+'x'}, {'header': 'Gene'}, {'header': 'Exon'},
               {'header': 'Transcript'}, ]
worksheet_threshold.add_table(table_area, {'data': threshold_table, 'columns': header_dict, 'style': 'Table Style Light 1'})

# pgrs
worksheet_pgrs_cov.set_column(1, 2, 10)
worksheet_pgrs_cov.set_column(4, 4, 15)
worksheet_pgrs_cov.write(0, 0, 'Coverage of PGRS positions', format_heading)
worksheet_pgrs_cov.write_row(1, 0, empty_list, format_line)
worksheet_pgrs_cov.write(2, 0, 'Sample: '+str(sample))
worksheet_pgrs_cov.write(3, 0, 'Average coverage of pgrs-bedfile')

table_area = 'A6:E'+str(len(pgrs_cov_table))
header_dict = [{'header': 'Chr'}, {'header': 'Start'}, {'header': 'End'}, {'header': 'Coverage'},
               {'header': 'Hg19 coord/Comment'}]
worksheet_pgrs_cov.add_table(table_area, {'data': pgrs_cov_table, 'columns': header_dict, 'style': 'Table Style Light 1'})

workbook.close()

Python XlsxWriter From line 3 of scripts/export_qc_xlsx_report.py

import sys
import csv

samples = []
header = False

with open(snakemake.input.sample_sheet, 'r') as samplesheet:
    for lline in samplesheet:
        line = lline.strip()
        if header:
            samples.append(line.split(",")[1])
        if line == "Sample_ID,Sample_Name,Description,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project":
            header = True

if len(samples) == 0:
    raise Exception("No samples found, has the header in SampleSheet changed?")

with open(snakemake.output.replacement, "w+") as tsv:
    tsv_writer = csv.writer(tsv, delimiter='\t')
    i = 1
    for sample in samples:
        tsv_writer.writerow([sample, 'sample_'+str(f"{i:03}")])
        i += 1

with open(snakemake.output.order, "w+") as tsv:
    tsv_writer = csv.writer(tsv, delimiter='\t')
    tsv_writer.writerow(["Sample Order", "Sample Name"])
    i = 1
    for sample in samples:
        tsv_writer.writerow(['sample_'+str(f"{i:03}"), sample])
        i += 1

Python From line 3 of scripts/sample_order_multiqc.py

__author__ = "Jan Forster"
__copyright__ = "Copyright 2019, Jan Forster"
__email__ = "j.forster@dkfz.de"
__license__ = "MIT"

from snakemake.shell import shell

## Extract arguments
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "(bedtools intersect"
    " {extra}"
    " -a {snakemake.input.left}"
    " -b {snakemake.input.right}"
    " > {snakemake.output})"
    " {log}"
)