Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation, topic
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
CNV tool benchmarking pipeline
This pipeline was set up to test a variety of CNV calling tools on WES and WGS samples sequenced using Illumina short-read sequencing technology.
Installation and configuration
This pipeline is set up as a Snakemake workflow. More information on how to install and run Snakemake can be found at the wiki .
After installation of Snakemake, clone this repository and fill in the config.yaml.
This pipeline is set up to work with the
Environment Modules
. It likely needs customisation or removal of the module statements in the rules (
tools/*/rules.smk
) for compatilibility with a given computational environment.
Some of the tools in this pipeline requires a panel of normals, which the pipeline does not generate automatically. One such tool is GATK GermlineCNVCaller . More info on generating a panel of normals for this tool can be found at the GATKs tool documentation
Running the pipeline
Once the pipeline has been configured, a given tool can be run using the command
snakemake $tool
.
A list of available Snakemake targets can be generated by running
snakemake --list-target-rules
.
By default, Snakemake runs all the tools specified in the config.yaml by just running:
snakemake
.
Many options are available for customizing snakemake behaviour. These are all documented at the Snakemake Wiki .
Sample data from project
The alignment file GB-WGS-NA12878.bam used in this project is available at the NCBI Sequencing Read Archive (SRA) .
Code Snippets
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | get_script_dir <- function() { initial.options <- commandArgs(trailingOnly = FALSE) file.arg.name <- "--file=" script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) return(dirname(script.name)) } setwd(paste(get_script_dir(), "..", sep ="/")) suppressPackageStartupMessages(library("rtracklayer", warn.conflicts = TRUE)) library("yaml") RESULTSDIR <- "../results" TRUTHDIR <- "truth_set" OUTDIR <- "results_filtered" TOOLS <- yaml::yaml.load_file("../tools.yaml") # MIN_OVERLAP <- 0.10 ## Fraction, i.e. 0.10 == "10 %" MIN_OVERLAP <- 0 ## Disable overlap filtering BED <- list( WGS = import("../resources/hg19/regions/WGS_mappable_regions.bed"), WES = import("../resources/hg19/regions/Broad_exomes.bed") ) ## Check tool results --------------------------------- callset <- list() IGNORED_TOOLS <- c("alignments", "TIDDIT", "SvABA") tool_dirs <- list.dirs(RESULTSDIR, recursive=FALSE, full.names = FALSE) tool_dirs <- subset(tool_dirs, tool_dirs %in% c(TOOLS$WES, TOOLS$WGS)) for (tool in tool_dirs) { tool_files <- list.files(paste(RESULTSDIR, tool, sep = "/"), pattern = ".bed$") for (bed_file in tool_files) { message("NOW: ", paste(tool, bed_file, sep = "/")) bed <- import(paste(RESULTSDIR, tool, bed_file, sep = "/"), genome = "GRCh37") bed$name <- as.numeric(bed$name) bed$name <- ifelse(bed$name > 2, "DUP", ifelse(bed$name == 2, "REF", "DEL")) bed$pval <- bed$score bed$score <- 0 ## Filter called CNVs # bed <- subset(bed, width(bed) >= 450 & bed$name != "REF") bed <- subset(bed, bed$name != "REF" & !seqnames(bed) %in% c("X", "Y", "MT")) ## Subset to library region if (startsWith(bed_file, "GB-WGS")) {library = "WGS"} else {library = "WES"} bed <- subsetByOverlaps(bed, BED[[library]], minoverlap = 1, ignore.strand = TRUE) ## Compare with truth set ----------------------------- truth_file <- paste(TRUTHDIR, bed_file, sep = "/") if (file.exists(truth_file)) { truth <- import(truth_file, genome = "GRCh37") truth$pval <- -1 hits <- findOverlaps(bed, truth) ## Check that CNV type is the same for call set and truth (truth$name == "CNV" is for NA12878) hits <- hits[bed$name[queryHits(hits)] == truth$name[subjectHits(hits)] | truth$name[subjectHits(hits)] == "CNV"] ## Count called CNVs that overlap a true CNV by at least MIN_OVERLAP overlaps <- pintersect(bed[queryHits(hits)], truth[subjectHits(hits)]) percentOverlap <- width(overlaps) / width(bed[queryHits(hits)]) hits <- hits[percentOverlap >= MIN_OVERLAP] bed[queryHits(hits)]$score <- percentOverlap[percentOverlap >= MIN_OVERLAP] ## Append true positives for later counting truth$score <- Inf truth_called <- truth[unique(subjectHits(hits))] bed <- c(bed, truth_called) ## Append false negative calls for later counting truth$score <- -Inf truth_not_called <- truth[setdiff(seq_along(truth), subjectHits(hits))] hits_not_called <- findOverlaps(truth_not_called, BED[[library]], minoverlap = 1, ignore.strand = TRUE) bed <- c(bed, truth_not_called[unique(queryHits(hits_not_called))]) ## Add calls to callset list sample <- sub("\\.bed$", "", bed_file) if (! sample %in% names(callset)) { callset[[sample]] <- list( cnv = paste(as.character(truth), truth$name, sep = "_") ) } # calls <- rep(0, length(truth)) pvals <- rep(NA, length(truth)) pvals[subjectHits(hits)] <- bed$pval[queryHits(hits)] # calls[subjectHits(hits)] <- 1 # names(calls) <- paste(as.character(truth), truth$name, sep = "_") callset[[sample]][[tool]] <- pvals } dir.create(paste(OUTDIR, tool, sep = "/"), showWarnings = FALSE) bed_string <- export(bed, format = "bed") outfile <- paste(OUTDIR, tool, bed_file, sep = "/") writeLines(paste(bed_string, bed$pval, sep = "\t"), outfile) } } callset.df <- lapply(callset, as.data.frame) callset.melted <- lapply(callset.df, reshape::melt, id.vars = "cnv", variable_name = "tool") for (i in names(callset.melted)) { callset.melted[[i]]$sample <- i } callset.melted$make.row.names = FALSE callset.final <- do.call("rbind", callset.melted) # callset.final <- callset.final[, c(4, 1, 2, 3)] write.table(callset.final, file = "callset.txt", sep = "\t", quote = FALSE, row.names = FALSE) |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | get_script_dir <- function() { initial.options <- commandArgs(trailingOnly = FALSE) file.arg.name <- "--file=" script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) return(dirname(script.name)) } setwd(paste(get_script_dir(), "..", sep ="/")) suppressPackageStartupMessages(library("data.table")) library("yaml") library("lubridate", warn.conflicts = FALSE) TRUTHDIR <- "truth_set" RESULTSDIR <- "results_filtered" BENCHMARKDIR <- "../benchmarks" TOOLS <- yaml::yaml.load_file("../tools.yaml") ## Functions ----------------------------- flatten_mapply_results <- function(mapply_out.list) { mapply_out.df <- lapply(mapply_out.list, as.data.frame) mapply_out.df$make.row.names = FALSE mapply_out <- do.call("rbind", mapply_out.df) return(mapply_out) } extract_results <- function(sample, tool, resultsdir = RESULTSDIR, truthdir = TRUTHDIR) { res <- list( sample = sample, tool = tool, library = NA, TP = NA, FP = NA, FN = NA, TP_FN = NA, N_truth = NA, N_DEL = NA, N_DUP = NA ) if (startsWith(sample, "GB-WGS")) {res$library = "WGS"} else {res$library = "WES"} if (tool == "Cytoscan") { bed_file <- sprintf("%s/%s.bed", truthdir, sample) } else { bed_file <- sprintf("%s/%s/%s.bed", resultsdir, tool, sample) } if (file.exists(bed_file)) { bed <- fread(bed_file, colClasses=list(character = 1)) if (tool == "Cytoscan") { res$N_DEL <- sum(bed$V4 == "DEL") res$N_DUP <- sum(bed$V4 == "DUP") } else { res$N_DEL <- sum(bed$V4 == "DEL" & Inf > bed$V5 & bed$V5 >= 0) res$N_DUP <- sum(bed$V4 == "DUP" & Inf > bed$V5 & bed$V5 >= 0) truth_file <- sprintf("%s/%s.bed", truthdir, sample) if (file.exists(truth_file)) { # res$TP <- sum(0 < bed$V5 & bed$V5 <= 1) + sum(bed$V5[1 < bed$V5 & bed$V5 < Inf]) # res$FN <- sum(bed$V5 == -1) # res$FP <- sum(bed$V5 == 0) res$TP <- sum(bed$V5 == Inf) res$FN <- sum(bed$V5 == -Inf) res$FP <- with(res, N_DUP + N_DEL - TP) res$TP_FN <- with(res, TP + FN) res$N_truth <- R.utils::countLines(truth_file)[1] } } } return(res) } extract_hist_data <- function(sample, tool, breaks, resultsdir = RESULTSDIR, truthdir = TRUTHDIR) { res <- list( sample = sample, tool = tool, library = NA, bin = NA, counts = NA ) if (startsWith(sample, "GB-WGS")) {res$library = "WGS"} else {res$library = "WES"} if (tool == "Cytoscan") { bed_file <- sprintf("%s/%s.bed", truthdir, sample) } else { bed_file <- sprintf("%s/%s/%s.bed", resultsdir, tool, sample) } if (file.exists(bed_file)) { bed <- fread(bed_file, colClasses=list(character = 1)) if (tool != "Cytoscan") { # bed <- subset(bed, bed$V5 >= 0) bed <- subset(bed, Inf > bed$V5 & bed$V5 >= 0) } bed$width <- bed$V3 - bed$V2 h <- hist(bed$width, breaks, plot = FALSE) res$bin <- sprintf("%.0f-%.0f", head(breaks, -1), tail(breaks, -1)) res$counts <- h$counts } return(res) } extract_mlpa_data <- function(sample, tool, resultsdir = RESULTSDIR, truthdir = TRUTHDIR) { res <- list( sample = sample, tool = tool, library = NA, overlap = NA ) if (startsWith(sample, "GB-WGS")) {res$library = "WGS"} else {res$library = "WES"} bed_file <- sprintf("%s/%s/%s.bed", resultsdir, tool, sample) if (file.exists(bed_file)) { bed <- fread(bed_file, colClasses=list(character = 1)) # if (any(bed$V5 == -1) || all(bed$V5) == 0) { # res$overlap = 0 # } else { # res$overlap <- max(bed$V5) # } bed <- subset(bed, bed$V5 <= 1) res$overlap <- max(bed$V5, 0) } return(res) } extract_benchmarks <- function(sample, tool, benchdir = BENCHMARKDIR) { res <- list( sample = sample, tool = tool, library = NA, cputime = NA, peak_memory = NA ) if (startsWith(sample, "GB-WGS")) {res$library = "WGS"} else {res$library = "WES"} if (tool == "CLC") { input <- sprintf("%s/%s/cpu_time.tsv", benchdir, tool) if (file.exists(input)) { df <- fread(input) df <- subset(df, df$sample == res$sample) res$cputime <- as.numeric(hm(df$`elapsed time`)) } } else { if (tool %in% c("CNVkit", "CODEX2") || (tool == "cn.MOPS" && res$library == "WES")) { input <- Sys.glob(sprintf("%s/%s*/*%s*.txt", benchdir, tool, res$library)) } else { input <- Sys.glob(sprintf("%s/%s*/*%s*.txt", benchdir, tool, sample)) } if (length(input) > 0) { df <- do.call("rbind", lapply(input, fread)) res$cputime <- sum(df$s) res$peak_memory <- max(df$max_rss) } } return(res) } ## Initialize --------------------------- # tool_dirs <- list.dirs(RESULTSDIR, recursive=FALSE, full.names = FALSE) tool_dirs <- union(TOOLS$WGS, TOOLS$WES) cohorts <- fread("../cohorts.txt") samples <- list( all = subset(cohorts, cohorts$cohort != "mlpa")$sample ) samples$WES <- subset(samples$all, grepl("WES", samples$all)) samples$WGS <- subset(samples$all, grepl("WGS", samples$all)) ## Summarize results ----------------------------- message("Creating summary.txt") l <- ifelse(tool_dirs %in% TOOLS$WGS & tool_dirs %in% TOOLS$WES, list(c(samples$WGS, samples$WES)), ifelse(tool_dirs %in% TOOLS$WES, list(samples$WES), list(samples$WGS))) map_args <- list( sample = do.call("c", l), tool = unlist(mapply(rep, tool_dirs, sapply(l, length)), use.names = FALSE) ) summary <- flatten_mapply_results(mapply(extract_results, map_args$sample, map_args$tool, SIMPLIFY = FALSE)) summary$precision <- with(summary, TP / (TP + FP)) summary$recall <- with(summary, TP / (TP + FN)) fwrite(summary, file = "summary.txt", sep = "\t", na = ".", quote = FALSE) ## Make histogram data --------------------------- message("Creating hist_data.txt") hist_breaks <- c(0L, 500L, 1000L, 10000L, 100000L, Inf) hist_data <- flatten_mapply_results(mapply(extract_hist_data, map_args$sample, map_args$tool, MoreArgs = list(breaks = hist_breaks), SIMPLIFY = FALSE)) fwrite(hist_data, file = "hist_data.txt", sep = "\t", na = ".", quote = FALSE) ## Summarize MLPA results ---------------------------- message("Creating mlpa_overlap.txt") MLPATRUTHDIR <- "truth_set/mlpa" mlpa_samples <- list.files(MLPATRUTHDIR) samples$MLPA = subset(cohorts, cohorts$cohort == "mlpa")$sample samples$MLPA_WES <- subset(samples$MLPA, grepl("WES", samples$MLPA)) samples$MLPA_WGS <- subset(samples$MLPA, grepl("WGS", samples$MLPA)) tool_dirs <- subset(tool_dirs, tool_dirs != "Cytoscan") l <- ifelse(tool_dirs %in% TOOLS$WGS & tool_dirs %in% TOOLS$WES, list(c(samples$MLPA_WGS, samples$MLPA_WES)), ifelse(tool_dirs %in% TOOLS$WES, list(samples$MLPA_WES), list(samples$MLPA_WGS))) mlpa_args <- list( sample = do.call("c", l), tool = unlist(mapply(rep, tool_dirs, sapply(l, length)), use.names = FALSE) ) mlpa_overlap <- flatten_mapply_results(mapply(extract_mlpa_data, mlpa_args$sample, mlpa_args$tool, SIMPLIFY = FALSE)) fwrite(mlpa_overlap, file = "mlpa_overlap.txt", sep = "\t", na = ".", quote = FALSE) ## Summarize computational benchmark results -------------------------- message("Creating benchmarks.txt") samples$REF = subset(cohorts, cohorts$cohort == "reference")$sample samples$REF_WES <- subset(samples$REF, grepl("WES", samples$REF)) samples$REF_WGS <- subset(samples$REF, grepl("WGS", samples$REF)) benchmark_args <- list( sample = c(rep(samples$REF_WES, length(TOOLS$WES)), rep(samples$REF_WGS, length(TOOLS$WGS))), tool = c(TOOLS$WES, TOOLS$WGS) ) benchmarks <- flatten_mapply_results(mapply(extract_benchmarks, benchmark_args$sample, benchmark_args$tool, SIMPLIFY = FALSE)) fwrite(benchmarks, file = "benchmarks.txt", sep = "\t", na = ".", quote = FALSE) |
24 25 26 27 | shell: "(module load tools samtools/1.9 bwa/{version}; " "bwa mem -M -t {threads} -R {params.rg} {input.ref} {input.reads} > {output.sam}; " ") 2> {log}" |
44 45 | shell: "(module load tools perl/5.24.0 samtools/1.9 biobambam2/{version}; " |
62 63 | shell: "(module load tools java/1.8.0 gatk/{version}; " |
97 98 99 100 101 102 103 104 105 106 107 | shell: "(module load tools java/1.8.0 samtools/1.9 gatk/{version}; " "gatk BaseRecalibrator " "--known-sites {input.dbsnp} " "--known-sites {input.indels} " "--known-sites {input.snps1k} " "--known-sites {input.indels1k} " "-R {input.ref} " "-I {input.bam} " "-O {output.bqsr} " ") 2> {log} " |
126 127 128 129 130 131 132 133 | shell: "(module load tools java/1.8.0 samtools/1.9 gatk/{version}; " "gatk ApplyBQSR -R {input.ref} " "-bqsr {input.bqsr} " "-I {input.bam} " "-O {output.bam}; " "samtools index -@ {threads} {output.bam} {output.bai}" ") 2> {log}" |
149 150 151 152 153 | shell: "(module load tools samtools/{version}; " "samtools view -C -T {input.ref} -@ {threads} {input.bam} > {output.cram}; " "samtools index -@ {threads} {output.cram} {output.crai}; " ") 2> {log}" |
163 164 165 166 | shell: "cat {input.wes} {input.wgs} | sort -k1,1 -k2,2 -V | uniq > {output.bed}; " "cat {input.wes} | sort -k1,1 -k2,2 -V | uniq > {output.wes}; " "cat {input.wgs} | sort -k1,1 -k2,2 -V | uniq > {output.wgs}; " |
174 175 176 | shell: "module load bedtools/2.27.1; " "bedtools subtract -a {input.truth} -b {input.region} > {output.bed}" |
192 193 194 195 | shell: "(module load tools htslib/1.9 mosdepth/0.2.4; " "MOSDEPTH_PRECISION=5 mosdepth -f {input.ref} -n --by {input.bed} {params.prefix} {input.cram}; " ") 2> {log}" |
9 10 11 12 | shell: "(module load intel/redist/2019_update2 intel/compiler/64/2019_update2 R/3.5.0; " "Rscript concordance/scripts/01_filter_results.R" ") 2> {log}" |
20 21 22 23 | shell: "(module load intel/redist/2019_update2 intel/compiler/64/2019_update2 R/3.5.0; " "Rscript concordance/scripts/02_summarize_results.R" ") 2> {log}" |
Support
- Future updates
Related Workflows





