CNV-- Structural Variants pipeline

public 1yr ago Version: 1.1.3 0 bookmarks

View Workflow

StructuralVariants Workflow

A CNV (Copy Number Variation) pipeline is a set of bioinformatics tools and algorithms used to detect and analyze genomic variations in the number of copies of DNA segments between individuals or populations.

Code Snippets

baseCommand: [Rscript]

inputs:
  input:
    type: 'File[]'
    inputBinding:
      position: 2
      itemSeparator: ','
  samples:
    type: File
    inputBinding:
      position: 3
  script:
    type: File
    inputBinding:
      position: 1
    default:
        class: File
        basename: "batch_parser.R"
        contents: |-
          args <- commandArgs(TRUE)

          bamsList <- as.vector(strsplit(args[1], ",")[[1]])
          bamsList <- sapply(strsplit(bamsList, "[/]"), "[[", 3)
          samplesFile <- read.delim(args[2], header = T, sep = " ")

          bams <- data.frame(unlist(bamsList))
          colnames(bams)[1] <- "file"
          samples <- samplesFile

          indx <- sapply(samples$sample_id, grep, bams$file)
          temp_df <- cbind(samples[unlist(indx), , drop = F], bams[unlist(indx), , drop = F])
          temp_df$index_file <- paste(temp_df$file, ".bai", sep = "")
          output <- split(temp_df, temp_df$batch)

          sapply(names(output), function(x)
            write.table(output[[x]], file = paste0("bams_x_batch_", x, ".csv"), sep = ",", row.names = F))

CWL From line 20 of tools/batch_parser.cwl

baseCommand: [bedops, -u]

inputs:
  input:
    type: 'File[]'
    inputBinding:
      position: 1

CWL From line 19 of tools/bedops_union.cwl

baseCommand: []
arguments:
  - position: 0
    shellQuote: false
    valueFrom: >-
      $(inputs.input_amb && inputs.input_ann && inputs.input_bwt && inputs.input_pac && inputs.input_sa ? 'echo bwa' : inputs.generate_bwa_indexes ? 'bwa index' : 'echo bwa')

inputs:
  generate_bwa_indexes:
    type: boolean?
  input_fasta:
    type: File
    inputBinding:
      position: 2
  input_amb:
    type: File?
  input_ann:
    type: File?
  input_bwt:
    type: File?
  input_pac:
    type: File?
  input_sa:
    type: File?
  algoType:
    type: string?
    inputBinding:
      position: 1
      prefix: '-a'

CWL BWA From line 22 of tools/bwa_index.cwl

baseCommand: [bwa, mem]

inputs:
  reads:
    type: 'File[]'
    inputBinding:
      position: 3
  reference_genome:
    type: File
    inputBinding:
      position: 2
    secondaryFiles:
      - .fai
      - .amb
      - .ann
      - .bwt
      - .pac
      - .sa
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '-t'
  read_group:
    type: string?
    default: '@RG\tID:SRR709972\tSM:NA06985\tPL:ILLUMINA\tCN:CBRA\tLB:Fragment'
    inputBinding:
      position: 4
      prefix: '-R'

CWL From line 17 of tools/bwa_mem_paired.cwl

baseCommand: [bwa, mem]

inputs:
  reads:
    type: File
    inputBinding:
      position: 3
  reference_genome:
    type: File
    inputBinding:
      position: 2
    secondaryFiles:
      - .fai
      - .amb
      - .ann
      - .bwt
      - .pac
      - .sa
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '-t'
  read_group:
    type: string?
    default: '@RG\tID:SRR709972\tSM:NA06985\tPL:ILLUMINA\tCN:CBRA\tLB:Fragment'
    inputBinding:
      position: 4
      prefix: '-R'

CWL From line 17 of tools/bwa_mem_unpaired.cwl

baseCommand: [Rscript]

inputs:
  input:
    type: 'File[]'
    inputBinding:
      position: 2
      itemSeparator: ','
  mapping:
    type: File
    inputBinding:
      position: 3
  bed:
    type: File
    inputBinding:
      position: 4
  script:
    type: File
    inputBinding:
      position: 1
    default:
        class: File
        basename: "CODEX2.R"
        contents: |-
          # load packages
          library(CODEX2)

          # parse arguments
          args <- commandArgs(TRUE)
          args2 <- args[2]

          bamsList <- as.vector(strsplit(args[1], ",")[[1]])
          bamsList <- sapply(strsplit(bamsList, "[/]"), "[[", 3)

          bams <- read.csv(args2, header = TRUE, sep = ",") # mapping file
          bedFile <- args[3]

          # mapping
          dirPath <- dirname(args2)
          bamsMap <- bams[bams$file %in% bamsList,]
          batch <- bamsMap$batch[[1]]
          batchname <- paste0("batch_", batch)

          # set global variables
          bamFile <- as.vector(bamsMap$file)
          bamdir <- file.path(dirPath, bamFile)
          sampname <- sapply(strsplit(bamFile, "[.]"), "[[", 1)

          bambedObj <- getbambed(bamdir = bamdir, bedFile = bedFile, sampname = sampname, projectname = batchname)

          bamdir <- bambedObj$bamdir
          sampname <- bambedObj$sampname
          ref <- bambedObj$ref
          projectname <- bambedObj$projectname

          nsamples <- length(bamFile)

          ##########################################################
          # Getting GC content and mappability
          ##########################################################
          gc <- getgc(ref)
          mapp <- getmapp(ref)

          ##########################################################
          # Getting gene names, needed for targeted sequencing, here generating gene names in silico
          ##########################################################
          gene <- rep(NA, length(ref))
          for (chr in as.matrix(unique(seqnames(ref)))) {
            chr.index <- which(seqnames(ref) == chr)
            gene[chr.index] <- paste0(chr, "_gene_", ceiling(chr.index / 30))
          }
          values(ref) <- cbind(values(ref), DataFrame(gc, mapp, gene))

          ##########################################################
          # Getting depth of coverage
          ##########################################################
          coverageObj <- getcoverage(bambedObj, mapqthres = 20)
          Y <- coverageObj$Y
          write.csv(Y, file = paste0(projectname, "_coverage.csv"), quote = FALSE)
          head(Y[, 1:nsamples])

          ##########################################################
          # Quality control
          ##########################################################
          qcObj <- qc(Y, sampname, ref, cov_thresh = c(20, Inf), length_thresh = c(20, Inf), mapp_thresh = 0.9, gc_thresh = c(20, 80))
          Y_qc <- qcObj$Y_qc
          sampname_qc <- qcObj$sampname_qc
          ref_qc <- qcObj$ref_qc
          qcmat <- qcObj$qcmat
          gc_qc <- ref_qc$gc
          write.table(qcmat, file = paste0(projectname, "_qcmat", ".txt"), sep = "\t", quote = FALSE, row.names = FALSE)

          ##########################################################
          # Estimating library size factor for each sample
          ##########################################################
          Y.nonzero <- Y_qc[apply(Y_qc, 1, function(x) { !any(x == 0) }), ]
          pseudo.sample <- apply(Y.nonzero, 1, function(x) { prod(x) ^ (1 / length(x)) })
          N <- apply(apply(Y.nonzero, 2, function(x) { x / pseudo.sample }), 2, median)
          plot(N, apply(Y, 2, sum), xlab = "Estimated library size factor", ylab = "Total sum of reads")

          ##########################################################
          # Genome-wide normalization using normalize_null
          ##########################################################
          # If there are negative control samples, use normalize_codex2_ns()
          # If there are negative control regions, use normalize_codex2_nr()
          normObj.null <- normalize_null(Y_qc = Y_qc, gc_qc = gc_qc, K = 1:nsamples, N = N)
          Yhat <- normObj.null$Yhat
          AIC <- normObj.null$AIC
          BIC <- normObj.null$BIC
          RSS <- normObj.null$RSS

          ##########################################################
          # Number of latent factors
          ##########################################################
          choiceofK(AIC, BIC, RSS, K = 1:nsamples , filename = "codex2_null_choiceofK.pdf")
          par(mfrow = c(1, 3))
          plot(1:nsamples, RSS, type = "b", xlab = "Number of latent variables", pch=20)
          plot(1:nsamples, AIC, type = "b", xlab = "Number of latent variables", pch=20)
          plot(1:nsamples, BIC, type = "b", xlab = "Number of latent variables", pch=20)
          par(mfrow = c(1,1))

          ##########################################################
          # CBS segmentation per chromosome: optimal for WGS and WES
          ##########################################################
          finalcall.CBS <- list()
          optK <- which.max(BIC)
          for (chr in unique(seqnames(ref_qc))) {
            chr.index <- which(seqnames(ref_qc) == chr)
            Yhat.null.chr <- lapply(Yhat, function(x) x[chr.index,])

            finalcall.CBS[[chr]] <- segmentCBS(Y_qc[chr.index,], Yhat = Yhat.null.chr, optK = optK, K = 1:nsamples, sampname_qc = sampname_qc, ref_qc = ranges(ref_qc)[chr.index], chr = chr, lmax = 400, mode = "integer")
          }
          # write results
          write.table(finalcall.CBS[[chr]], file = paste0(projectname, "_", optK, ".CODEX_Chrom.txt"), sep = "\t", quote = F, row.names = F)

CWL CODEX From line 21 of tools/codex.cwl

baseCommand: [bash, -c]

inputs:
  script:
    type: string?
    inputBinding:
      position: 1
    default: |
      INPUT_FILE=$0
      SAMPLE_FILE=$1
      MIN_LEN=$2
      MAX_LEN=$3
      MIN_LRATIO=$4
      TEMP_NAME=${INPUT_FILE/_Chrom.txt/.filtered.txt}
      TEMP_FILE=$(echo "$TEMP_NAME" | sed "s/.*\///")
      OUTPUT_NAME=${TEMP_FILE/txt/bed}
      OUTPUT_FILE=$(echo "$OUTPUT_NAME" | sed "s/.*\///")

      tail -n +2 $INPUT_FILE | awk -v maxLen=$MAX_LEN -v minLen=$MIN_LEN -v minLRatio=$MIN_LRATIO '{ \
      chr=$2; \
      start=$4; \
      end=$5; \
      sample=$1; \
      len=$6; \
      lratio=$12; \
      st_exon=$7; \
      ed_exon=$8; \
      type=$3=="del"?"DEL":"DUP"; \
      tool="codex"; \
      if(len>minLen && \
         len<maxLen && \
         len/(ed_exon-st_exon)<50 && \
         lratio>=minLRatio){
           print chr"\t"start"\t"end"\t"sample"\t"type"\t"lratio"\t"tool}}' > ${TEMP_FILE}

      # mapping case_id with sample_id
      join -1 4 -2 2 -o 1.1,1.2,1.3,2.1,1.5,1.6,1.7 <(sort -k 4 $TEMP_FILE) <(sort -k 2 $SAMPLE_FILE) > ${OUTPUT_FILE}

  input:
    type: File
    inputBinding:
      position: 2
  samples:
    type: File
    inputBinding:
      position: 3
  min_len:
    type: string?
    inputBinding:
      position: 4
  max_len:
    type: string?
    inputBinding:
      position: 5
  min_lratio:
    type: string?
    inputBinding:
      position: 6

CWL From line 9 of tools/codex_filter.cwl

baseCommand: [bash, -c]

inputs:
  script:
    type: string?
    inputBinding:
      position: 1
    default: |
      INPUT_FILE=$0
      OUTPUT_NAME=${INPUT_FILE/bed/sorted.merged.bed}
      OUTPUT_FILE=$(echo "$OUTPUT_NAME" | sed "s/.*\///")

      for i in $(cat $INPUT_FILE | sed -e "s/[[:space:]]\+/\t/g" | cut -f4 | sort -u)  # FIXME: remove sed -e "s/[[:space:]]\+/\t/g" try better solution
      do
        for j in "DEL" "DUP"
        do
          grep $i $INPUT_FILE | grep $j | sed -e "s/[[:space:]]\+/\t/g" | bedtools sort | bedtools merge -c 6,7 -o max,distinct | awk -v sample=$i -v type=$j '{ \
            print $1,$2,$3,sample,type,$4,$5}' OFS="\t"
        done >> ${OUTPUT_FILE}
      done
  input:
    type: File
    inputBinding:
      position: 2

CWL BEDTools From line 17 of tools/collapse.cwl

baseCommand: [Rscript]

inputs:
  input:
    type: 'File[]'
    inputBinding:
      position: 2
      itemSeparator: ','
  mapping:
    type: File
    inputBinding:
      position: 3
  reference_genome:
    type: File
    inputBinding:
      position: 4
  regions:
    type: File
    inputBinding:
      position: 5
  script:
    type: File
    inputBinding:
      position: 1
    default:
        class: File
        basename: "exomeDepth.R"
        contents: |-
          # load packages
          library(ExomeDepth)

          # parse arguments
          args <- commandArgs(TRUE)
          args2 <- args[2]

          bamsList <- as.vector(strsplit(args[1], ",")[[1]])
          bamsList <- sapply(strsplit(bamsList, "[/]"), "[[", 3)

          bams <- read.csv(args2, header = TRUE, sep = ",") # mapping file
          bed <- read.csv(args[4], sep = "\t")
          refgen <- args[3]

          # mapping
          dirPath <- dirname(args2)
          bamsMap <- bams[bams$file %in% bamsList,]
          batch <- bamsMap$batch[[1]]

          # set global variables
          bamFiles <- as.vector(bamsMap$file)
          baiFiles <- as.vector(bamsMap$index_file)
          bamdir <- file.path(dirPath, bamFiles)
          baidir <- file.path(dirPath, baiFiles)

          # prepare bed file (4th column is the annotation for each region)
          bed <- bed[,1:4]
          colnames(bed) <- c("chromosome", "start", "end", "name")

          # get counts per bed region
          message('\n[INFO] Creating counts matrix')
          my.counts <- getBamCounts(bed.frame = bed, bam.files = bamdir, include.chr = FALSE, min.mapq = 20, referenceFasta = refgen, index.files = baidir)
          ExomeCount.dafr <- as(my.counts, "data.frame")

          # prepare the main matrix of read count data
          ExomeCount.mat <- as.matrix(ExomeCount.dafr[, grep(names(ExomeCount.dafr), pattern = '*.bam')])
          nsamples <- ncol(ExomeCount.mat)

          results <- data.frame(NULL) # to write results

          # start looping over each sample
          for (i in 1:nsamples) {

            sampname <- sapply(strsplit(bamsList[[i]], "[.]"), "[[", 1) # get sample name
            message(paste0("\n[INFO] Creating reference set for ", bamsList[[i]]))

            # Create the aggregate reference set for this sample
            my.choice <- select.reference.set(test.counts = ExomeCount.mat[,i],
                                              reference.counts = ExomeCount.mat[,-i],
                                              bin.length = (ExomeCount.dafr$end - ExomeCount.dafr$start)/1000,
                                              n.bins.reduced = 10000)

            my.reference.selected <- apply(X = ExomeCount.mat[, my.choice$reference.choice, drop = FALSE],
                                          MAR = 1,
                                          FUN = sum)

            message('\n[INFO] Computing correlation between sample and references...\n')
            my.test = ExomeCount.mat[, i, drop = FALSE]
            my.ref.counts = ExomeCount.mat[, my.choice$reference.choice, drop = FALSE]
            correlation = cor(my.test[,1], apply(my.ref.counts, 1, mean))
            message(paste("\n[INFO] Correlation between reference and tests count is", round(correlation,4)))

            message('\n[INFO] Creating the ExomeDepth object')
            all.exons <- new('ExomeDepth',
                            test = ExomeCount.mat[,i],
                            reference = my.reference.selected,
                            formula = 'cbind(test, reference) ~ 1')

            # Call the CNVs
            message('\n[INFO] Calling CNVs')
            all.exons <- CallCNVs(x = all.exons,
                                  transition.probability = 10^-4,
                                  chromosome = ExomeCount.dafr$space,
                                  start = ExomeCount.dafr$start,
                                  end = ExomeCount.dafr$end,
                                  name = bed$name)

            all.exons@CNV.calls$sample_id <- sampname # set sample name
            results <- rbind(results, all.exons@CNV.calls)  # append to results

          }
          message('\n[INFO] Writting results')
          output.file <- paste0("batch_", batch, ".exomeDepth.csv")
          write.csv(file = output.file, x = results, row.names = FALSE)

CWL ExomeDepth From line 21 of tools/exome_depth.cwl

baseCommand: [bash, -c]

inputs:
  script:
    type: string?
    inputBinding:
      position: 1
    default: |
      INPUT_FILE=$0
      SAMPLE_FILE=$1
      MIN_LEN=$2
      MAX_LEN=$3
      MIN_BF=$4
      TEMP_NAME=${INPUT_FILE/csv/filtered.csv}
      TEMP_FILE=$(echo "$TEMP_NAME" | sed "s/.*\///")
      OUTPUT_NAME=${TEMP_FILE/csv/bed}
      OUTPUT_FILE=$(echo "$OUTPUT_NAME" | sed "s/.*\///")

      tail -n +2 $INPUT_FILE | sed 's/"//g' | awk -F"," -v maxLen=$MAX_LEN -v minLen=$MIN_LEN -v minBf=$MIN_BF '{ \
      chr=$7; \
      start=$5; \
      end=$6; \
      sample=$13; \
      bf=$9; \
      len=(end-start)/1000; \
      type=$3=="deletion"?"DEL":"DUP"; \
      tool="exomeDepth"; \
      if(len>minLen && \
         len<maxLen && \
         bf>=minBf){
           print chr"\t"start"\t"end"\t"sample"\t"type"\t"bf"\t"tool}}' > ${TEMP_FILE}

      # mapping case_id with sample_id
      join -1 4 -2 2 -o 1.1,1.2,1.3,2.1,1.5,1.6,1.7 <(sort -k 4 $TEMP_FILE) <(sort -k 2 $SAMPLE_FILE) > ${OUTPUT_FILE}

  input:
    type: File
    inputBinding:
      position: 2
  samples:
    type: File
    inputBinding:
      position: 3
  min_len:
    type: string?
    inputBinding:
      position: 4
  max_len:
    type: string?
    inputBinding:
      position: 5
  min_bf:
    type: string?
    inputBinding:
      position: 6

CWL From line 9 of tools/exome_depth_filter.cwl

baseCommand: [fastp]

arguments:
  - prefix: '-o'
    valueFrom: $(inputs.fastq1.nameroot.replace(/\b.fastq\b/g, '')).trimmed.fastq
  - |
    ${
      if (inputs.fastq2){
        return '-O';
      } else {
        return '';
      }
    }
  - |
    ${
      if (inputs.fastq2){
        return inputs.fastq2.nameroot.replace(/\b.fastq\b/g, '') + '.trimmed.fastq';
      } else {
        return '';
      }
    }
  - prefix: '--unpaired1'
    valueFrom: $(inputs.fastq1.nameroot.replace(/\b.fastq\b/g, '')).unpaired.trimmed.fastq
  - |
    ${
      if (inputs.fastq2){
        return '--unpaired2';
      } else {
        return '';
      }
    }
  - |
    ${
      if (inputs.fastq2){
        return inputs.fastq2.nameroot.replace(/\b.fastq\b/g, '') + ".unpaired.trimmed.fastq";
      } else {
        return '';
      }
    }

inputs:
  fastq1:
    type: File
    inputBinding:
      prefix: -i
  fastq2:
    type: File
    inputBinding:
      prefix: -I
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '--thread'
  cut_right:
    type: boolean?
    default: true
    inputBinding:
      prefix: '--cut_right'
  cut_right_window_size:
    type: int?
    default: 5
    inputBinding:
      prefix: '--cut_right_window_size'
  cut_right_mean_quality:
    type: int?
    default: 24
    inputBinding:
      prefix: '--cut_right_mean_quality'
  trim_tail1:
    type: int?
    default: 1
    inputBinding:
      prefix: '--trim_tail1'
  length_required:
    type: int?
    default: 70
    inputBinding:
      prefix: '--length_required'

CWL fastp From line 17 of tools/fastp.cwl

baseCommand: [fastqc]

arguments:
  - valueFrom: $(runtime.outdir)
    prefix: '-o'
  - valueFrom: '--noextract'

inputs:
  fastq1:
    type: 'File[]'
    inputBinding:
      position: 2
  fastq2:
    type: 'File[]'
    inputBinding:
      position: 3
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '--threads'

CWL FastQC From line 20 of tools/fastqc.cwl

baseCommand: [gridss]

arguments:
  - position: 2
    prefix: '--output'
    valueFrom: $(inputs.input.nameroot.split('.')[0]).gridss.raw.vcf.gz
  - position: 5
    prefix: '--jar'
    valueFrom: "/usr/local/share/gridss-2.9.3-0/gridss.jar"

inputs:
  input:
    type: File
    secondaryFiles:
      - .bai
    inputBinding:
      position: 7
  reference_genome:
    type: File
    inputBinding:
      prefix: '--reference'
      position: 1
  assemblyFilename:
    type:
      - string?
      - "null"
    default: "gridss.assembly.bam"
    inputBinding:
      prefix: '--assembly'
      position: 3
  threads:
    type: int?
    default: 1
    inputBinding:
      prefix: '--threads'
      position: 4
  blacklist:
    type: File?
    inputBinding:
      prefix: '--blacklist'
      position: 6

CWL GRIDSS From line 21 of tools/gridss.cwl

baseCommand: [bash, -c]

inputs:
  script:
    type: string?
    inputBinding:
      position: 1
    default: |
      INPUT_FILE=$0
      SAMPLE_FILE=$1
      MIN_LEN=$2
      MAX_LEN=$3
      MIN_Q=$4
      OUTPUT_NAME=${INPUT_FILE/raw/filtered}
      OUTPUT_FILE=$(echo "$OUTPUT_NAME" | sed "s/.*\///")

      # mapping case_id with sample_id
      SAMPLE_ID=(${OUTPUT_FILE//./ })
      CASE_ID=$(awk -v search="$SAMPLE_ID" '$0 ~ search{print $1}' "$SAMPLE_FILE")

      tail -n +2 $INPUT_FILE | awk -v maxLen=$MAX_LEN -v minLen=$MIN_LEN -v minQ=$MIN_Q '{ \
      chr=$1; \
      start=$2; \
      end=$6; \
      sample="'$CASE_ID'"; \
      q=$11; \
      len=(end-start)/1000; \
      type=$7; \
      tool="gridss"; \
      if(len>minLen && \
         len<maxLen && \
         q>=minQ && \
         (type=="DEL" || type=="DUP")){
           print chr"\t"start"\t"end"\t"sample"\t"type"\t"q"\t"tool}}' > ${OUTPUT_FILE}

  input:
    type: File
    inputBinding:
      position: 2
  samples:
    type: File
    inputBinding:
      position: 3
  min_len:
    type: string?
    inputBinding:
      position: 4
  max_len:
    type: string?
    inputBinding:
      position: 5
  min_q:
    type: string?
    inputBinding:
      position: 6

CWL GRIDSS From line 9 of tools/gridss_filter.cwl

baseCommand: []
arguments:
  - position: 0
    shellQuote: false
    valueFrom: >-
      $(inputs.input_fasta.nameext == '.gz' ? 'gunzip -c' : 'echo gunzip')


inputs:
  input_fasta:
    type: File
    inputBinding:
      position: 2

CWL From line 17 of tools/gunzip.cwl

arguments:
  - position: 0
    valueFrom: "/usr/local/share/manta-1.6.0-0/bin/configManta.py"
    shellQuote: false
  - position: 2
    valueFrom: $(";{runDir}/runWorkflow.py".replace(/\{runDir\}/g, inputs.runDir))
    shellQuote: false

inputs:
  input:
    type: File
    secondaryFiles:
      - .bai
    inputBinding:
      prefix: '--bam'
      position: 1
      shellQuote: false
  reference_genome:
    type: File
    secondaryFiles:
      - .fai
      - .amb
      - .ann
      - .bwt
      - .pac
      - .sa
    inputBinding:
      prefix: '--referenceFasta'
      position: 1
      shellQuote: false
  exome:
    type: boolean?
    inputBinding:
      prefix: "--exome"
      position: 1
      shellQuote: false
  regions:
    type: File
    secondaryFiles:
      - .tbi
    inputBinding:
      prefix: '--callRegions'
      position: 1
      shellQuote: false
  runDir:
    type: string?
    default: "generated"
    inputBinding:
      prefix: '--runDir'
      position: 1
      shellQuote: false

CWL From line 22 of tools/manta.cwl

baseCommand: [bash, -c]

inputs:
  script:
    type: string?
    inputBinding:
      position: 1
    default: |
      INPUT_FILE=$0
      SAMPLE_FILE=$1
      MIN_LEN=$2
      MAX_LEN=$3
      MIN_Q=$4
      OUTPUT_NAME=${INPUT_FILE/raw/filtered}
      OUTPUT_FILE=$(echo "$OUTPUT_NAME" | sed "s/.*\///")

      # mapping case_id with sample_id
      SAMPLE_ID=(${OUTPUT_FILE//./ })
      CASE_ID=$(awk -v search="$SAMPLE_ID" '$0 ~ search{print $1}' "$SAMPLE_FILE")

      tail -n +2 $INPUT_FILE | awk -v maxLen=$MAX_LEN -v minLen=$MIN_LEN -v minQ=$MIN_Q '{ \
      chr=$1; \
      start=$2; \
      end=$6; \
      sample="'$CASE_ID'"; \
      q=$8; \
      len=(end-start)/1000; \
      type=$11; \
      tool="manta"; \
      if(len>minLen && \
        len<maxLen && \
        q>=minQ && \
        (type=="DEL" || type=="DUP")){
          print chr"\t"start"\t"end"\t"sample"\t"type"\t"q"\t"tool}}' > ${OUTPUT_FILE}

  input:
    type: File
    inputBinding:
      position: 2
  samples:
    type: File
    inputBinding:
      position: 3
  min_len:
    type: string?
    inputBinding:
      position: 4
  max_len:
    type: string?
    inputBinding:
      position: 5
  min_q:
    type: string?
    inputBinding:
      position: 6

CWL manta From line 9 of tools/manta_filter.cwl

baseCommand: [bash, -c]

inputs:
  script:
    type: string?
    inputBinding:
      position: 1
    default: |
      INPUT_FILE=$0
      OUTPUT_NAME=${INPUT_FILE/bed/sorted.merged.bed}
      OUTPUT_FILE=$(echo "$OUTPUT_NAME" | sed "s/.*\///")

      for i in $(cat $INPUT_FILE | sed -e "s/[[:space:]]\+/\t/g" | cut -f4 | sort -u)  # FIXME: remove sed -e "s/[[:space:]]\+/\t/g" try better solution
      do
        for j in "DEL" "DUP"
        do
          grep $i $INPUT_FILE | grep $j | sed -e "s/[[:space:]]\+/\t/g" | bedtools sort | bedtools merge -c 6,7 -o collapse | awk -v sample=$i -v type=$j '{ \
            print $1,$2,$3,sample,type,$4,$5}' OFS="\t"
        done >> ${OUTPUT_FILE}
      done
  input:
    type: File
    inputBinding:
      position: 2

CWL BEDTools From line 17 of tools/merge_all.cwl

baseCommand: [picard, MarkDuplicates]

arguments:
  - position: 2
    valueFrom: OUTPUT=$(inputs.input.nameroot).dedup.bam
  - position: 3
    valueFrom: METRICS_FILE=$(inputs.input.nameroot).dedup.metrics.txt
  - position: 4
    valueFrom: ASSUME_SORTED=TRUE

inputs:
  input:
    type: File
    inputBinding:
      position: 1
      prefix: 'INPUT='
      separate: false

CWL From line 19 of tools/picard_markduplicates.cwl

baseCommand: []
arguments:
  - position: 0
    shellQuote: false
    valueFrom: >-
      $(inputs.input_index ? 'echo samtools faidx' : 'samtools faidx')

inputs:
  input_fasta:
    type: File
    inputBinding:
      position: 1
  input_index:
    type: File?

CWL SAMtools From line 18 of tools/samtools_faidx.cwl

baseCommand: [samtools, index]

arguments:
  - position: 1
    valueFrom: -b

inputs:
  input:
    type: File
    inputBinding:
      position: 2

CWL From line 19 of tools/samtools_index.cwl

baseCommand: [samtools, merge, -f]

arguments:
  - position: 2
    valueFrom: $(inputs.paired.nameroot.split('.')[0]).sorted.bam

inputs:
  paired:
    type: File
    inputBinding:
      position: 3
  unpaired_R1:
    type: File
    inputBinding:
      position: 4
  unpaired_R2:
    type: File
    inputBinding:
      position: 5
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '--threads'

CWL From line 21 of tools/samtools_merge.cwl

baseCommand: [samtools, sort]

arguments:
  - position: 3
    prefix: '-o'
    valueFrom: $(inputs.input.nameroot).sorted.bam

inputs:
  input:
    type: File
    inputBinding:
      position: 2
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '--threads'

CWL From line 19 of tools/samtools_sort.cwl

baseCommand: [samtools, view]

arguments:
  - position: 5
    prefix: '-o'
    valueFrom: $(inputs.input.nameroot).filtered.bam

inputs:
  input:
    type: File
    inputBinding:
      position: 4
  min_mapping_quality:
    type: int
    inputBinding:
      position: 2
      prefix: '-q'
  bits_set:
    type: int
    inputBinding:
      position: 3
      prefix: '-F'
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '--threads'

CWL From line 16 of tools/samtools_view.cwl

baseCommand: [samtools, view]

arguments:
  - position: 3
    prefix: '-o'
    valueFrom: $(inputs.input.nameroot).bam

inputs:
  input:
    type: File
    inputBinding:
      position: 2
      prefix: '-bS'
  threads:
    type: int?
    default: 1
    inputBinding:
      position: 1
      prefix: '--threads'

CWL From line 19 of tools/samtools_view_sam2bam.cwl

baseCommand: [Rscript]

inputs:
  input:
    type: File
    inputBinding:
      position: 2
  script:
    type: File
    inputBinding:
      position: 1
    default:
        class: File
        basename: "gridss.R"
        contents: |-
          library(stringr)
          library("VariantAnnotation")
          library("StructuralVariantAnnotation")

          args <- commandArgs(TRUE)
          vcfFile <- args[1]
          bedFile <- paste(sub('\\.vcf.gz$', '', vcfFile), ".bed", sep = "")

          simpleEventType <- function(gr) {
            return(
              ifelse(seqnames(gr) != seqnames(partner(gr)), "Translocation",
                     ifelse(gr$insLen >= abs(gr$svLen) * 0.7, "Insertion",
                            ifelse(strand(gr) == strand(partner(gr)), "Inversion",
                                   ifelse(xor(start(gr) < start(partner(gr)), strand(gr) == "-"), "DEL",
                                          "DUP")))))
          }
          options(scipen=999)
          full_vcf <- readVcf(vcfFile)
          gr <- breakpointRanges(full_vcf)
          bedpe <- data.frame(
            chrom1=seqnames(gr),
            start1=start(gr) - 1,
            end1=end(gr),
            chrom2=seqnames(partner(gr)),
            start2=start(partner(gr)) - 1,
            end2=end(partner(gr)),
            type=simpleEventType(gr),
            svLen=gr$svLen,
            insLen=gr$insLen,
            name=names(gr),
            score=gr$QUAL,
            strand1=strand(gr),
            strand2=strand(partner(gr))
          )

          bedpe <- bedpe[str_detect(bedpe$name, "gridss.+o"),]
          write.table(bedpe, bedFile, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE)

CWL stringr VariantAnnotation StructuralVariantAnnotation From line 19 of tools/structural_variants.cwl

baseCommand: [svtools, vcftobedpe]

arguments:
  - position: 2
    prefix: '-o'
    valueFrom: $(inputs.input.nameroot.replace('vcf', 'bed'))

inputs:
  input:
    type: File
    inputBinding:
      position: 1
      prefix: '-i'