BioWorkflows: Bioinformatic pipeline for identifying dsDNA breaks by marker based incorporation, such as breaks induced by designer nucleases like Cas9.

shell:
  """
  if [[ $(cat {input.seq} | wc -l) -eq 0 ]]
      then
          touch {output}
          echo 'Empty input sequences for {input.seq}.' > {log} 2>&1
      else
          blat {input.genome} {input.seq} {output} \
              {params} > {log} 2>&1
  fi
  """

SnakeMake From line 16 of rules/align.blat.rules

shell: "gzip {input}"

SnakeMake From line 33 of rules/align.blat.rules

shell: 
  "Rscript {ROOT_DIR}/tools/rscripts/generate_ref_genome.R {params} {output}"

SnakeMake From line 41 of rules/align.blat.rules

shell: 
  "Rscript {ROOT_DIR}/tools/rscripts/generate_ref_genome.R {params} {output}"

SnakeMake From line 10 of rules/align.bwa.rules

shell:
  """
  bwa index -p {params} -a bwtsw {input}
  """

SnakeMake BWA From line 24 of rules/align.bwa.rules

shell:
  """
  bwa mem {params.bwa} {params.index} {input.R2} {input.R1} | \
      samtools view -b -o {output} > {log} 2>&1
  """

SnakeMake SAMtools BWA From line 48 of rules/align.bwa.rules

shell: 
  "Rscript {ROOT_DIR}/tools/rscripts/generate_ref_genome.R {params} {output}"

SnakeMake From line 10 of rules/align.bwa.umi_alt_method.rules

shell:
  """
  bwa index -p {params} -a bwtsw {input}
  """

SnakeMake BWA From line 24 of rules/align.bwa.umi_alt_method.rules

shell:
  """
  bwa mem {params.bwa} {params.index} {input.R1} {input.R2} | \
      samtools view -b -o {output} > {log} 2>&1
  """

SnakeMake SAMtools BWA From line 48 of rules/align.bwa.umi_alt_method.rules

shell: 
  """
  Rscript {params.tool} -d {params.data_dir} -o {output}
  """

SnakeMake From line 46 of rules/arch.rules

shell: "Rscript {params.tool} -f {input} -o {output}"

SnakeMake From line 58 of rules/arch.rules

shell:
  """
  Rscript {params.tool} -r {input.core} -e {input.eval} -i {input.site} \
      -o {output} -c {params.config} > {log} 2>&1
  """

SnakeMake From line 72 of rules/arch.rules

shell: 
  """
  Rscript {params.tool} -d {params.data_dir} -o {output}
  """

SnakeMake From line 46 of rules/arch.umi_alt_method.rules

shell: "Rscript {params.tool} -f {input} -o {output}"

SnakeMake From line 58 of rules/arch.umi_alt_method.rules

shell:
  """
  Rscript {params.tool} -r {input.core} -e {input.eval} -i {input.site} \
      -o {output} -c {params.config} > {log} 2>&1
  """

SnakeMake From line 72 of rules/arch.umi_alt_method.rules

shell:
  """
  Rscript {params.tool} {input} -o {params.outdir} \
    -b {params.bins} -l {params.level} --compress \
    --readNamePattern {params.readNamePatternArg} > {log} 2>&1
  """

SnakeMake From line 25 of rules/binning.rules

shell:
  """
  Rscript {params.tool} {input} -o {output.consol} -k {output.key} \
    --stat {output.stat} > {log} 2>&1
  """

SnakeMake From line 17 of rules/consol.rules

shell: "touch {output.stat}"

SnakeMake From line 9 of rules/consol_stub.rules

shell:
  """
  Rscript {params.tool} -m {input.sampleInfo} \
    --read1 {input.R1} --read2 {input.R2} \
    --idx1 {input.I1} --idx2 {input.I2} \
    --bc1 {params.bc1} --bc1Len {params.bc1Len} \
    --bc2 {params.bc2} --bc2Len {params.bc2Len} \
    --bc1Mis {params.bc1Mis} --bc2Mis {params.bc2Mis} --maxN {params.maxN} \
    -o {RUN_DIR}/process_data/demulti --stat {params.statName} -c {threads} \
    --readNamePattern {params.readNamePatternArg} --compress > {log} 2>&1
  """

SnakeMake From line 42 of rules/demulti.rules

shell: "mv {input} {output}"

SnakeMake From line 59 of rules/demulti.rules

shell:
  """
  Rscript {params.tool} {input.R1} {input.R2} -o {output.R1} {output.R2} \
    --readNamePattern {params.readNamePatternArg} \
    --stat {output.stat} --compress > {log} 2>&1
  """

SnakeMake From line 19 of rules/filt.rules

shell:
  """
  head -n 1 -q {params}/*.uniq.csv | uniq > {output}
  for UNIQ in $(ls {params}/*uniq.csv); do
      tail -n +2 $UNIQ >> {output}
  done
  """

SnakeMake From line 15 of rules/process.rules

shell:
  """
  Rscript {params.tool} -d {params.dir} -p {params.pattern} \
    -o {output.hits} -s {output.stat} > {log} 2>&1
  """

SnakeMake From line 39 of rules/process.rules

run:
  call_str="Rscript {params.tool} {input.sites} -o {output.incorp} "
  if (config["UMItags"]):
      call_str=call_str + "-u {params.umitagDir} "
  if (config["recoverMultihits"]):
      call_str=call_str + "-m {params.multiDir} "
  call_str=call_str + "-c {params.config} --stat {output.stat} "
  call_str=call_str + "--readNamePattern {params.readNamePatternArg} > {log} 2>&1"
  shell(call_str)

SnakeMake From line 80 of rules/process.rules

run:
  call_str="Rscript {params.tool} {params.config} -o {output.eval}"
  if (config["suppFile"]):
    call_str=call_str + " -s " + str(SUPPINFO_PATH)
  call_str=call_str + " --stat {output.stat} > {log} 2>&1"
  shell(call_str)

SnakeMake From line 102 of rules/process.rules

shell:
  """
  Rscript {params.tool} {input} -o {output} {params.supp} > {log} 2>&1
  """

SnakeMake From line 132 of rules/process.rules

shell:
  """
  Rscript {params.tool} {input} -o {output} > {log} 2>&1
  """

SnakeMake From line 147 of rules/process.rules

shell:
  """
  Rscript {params.tool} {input.sampleR2} {input.sampleR1} \
    -k {input.keyR2} {input.keyR1} -o {output.uniq} \
    --chimera {output.chimera} --multihit {output.multihit} -g {params.ref} \
    --maxAlignStart {params.start} --minPercentIdentity {params.pct} \
    --minTempLength {params.minLen} --maxTempLength {params.maxLen} \
    --readNamePattern {params.readNamePatternArg} \
    --stat {output.stat} > {log} 2>&1
  """

SnakeMake From line 27 of rules/quality.blat.rules

shell:
  """
  Rscript {params.tool} {input.sampleR1} {input.sampleR2} \
    -k {input.keyR1} {input.keyR2} -o {output.uniq} \
    --chimera {output.chimera} --multihit {output.multihit} -g {params.ref} \
    --maxAlignStart {params.start} --minPercentIdentity {params.pct} \
    --minTempLength {params.minLen} --maxTempLength {params.maxLen} \
    --readNamePattern {params.readNamePatternArg} \
    --stat {output.stat} > {log} 2>&1
  """

SnakeMake From line 27 of rules/quality.blat.umi_alt_method.rules

shell: "samtools sort {input} -o {output}"

SnakeMake SAMtools From line 9 of rules/quality.sam.rules

shell: "samtools index -b {input} {output}"

SnakeMake SAMtools From line 16 of rules/quality.sam.rules

shell:
  """
  Rscript {params.tool} {input.bam} {input.bai} \
    -o {output.uniq} --chimera {output.chimera} --multihit {output.multihit} \
    -g {params.ref} --maxAlignStart {params.start} \
    --minPercentIdentity {params.pct} --minTempLength {params.minLen} \
    --maxTempLength {params.maxLen} \
    --readNamePattern {params.readNamePatternArg} \
    --stat {output.stat} > {log} 2>&1
  """

SnakeMake From line 39 of rules/quality.sam.rules

shell:
  """
  Rscript {params.tool} {input.reads} --output {output.data} \
    --stat {output.stat} --readNamePattern {params.readNamePatternArg} \
    --compress > {log} 2>&1
  """

SnakeMake From line 23 of rules/skip_demulti.rules

shell: 
  """
  cat {input} | cut -d , -f 2,3 | uniq | sed 's/reads/{wildcards.sample}.demulti,reads/' > {output}
  """

SnakeMake From line 35 of rules/skip_demulti.rules

shell: 
  """
  cat {input} > {output}
  """

SnakeMake From line 48 of rules/skip_demulti.rules

shell:
  """
  Rscript {params.tool} {input} -o {output.trim} \
    -l {params.lead} --leadMismatch {params.leadMis} \
    -r {params.over} --overMismatch {params.overMis} \
    --overMaxLength {params.overLen} --stat {output.stat} \
    --compress > {log} 2>&1
  """

SnakeMake From line 21 of rules/trim.rules

shell:
  """
  Rscript {params.tool} {input} -o {output.trim} \
    -l {params.lead} --leadMismatch {params.leadMis} \
    -r {params.over} --overMismatch {params.overMis} \
    --overMaxLength {params.overLen} --stat {output.stat} \
    --compress > {log} 2>&1
  """

SnakeMake From line 47 of rules/trim.rules

shell:
  """
  Rscript {params.tool} {input} -o {output.trim} \
    -l {params.lead} --leadMismatch {params.leadMis} \
    --noQualTrimming --stat {output.stat} --compress > {log} 2>&1
  """

SnakeMake From line 70 of rules/trim.rules

shell:
  """
  Rscript {params.tool} {input} -o {output.trim} \
    -l {params.lead} --leadMismatch {params.leadMis} \
    -r {params.over} --overMismatch {params.overMis} \
    --overMaxLength {params.overLen} --stat {output.stat} \
    --compress > {log} 2>&1
  """

SnakeMake From line 21 of rules/trim.umi_alt_method.rules

shell:
  """
  Rscript {params.tool} {input} -o {output.trim} \
    -l {params.lead} --leadMismatch {params.leadMis} \
    -r {params.over} --overMismatch {params.overMis} \
    --overMaxLength {params.overLen} --stat {output.stat} \
    --compress > {log} 2>&1
  """

SnakeMake From line 47 of rules/trim.umi_alt_method.rules

shell:
  """
  Rscript {params.tool} {input} -o {output.trim} \
    -l {params.lead} --leadMismatch {params.leadMis} \
    --noQualTrimming --stat {output.stat} --compress > {log} 2>&1
  """

SnakeMake From line 70 of rules/trim.umi_alt_method.rules

shell:
  """
  Rscript {params.tool} {input} -o {output.seq} \
    -l {params.seq} --leadMismatch {params.mis} --noQualTrimming \
    --minSeqLength 0 --collectRandomIDs {output.umi} --stat {output.stat} \
    --compress > {log} 2>&1
  """

SnakeMake From line 19 of rules/umitag.rules

shell: "touch {output.stat}"

SnakeMake From line 9 of rules/umitag_stub.rules

shell:
  """
  Rscript {params.tool} {input} -o {output.seq} \
    -l {params.seq} --leadMismatch {params.mis} --noQualTrimming \
    --minSeqLength 0 --collectRandomIDs {output.umi} --stat {output.stat} \
    --compress > {log} 2>&1
  """

SnakeMake From line 19 of rules/umitag.umi_alt_method.rules

options(stringsAsFactors = FALSE, scipen = 99, width = 180)
suppressMessages(library("magrittr"))
suppressMessages(library("iguideSupport"))

# Set up and gather command line arguments ----
parser <- argparse::ArgumentParser(
  description = "Assimilate incorporation data from iGUIDE pipeline.",
  usage = paste(
    "Rscript assimilate_incorp_data.R <uniqSites> -o <output> -c <config>",
    "[-h/--help, -v/--version] [optional args]"
  )
)

parser$add_argument(
  "uniqSites", nargs = 1, type = "character",
  help = paste(
    "Unique sites output from blatCoupleR. The output from an entire run can",
    "be concatenated together as a single input."
  )
)

parser$add_argument(
  "-o", "--output", nargs = 1, type = "character", required = TRUE,
  help = "Output file name in .rds format."
)

parser$add_argument(
  "-c", "--config", nargs = 1, type = "character", required = TRUE,
  help = "Run specific config file in yaml format."
)

parser$add_argument(
  "-u", "--umitags", nargs = 1, type = "character",
  help = paste(
    "Path to directory with associated fasta files containing read specific",
    "random captured sequences. Directory should contain files with file names",
    "like *.umitags.fasta."
  )
)

parser$add_argument(
  "-m", "--multihits", nargs = 1, type = "character",
  help = paste(
    "Path to directory with associated multihit files (*.multihits.rds) as",
    "produced by coupling alignment output files."
  )
)

parser$add_argument(
  "--stat", nargs = 1, type = "character", default = FALSE, 
  help = paste(
    "File name to be written in output directory of read couts for each",
    "sample. CSV file format. ie. test.stat.csv."
  )
)

parser$add_argument(
  "--readNamePattern", nargs = 1, type = "character", 
  default = "[\\w\\:\\-\\+]+", 
  help = "Regular expresion capturing the read name for a given sequence."
)

parser$add_argument(
  "--iguide_dir", nargs = 1, type = "character", default = "IGUIDE_DIR",
  help = "iGUIDE install directory path, do not change for normal applications."
)


# Set arguments with parser ----
args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

if( !dir.exists(args$iguide_dir) ){
  root_dir <- Sys.getenv(args$iguide_dir)
}else{
  root_dir <- args$iguide_dir
}

if( !dir.exists(root_dir) ){
  stop(paste0("\n  Cannot find install path to iGUIDE: ", root_dir, ".\n"))
}else{
  args$iguide_dir <- root_dir
}


input_table <- data.frame(
  "Variables" = paste0(names(args), " :"), 
  "Values" = sapply(
    seq_along(args), 
    function(i) paste(args[[i]], collapse = ", ")
  )
)

input_table <- input_table[
  match(c(
    "uniqSites :", "output :", "config :", "umitags :", "multihits :", "stat :",
    "iguide_dir :"
    ), 
    input_table$Variables
  ),
]

## Remove output file(s) if existing
if( args$stat != FALSE ){
  output_files <- c(args$output, args$stat)
}else{
  output_files <- c(args$output)
}

if( any(sapply(output_files, file.exists)) ){
  null <- lapply(output_files, unlink)
}

# Log inputs
cat("\nAssimilate Inputs:\n")
print(
  x = data.frame(input_table),
  right = FALSE, 
  row.names = FALSE
)


# Get versioning ----
soft_version <- as.character(read.delim(
  file = file.path(root_dir, ".version"), header = FALSE))

build_version <- list.files(file.path(root_dir, "etc")) %>%
  grep(pattern = "build.b[0-9\\.]+.*", x = ., value = TRUE) %>%
  stringr::str_extract(pattern = "b[0-9]+\\.[0-9]+\\.[0-9]+")


# Inputs and parameters ----
# Run parameters and sample parameters
config <- yaml::yaml.load_file(args$config)

## These parameters are dictate part of the following analysis if multihit 
## alignments are to be considered in the analysis.

upstream_dist <- config$upstreamDist
downstream_dist <- config$downstreamDist
pile_up_min <- config$pileUpMin

# Load reference genome ----
## Load a reference genome from a fasta file or a BSGenome reference.
## Script stops if ref genome is not available

if( grepl(".fa", config$Ref_Genome) ){

  if( !(
    file.exists(file.path(args$iguide_dir, config$Ref_Genome)) | 
      file.exists(config$Ref_Genome)
  ) ){
    stop(
      "\n  Specified reference genome file not found: ", config$Ref_Genome, "\n"
    )
  }

  ref_file_type <- ifelse(grepl(".fastq", config$Ref_Genome), "fastq", "fasta")


  if( file.exists(
    file.path(args$iguide_dir, config$Ref_Genome) 
  ) ){

    ref_genome <- Biostrings::readDNAStringSet(
      filepath = file.path(args$iguide_dir, config$Ref_Genome),
      format = ref_file_type
    )

  }else{

    ref_genome <- Biostrings::readDNAStringSet(
      filepath = config$Ref_Genome,
      format = ref_file_type
    )

  }

}else{

  genome <- grep(
    pattern = config$Ref_Genome, 
    x = unique(BSgenome::installed.genomes()), 
    value = TRUE
  )

  if( length(genome) == 0 ){

    cat("\nInstalled genomes include:\n")
    print(unique(BSgenome::installed.genomes()))
    cat("\n  Selected reference genome not in list.")
    stop("\n  Genome not available.\n")

  }else if( length(genome) > 1 ){

    cat("\nInstalled genomes include:\n")
    print(unique(BSgenome::installed.genomes()))
    cat(
      "\n  Please be more specific about reference genome.",
      "Multiple matches to input."
    )
    stop("\n  Multiple genomes requested.\n")

  }

  suppressMessages(library(genome, character.only = TRUE))

  ref_genome <- get(genome)

}


# Load input data ----
## Unique sites ----
## This object is the alignment positions for the sequences / reads that only 
## aligned to a single location on the reference genome.

reads <- data.table::fread(
  input = args$uniqSites, data.table = FALSE, stringsAsFactors = FALSE
)


# Multihits if requested ----
## Multihits are alignments that legitimately appear in multiple locations
## across the reference genome. These can be more difficult to interpret but are
## an option for this software. The user should be familiar and cautious of 
## alignment artifacts if using multihit data.

if( all(!is.null(args$multihits)) ){

  uniq_reads <- GenomicRanges::makeGRangesFromDataFrame(
    df = reads, 
    keep.extra.columns = TRUE, 
    seqinfo = GenomeInfoDb::seqinfo(ref_genome)
  )

  multihit_files <- list.files(path = args$multihit, full.names = TRUE)

  mulithit_files <- multihit_files[
    stringr::str_detect(mulithit_files, ".multihits.rds")
  ]

  multi_reads <- unlist(GRangesList(lapply(mulithit_files, function(x){

    multi <- readRDS(x)
    GenomeInfoDb::seqinfo(multi$unclustered_multihits) <- 
      GenomeInfoDb::seqinfo(ref_genome)

    if( length(multi$unclustered_multihits) > 0 ){

      GenomicRanges::mcols(multi$unclustered_multihits) <- 
        GenomicRanges::mcols(multi$unclustered_multihits)[
          ,c(names(GenomicRanges::mcols(uniq_reads)))
      ]

    }else{

      GenomicRanges::mcols(multi$unclustered_multihits) <- 
        GenomicRanges::mcols(uniq_reads)[
          0, c(names(GenomicRanges::mcols(uniq_reads)))
      ]

    }

    multi$unclustered_multihits

  })))

  comb_reads <- c(uniq_reads, multi_reads)

  GenomicRanges::mcols(comb_reads)$type <- rep(
    c("uniq", "multi"), c(length(uniq_reads), length(multi_reads))
  )

  GenomicRanges::mcols(comb_reads)$clus.id <- pileupCluster(
    gr = comb_reads, 
    grouping = "sampleName", 
    return = "ID"
  )

  filt_multi_reads <- dplyr::bind_rows(lapply(
    split(comb_reads, comb_reads$sampleName), 
    function(x){

      uniq_id <- unique(x$clus.id[x$type == "uniq"])
      multi_id <- unique(x$clus.id[x$type == "multi"])
      y <- x[x$type == "multi" & x$clus.id %in% intersect(uniq_id, multi_id)]
      mcols(y)$clus.id <- NULL

      if( length(y) > 0 ){

        contrib_amt <- 1 / table(mcols(y)$ID)
        GenomicRanges::mcols(y)$contrib <- 
          as.numeric(contrib_amt[GenomicRanges::mcols(y)$ID])

      }

      as.data.frame(y, row.names = NULL) %>%
        dplyr::mutate(
          seqnames = as.character(seqnames), 
          strand = as.character(strand)
        )

    }
  ))

  reads <- dplyr::mutate(reads, type = "uniq", contrib = 1) %>%
    dplyr::bind_rows(., filt_multi_reads)

}else{

  reads <- dplyr::mutate(reads, type = "uniq", contrib = 1)

}

# Print out stats during analysis.
cat("\nTabulation of aligned reads per specimen:\n")
temp_table <- table(stringr::str_extract(reads$sampleName, "[\\w]+"))

print(
  data.frame(
    "Specimen" = names(temp_table), 
    "Aligned_Reads" = format(as.numeric(temp_table), big.mark = ",")
  ),
  right = FALSE,
  row.names = FALSE
)

rm(temp_table)

# Umitags or captured random sequences ----
## Unique molecular index tags, or UMItags, are random sequences appended to the
## index 2 read. They are 8 or so nucleotides and are combined with the terminal 
## breakpoint sequence to be potentially used for a higher dynamic range 
## abundance measure. While ideal in theory, practice has identified these 
## sequences skewing with read counts and an over abundance of sharing of the 
## random sequence between difference breakpoints. Interpretation of UMItag 
## based abundances should be interpreted with caution as they are prone / 
## susceptable to PCR artifacts.

if( !is.null(args$umitags) ){

  umitag_files <- list.files(path = args$umitags, full.names = TRUE)

  umitag_files <- umitag_files[
    stringr::str_detect(umitag_files, ".umitags.fasta")
  ]

  umitags <- lapply(umitag_files, ShortRead::readFasta)
  umitags <- serialAppendS4(umitags)
  umitag_read_ids <- stringr::str_extract(
    as.character(ShortRead::id(umitags)),
    args$readNamePattern
  )

  reads$umitag <- as.character(ShortRead::sread(umitags))[
    match(reads$ID, umitag_read_ids)
  ]

}

# Generate stats if requested ----
## If requested, generate stats from the analysis for qc.

if( args$stat != FALSE ){

  stat <- reads %>% 
    dplyr::group_by(sampleName) %>% 
    dplyr::summarise(
      reads = dplyr::n_distinct(ID),
      aligns = dplyr::n_distinct(seqnames, start, end, strand),
      loci = dplyr::n_distinct(
        seqnames, strand, ifelse(strand == "+", start, end)
      )
    ) %>% 
    tidyr::gather(key = "type", value = "value", -sampleName)

  write.table(
    x = stat, file = args$stat, 
    sep = ",", row.names = FALSE, 
    col.names = FALSE, quote = FALSE
  )

}

# Output data ----
## rds file that can be read into evaluation or reports or loaded into a 
## database with some additional scripting.
fmt_reads <- reads %>%
  dplyr::select(-lociPairKey, -readPairKey)

output_file <- list(
  "soft_version" = soft_version,
  "build_version" = build_version,
  "config" = config,
  "reads" = fmt_reads
)

saveRDS(output_file, file = args$output)

if( all(sapply(output_files, file.exists)) ){
  message("Successfully completed script.")
}else{
  stop("Check output, it not detected after assimilating.")
}

q(status = 0)

R magrittr From line 10 of rscripts/assimilate_incorp_data.R

options(stringsAsFactors = FALSE, scipen = 99, width = 120)

# Set up and gather command line arguments ----
parser <- argparse::ArgumentParser(
  description = "Separate sequence files into bins of appropriate size.",
  usage = paste(
    "Rscript bin_seqs.R <seqs> -o <outputDir> [optional args] [-h/--help]"
  )
)

parser$add_argument(
  "seqs", nargs = "+", type = "character",
  help = paste(
    "Path(s) to sequence files to separate into bins. Only read names in first",
    "file will be used for indexing and splitting. Make sure all files have",
    "the same content! Read order will be set by first file. Fasta or Fastq",
    "formats allowed, as well as gzipped compression."
  )
)

parser$add_argument(
  "-o", "--output", nargs = 1, type = "character", default = ".",
  help = "Directory for output files to be written. Default: '.'"
)

parser$add_argument(
  "-b", "--bins", nargs = 1, type = "integer", default = 2L,
  help = "The number of bins to separate files into, default is 2."
)

parser$add_argument(
  "-l", "--level", nargs = 1, type = "integer", default = 0L, 
  help = paste(
    "Fill level for each bin. If specified, then script will fill files to the",
    "specified level with reads before filling the next file, sequentially.",
    "If the total number of reads would fill all bins to their level, then",
    "reads will be evenly distributed across all bins, which is the default",
    "behavior. Default value: 0."
  )
)

parser$add_argument(
  "--compress", action = "store_true", 
  help = paste(
    "Output sequence file(s) in gzipped format. Otherwise this relies on the",
    "input format."
  )
)

parser$add_argument(
  "--readNamePattern", nargs = 1, type = "character", 
  default = "[\\w\\:\\-\\+]+", 
  help = paste(
    "Regular expression for pattern matching read names. Should not contain", 
    "R1/R2/I1/I2 specific components. Default is [\\w\\:\\-\\+]+"
  )
)


args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

# Create output directory if not currently available ----
if( !dir.exists(args$output) ){

  dir.create(args$output)
  if(!dir.exists(args$output)) stop("Cannot create output folder.\n")
  args$output <- normalizePath(args$output)

}

# Load sequence files
seq_files <- lapply(args$seqs, function(x){

  if( stringr::str_detect(x, ".fastq") | stringr::str_detect(x, ".fq") ){
    return(ShortRead::readFastq(x))
  }else{
    return(ShortRead::readFasta(x))
  }

})


# Score indices from first sequence for binning input sequences
if( length(seq_files[[1]]) <= args$bins * args$level ){

  seq_idx <- split(
      seq_along(seq_files[[1]]),
      ceiling(seq_along(seq_files[[1]]) / args$level)
  )

  if( length(seq_idx) < args$bins ){
    seq_idx <- c(
      seq_idx, 
      split(integer(), seq(length(seq_idx)+1, args$bins, 1))
    )
  }

}else{

  seq_idx <- split(
    seq_along(seq_files[[1]]), 
    ceiling(
      seq_along(seq_files[[1]])/(length(seq_files[[1]])/as.numeric(args$bins))
    )
  )

}

seq_names <- stringr::str_extract(
  as.character(ShortRead::id(seq_files[[1]])),
  args$readNamePattern
)

seq_name_list <- lapply(seq_idx, function(i) seq_names[i])

# Split and write sequences to output directory
output_files <- strsplit(args$seqs, "/")

output_files <- unlist(mapply(
  function(i, j) output_files[[i]][j],
  i = seq_along(output_files), 
  j = lengths(output_files),
  SIMPLIFY = FALSE
))

if( any(stringr::str_detect(output_files, ".gz$")) | args$compress ){
  args$compress <- TRUE
}else{
  args$compress <- FALSE
}

expanded_output_file_names <- lapply(output_files, function(x){

  x <- stringr::str_remove(x, ".gz$")

  ext <- unlist(strsplit(x, "\\."))
  lead <- paste(ext[-length(ext)], collapse = ".")
  ext <- ext[length(ext)]

  bins <- stringr::str_pad(seq_len(args$bins), nchar(args$bins), pad = 0)
  exp_names <- paste0(lead, ".bin", bins, ".", ext)

  if( args$compress ){
    exp_names <- paste0(exp_names, ".gz")
  }

  exp_names

})

# Write output files
null <- mapply(
  function(seqs, outputs, idx_names){

    null <- mapply(
      function(idx, outfile){

        matched_idx <- match(idx, stringr::str_extract(
          as.character(ShortRead::id(seqs)), args$readNamePattern
        ))

        if( any(table(matched_idx)) > 1 ){
          stop("\n  ReadNamePattern is ambiguous, please refine.")
        }

        if( file.exists(file.path(args$output, outfile)) ){
          unlink(file.path(args$output, outfile))
        }

        if( stringr::str_detect(outfile, ".fastq") | 
            stringr::str_detect(outfile, ".fq") ){

          ShortRead::writeFastq(
            object = seqs[matched_idx],
            file = file.path(args$output, outfile),
            compress = args$compress
          )

        }else{

          ShortRead::writeFasta(
            object = seqs[matched_idx],
            file = file.path(args$output, outfile),
            compress = args$compress
          )

        }

      },
      idx = idx_names,
      outfile = outputs
    )

  },
  seqs = seq_files,
  outputs = expanded_output_file_names,
  MoreArgs = list(idx_names = seq_name_list)
)

# Check for output files
if( 
  all(file.exists(file.path(args$output, unlist(expanded_output_file_names)))) 
){

  cat(
    "\nAll files written to output directory:\n ", 
    paste(
      file.path(args$output, unlist(expanded_output_file_names)), 
      collapse = "\n  "
    ),
    "\n"
  )

  q(save = "no", status = 0)

}else{

  stop("\n  Could not confirm existance of all output files.\n")

}

R From line 9 of rscripts/bin_seqs.R

options(stringsAsFactors = FALSE, scipen = 99, warn = -1, window = 999)
suppressMessages(library("magrittr"))

# Set up and gather command line arguments ----
parser <- argparse::ArgumentParser(
  description = "Test checksums of files from a yaml input.",
  usage = "Rscript tools/rscripts/check_file_digests.R <yaml.input> <options>"
)

parser$add_argument(
  "yaml", nargs = 1, type = "character",
  help = "Yaml containing file paths and checksums (md5). ie. sim.test.yml"
)

parser$add_argument(
  "-o", "--output", nargs = 1, type = "character", default = FALSE,
  help = "Output file name .csv, .tsv, or .rds format."
)

parser$add_argument(
  "-v", "--verbose", action = "store_true", 
  help = "Turns on diagnositc-based messages."
)

parser$add_argument(
  "--install_path", nargs = 1, type = "character", default = "IGUIDE_DIR",
  help = "iGUIDE install directory path, do not change for normal applications."
)

# Set arguments with parser ----
args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

root_dir <- Sys.getenv("IGUIDE_DIR")
args$install_path <- root_dir

code_dir <- dirname(sub(
  pattern = "--file=", 
  replacement = "", 
  x = grep("--file=", commandArgs(trailingOnly = FALSE), value = TRUE)
))

input_table <- data.frame(
  "Variables" = paste0(names(args), " :"), 
  "Values" = sapply(
    seq_along(args), 
    function(i) paste(args[[i]], collapse = ", ")
  )
)

input_table <- input_table[
  match(
    c("yaml :", "output :", "verbose :", "install_path :"), 
    input_table$Variables
  ),
]

## Log inputs
if( args$verbose ){

  cat("List Inputs")
  pander::pandoc.table(
    data.frame(input_table),
    justify = "left", 
    row.names = FALSE,
    style = "simple",
    split.table = Inf
  )

}


# Additional functions ----
readFile <- function(path, root){

  if( !file.exists(path) ){
    root_path <- file.path(root, path)
    if( !file.exists(root_path) ){
      stop("Cannot find file:", path)
    }else{
      path <- root_path
    }
  }

  # Read extension form path
  ext <- stringr::str_extract(path, "[\\w]+$")
  supported_ext <- c("tsv", "csv", "gz", "fasta", "fastq", "rds")

  stopifnot( ext %in% supported_ext )

  # Check additional extension if compressed
  if( ext == "gz" ){
    ext2 <- stringr::str_extract(path, "[\\w]+.gz")
    ext2 <- gsub(".gz", "", ext2)
    stopifnot( ext2 %in% supported_ext )
  }else{
    ext2 <- NA
  }

  exts <- c(ext, ext2)
  exts <- exts[!is.na(exts)]

  # Read in methods based on inputs.
  if( any(exts %in% c("tsv", "csv")) ){

    if( any(exts == "csv") ){
      delim <- ","
    }else{
      delim <- "\t"
    }

    if( ext == "gz" ){
      return(read.table(gzfile(path), header = TRUE, sep = delim))
    }else{
      return(read.table(path, header = TRUE, sep = delim))
    }

  }else if( any(stringr::str_detect(exts, "fast")) ){

    return(Biostrings::readDNAStringSet(path))

  }else{

    rds_import <- readRDS(path)

    if( class(rds_import) == "list" ){
      return(rds_import[[
        which(sapply(rds_import, class) == "data.frame")
      ]])
    }else{
      return(rds_import)
    }

  }

}

# Load inputs ----
config <- yaml::yaml.load_file(args$yaml)
paths <- lapply(config, "[[", "path")
data_objs <- lapply(paths, readFile, root = args$install_path)

# Check digests ----
test_digests <- sapply(data_objs, digest::digest)
check_digests <- sapply(config, "[[", "md5")

df <- data.frame(
  "file_name" = sapply(config, "[[", "name"),
  "md5_standard" = check_digests,
  "md5_tested" = test_digests,
  "outcome" = ifelse(test_digests == check_digests, "pass", "FAIL")
)

# Log output if requested ----
if( args$verbose ){

  cat("\nList of Outcomes")
  pander::pandoc.table(
    df,
    justify = "left", 
    row.names = FALSE,
    style = "simple",
    split.table = Inf
  )

}

# Write output file if requested ----
if( args$output != FALSE ){
  if( stringr::str_detect(args$output, ".tsv$") ){
    write.table(df, file = args$output, quote = FALSE, row.names = FALSE)
  }else if( stringr::str_detect(args$output, ".csv$") ){
    write.table(df, file = args$output, quote = FALSE, row.names = FALSE)
  }else if( stringr::str_detect(args$output, ".rds$") ){
    saveRDS(df, file = args$output)
  }else if( stringr::str_detect(args$output, ".RData$") ){
    save(df, file = args$output)
  }
}

# Finish up and close out ----
if( all(df$outcome == "pass") ){
  q(save = "no", status = 0)
}else{
  q(save = "no", status = 1)
}

R magrittr From line 3 of rscripts/check_file_digests.R

options(stringsAsFactors = FALSE, unzip = "internal")

cmd_args <- unlist(strsplit(c("", commandArgs(trailingOnly = TRUE)), " "))


# Set arguments ----
cran_install <- any(grepl("--cran$", cmd_args, perl = TRUE))

mirror_install <- any(grepl("--cran_mirror$", cmd_args, perl = TRUE))

mirror_url <- cmd_args[
  which(grepl("--cran_mirror$", cmd_args, perl = TRUE)) + 1
]

within_conda <- any(grepl("--conda$", cmd_args, perl = TRUE)) 

quiet <- any(grepl("-q", cmd_args))


# Check installed packages for dependencies ----
r_packs <- c(
  "argparse", "data.table", "devtools", "digest", "igraph", "ggforce", 
  "knitr", "magrittr", "Matrix", "pander", "RColorBrewer", "rmarkdown", 
  "scales", "tidyverse", "yaml")

bioc_packs <- c(
  "BiocGenerics", "Biostrings", "BSgenome", "BSgenome.Hsapiens.UCSC.hg38",
  "GenomicRanges", "hiAnnotator", "IRanges", "Rsamtools", "ShortRead"
)

packs <- c(r_packs, bioc_packs)

present <- packs %in% row.names(installed.packages())

if( !quiet ){
  print(data.frame(row.names = packs, "Installed" = present))
}

if( !cran_install | !mirror_install ){
  stopifnot(all(present))
  q()
}


# Install from CRAN or from CRAN mirror ----

if( within_conda ){

  .libPaths(
    new = grep(
      pattern = "conda./envs/", x = .libPaths(), 
      perl = TRUE, value = TRUE
    )
  )

}

if( mirror_install ){
  repo <- mirror_url
}else{
  repo <- getOption("repos")
}

r_packs_to_get <- r_packs[!r_packs %in% row.names(installed.packages())]

if( length(r_packs_to_get) > 0 ){

  install.packages(
    r_packs_to_get, 
    repos = repo, 
    dependencies = c("Depends", "Imports"),
    quiet = TRUE
  )

}


# Install from BioConductor ----
bioc_packs_to_get <- bioc_packs[
  !bioc_packs %in% row.names(installed.packages())
]

if( length(bioc_packs_to_get) > 0 ){

  suppressMessages(source("https://bioconductor.org/biocLite.R"))

  biocLite(
    bioc_packs_to_get,
    suppressUpdates = TRUE, 
    ask = FALSE,
    siteRepos = repo
  )

}


# Check for installed packages again and close out

if( !quiet ){
  print(data.frame(row.names = packs, "Installed" = present))
}

stopifnot(all(present))

q()

R Biostrings From line 12 of rscripts/check_for_required_packages.R

tto <- devtools::test(pkg = "tools/iguideSupport")

num_success <- sum(
  sapply(seq_along(tto), function(i){
    tto[[i]]$results[[1]]$message}
  ) == "success"
)

num_failed <- length(tto) - num_success

q(save = "no", status = num_failed)

R From line 3 of rscripts/check_iguideSupport.R

options(stringsAsFactors = FALSE, scipen = 99)

# Capture commandline files
parser <- argparse::ArgumentParser(
  description = "Script to check for an installed package.",
  usage = "Rscript tools/rscripts/check_pkgs.R <pkgs>"
)

parser$add_argument(
  "pkg", nargs = "+", type = "character", default = "NA",
  help = "Package(s) name."
)

args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

pkgs <- args$pkg

pkgs_present <- pkgs %in% rownames(installed.packages())

if( all(pkgs_present) ){

  q(save = "no", status = 0)

}else{

  cat(
    " Packages not installed:\n  ", 
    paste(pkgs[!pkgs_present], collapse = "\n   "), 
    "\n"
  )

  q(save = "no", status = 1)

}

R From line 4 of rscripts/check_pkgs.R

options(stringsAsFactors = FALSE, scipen = 99, warn = -1, window = 999)
suppressMessages(library("magrittr"))

# Set up and gather command line arguments ----
parser <- argparse::ArgumentParser(
  description = "Check accuracy in processing test data set.",
  usage = paste(
    "Rscript tools/rscripts/check_test_accuracy.R <run.config> <test.truth>",
    "<options>"
  )
)

parser$add_argument(
  "run.config", nargs = 1, type = "character",
  help = paste(
    "Yaml config file used to process the run.", 
    "i.e. simulation.config.yml"
  )
)

parser$add_argument(
  "test.truth", nargs = 1, type = "character",
  help = paste(
    "CSV file with the original 'true' data for testing accuracy.",
    "i.e. truth.csv"
  )
)

parser$add_argument(
  "-o", "--output", nargs = 1, type = "character", default = FALSE,
  help = "Output file name in an .rds format."
)

parser$add_argument(
  "-v", "--verbose", action = "store_true", 
  help = "Turns on diagnositc-based messages."
)

parser$add_argument(
  "--iguide_dir", nargs = 1, type = "character", default = "IGUIDE_DIR",
  help = "iGUIDE install directory path, do not change for normal applications."
)

# Set arguments with parser ----
args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

if( !dir.exists(args$iguide_dir) ){
  root_dir <- Sys.getenv(args$iguide_dir)
}else{
  root_dir <- args$iguide_dir
}

if( !dir.exists(root_dir) ){
  stop(paste0("\n  Cannot find install path to iGUIDE: ", root_dir, ".\n"))
}else{
  args$iguide_dir <- root_dir
}

code_dir <- dirname(sub(
  pattern = "--file=", 
  replacement = "", 
  x = grep("--file=", commandArgs(trailingOnly = FALSE), value = TRUE)
))

input_table <- data.frame(
  "Variables" = paste0(names(args), " :"), 
  "Values" = sapply(
    seq_along(args), 
    function(i) paste(args[[i]], collapse = ", ")
  )
)

input_table <- input_table[
  match(
    c(
      "run.config :", "test.truth :", "output :", "verbose :", "iguide_dir :"
    ), 
    input_table$Variables
  ),
]

## Log inputs
if( args$verbose ){

  cat("List Inputs")
  pander::pandoc.table(
    data.frame(input_table),
    justify = "left", 
    row.names = FALSE,
    style = "simple",
    split.table = Inf
  )

}


# Additional functions ----
readFile <- function(path, root){

  if( !file.exists(path) ){
    root_path <- file.path(root, path)
    if( !file.exists(root_path) ){
      stop("Cannot find file:", path)
    }else{
      path <- root_path
    }
  }

  # Read extension form path
  ext <- stringr::str_extract(path, "[\\w]+$")
  supported_ext <- c("tsv", "csv", "gz", "fasta", "fastq", "rds", "yaml", "yml")

  stopifnot( ext %in% supported_ext )

  # Check additional extension if compressed
  if( ext == "gz" ){
    ext2 <- stringr::str_extract(path, "[\\w]+.gz")
    ext2 <- gsub(".gz", "", ext2)
    stopifnot( ext2 %in% supported_ext )
  }else{
    ext2 <- NA
  }

  exts <- c(ext, ext2)
  exts <- exts[!is.na(exts)]

  # Read in methods based on inputs.
  if( any(exts %in% c("tsv", "csv")) ){

    if( ext == "gz" ){
      return(read.table(gzfile(path), header = TRUE, sep = ","))
    }else{
      return(read.table(path, header = TRUE, sep = ","))
    }

  }else if( any(stringr::str_detect(exts, "fast")) ){

    return(Biostrings::readDNAStringSet(path))

  }else if( any(exts %in% c("yaml", "yml")) ){

    return(yaml::yaml.load_file(path))

  }else if( any(exts %in% c("rds")) ){

    rds_import <- readRDS(path)

    if( class(rds_import) == "list" ){

      if( any(sapply(rds_import, class) == "data.frame") ){

        idx <- which(sapply(rds_import, class) == "data.frame")
        return(rds_import[[idx[1]]])

      }else{
        return(as.data.frame(rds_import[[1]], row.names = NULL))
      }

    }else{
      return(rds_import)
    }

  }else{

    stop("\n  Unsupported input file time.\n")

  }

}

# Load inputs ----
run_config <- readFile(args$run.config, args$iguide_dir)
test_truth <- readFile(args$test.truth, args$iguide_dir)
sample_info <- readFile(run_config$Sample_Info, args$iguide_dir)

# Files to check ----
check_files <- paste0(
  "analysis/", run_config$Run_Name, "/output/incorp_sites.", 
  run_config$Run_Name, ".rds"
)

check_data <- lapply(check_files, readFile, root = args$iguide_dir)
names(check_data) <- c("uniq_sites")

check_data$multihits <- suppressMessages(dplyr::bind_rows(lapply(
  sample_info$sampleName, 
  function(x){

    readFile(
      paste0(
        "analysis/", run_config$Run_Name, "/process_data/multihits/", 
        x, ".multihits.rds"
      ), 
      args$iguide_dir
    )

  }), 
  .id = "specimen"
))

## Check for content ----
total_reads <- length(test_truth$read.name)

total_read_ids <- split(
  test_truth$read.name, 
  stringr::str_extract(test_truth$read.name, "[\\w]+\\-[\\w]+\\-[\\w]+")
)[
  unique(stringr::str_extract(test_truth$read.name, "[\\w]+\\-[\\w]+\\-[\\w]+"))
]

collected_stats <- dplyr::bind_rows(lapply(total_read_ids, function(x){

      ret <- c(
        "uniq" = sum(x %in% check_data$uniq_sites$ID), 
        "multi" = sum(x %in% check_data$multihits$ID), 
        "comb" = sum(x %in% check_data$uniq_sites$ID) + 
          sum(x %in% check_data$multihits$ID),
        "total" = length(x)
      )

      x <- x[x %in% check_data$uniq_sites$ID]
      spec_truth <- test_truth[match(x, test_truth$read.name),]
      uniq_sites <- check_data$uniq_sites[match(x, check_data$uniq_sites$ID),]
      seq_cnt <- sum(spec_truth$seqnames == uniq_sites$seqnames)
      std_cnt <- sum(spec_truth$strand == uniq_sites$strand)
      cum_dis <- sum(abs(spec_truth$start - uniq_sites$start)) + 
        sum(abs(spec_truth$end - uniq_sites$end))
      correct <- sum(
        spec_truth$seqnames == uniq_sites$seqnames & 
          spec_truth$strand == uniq_sites$strand &
          abs(spec_truth$start - uniq_sites$start) + 
          abs(spec_truth$end - uniq_sites$end) == 0
      )

      ret <- c(
        ret, 
        c(
          "seqs" = seq_cnt, "strand" = std_cnt, "dis" = cum_dis, 
          "cor" = correct
        )
      )

      data.frame(t(ret))

    }),
    .id = "type"
  ) %>%
  tidyr::separate(type, into = c("specimen", "target", "gRNA"), sep = "-")

missing_data <- dplyr::bind_rows(lapply(total_read_ids, function(x){

      x <- x[!x %in% check_data$uniq_sites$ID]
      x <- x[!x %in% check_data$multihits$ID]
      spec_truth <- test_truth[match(x, test_truth$read.name),]

      dist <- abs(
        as.numeric(stringr::str_extract(spec_truth$posid, "[0-9]+$")) - 
          as.numeric(stringr::str_extract(spec_truth$incorp, "[0-9]+$")))

      ret <- c(
        "count" = nrow(spec_truth), 
        "min_dist" = min(dist), "max_dist" = max(dist), 
        "min_width" = min(spec_truth$width), "max_width" = max(spec_truth$width)
      )

      data.frame(t(ret))

    }),
    .id = "type"
  ) %>%
  tidyr::separate(type, into = c("specimen", "target", "gRNA"), sep = "-") %>%
  dplyr::mutate(
    min_dist = ifelse(count == 0, 0, min_dist),
    max_dist = ifelse(count == 0, 0, max_dist),
    min_width = ifelse(count == 0, 0, min_width),
    max_width = ifelse(count == 0, 0, max_width)
  )

pct_retention <- 100 * sum(collected_stats$comb) / sum(collected_stats$total)
uniq_accuracy <- 100 * sum(collected_stats$cor) / sum(collected_stats$uniq)

# Log output if requested ----
if( args$verbose ){

  cat("\nCollected Stats:")
  pander::pandoc.table(
    collected_stats,
    justify = "left", 
    row.names = FALSE,
    style = "simple",
    split.table = Inf
  )

  cat("\nRead retention:", round(pct_retention, digits = 1), "%\n")
  cat("Unique accuracy:", round(uniq_accuracy, digits = 1), "%\n")

  cat("\nMissing data:")
  pander::pandoc.table(
    missing_data,
    justify = "left", 
    row.names = FALSE,
    style = "simple",
    split.table = Inf
  )

}

# Write output file if requested ----
if( args$output != FALSE ){
  if( stringr::str_detect(args$output, ".rds$") ){
    saveRDS(
      list(
        "collected_stats" = collected_stats,
        "missing_data" = missing_data,
        "test_truth" = test_truth,
        "checked_data" = check_data
      ), 
      file = args$output
    )
  }else{
    stop("\n  Output data object must be a .rds format.\n")
  }
}

# Finish up and close out ----
if( pct_retention >= 95 & uniq_accuracy >= 99 ){
  q(save = "no", status = 0)
}else{
  q(save = "no", status = 1)
}

R magrittr From line 3 of rscripts/check_test_accuracy.R

options(stringsAsFactors = FALSE, scipen = 99)
suppressMessages(library("magrittr"))

# Capture commandline files
parser <- argparse::ArgumentParser(
  description = "Script to consolidate .stat files."
)

parser$add_argument(
  "-f", "--file", nargs = "+", type = "character", default = "NA",
  help = "Path to files with *.stat files (long, csv format). "
)

parser$add_argument(
  "-d", "--dir", nargs = "+", type = "character", default = "NA",
  help = "Path to directory with *.stat files (long, csv format). "
)

parser$add_argument(
  "-o", "--output", nargs = 1, type = "character", 
  help = "Output file path and name, csv format. ie. path/to/file.csv"
)

args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

stopifnot(! all(c(args$file, args$dir) == "NA") )

# Manipulate file paths to determine stat types
if( args$file != "NA" ){
  is_present <- file.exists(args$file)
}else{
  is_present <- file.exists(args$dir)
}

if( !is_present ){
  stop(
    "\n  Cannot find the following file(s) or directory: ", 
    c(args$file, args$dir)[c(args$file, args$dir) != "NA"]
  )
}

if( args$file != "NA"){
  file_names <- stringr::str_extract(args$file, "[\\w\\.\\-\\_]+$")
  file_paths <- args$file
}else{
  file_names <- list.files(path = args$dir, pattern = "\\.stat$")
  file_paths <- file.path(args$dir, file_names)
}

file_types <- sub("[\\w\\-\\_]+.", "", file_names, perl = TRUE)
file_types <- sub(".stat", "", file_types)

# Read in data in a long format
long_data <- dplyr::bind_rows(
  lapply(
    structure(file_paths, names = file_types), 
    function(file){

      x <- try(
        expr = read.csv(file = file, header = FALSE), 
        silent = TRUE
      )

      if( class(x) == "try-error"){

        return(data.frame(
          sampleName = vector(mode = "character"), 
          metric = vector(mode = "character"), 
          count = vector("numeric")
        ))

      }else{

        names(x) <- c("sampleName", "metric", "count")

        return(dplyr::mutate(
          x,
          sampleName = stringr::str_extract(sampleName, "[\\w\\-\\_]+")
        )) 

      }

    }
  ),
  .id = "type"
)

fmt_long_data <- long_data %>%
  dplyr::distinct(type, sampleName, metric, count) %>%
  dplyr::mutate(
    bin = stringr::str_extract(type, "bin[0-9]+"),
    read = ifelse(
      stringr::str_detect(type, "R[12]."),
      ifelse(stringr::str_detect(type, "R1."), "R1", "R2"),
      NA
    ),
    type = stringr::str_remove(type, "bin[0-9]+.")
  ) %>%
  dplyr::group_by(sampleName, type, metric, read) %>%
  dplyr::summarise(count = sum(count)) %>%
  dplyr::ungroup() %>%
  dplyr::filter(
    (stringr::str_detect(metric, "multihit") & 
      stringr::str_detect(type, "multihits")) | 
      !stringr::str_detect(metric, "multihit")
  ) %>%
  dplyr::mutate(type = ifelse(type == "multihits", "align", type)) %>%
  dplyr::ungroup()

# Transform data into a wide format
wide_data <- dplyr::mutate(
    fmt_long_data, 
    type = paste0(type, ".", metric),
    type = factor(type, levels = unique(type))
  ) %>%
  dplyr::select(-metric, -read) %>%
  tidyr::spread(type, count)

wide_cols <- names(wide_data)

wide_data <- wide_data[
  ,c("sampleName", sort(wide_cols[-match("sampleName", wide_cols)]))
]

# Write data to output
write.csv(wide_data, file = args$output, quote = FALSE, row.names = FALSE)

q()

R magrittr From line 3 of rscripts/collect_stats.R

options(stringsAsFactors = FALSE, scipen = 99, width = 120)
suppressMessages(library(magrittr))

code_dir <- dirname(sub(
  pattern = "--file=", 
  replacement = "", 
  x = grep("--file=", commandArgs(trailingOnly = FALSE), value = TRUE)
))

# Set up and gather command line arguments
parser <- argparse::ArgumentParser(
  description = "Script for combining multihit objects together.",
  usage = paste(
    "Rscript combine_multihits.R -d <directory> -p <pattern>",
    "[-h/--help, -v/--version] [optional args]"
  )
)

parser$add_argument(
  "-d", "--dir", nargs = 1, type = "character", 
  help = paste(
    "Directory where to look for multihit files. Combine with 'pattern'",
    "to select specific files."
  )
)

parser$add_argument(
  "-p", "--pattern", nargs = 1, type = "character", default = ".",
  help = paste(
    "Pattern to identify files within the directory specified to combine.",
    "Regex patterns supported through R. Default: '.'"
  )
)

parser$add_argument(
  "-o", "--output", nargs = 1, type = "character", required = TRUE,
  help = "Output file name. Output format only supports R-based rds format."
)

parser$add_argument(
  "-s", "--stat", nargs = 1, type = "character", default = FALSE, 
  help = "Stat output name. Stats output in long csv file format."
)


args <- parser$parse_args(commandArgs(trailingOnly = TRUE))


# Check output file name
if( !stringr::str_detect(args$output, ".rds$") ){

  stop(paste(
    "\n  Output file name must be in rds format.",
    "\n  Please change name to have the proper extension (*.rds).\n"
  ))

}


# Print Inputs to terminal
input_table <- data.frame(
  "Variables" = paste0(names(args), " :"), 
  "Values" = sapply(seq_along(args), function(i){
    paste(args[[i]], collapse = ", ")
  })
)

input_table <- input_table[
  match(
    c("dir :", "pattern :", "output :", "stat :"),
    input_table$Variables
  ),
]

cat("\nCombine Multihit Inputs:\n")
print(
  data.frame(input_table),
  right = FALSE, 
  row.names = FALSE
)

# Clear output file and prep output path
write(c(), file = args$output)
args$output <- normalizePath(args$output)
unlink(args$output)

# Check for input files
input_files <- list.files(path = args$dir)

if( args$pattern != "." ){
  input_files <- input_files[stringr::str_detect(input_files, args$pattern)]
}

if( length(input_files) == 0 ){

  cat("\nWarning:\n  No input files identified, writing empty output files.\n")

  saveRDS(
    object = list(
      "unclustered_multihits" = GenomicRanges::GRanges(),
      "clustered_multihit_positions" = GenomicRanges::GRangesList(),
      "clustered_multihit_lengths" = IRanges::RleList()
    ),
    file = args$output
  )

  if( args$stat != FALSE ){
    write.table(
      x = data.frame(), file = args$stat, 
      sep = ",", row.names = FALSE, col.names = FALSE, quote = FALSE
    )
  }

}else{

  cat(paste(
    "\n  A few multihit files to join together:\n ", 
    paste(head(file.path(args$dir, input_files)), collapse = "\n  ")
  ))

}


# Load supporting scripts
source(file.path(code_dir, "supporting_scripts", "printHead.R"))
source(file.path(code_dir, "supporting_scripts", "writeOutputFile.R"))

## Set up stat object
if( args$stat != FALSE ){

  sampleName <- unlist(strsplit(args$output, "/"))

  sampleName <- unlist(
    strsplit(sampleName[length(sampleName)], ".", fixed = TRUE)
  )[1]

  stat <- data.frame(
    sampleName = vector("character"),
    metric = vector("character"),
    count = vector("character")
  )

}


# Read in files ----
multihit_input <- lapply(file.path(args$dir, input_files), readRDS)

multihits <- unlist(GenomicRanges::GRangesList(lapply(
  multihit_input, "[[", "unclustered_multihits"
)))

num_alignments <- length(multihits)
num_reads <- length(unique(names(multihits)))

# Message
cat(
  "\nA total of", 
  format(num_alignments, big.mark = ","), 
  "alignments will be clustered from", 
  format(num_reads, big.mark = ","), 
  "reads.\n"
)


# Group and characterize multihits 
# Multihits are reads that align to multiple locations in the reference 
# genome. There are bound to always be a certain proportion of reads aligning
# to repeated sequence due to the high level degree of repeated DNA elements
# within genomes. The final object generated, "multihitData", is a list of 
# three objects. "unclustered_multihits" is a GRanges object where every 
# alignment for every multihit read is present in rows. 
# "clustered_multihit_positions" returns all the possible integration site 
# positions for the multihit. Lastly, "clustered_multihit_lengths" contains the
# length of the templates mapping to the multihit clusters, used for
# abundance calculations.

unclustered_multihits <- GenomicRanges::GRanges()
clustered_multihit_positions <- GenomicRanges::GRangesList()
clustered_multihit_lengths <- list()

if( length(multihits) > 0 ){

  #' As the loci are expanded from the coupled_loci object, unique templates 
  #' and readPairKeys are present in the readPairKeys unlisted from the 
  #' paired_loci object.
  multihit_templates <- multihits

  multihit_keys <- multihits %>%
    as.data.frame(row.names = NULL) %>%
    dplyr::distinct(sampleName, ID, readPairKey) %>%
    dplyr::select(sampleName, ID, readPairKey)

  #' Medians are based on all the potential sites for a given read, which will
  #' be identical for all reads associated with a readPairKey.
  multihit_medians <- round(
    IRanges::median(GenomicRanges::width(GenomicRanges::GRangesList(split(
      x = multihit_templates, 
      f = multihit_templates$readPairKey
    ))))
  )

  multihit_keys$medians <- multihit_medians[
    as.character(multihit_keys$readPairKey)
  ]

  multihits_pos <- GenomicRanges::flank(
    x = multihit_templates, width = -1, start = TRUE
  )

  multihits_red <- GenomicRanges::reduce(
    x = multihits_pos, min.gapwidth = 5L, with.revmap = TRUE
  )  #! Should make min.gapwidth a option

  revmap <- multihits_red$revmap

  axil_nodes <- as.character(S4Vectors::Rle(
    values = multihit_templates$readPairKey[min(revmap)], 
    lengths = lengths(revmap)
  ))

  nodes <- multihit_templates$readPairKey[unlist(revmap)]
  edgelist <- unique(matrix( c(axil_nodes, nodes), ncol = 2 ))

  multihits_cluster_data <- igraph::clusters(
    igraph::graph.edgelist(el = edgelist, directed = FALSE)
  )

  clus_key <- data.frame(
    row.names = unique(as.character(t(edgelist))),
    "clusID" = multihits_cluster_data$membership
  )

  multihits_pos$clusID <- clus_key[
    as.character(multihits_pos$readPairKey), "clusID"
  ]

  multihits_pos <- multihits_pos[order(multihits_pos$clusID)]

  clustered_multihit_index <- as.data.frame(
    GenomicRanges::mcols(multihits_pos)
  )

  multihit_loci_rle <- S4Vectors::Rle(factor(
    x = clustered_multihit_index$lociPairKey, 
    levels = unique(clustered_multihit_index$lociPairKey)
  ))

  multihit_loci_intL <- S4Vectors::split(
    multihit_loci_rle, clustered_multihit_index$clusID
  )

  clustered_multihit_positions <- GenomicRanges::granges(
    x = multihits_pos[
      match(
        x = BiocGenerics::unlist(S4Vectors::runValue(multihit_loci_intL)), 
        table = clustered_multihit_index$lociPairKey
      )
    ]
  )

  clustered_multihit_positions <- GenomicRanges::split(
    x = clustered_multihit_positions,
    f = S4Vectors::Rle(
      values = seq_along(multihit_loci_intL), 
      lengths = S4Vectors::width(S4Vectors::runValue(
        multihit_loci_intL
      )@partitioning)
    )
  )

  readPairKey_cluster_index <- unique(
    clustered_multihit_index[,c("readPairKey", "clusID")]
  )

  multihit_keys$clusID <- readPairKey_cluster_index$clusID[
    match(
      as.character(multihit_keys$readPairKey), 
      readPairKey_cluster_index$readPairKey
    )
    ]

  multihit_keys <- multihit_keys[order(multihit_keys$medians),]

  clustered_multihit_lengths <- split(
    x = S4Vectors::Rle(multihit_keys$medians), 
    f = multihit_keys$clusID
  )

  #' Expand the multihit_templates object from readPairKey specific to read
  #' specific.
  multihit_keys <- multihit_keys[order(multihit_keys$readPairKey),]

  multihit_readPair_read_exp <- IRanges::IntegerList(
    split(x = seq_len(nrow(multihit_keys)), f = multihit_keys$readPairKey)
  )

  unclustered_multihits <- multihit_templates

  multihit_readPair_read_exp <- multihit_readPair_read_exp[
    as.character(unclustered_multihits$readPairKey)
  ]

  unclustered_multihits <- unclustered_multihits[S4Vectors::Rle(
    values = seq_along(unclustered_multihits),
    lengths = S4Vectors::width(multihit_readPair_read_exp@partitioning)
  )]

  names(unclustered_multihits) <- multihit_keys$ID[
    BiocGenerics::unlist(multihit_readPair_read_exp)
  ]

  unclustered_multihits$ID <- multihit_keys$ID[
    BiocGenerics::unlist(multihit_readPair_read_exp)
  ]

  unclustered_multihits$sampleName <- multihit_keys$sampleName[
    BiocGenerics::unlist(multihit_readPair_read_exp)
  ]

}

stopifnot(
  length(clustered_multihit_positions) == length(clustered_multihit_lengths)
)

multihitData <- list(
  "unclustered_multihits" = unclustered_multihits, 
  "clustered_multihit_positions" = clustered_multihit_positions, 
  "clustered_multihit_lengths" = clustered_multihit_lengths
)

writeOutputFile(multihitData, file = args$output, format = "rds")

printHead(
  data.frame(
    "multihit_reads" = length(unique(names(unclustered_multihits))),
    "multihit_alignments" = length(unique(unclustered_multihits)),
    "multihit_clusters" = length(clustered_multihit_positions),
    "multihit_lengths" = sum(lengths(clustered_multihit_lengths))
  ),
  title = "Multihit metrics", 
  caption = "Metrics highlighting the observation of multiple aligning reads."
)

if( args$stat != FALSE ){

  add_stat <- data.frame(
    sampleName = sampleName,
    metric = c("multihit.reads", "multihit.lengths", "multihit.clusters"),
    count = c(
      length(unique(names(unclustered_multihits))), 
      sum(lengths(clustered_multihit_lengths)), 
      length(clustered_multihit_positions))
  )

  stat <- rbind(stat, add_stat)

}

if( args$stat != FALSE ){

  write.table(
    x = stat, file = args$stat, 
    sep = ",", row.names = FALSE, 
    col.names = FALSE, quote = FALSE
  )

}

if( file.exists(args$output) ){
  cat("\n  Output file generated :", args$output, "\n")
  q(save = "no", status = 0)
}else{
  stop("\n  Could not verify existance of output file:\n  ", args$output, "\n")
}

R magrittr From line 10 of rscripts/combine_multihits.R

options(stringsAsFactors = FALSE, scipen = 99, width = 999)

code_dir <- dirname(sub(
  pattern = "--file=", 
  replacement = "", 
  x = grep("--file=", commandArgs(trailingOnly = FALSE), value = TRUE)
))

desc <- desc <- yaml::yaml.load_file(
  file.path(code_dir, "descriptions/consol.desc.yml")
)

#' Set up and gather command line arguments
parser <- argparse::ArgumentParser(
  description = desc$program_short_description,
  usage = "nuc consol <seqFile> [-h/--help, -v/--version] [optional args]"
)

parser$add_argument(
  "seqFile", nargs = 1, type = "character", default = "NA",
  help = desc$seqFile
)

parser$add_argument(
  "-o", "--output", nargs = 1, type = "character", default = "NA",
  help = desc$output
)

parser$add_argument(
  "-k", "--keyFile", nargs = 1, type = "character", default = "NA",
  help = desc$keyFile
)

parser$add_argument(
  "-l", "--seqName", nargs = 1, type = "character", default = "NA",
  help = desc$seqName
)

parser$add_argument(
  "--stat", nargs = 1, type = "character", default = FALSE, help = desc$stat
)

parser$add_argument(
  "--compress", action = "store_true", help = desc$compress
) 

args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

if( args$seqFile == "NA" ){
  stop("\n  No sequence file specified. Please provide.\n")
}

# Check I/O file types
seq_type <- unlist(strsplit(args$seqFile, "/"))
seq_type <- seq_type[length(seq_type)]
seq_type <- stringr::str_extract(seq_type, "fa[\\w]*")

if( !seq_type %in% c("fa", "fasta", "fastq") ){
  stop(desc$unrecognized_file_type, " ", desc$compression_note)
}

seq_type <- ifelse(seq_type %in% c("fa", "fasta"), "fasta", "fastq")

if( args$output != "NA" ){

  outType <- unlist(strsplit(args$output, "/"))
  outType <- outType[length(outType)]
  outType <- stringr::str_extract(outType, "fa[\\w]*")
  args$output <- unlist(strsplit(args$output, outType))[1]

  if( !outType %in% c("fa", "fasta", "fastq") ){
    stop(desc$unrecognized_file_type)
  }

  outType <- ifelse(outType %in% c("fa", "fasta"), "fasta", "fastq")

  if( outType == "fastq" ){
    message(desc$fastq_input)
    outType <- "fasta"
  }

  args$output <- paste0(args$output, outType)

  if( args$compress & !grepl(".gz", args$output) ){
    args$output <- paste0(args$output, ".gz")
  }

}

if( args$keyFile != "NA" ){

  key_type <- stringr::str_extract(args$keyFile, "[\\w]+$")

  if( !key_type %in% c("csv", "tsv", "rds", "RData") ){
    stop(desc$output_keyfile_type)
  }

}else{

  stop("\n  No key file name given. Please provide.\n")

}

# Check sequence name lead
if( args$seqName == "NA" ){

  parsedName <- unlist(strsplit(args$seqFile, "/"))[
    length(unlist(strsplit(args$seqFile, "/")))
  ]

  args$seqName <- unlist(strsplit(parsedName, "fa[\\w]*"))[1]

}

# Print inputs to table
input_table <- data.frame(
  "Variables" = paste0(names(args), " :"), 
  "Values" = sapply(
    seq_along(args), function(i){
      paste(args[[i]], collapse = ", ")
    }
  )
)

input_table <- input_table[
  match(
    c("seqFile :", "output :", "keyFile :", "seqName :", "stat :"),
    input_table$Variables)
,]

cat("\nConsolidate Inputs:\n")
print(
  data.frame(input_table, row.names = NULL), 
  right = FALSE, row.names = FALSE
)


# Read sequence file
if( seq_type == "fasta" ){
  seq_pointer <- ShortRead::readFasta(args$seqFile)
}else{
  seq_pointer <- ShortRead::readFastq(args$seqFile)
}

seqs <- ShortRead::sread(seq_pointer)
names(seqs) <- ShortRead::id(seq_pointer)

# Generate blank files if inputs are empty
if( length(seqs) == 0 ){

  Biostrings::writeXStringSet(
    x = Biostrings::DNAStringSet(),
    filepath = args$output,
    format = "fasta",
    compress = args$compress
  )

  if( !is.null(args$keyFile) ){

    key <- data.frame("readNames" = c(), "seqID" = c())

    if( key_type == "csv" ){
      write.csv(key, file = args$keyFile, row.names = FALSE, quote = FALSE)
    }else if( key_type == "tsv" ){
      write.table(
        key, file = args$keyFile, sep = "\t", row.names = FALSE, quote = FALSE
      )
    }else if(key_type == "rds"){
      saveRDS(key, file = args$keyFile)
    }else if(key_type == "RData"){
      save(key, file = args$keyFile)
    }

  }


  if( args$stat != FALSE ){

    sampleName <- unlist(strsplit(args$output, "/"))

    sampleName <- unlist(
      strsplit(sampleName[length(sampleName)], ".fa", fixed = TRUE)
    )[1]

    write.table(
      x = data.frame(
        sampleName = sampleName,
        metric = "reads",
        count = length(seqs)
      ),
      file = args$stat,
      sep = ",", 
      row.names = FALSE, 
      col.names = FALSE, 
      quote = FALSE
    )

  }

  q()

}


factor_seqs <- factor(as.character(seqs))

key <- data.frame(
  "readNames" = names(factor_seqs),
  "seqID" = paste0(args$seqName, as.integer(factor_seqs))
)

consolidated_seqs <- Biostrings::DNAStringSet(levels(factor_seqs))
names(consolidated_seqs) <- paste0(args$seqName, seq_along(levels(factor_seqs)))

# Stats if requested
if( args$stat != FALSE ){

  sampleName <- unlist(strsplit(args$output, "/"))
  sampleName <- unlist(
    strsplit(sampleName[length(sampleName)], ".fa", fixed = TRUE)
  )[1]

  write.table(
    x = data.frame(
      sampleName = sampleName,
      metric = "reads",
      count = length(consolidated_seqs)),
    file = args$stat,
    sep = ",", row.names = FALSE, col.names = FALSE, quote = FALSE
  )

}

# Write output and key files
# Output
if( args$output == "NA" ){

  print(
    data.frame(
      "seqID" = names(consolidated_seqs),
      "sequence" = as.character(consolidated_seqs)
    ),
    row.names = FALSE
  )

}else{

  unlink(args$output)

  ShortRead::writeFasta(
    consolidated_seqs, 
    file = args$output,
    width = max(Biostrings::width(consolidated_seqs)),
    compress = args$compress
  )

}

# Key file
if( !is.null(args$keyFile) ){

  if( key_type == "csv" ){
    write.csv(key, file = args$keyFile, row.names = FALSE, quote = FALSE)
  }else if( key_type == "tsv" ){
    write.table(
      key, file = args$keyFile, sep = "\t", row.names = FALSE, quote = FALSE
    )
  }else if( key_type == "rds" ){
    saveRDS(key, file = args$keyFile)
  }else if( key_type == "RData" ){
    save(key, file = args$keyFile)
  }

}

q()

R From line 2 of rscripts/consol.R

options(stringsAsFactors = FALSE, scipen = 99, width = 120)

code_dir <- dirname(sub(
  pattern = "--file=", 
  replacement = "", 
  x = grep("--file=", commandArgs(trailingOnly = FALSE), value = TRUE)
))

desc <- yaml::yaml.load_file(
  file.path(code_dir, "descriptions/couple.desc.yml")
)

# Set up and gather command line arguments
parser <- argparse::ArgumentParser(
  description = desc$program_short_description,
  usage = "Rscript couple.R <anchorPSL> <adriftPSL> [-h/--help, -v/--version] [optional args]"
)

parser$add_argument(
  "anchorPSL", nargs = 1, type = "character", help = desc$anchorPSL
)

parser$add_argument(
  "adriftPSL", nargs = 1, type = "character", help = desc$adriftPSL
)

parser$add_argument(
  "-k", "--keys", nargs = "*", type = "character", help = desc$keys
)

parser$add_argument(
  "-o", "--uniqOutput", nargs = 1, type = "character", help = desc$uniqOutput
)

parser$add_argument(
  "--condSites", nargs = 1, type = "character", help = desc$condSites
)

parser$add_argument(
  "--chimeras", nargs = 1, type = "character", help = desc$chimeras
)

parser$add_argument(
  "--multihits", nargs = 1, type = "character", help = desc$multihits
)

parser$add_argument(
  "--stat", nargs = 1, type = "character", default = FALSE, help = desc$stat
)

parser$add_argument(
  "-g", "--refGenome", nargs = 1, type = "character", default = "hg38",
  help = desc$refGenome
)

parser$add_argument(
  "--maxAlignStart", nargs = 1, type = "integer", default = 5L,
  help = desc$maxAlignStart
)

parser$add_argument(
  "--minPercentIdentity", nargs = 1, type = "integer", default = 95L,
  help = desc$minPercentIdentity
)

parser$add_argument(
  "--minTempLength", nargs = 1, type = "integer", default = 30L,
  help = desc$minTempLength
)

parser$add_argument(
  "--maxTempLength", nargs = 1, type = "integer", default = 2500L,
  help = desc$maxTempLength
)

parser$add_argument(
  "--keepAltChr", action = "store_true", help = desc$keepAltChr
)

parser$add_argument(
  "--readNamePattern", nargs = 1, type = "character", 
  default = "[\\w\\:\\-\\+]+", help = desc$readNamePattern
)

parser$add_argument(
  "--saveImage", nargs = 1, type = "character", help = desc$saveImage
)


args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

# Argument Conditionals
if( is.null(args$anchorPSL) | is.null(args$adriftPSL) ){
  stop("\n  Anchor and adrift PSL files not found. Please provide.\n")
}

if( is.null(args$uniqOutput) ){
  stop("\n  Please provide an output file name.\n")
}

# Print Inputs to terminal
input_table <- data.frame(
  "Variables" = paste0(names(args), " :"), 
  "Values" = sapply(seq_along(args), function(i){
    paste(args[[i]], collapse = ", ")
  })
)

input_table <- input_table[
  match(
    c("anchorPSL :", "adriftPSL :", "keys :", "uniqOutput :", "condSites :", 
      "chimeras :", "multihits :", "stat :", "refGenome :", 
      "maxAlignStart :", "minPercentIdentity :", "minTempLength :", 
      "maxTempLength :", "readNamePattern :"),
    input_table$Variables
  ),
]

cat("\nCoupler Inputs:\n")
print(
  data.frame(input_table),
  right = FALSE, 
  row.names = FALSE
)

# Load supporting scripts
source(file.path(code_dir, "supporting_scripts", "printHead.R"))

source(file.path(code_dir, "supporting_scripts", "readKeyFile.R"))

source(file.path(code_dir, "supporting_scripts", "readPSL.R"))

source(file.path(code_dir, "supporting_scripts", "qualityFilter.R"))

source(file.path(code_dir, "supporting_scripts", "processBLATData.R"))

source(file.path(code_dir, "supporting_scripts", "condenseSites.R"))

source(file.path(code_dir, "supporting_scripts", "writeOutputFile.R"))

if( !all(
  c("printHead", "readKeyFile", "readPSL", "qualityFilter", 
    "processBLATData", "condenseSites", "writeOutputFile") %in% ls())
){
  stop(
    "\n  Cannot load supporting scripts. ",
    "You may need to clone from github again.\n"
  )
}

# Load reference genome
if( grepl(".fa", args$refGenome) ){

  if( !file.exists(args$refGenome) ){
    stop("\n  Specified reference genome file not found.\n")
  }

  ref_file_type <- ifelse(grepl(".fastq", args$refGenome), "fastq", "fasta")

  ref_genome <- Biostrings::readDNAStringSet(
    args$refGenome, format = ref_file_type
  )

}else{

  genome <- grep(
    args$refGenome, 
    unique(BSgenome::installed.genomes()), 
    value = TRUE
  )

  if( length(genome) == 0 ){

    cat("\nInstalled genomes include:\n")
    print(paste(unique(BSgenome::installed.genomes()), collapse = "\n"))
    stop("\n  Selected reference '", args$refGenome, "'genome not in list.\n")

  }else if( length(genome) > 1 ){

    cat("\nInstalled genomes include:\n")
    print(paste(unique(BSgenome::installed.genomes(), collapse = "\n")))
    stop(
        "\n  Please be more specific about reference genome. ", 
        "Multiple matches to input.\n"
    )

  }

  suppressMessages(library(genome, character.only = TRUE))
  ref_genome <- get(genome)

}

## Set up stat object
if( args$stat != FALSE ){

  sampleName <- unlist(strsplit(args$uniqOutput, "/"))

  sampleName <- unlist(
    strsplit(sampleName[length(sampleName)], ".", fixed = TRUE)
  )[1]

  stat <- data.frame(
    sampleName = vector("character"),
    metric = vector("character"),
    count = vector("character")
  )

}

## Load and process alignment data ##
# Create single key file if one for each alignment file.
if( length(args$keys) > 1 ){

  anchor_key_type <- stringr::str_extract(args$keys[1], "[\\w]+$")

  if( !anchor_key_type %in% c("csv", "tsv", "rds", "RData") ){
    stop(
      "\n  Output key file type not supported. ",
      "Please use csv, tsv, rds, or RData.\n"
    )
  }

  anchor_keys <- readKeyFile(args$keys[1], format = anchor_key_type)

  adrift_key_type <- stringr::str_extract(args$key[2], "[\\w]+$")

  if( !adrift_key_type %in% c("csv", "tsv", "rds", "RData") ){
    stop(
      "\n  Output key file type not supported. ",
      "Please use csv, tsv, rds, or RData.\n"
    )
  }

  adrift_keys <- readKeyFile(args$keys[2], format = adrift_key_type)

  stopifnot(all(c("readNames", "seqID") %in% names(anchor_keys)))
  stopifnot(all(c("readNames", "seqID") %in% names(adrift_keys)))

  # Check input for data, if none, write files and exit
  if( nrow(anchor_keys) == 0 | nrow(adrift_keys) == 0 ){

    cat("\nNo sequences identified in at least one key file.\n")
    writeNullOutput(args)
    q()

  }

  # Verify readNames are in the same format.
  anchor_keys$readNames <- stringr::str_extract(
    anchor_keys$readNames, args$readNamePattern
  )

  adrift_keys$readNames <- stringr::str_extract(
    adrift_keys$readNames, args$readNamePattern
  )

  # Only interested in reads in common between the two.
  common_names <- intersect(anchor_keys$readNames, adrift_keys$readNames)

  # Check intersection is not 0
  if( length(common_names) == 0 | is.null(common_names) ){

    cat("\nNo sequences in common between key files.\n")
    writeNullOutput(args)
    q()

  }

  # Filter names in key files.
  anchor_keys <- anchor_keys[anchor_keys$readNames %in% common_names,]
  adrift_keys <- adrift_keys[adrift_keys$readNames %in% common_names,]

  # Create a common key
  adrift_keys <- adrift_keys[
    match(anchor_keys$readNames, adrift_keys$readNames),
  ]

  keys <- data.frame(
    "readNames" = anchor_keys$readNames,
    "anchorSeqID" = factor(anchor_keys$seqID),
    "adriftSeqID" = factor(adrift_keys$seqID)
  )

  keys$anchorKey <- as.integer(keys$anchorSeqID)
  keys$adriftKey <- as.integer(keys$adriftSeqID)
  keys$readPairKey <- paste0(keys$anchorKey, ":", keys$adriftKey)

  # Print beginning of keys
  printHead(
    keys, 
    title = "Beginning of Key for relating reads to sequences",
    caption = paste0(
      "\tReads: ", length(unique(keys$readNames)), 
      "\n\tUnique Pairings: ", length(unique(keys$readPairKey))
    )
  )

}else if( length(args$keys) == 1 ){

  key_type <- str_extract(args$keys, "[\\w]+$")

  if( !keys_type %in% c("csv", "tsv", "rds", "RData") ){
    stop(
      "\n  Output key file type not supported. ",
      "Please use csv, tsv, rds, or RData.\n"
    )
  }

  keys <- readKeyFile(args$keys, format = key_type)
  stopifnot(all(c("readNames", "anchorSeqID", "adriftSeqID") %in% names(keys)))

  if( nrow(keys) == 0 ){

    cat("\nNo sequences identified in key file.\n")
    writeNullOutput(args)
    q()

  }

  keys$anchorSeqID <- factor(keys$anchorSeqID)
  keys$adriftSeqID <- factor(keys$adriftSeqID)
  keys$anchorKey <- as.integer(keys$anchorSeqID)
  keys$adriftKey <- as.integer(keys$adriftSeqID)
  keys$readPairKey <- paste0(keys$anchorKey, ":", keys$adriftKey)

  # Print beginning of keys
  printHead(
    keys, 
    title = "Beginning of Key for relating reads to sequences.",
    caption = paste0(
      "\n  Reads          : ", format(length(unique(keys$readNames)), big.mark = ","), 
      "\n  Unique Pairings: ", format(length(unique(keys$readPairKey)), big.mark = ",")
    )
  )

}else if( length(args$keys) > 2 ){

  stop("\n  Cannot have more key files than sequence alignment files.\n")

}

# Load psl files and filter reads based on inputs
anchor_hits <- readPSL(args$anchorPSL)
adrift_hits <- readPSL(args$adriftPSL)

# Remove alignments to alternate chromosomes
# This helps in identifying unique locations instead of alignments that appear
# for both the standard and alternate chromosomes, becoming a multihit.
if( !args$keepAltChr ){

  anchor_hits <- anchor_hits[
    !stringr::str_detect(anchor_hits$tName, stringr::fixed("_")),
  ]

  adrift_hits <- adrift_hits[
    !stringr::str_detect(adrift_hits$tName, stringr::fixed("_")),
  ]

}


# Create base key if no key was supplied
if( is.null(args$keys) ){

  anchor_SeqID <- stringr::str_extract(
    unique(anchor_hits$qName), args$readNamePattern
  )

  adrift_SeqID <- stringr::str_extract(
    unique(adrift_hits$qName), args$readNamePattern
  )

  intersect_SeqID <- intersect(anchor_SeqID, adrift_SeqID)

  keys <- data.frame(
    readNames = intersect_SeqID,
    anchorSeqID = factor(intersect_SeqID),
    adriftSeqID = factor(intersect_SeqID)
  )

  keys$anchorKey <- as.integer(keys$anchorSeqID)
  keys$adriftKey <- as.integer(keys$adriftSeqID)
  keys$readPairKey <- paste0(keys$anchorKey, ":", keys$adriftKey)

  # Print beginning of keys
  printHead(
    keys, 
    title = "Beginning of Key for relating reads to sequences",
    caption = paste0(
      "\n  Reads          :", format(length(unique(keys$readNames)), big.mark = ","), 
      "\n  Unique Pairings:", format(length(unique(keys$readPairKey)), big.mark = ",")
    )
  )

}


# Print out basic alignment info.
cat(sprintf(
  "\nAnchor Alignments: %1$s from %2$s sequences\n", 
  nrow(anchor_hits),
  length(unique(anchor_hits$qName))
))

cat(sprintf(
  "\nAdrift Alignments: %1$s from %2$s sequences\n\n", 
  nrow(adrift_hits),
  length(unique(adrift_hits$qName))
))

# Stop if there are no alignments to couple.
if( nrow(anchor_hits) == 0 | nrow(adrift_hits) == 0 ){

  cat("\nNo sequences aligned for at least one of the sequence pairs.\n")
  writeNullOutput(args)
  q()

}

# Remove alignments that do not appear in the keys (single reads filtered out)
anchor_hits <- anchor_hits[anchor_hits$qName %in% levels(keys$anchorSeqID),]
adrift_hits <- adrift_hits[adrift_hits$qName %in% levels(keys$adriftSeqID),]

# Quality filter and convert alignments from data.frame to GRanges
anchor_hits <- qualityFilter(
  alignments = anchor_hits, 
  q.start.max = args$maxAlignStart, 
  global.identity.min = args$minPercentIdentity
)                     

if( nrow(anchor_hits) == 0 ){

  cat("\nNo alignments remaining after quality filtering anchor reads.\n")
  writeNullOutput(args)
  q()

}

anchor_hits <- processBLATData(
  algns = anchor_hits, from = "anchor", ref.genome = ref_genome
)

anchor_hits$anchorKey <- match(anchor_hits$qName, levels(keys$anchorSeqID))

adrift_hits <- qualityFilter(
  alignments = adrift_hits, 
  q.start.max = args$maxAlignStart, 
  global.identity.min = args$minPercentIdentity
)

if( nrow(adrift_hits) == 0 ){

  cat("\nNo alignments remaining after quality filtering adrift reads.\n")
  writeNullOutput(args)
  q()

}

adrift_hits <- processBLATData(
  algns = adrift_hits, from = "adrift", ref.genome = ref_genome
)

adrift_hits$adriftKey <- match(adrift_hits$qName, levels(keys$adriftSeqID))

# Info after quality filtering individual alignments.
printHead(
  anchor_hits,
  title = "Head of filtered anchor alignments",
  caption = sprintf(
    "Alignments: %1$s from %2$s reads", 
    length(anchor_hits), 
    length(unique(anchor_hits$qName))
  )
)

printHead(
  adrift_hits,
  title = "Head of filtered adrift alignments",
  caption = sprintf(
    "Alignments: %1$s from %2$s reads", 
    length(adrift_hits), 
    length(unique(adrift_hits$qName))
  )
)

# Stop if no alignments passed filtering for individual sequences.
if( length(anchor_hits) == 0 | length(adrift_hits) == 0 ){

  cat(
    "\nNo alignments remaining after quality filtering",
    "for at least one of the sequence pairs.\n"
  )
  writeNullOutput(args)
  q()

}

# All alignments should be either "+" or "-" strand.  
stopifnot(all(strand(anchor_hits) == "+" | strand(anchor_hits) == "-"))
stopifnot(all(strand(adrift_hits) == "+" | strand(adrift_hits) == "-"))

# Identify all combinations of unique anchor and adrift sequences present in the 
# data
unique_key_pairs <- unique(keys[,c("anchorKey", "adriftKey", "readPairKey")])

#' Reduced alignments identify the distinct genomic locations present in the 
#' data for the adrift sequences (breakpoint positions) and anchor sequences 
#' (integration site position). 
#' Levels: Reads --> Unique Sequences --> Alignments --> Unique Genomic Loci
red_anchor_hits <- GenomicRanges::reduce(
  x = GenomicRanges::flank(anchor_hits, -1, start = TRUE), 
  min.gapwidth = 0L, 
  with.revmap = TRUE
)

red_adrift_hits <- GenomicRanges::reduce(
  x = GenomicRanges::flank(adrift_hits, -1, start = TRUE), 
  min.gapwidth = 0L, 
  with.revmap = TRUE
)

#' The following finds all posible combinations of anchor and adrift loci which 
#' meet criteria for pairing. These include: oneEach (each pairing must come 
#' from one anchor and one adrift loci), opposite strands (paired loci should be
#' present on opposite strands), and correct downstream orientation (if an 
#' anchor loci is on the "+" strand, then the start of the anchor loci should be
#' less than the paired adrift, and vice versa for "-" strand).
#' (Inherent check for oneEach with findOverlaps())
pairs <- GenomicRanges::findOverlaps(
  query = red_anchor_hits, 
  subject = red_adrift_hits,
  maxgap = args$maxTempLength,
  ignore.strand = TRUE
)

#Stop if no alignments coupled based on criteria.
if( length(pairs) == 0 ){

  cat("\nNo alignments coupled based on input criteria.\n")
  writeNullOutput(args)
  q()

}

# Check isDownstream and isOppositeStrand
adrift_loci_starts <- GenomicRanges::start(red_adrift_hits)[
  S4Vectors::subjectHits(pairs)
]

anchor_loci_starts <- GenomicRanges::start(red_anchor_hits)[
  S4Vectors::queryHits(pairs)
]

adrift_loci_strand <- GenomicRanges::strand(red_adrift_hits)[
  S4Vectors::subjectHits(pairs)
]

anchor_loci_strand <- GenomicRanges::strand(red_anchor_hits)[
  S4Vectors::queryHits(pairs)
]

keep_loci <- ifelse(
  anchor_loci_strand == "+", 
  as.vector(
    (adrift_loci_starts > anchor_loci_starts) & 
      (adrift_loci_strand != anchor_loci_strand)
  ), 
  as.vector(
    (adrift_loci_starts < anchor_loci_starts) & 
      (adrift_loci_strand != anchor_loci_strand)
  )
)

keep_loci <- as.vector(
  (keep_loci & anchor_loci_strand != "*") & (adrift_loci_strand != "*")
)

pairs <- pairs[keep_loci]

# Stop if no loci were properly paired
if( length(pairs) == 0 ){

  cat("\nNo genomic loci from alignments were properly paired.\n")  
  writeNullOutput(args)
  q()

}

#' Below, the code constructs a genomic loci key which links genomic loci to
#' the various anchor and adrift sequences that were aligned. The technique used
#' below first matches the unique loci back to multiple alignments, then uses 
#' the indices of the unique_key_pairs data.frame (which matches alignments to 
#' unique sequence identifiers) as a GRanges object to match many alignments to 
#' many read identifiers with findOverlaps. For some reason, this method did not
#' work as anticipated with IRanges, and therefore objects were moved to GRanges
#' and GRangesLists.
loci_key <- data.frame(
  "anchorLoci" = S4Vectors::queryHits(pairs),
  "adriftLoci" = S4Vectors::subjectHits(pairs)
)

loci_key$lociPairKey <- paste0(loci_key$anchorLoci, ":", loci_key$adriftLoci)

# Append *Loci ids to the anchor and adrift alignments
idx_passing_anchors <- unlist(red_anchor_hits$revmap[
  unique(loci_key$anchorLoci)
])

anchor_hits$anchorLoci <- NA
anchor_hits$anchorLoci[idx_passing_anchors] <- as.numeric(S4Vectors::Rle(
  values = unique(loci_key$anchorLoci), 
  lengths = lengths(red_anchor_hits$revmap[unique(loci_key$anchorLoci)])
))

idx_passing_adrifts <- unlist(red_adrift_hits$revmap[
  unique(loci_key$adriftLoci)
])

adrift_hits$adriftLoci <- NA
adrift_hits$adriftLoci[idx_passing_adrifts] <- as.numeric(S4Vectors::Rle(
  values = unique(loci_key$adriftLoci), 
  lengths = lengths(red_adrift_hits$revmap[unique(loci_key$adriftLoci)])
))

# Join the loci idx information up to the keys file
# Identify aligning keys
aligned_anchor_keys <- unique(
  anchor_hits$anchorKey[!is.na(anchor_hits$anchorLoci)]
)

aligned_adrift_keys <- unique(
  adrift_hits$adriftKey[!is.na(adrift_hits$adriftLoci)]
)

# Construct an anchor/adrift key to loci IntegerList with indices
anchor_key_to_loci <- with(
  as.data.frame(anchor_hits)[
    anchor_hits$anchorKey %in% aligned_anchor_keys &
      !is.na(anchor_hits$anchorLoci), 
    c("anchorKey", "anchorLoci")
  ],
  IRanges::IntegerList(split(anchorLoci, anchorKey))
)

adrift_key_to_loci <- with(
  as.data.frame(adrift_hits)[
    adrift_hits$adriftKey %in% aligned_adrift_keys &
      !is.na(adrift_hits$adriftLoci), 
    c("adriftKey", "adriftLoci")
    ],
  IRanges::IntegerList(split(adriftLoci, adriftKey))
)

# Construct readPairKey to lociKey object
unique_read_pair_keys <- unique(keys$readPairKey)

unique_read_pair_keys <- unique_read_pair_keys[
  stringr::str_extract(unique_read_pair_keys, "[\\d]+") %in% names(anchor_key_to_loci) &
    stringr::str_extract(unique_read_pair_keys, "[\\d]+$") %in% names(adrift_key_to_loci)
]


loci_key_anchor_idx <- IRanges::IntegerList(split(
  seq_along(loci_key$anchorLoci), loci_key$anchorLoci
))

loci_key_adrift_idx <- IRanges::IntegerList(split(
  seq_along(loci_key$adriftLoci), loci_key$adriftLoci
))

# Time sink -- warning
rpk_anchor_loci_idx <- IRanges::IntegerList(lapply(
  anchor_key_to_loci[stringr::str_extract(unique_read_pair_keys, "[\\d]+")],
  function(x) unlist(loci_key_anchor_idx[as.character(x)], use.names = FALSE)
))

# Time sink -- warning
rpk_adrift_loci_idx <- IRanges::IntegerList(lapply(
  adrift_key_to_loci[stringr::str_extract(unique_read_pair_keys, "[\\d]+$")],
  function(x) unlist(loci_key_adrift_idx[as.character(x)], use.names = FALSE)
))

rpk_loci_idx <- IRanges::intersect(rpk_anchor_loci_idx, rpk_adrift_loci_idx)
names(rpk_loci_idx) <- unique_read_pair_keys

rpk_loci_key <- IRanges::CharacterList(split(
  loci_key$lociPairKey[unlist(rpk_loci_idx)], S4Vectors::Rle(
    values = names(rpk_loci_idx), lengths = lengths(rpk_loci_idx)
  )
))

gc()

# Group readPairKeys into unique, mulithit, or artifactual chimeras
unique_rpks <- names(rpk_loci_key)[lengths(rpk_loci_key) == 1]
multihit_rpks <- names(rpk_loci_key)[lengths(rpk_loci_key) > 1]
chimera_rpks <- keys$readPairKey[
  !keys$readPairKey %in% c(unique_rpks, multihit_rpks)
]

cat(
  "\nUnique sequences associated with types of alignments:\n",
  "  unique alignments  : ", format(length(unique_rpks), big.mark = ","), "\n",
  "  multihit alignments: ", format(length(multihit_rpks), big.mark = ","), "\n",
  "  chimera artifacts  : ", format(length(chimera_rpks), big.mark = ","), "\n"
)

# Couple together the anchor and adrift loci for expanding rpks-loci
# Using the range information from the filtered paired alignments, the code
# constructs a GRanges object from the anchor_loci and adrift_loci. Anchor_loci
# are the integration site positions while the adrift_loci are the various 
# breakpoints. The strand of the range is set to the same strand as the 
# anchor_loci since the direction of sequencing is considered to be from the 
# host-junction found at the 3' end of the integrated element.

coupled_loci <- GenomicRanges::GRanges(
  seqnames = GenomicRanges::seqnames(red_anchor_hits)[loci_key$anchorLoci],
  ranges = IRanges::IRanges(
    start = ifelse(
      GenomicRanges::strand(red_anchor_hits[loci_key$anchorLoci]) == "+", 
      GenomicRanges::start(red_anchor_hits)[loci_key$anchorLoci],
      GenomicRanges::start(red_adrift_hits)[loci_key$adriftLoci]
    ),
    end = ifelse(
      GenomicRanges::strand(red_anchor_hits[loci_key$anchorLoci]) == "+", 
      GenomicRanges::start(red_adrift_hits)[loci_key$adriftLoci],
      GenomicRanges::start(red_anchor_hits)[loci_key$anchorLoci]
    )
  ),
  strand = GenomicRanges::strand(red_anchor_hits[loci_key$anchorLoci]),
  seqinfo = GenomeInfoDb::seqinfo(ref_genome),
  lociPairKey = loci_key$lociPairKey
)

#' Information on valid coupled alignments from all sequences present.

printHead(
  sort(coupled_loci[sample.int(
    length(coupled_loci), 
    size = min(6, length(coupled_loci)), 
    replace = FALSE
  )]),
  title = "Randomly sampled coupled loci present in the data.",
  caption = sprintf("Genomic loci: %s", length(coupled_loci))
)

#' Stop if there are no coupled_loci
if( length(coupled_loci) == 0 ){

  cat(
    "\nNo valid coupled genomic loci were found within", 
    "the data given input criteria.\n"
  )
  writeNullOutput(args)
  q()

}

#' Bin reads that would map to different loci on the same read (chimeras)
#' All unique and multihit templates are mapped successfully to 
#' genomic loci, yet some templates are sequenced but do not make it through
#' the selection criteria. These templates either do not have alignments to the
#' reference genome (anchor or adrift did not align) or map to two distant 
#' genomic loci. The latter are termed chimeras and are considered to be 
#' artifacts of PCR amplification.
if( !is.null(args$chimeras) ){

  failed_reads <- keys[keys$readPairKey %in% chimera_rpks,]

  chimera_reads <- failed_reads[
    failed_reads$anchorKey %in% anchor_hits$anchorKey & 
      failed_reads$adriftKey %in% adrift_hits$adriftKey,
  ]

  chimera_alignments <- GenomicRanges::GRangesList()

  if( nrow(chimera_reads) > 0 ){

    chim_anchor <- anchor_hits[
      anchor_hits$anchorKey %in% chimera_reads$anchorKey,
    ]

    chim_anchor <- split(x = chim_anchor, f = chim_anchor$qName)
    chim_anchor <- chim_anchor[chimera_reads$anchorSeqID]
    names(chim_anchor) <- chimera_reads$readNames
    chim_anchor <- unlist(chim_anchor)

    chim_adrift <- adrift_hits[
      adrift_hits$adriftKey %in% chimera_reads$adriftKey,
    ]

    chim_adrift <- split(x = chim_adrift, f = chim_adrift$qName)
    chim_adrift <- chim_adrift[chimera_reads$adriftSeqID]
    names(chim_adrift) <- chimera_reads$readNames
    chim_adrift <- unlist(chim_adrift)

    keepCols <- c(
      "from", "qName", "matches", "repMatches", "misMatches", "qStart", "qEnd", 
      "qSize", "tBaseInsert"
    )

    mcols(chim_anchor) <- mcols(chim_anchor)[,keepCols]
    mcols(chim_adrift) <- mcols(chim_adrift)[,keepCols]

    chimera_alignments <- c(chim_anchor, chim_adrift)
    chimera_alignments <- split(chimera_alignments, names(chimera_alignments))

  }

  if( args$stat != FALSE ){

    add_stat <- data.frame(
      sampleName = sampleName,
      metric = "chimera.reads",
      count = length(unique(chimera_reads$readNames))
    )

    stat <- rbind(stat, add_stat)

  }

  chimeraData <- list(
    "read_info" = chimera_reads, 
    "alignments" = chimera_alignments,
    "failed_reads" = failed_reads
  )

  writeOutputFile(chimeraData, file = args$chimeras, format = "rds")

}

#' Expand out uniquely mapped reads or unique sites
#' Below, the paired_loci object is expanded to create the genomic alignments
#' for each read that mapped to a single genomic loci. This data is then 
#' recorded in two formats. "allSites" is a GRanges object where each row is a
#' single read, while "sites.final" is a condensed form of the data where each
#' row is a unique integration site with the width of the range refering to 
#' the longest template aligned to the reference genome. 
uniq_templates <- coupled_loci[
  match(unlist(rpk_loci_key[unique_rpks]), coupled_loci$lociPairKey)
]

uniq_templates$readPairKey <- unique_rpks

uniq_keys <- keys[keys$readPairKey %in% unique_rpks,]

uniq_reads <- uniq_templates[
  match(uniq_keys$readPairKey, uniq_templates$readPairKey)
]

names(uniq_reads) <- as.character(uniq_keys$readNames)

uniq_reads$sampleName <- stringr::str_extract(
  string = as.character(keys$anchorSeqID[
    match(uniq_reads$readPairKey, keys$readPairKey)
  ]), 
  pattern = "^[\\w-]+"
)

uniq_reads$ID <- names(uniq_reads)

uniq_sites <- uniq_reads
names(uniq_sites) <- NULL
writeOutputFile(uniq_sites, file = args$uniqOutput)

# Print out head of uniq_sites for reference.
printHead(
  uniq_sites,
  title = "Head of uniquely mapped genomic loci",
  caption = sprintf(
    paste(
      "Alignments yeilded %1$s unique anchor sites from %2$s", 
      "properly-paired and aligned reads."
    ),
    length(reduce(flank(uniq_sites, -1, start = TRUE), min.gapwidth = 0L)),
    length(uniq_sites)
  )
)

if( args$stat != FALSE ){

  add_stat <- data.frame(
    sampleName = sampleName,
    metric = c("unique.reads", "unique.algns", "unique.loci"),
    count = c(
      length(unique(uniq_sites$ID)), 
      length(unique(uniq_sites)),
      length(GenomicRanges::reduce(
        x = GenomicRanges::flank(uniq_sites, width = -1, start = TRUE), 
        min.gapwidth = 0L
      ))
    )
  )

  stat <- rbind(stat, add_stat)

}

# Generate condensed sites
if( !is.null(args$condSites) ){

  cond_sites <- condenseSites(
    uniq_sites, keep.cols = "sampleName", list.bp.counts = TRUE
  )

  writeOutputFile(cond_sites, file = args$condSites)

  printHead(
    cond_sites,
    title = "Head of unique anchor sites",
    caption = sprintf(
      paste(
        "There were %1$s unique anchor sites identified with a total", 
        "of %2$s unique template lengths and %3$s read counts."
      ),
      length(cond_sites),
      sum(cond_sites$fragLengths),
      sum(cond_sites$counts)
    )
  )

}

# Clean up environment for expansion and clustering of multihits

# Group and characterize multihits 
# Multihits are reads that align to multiple locations in the reference 
# genome. There are bound to always be a certain proportion of reads aligning
# to repeated sequence due to the high level degree of repeated DNA elements
# within genomes. The final object generated, "multihitData", is a list of 
# three objects. "unclustered_multihits" is a GRanges object where every 
# alignment for every multihit read is present in rows. 
# "clustered_multihit_positions" returns all the possible integration site 
# positions for the multihit. Lastly, "clustered_multihit_lengths" contains the
# length of the templates mapping to the multihit clusters, used for
# abundance calculations.
if( !is.null(args$multihits) ){

  unclustered_multihits <- GenomicRanges::GRanges()
  clustered_multihit_positions <- GenomicRanges::GRangesList()
  clustered_multihit_lengths <- list()

  if( length(multihit_rpks) > 0 ){

    #' Only consider readPairKeys that aligned to multiple genomic loci
    multihit_templates <- coupled_loci[
      coupled_loci$lociPairKey %in% unlist(rpk_loci_key[multihit_rpks])
    ]

    multihit_templates <- multihit_templates[
      match(unlist(rpk_loci_key[multihit_rpks]), multihit_templates$lociPairKey)
    ]

    multihit_templates$readPairKey <- as.character(S4Vectors::Rle(
      values = multihit_rpks, lengths = lengths(rpk_loci_key[multihit_rpks])
    ))

    #' As the loci are expanded from the coupled_loci object, unique templates 
    #' and readPairKeys are present in the readPairKeys unlisted from the 
    #' paired_loci object.
    multihit_keys <- keys[keys$readPairKey %in% multihit_rpks,]

    multihit_keys$sampleName <- stringr::str_extract(
      string = as.character(multihit_keys$anchorSeqID), pattern = "^[\\w-]+"
    )

    multihit_keys$ID <- multihit_keys$readNames

    #' Medians are based on all the potential sites for a given read, which will
    #' be identical for all reads associated with a readPairKey.
    multihit_medians <- round(
      median(GenomicRanges::width(split(
        x = multihit_templates, 
        f = multihit_templates$readPairKey
      )))
    )

    multihit_keys$medians <- multihit_medians[multihit_keys$readPairKey]

    multihits_pos <- GenomicRanges::flank(
      x = multihit_templates, width = -1, start = TRUE
    )

    multihits_red <- GenomicRanges::reduce(
      x = multihits_pos, min.gapwidth = 5L, with.revmap = TRUE
    )  #! Should make min.gapwidth a option

    revmap <- multihits_red$revmap

    axil_nodes <- as.character(S4Vectors::Rle(
      values = multihit_templates$readPairKey[min(revmap)], 
      lengths = lengths(revmap)
    ))

    nodes <- multihit_templates$readPairKey[unlist(revmap)]
    edgelist <- unique(matrix( c(axil_nodes, nodes), ncol = 2 ))

    multihits_cluster_data <- igraph::clusters(
      igraph::graph.edgelist(el = edgelist, directed = FALSE)
    )

    clus_key <- data.frame(
      row.names = unique(as.character(t(edgelist))),
      "clusID" = multihits_cluster_data$membership
    )

    multihits_pos$clusID <- clus_key[multihits_pos$readPairKey, "clusID"]
    multihits_pos <- multihits_pos[order(multihits_pos$clusID)]
    clustered_multihit_index <- as.data.frame(mcols(multihits_pos))

    multihit_loci_rle <- S4Vectors::Rle(factor(
      x = clustered_multihit_index$lociPairKey, 
      levels = unique(clustered_multihit_index$lociPairKey)
    ))

    multihit_loci_intL <- split(
      multihit_loci_rle, clustered_multihit_index$clusID
    )

    clustered_multihit_positions <- GenomicRanges::granges(
      x = multihits_pos[
        match(
          x = unlist(S4Vectors::runValue(multihit_loci_intL)), 
          table = clustered_multihit_index$lociPairKey)
      ]
    )

    clustered_multihit_positions <- split(
      x = clustered_multihit_positions,
      f = S4Vectors::Rle(
        values = seq_along(multihit_loci_intL), 
        lengths = S4Vectors::width(S4Vectors::runValue(
          multihit_loci_intL
        )@partitioning)
      )
    )

    readPairKey_cluster_index <- unique(
      clustered_multihit_index[,c("readPairKey", "clusID")]
    )

    multihit_keys$clusID <- readPairKey_cluster_index$clusID[
      match(multihit_keys$readPairKey, readPairKey_cluster_index$readPairKey)
    ]

    multihit_keys <- multihit_keys[order(multihit_keys$medians),]

    clustered_multihit_lengths <- split(
      x = S4Vectors::Rle(multihit_keys$medians), 
      f = multihit_keys$clusID
    )

    #' Expand the multihit_templates object from readPairKey specific to read
    #' specific.
    multihit_keys <- multihit_keys[order(multihit_keys$readPairKey),]

    multihit_readPair_read_exp <- IRanges::IntegerList(
      split(x = seq_len(nrow(multihit_keys)), f = multihit_keys$readPairKey)
    )

    unclustered_multihits <- multihit_templates

    multihit_readPair_read_exp <- multihit_readPair_read_exp[
      as.character(unclustered_multihits$readPairKey)
    ]

    unclustered_multihits <- unclustered_multihits[S4Vectors::Rle(
      values = seq_along(unclustered_multihits),
      lengths = S4Vectors::width(multihit_readPair_read_exp@partitioning)
    )]

    names(unclustered_multihits) <- multihit_keys$ID[
      unlist(multihit_readPair_read_exp)
    ]

    unclustered_multihits$ID <- multihit_keys$ID[
      unlist(multihit_readPair_read_exp)
    ]

    unclustered_multihits$sampleName <- multihit_keys$sampleName[
      unlist(multihit_readPair_read_exp)
    ]

  }

  stopifnot(
    length(clustered_multihit_positions) == length(clustered_multihit_lengths)
  )

  multihitData <- list(
    unclustered_multihits, 
    clustered_multihit_positions, 
    clustered_multihit_lengths
  )

  names(multihitData) <- c(
    "unclustered_multihits", 
    "clustered_multihit_positions", 
    "clustered_multihit_lengths"
  )

  writeOutputFile(multihitData, file = args$multihits, format = "rds")

  printHead(
    data.frame(
      "multihit_reads" = length(unique(names(unclustered_multihits))),
      "multihit_alignments" = length(unique(unclustered_multihits)),
      "multihit_clusters" = length(clustered_multihit_positions),
      "multihit_lengths" = sum(lengths(clustered_multihit_lengths))
    ),
    title = "Multihit metrics", 
    caption = "Metrics highlighting the observation of multiple aligning reads."
  )

  if( args$stat != FALSE ){

    add_stat <- data.frame(
      sampleName = sampleName,
      metric = c("multihit.reads", "multihit.lengths", "multihit.clusters"),
      count = c(
        length(unique(names(unclustered_multihits))), 
        sum(lengths(clustered_multihit_lengths)), 
        length(clustered_multihit_positions))
    )

    stat <- rbind(stat, add_stat)

  }

}

if( args$stat != FALSE ){

  write.table(
    x = stat, file = args$stat, 
    sep = ",", row.names = FALSE, 
    col.names = FALSE, quote = FALSE
  )

}

if( !is.null(args$saveImage) ) save.image(args$saveImage)

q()

R From line 10 of rscripts/couple.R

options(stringsAsFactors = FALSE, scipen = 99, width = 999)

code_dir <- dirname(sub(
  pattern = "--file=", 
  replacement = "", 
  x = grep("--file=", commandArgs(trailingOnly = FALSE), value = TRUE)
))

desc <- yaml::yaml.load_file(
  file.path(code_dir, "descriptions/demulti.desc.yml")
)

# Set up and gather command line arguments ----
## Argument parser ----
parser <- argparse::ArgumentParser(
  description = desc$program_short_description,
  usage = "Rscript demulti.R [-h/--help, -v/--version] [optional args]"
)

parser$add_argument(
  "-m", "--manifest", type = "character", 
  help = desc$manifest
)

parser$add_argument(
  "--read1", type = "character", default = "NA", 
  help = desc$read1
)

parser$add_argument(
  "--read2", type = "character", default = "NA", 
  help = desc$read2
)

parser$add_argument(
  "--idx1", type = "character", default = "NA", 
  help = desc$idx1
)

parser$add_argument(
  "--idx2", type = "character", default = "NA", 
  help = desc$idx2
)

parser$add_argument(
  "-o", "--outfolder", nargs = 1, type = "character", 
  help = desc$outfolder
)

parser$add_argument(
  "--bc1", nargs = 1, type = "character", default = "I1", 
  help = desc$bc1
)

parser$add_argument(
  "--bc2", nargs = 1, type = "character", default = "I2", 
  help = desc$bc2
)

parser$add_argument(
  "--bc1Man", nargs = 1, type = "character", default = "barcode1", 
  help = desc$bc1Man
)

parser$add_argument(
  "--bc2Man", nargs = 1, type = "character", default = "barcode2", 
  help = desc$bc2Man
)

parser$add_argument(
  "--bc1Len", nargs = 1, type = "integer", default = 8, 
  help = desc$bc1Len
)

parser$add_argument(
  "--bc2Len", nargs = 1, type = "integer", default = 8,
  help = desc$bc2Len
)

parser$add_argument(
  "--maxMis", nargs = 1, type = "integer", 
  help = desc$maxMis
)

parser$add_argument(
  "--bc1Mis", nargs = 1, type = "integer", default = 0, 
  help = desc$bc1Mis
)

parser$add_argument(
  "--bc2Mis", nargs = 1, type = "integer", default = 0,
  help = desc$bc2Mis
)

parser$add_argument(
  "--maxN", nargs = 1, type = "integer", default = 1,
  help = desc$maxN
)

parser$add_argument(
  "--stat", nargs = 1, type = "character", default = FALSE, 
  help = desc$stat
)

parser$add_argument(
  "-c", "--cores", nargs = 1, default = 1, type = "integer", 
  help = desc$cores
)

parser$add_argument(
  "--compress", action = "store_true", 
  help = desc$compress
)

parser$add_argument(
  "-p", "--poolreps", action = "store_true", 
  help = desc$poolreps
)

parser$add_argument(
  "--singleBarcode", action = "store_true", 
  help = desc$singleBarcode
)

parser$add_argument(
  "--readNamePattern", nargs = 1, type = "character", 
  default = "[\\w\\:\\-\\+]+", 
  help = desc$readNamePattern
)


args <- parser$parse_args(commandArgs(trailingOnly = TRUE))


demulti <- data.frame(
  "readType" = c("R1", "R2", "I1", "I2"),
  "path" = c(args$read1, args$read2, args$idx1, args$idx2)
)

demulti$bc1 <- grepl(args$bc1, demulti$readType)
demulti$bc2 <- grepl(args$bc2, demulti$readType)


if( demulti$readType[demulti$bc1] == demulti$readType[demulti$bc2] ){
  stop("Please select different read types for barcodes 1 and 2.\n")
}

if( demulti$readType[demulti$bc1] == "NA" ){
  stop("Barcode 1 is set to a read type that is not provided.\n")
}

if( demulti$readType[demulti$bc2] == "NA" ){
  stop("Barcode 2 is set to a read type that is not provided.\n")
}

if( args$singleBarcode ){
  demulti$bc2 <- FALSE
}

if( !is.null(args$maxMis) ){
  args$bc1Mis <- args$maxMis
  args$bc2Mis <- args$maxMis
}


input_table <- data.frame(
  "Variables" = paste0(names(args), " :"), 
  "Values" = sapply( 
    seq_along(args), 
    function(i){
      paste(args[[i]], collapse = ", ")
    }
  )
)

input_table <- input_table[
  match(
    c("manifest :", "idx1 :", "idx2 :", "read1 :", "read2 :", 
      "outfolder :", "stat :", "bc1 :", "bc2 :", "bc1Man :", "bc2Man :", 
      "bc1Len :", "bc2Len :", "bc1Mis :", "bc2Mis :", "cores :", "compress :", 
      "poolreps :", "singleBarcode :",  "readNamePattern :"
    ),
    input_table$Variables
  ),
  ]

cat("Demultiplex Inputs:\n")
print(
  x = data.frame(input_table, row.names = NULL), 
  right = FALSE, 
  row.names = FALSE
)

# Create output directory if not currently available ----
if( !file.exists(args$outfolder) ){

  attempt <- try(system(paste0("mkdir ", args$outfolder)))
  if(attempt == 1) stop("Cannot create output folder.\n")

}

# Check for required packages ----
required_packs <- c("stringr", "ShortRead", "Biostrings")
present_packs <- required_packs %in% row.names(installed.packages())

if( !all(present_packs) ){

  cat("Missing required r-packages:\n")
  print(
    data.frame(
      "R-Packages" = required_packs, 
      "Installed" = present_packs, 
      row.names = NULL
    ), right = FALSE, row.names = FALSE)

  stop("Check dependancies.\n")

}

# Operating functions ----
parseIndexReads <- function(barcode.seqs, reads, indices = NULL, 
                            barcode.length = NULL, max.mismatch = 1L, 
                            max.N.count = 1L){

  if( is.null(indices) ) indices <- seq_along(reads)
  if( is.null(barcode.length) ) barcode.length <- max(width(reads))

  # Load index file sequences and sequence names
  n_reads <- ShortRead::narrow(reads, start = 1, end = barcode.length)
  unique_index_seqs <- unique(ShortRead::sread(n_reads))

  # Trim barcode if necessary
  barcode_seqs <- as.character(
    Biostrings::DNAStringSet(
      unique(barcode.seqs), 
      start = 1, 
      end = barcode.length
    )
  )

  # Identify read names with sequences above or equal to the minscore
  bc_to_unique_idxs <- lapply(
    barcode_seqs, 
    function(x){

      vmp <- Biostrings::vmatchPattern(
        pattern = x,
        subject = unique_index_seqs, 
        max.mismatch = max.mismatch, 
        fixed = FALSE
      )

      which(lengths(vmp) == 1)

    }
  )

  # Lookup frame to match barcode sequences to index sequences
  # Sequence variability accounted for and ambiguous, degenerate, and unassigned
  # sequences identified
  degenerate_idxs <- which(
    stringr::str_count(unique_index_seqs, "N") > max.N.count
  )

  ambiguous_idxs <- as.numeric(names(table(unlist(bc_to_unique_idxs)))[
    table(unlist(bc_to_unique_idxs)) > 1
    ])

  ambiguous_idxs <- ambiguous_idxs[!ambiguous_idxs %in% degenerate_idxs]

  unassigned_idxs <- seq_along(unique_index_seqs)[
    !seq_along(unique_index_seqs) %in% unlist(bc_to_unique_idxs)
    ]

  unassigned_idxs <- unassigned_idxs[!unassigned_idxs %in% degenerate_idxs]

  bc_to_unique_idxs <- lapply(bc_to_unique_idxs, function(x){
    x[!x %in% c(ambiguous_idxs, unassigned_idxs, degenerate_idxs)]
  })

  lookup_frame <- data.frame(
    bc_seqs = factor(S4Vectors::Rle(
      values = c(unique(barcode.seqs), "ambiguous", "degenerate", "unassigned"),
      lengths = c(
        lengths(bc_to_unique_idxs), length(ambiguous_idxs), 
        length(degenerate_idxs), length(unassigned_idxs)
      )
    ), 
    levels = c(unique(barcode.seqs), "ambiguous", "degenerate", "unassigned")
    ),
    index_seqs = unique_index_seqs[
      c(unlist(bc_to_unique_idxs), ambiguous_idxs, 
        degenerate_idxs, unassigned_idxs)
      ]
  )

  return(split(
    indices, 
    lookup_frame$bc_seqs[
      match(as.character(ShortRead::sread(n_reads)), lookup_frame$index_seqs)
    ]
  ))

}

writeDemultiplexedSequences <- function(reads, quals, samplename, type, 
                                        outfolder, compress){

  if( compress ){  
    file_path <- file.path(
      outfolder, paste0(samplename, ".", type, ".fastq.gz")
    )
  }else{
    file_path <- file.path(outfolder, paste0(samplename, ".", type, ".fastq"))
  }

  if( file.exists(file_path) ) unlink(file_path)

  Biostrings::writeXStringSet(
    x = reads, 
    filepath = file_path, 
    compress = compress, 
    format = "fastq", 
    qualities = quals
  )

  cat(
    paste0("Wrote ", length(reads), " reads to:\n  ", file_path, ".\n")
  )

  return(list(file_path, type, outfolder))

}

# Load manifest / sample mapping file ----
file_ext <- unlist(strsplit(args$manifest, "\\."))
file_ext <- file_ext[length(file_ext)]

if( file_ext %in% c("yaml", "yml") ){

  if( !"yaml" %in% row.names(installed.packages()) ){
    stop("Package:yaml not loaded or installed.\n")
  }

  manifest <- yaml::yaml.load_file(args$manifest)

  if( args$singleBarcode ){

    samples_df <- data.frame(
      "sampleName" = names(manifest$samples),
      "bc1" = sapply( manifest$samples, function(x) x[args$bc1Man] ),
      row.names = NULL
    )

  }else{

    samples_df <- data.frame(
      "sampleName" = names(manifest$samples),
      "bc1" = sapply( manifest$samples, function(x) x[args$bc1Man] ),
      "bc2" = sapply( manifest$samples, function(x) x[args$bc2Man] ),
      row.names = NULL
    )

  }

}else{

  if( file_ext == "csv" ){
    manifest <- read.csv(args$manifest)
  }else if( file_ext == "tsv" ){
    manifest <- read.delim(args$manifest)
  }

  if( args$singleBarcode ){
    samples_df <- manifest[, c("sampleName", args$bc1Man)]
    names(samples_df) <- c("sampleName", "bc1")
  }else{
    samples_df <- manifest[, c("sampleName", args$bc1Man, args$bc2Man)]
    names(samples_df) <- c("sampleName", "bc1", "bc2")
  }

}


if( !args$singleBarcode ){

  unique_samples <- nrow(samples_df[,c("bc1", "bc2")]) == 
    nrow(unique(samples_df[,c("bc1", "bc2")]))

  if( !unique_samples ) stop("Ambiguous barcoding of samples. Please correct.\n")

}else{

  unique_samples <- length(samples_df[,c("bc1")]) == 
    length(unique(samples_df[,"bc1"]))

  if( !unique_samples ) stop("Ambiguous barcoding of samples. Please correct.\n")

}

# Read in barcode sequences ----
bc1_reads <- ShortRead::readFastq(demulti$path[demulti$bc1])

all_indices <- stringr::str_extract(
  as.character(ShortRead::id(bc1_reads)), 
  args$readNamePattern
)

if( !all(table(all_indices) == 1) ){
  stop(
    "\n  Read names are not unique, check input sequence files or ",
    "adjust readNamePattern parameter.\n")
}

cat(paste("\nReads to demultiplex : ", length(bc1_reads), "\n"))

if( args$cores > 1 ){

  bc1_proc_grps <- split(
    bc1_reads,
    ceiling( seq_along(bc1_reads) / (length(bc1_reads)/args$cores) )
  )

  split_indices <- split(
    all_indices,
    ceiling( seq_along(all_indices) / (length(bc1_reads)/args$cores) )
  )

  cluster <- parallel::makeCluster(min(c(parallel::detectCores(), args$cores)))

  BC1_parsed_list <-  parallel::clusterMap(
    cluster,
    function(reads, idx, parseIndexReads, samples_df, args){
      parseIndexReads(
        barcode.seqs = samples_df$bc1,
        reads = reads,
        indices = idx,
        barcode.length = args$bc1Len,
        max.mismatch = args$bc1Mis,
        max.N.count = args$maxN
      )
    },
    reads = bc1_proc_grps,
    idx = split_indices,
    MoreArgs = list(
      parseIndexReads = parseIndexReads, 
      samples_df = samples_df,
      args = args
    ),
    SIMPLIFY = FALSE
  )

  BC1_parsed <- lapply(
    names(BC1_parsed_list[[1]]), function(x){
      unlist(lapply(seq_along(BC1_parsed_list), function(i){
        BC1_parsed_list[[i]][[x]]
      }))
    }
  )

  names(BC1_parsed) <- names(BC1_parsed_list[[1]])
  rm(BC1_parsed_list, bc1_proc_grps)

  cat("\nbc1 breakdown:\n")
  print(
    data.frame(
      "bc1" = names(BC1_parsed),
      "Read Counts" = lengths(BC1_parsed)
    ),
    right = TRUE, 
    row.names = FALSE
  )

  if( !args$singleBarcode ){

    bc2_reads <- ShortRead::readFastq(demulti$path[demulti$bc2])

    bc2_indices <- stringr::str_extract(
      as.character(ShortRead::id(bc2_reads)), 
      args$readNamePattern
    )

    if( !all(bc2_indices == all_indices) ){
      warning(
        "  Index reads are not in the same order. Sequencing files should ",
        "always be kept in order across read types.\n")
    }

    bc2_proc_grps <- split(
      bc2_reads,
      ceiling( seq_along(bc2_reads) / (length(bc2_reads)/args$cores) )
    )

    split_bc2_indices <- split(
      bc2_indices,
      ceiling( seq_along(bc2_reads) / (length(bc2_reads)/args$cores) )
    )

    BC2_parsed_list <- parallel::clusterMap(
      cluster,
      function(reads, idx, parseIndexReads, samples_df, args){
        parseIndexReads(
          barcode.seqs = samples_df$bc2,
          reads = reads,
          indices = idx,
          barcode.length = args$bc2Len,
          max.mismatch = args$bc2Mis,
          max.N.count = args$maxN
        )
      },
      reads = bc2_proc_grps,
      idx = split_bc2_indices,
      MoreArgs = list(
        parseIndexReads = parseIndexReads, 
        samples_df = samples_df,
        args = args
      ),
      SIMPLIFY = FALSE
    )

    BC2_parsed <- lapply(
      names(BC2_parsed_list[[1]]), function(x){
        unlist(lapply(seq_along(BC2_parsed_list), function(i){
          BC2_parsed_list[[i]][[x]]
        }))
      }
    )

    names(BC2_parsed) <- names(BC2_parsed_list[[1]])
    rm(BC2_parsed_list, bc2_proc_grps)

  }

  parallel::stopCluster(cluster)

}else{

  BC1_parsed <-  parseIndexReads(
    barcode.seqs = samples_df$bc1, 
    reads = bc1_reads,
    indices = all_indices,
    barcode.length = args$bc1Len,
    max.mismatch = args$bc1Mis,
    max.N.count = args$maxN
  )

  cat("\nbc1 breakdown:\n")
  print(
    data.frame(
      "bc1" = names(BC1_parsed),
      "Read Counts" = lengths(BC1_parsed)
    ),
    right = TRUE, 
    row.names = FALSE
  )

  if( !args$singleBarcode ){

    bc2_reads <- ShortRead::readFastq(demulti$path[demulti$bc2])

    bc2_indices <- stringr::str_extract(
      as.character(ShortRead::id(bc2_reads)), 
      args$readNamePattern
    )

    if( !all(bc2_indices == all_indices) ){
      warning(
        "  Index reads are not in the same order. Sequencing files should ",
        "always be kept in order across read types.\n")
    }

    BC2_parsed <- parseIndexReads(
      barcode.seqs = samples_df$bc2, 
      reads = bc2_reads,
      indices = bc2_indices,
      barcode.length = args$bc2Len,
      max.mismatch = args$bc2Mis,
      max.N.count = args$maxN
    )

  }

}

if( !args$singleBarcode ){

  cat("\nbc2 breakdown:\n")
  print(
    data.frame(
      "bc2" = names(BC2_parsed),
      "Read Counts" = lengths(BC2_parsed)
    ),
    right = TRUE,
    row.names = FALSE
  )

}

if( !args$singleBarcode ){

  demultiplexed_indices <- mapply(
    function(bc1, bc2){
      Biostrings::intersect(BC1_parsed[[bc1]], BC2_parsed[[bc2]])
    },
    bc1 = samples_df$bc1,
    bc2 = samples_df$bc2,
    SIMPLIFY = FALSE
  )

  names(demultiplexed_indices) <- paste0(
    samples_df$bc1, samples_df$bc2
  )

}else{

  demultiplexed_indices <- BC1_parsed[samples_df$bc1]

}

# As there is some flexibility in the barcode matching, some reads may be 
# be assigned to multiple samples (ambiguous). Additionally, uncalled bases can
# lead to degenerate sequences (a cause of ambiguous matching), or many 
# sequences will be unassigned.
if( !args$singleBarcode ){

  degenerate_indices <- unique(c(BC1_parsed$degenerate, BC2_parsed$degenerate))

  ambiguous_indices <- unique(c(BC1_parsed$ambiguous, BC2_parsed$ambiguous))

  ambiguous_indices <- ambiguous_indices[
    !ambiguous_indices %in% degenerate_indices
  ]

  unassigned_indices <- unique(c(BC1_parsed$unassigned, BC2_parsed$unassigned))

  unassigned_indices <- unassigned_indices[
    !unassigned_indices %in% c(degenerate_indices, ambiguous_indices)
  ]

  demultiplexed_indices <- lapply(demultiplexed_indices, function(x){
    x[!x %in% c(unassigned_indices, ambiguous_indices, degenerate_indices)]
  })

  unassigned_indices <- c(unassigned_indices, all_indices[
    !all_indices %in% c(
      unlist(demultiplexed_indices), degenerate_indices, 
      ambiguous_indices, unassigned_indices
    )
  ])

}else{

  degenerate_indices <- BC1_parsed$degenerate
  ambiguous_indices <- BC1_parsed$ambiguous
  unassigned_indices <- BC1_parsed$unassigned

}

# Reads by sample
samples_df$read_counts <- lengths(demultiplexed_indices)
cat("\nRead counts for each sample.\n")
print(samples_df, split.tables = Inf)

# Ambiguous reads
cat(paste0("\nAmbiguous reads: ", length(ambiguous_indices), "\n"))

# Degenerate reads
cat(paste0("Degenerate reads: ", length(degenerate_indices), "\n"))

# Unassigned reads
cat(paste0("Unassigned reads: ", length(unassigned_indices), "\n"))

if( args$stat != FALSE ){
  write.table(
    data.frame(
      sampleName = paste0(
        c(
          samples_df$sampleName, "ambiguous_reads", 
          "degenerate_reads", "unassigned_reads"
        ), 
        ".demulti"
      ),
      metric = "reads",
      count = c(
        samples_df$read_counts, 
        length(ambiguous_indices), 
        length(degenerate_indices),
        length(unassigned_indices)
      )
    ),
    file = file.path(args$outfolder, args$stat),
    sep = ",", row.names = FALSE, col.names = FALSE, quote = FALSE
  )
}

# Create multiplex dataframe for subseting sequencing files ----
multiplexed_data <- data.frame(
  "sampleName" = S4Vectors::Rle(
    values = samples_df$sampleName, 
    length = lengths(demultiplexed_indices)
  ),
  "index" = unlist(demultiplexed_indices),
  row.names = NULL
)

ambiguous_data <- data.frame(
  "sampleName" = rep("ambiguous", length(ambiguous_indices)),
  "index" = ambiguous_indices,
  row.names = NULL
)

degenerate_data <- data.frame(
  "sampleName" = rep("degenerate", length(degenerate_indices)),
  "index" = degenerate_indices,
  row.names = NULL
)

unassigned_data <- data.frame(
  "sampleName" = rep("unassigned", length(unassigned_indices)),
  "index" = unassigned_indices,
  row.names = NULL
)

multiplexed_data <- rbind(
  multiplexed_data, ambiguous_data, degenerate_data, unassigned_data
)

multiplexed_data$sampleName <- factor(
  multiplexed_data$sampleName,
  levels = c(samples_df$sampleName, "ambiguous", "degenerate", "unassigned")
)

stopifnot( all(multiplexed_data$index %in% all_indices) )

if( args$poolreps ){
  multiplexed_data$sampleName <- gsub("-\\d+$", "", multiplexed_data$sampleName)
}

cat(paste0("Reads to be written to files: ", nrow(multiplexed_data), "\n"))

# Write files to read files to outfolder directory ----
if( args$cores > 1 ){

  cluster <- parallel::makeCluster(min(c(parallel::detectCores(), args$cores)))

  read_list <- demulti$readType[demulti$path != "NA"]
  read_paths <- demulti$path[match(read_list, demulti$readType)]

  written_seq_files <- mapply(
    function(read.file.path, read.type, cluster, args,
             multiplexed.data, writeDemultiplexedSequences){

      reads <- ShortRead::readFastq(read.file.path)

      seqs <- reads@sread

      ids <- Biostrings::BStringSet(
        stringr::str_extract(
          as.character(reads@id), args$readNamePattern
        )
      )

      names(seqs) <- ids

      quals <- reads@quality@quality

      seqs <- split(
        seqs[match(multiplexed.data$index, as.character(ids))],
        multiplexed.data$sampleName
      )

      quals <- split(
        quals[match(multiplexed.data$index, as.character(ids))],
        multiplexed.data$sampleName
      )

      demultiplex <- parallel::clusterMap(
        cluster,
        writeDemultiplexedSequences,
        reads = seqs,
        quals = quals,
        samplename = names(seqs),
        MoreArgs = list(
          type = read.type,
          outfolder = args$outfolder,
          compress = args$compress
        )
      )

    },
    read.file.path = read_paths,
    read.type = read_list,
    MoreArgs = list(
      cluster = cluster,
      multiplexed.data = multiplexed_data,
      writeDemultiplexedSequences = writeDemultiplexedSequences,
      args = args
    ),
    SIMPLIFY = FALSE
  )

  parallel::stopCluster(cluster)

}else{

  read_list <- demulti$readType[demulti$path != "NA"]
  read_paths <- demulti$path[match(read_list, demulti$readType)]

  written_seq_files <- mapply(
    function(read.file.path, read.type, args,
             multiplexed.data, writeDemultiplexedSequences){

      reads <- ShortRead::readFastq(read.file.path)

      seqs <- reads@sread

      ids <- Biostrings::BStringSet(
        stringr::str_extract(
          as.character(reads@id), args$readNamePattern
        )
      )

      names(seqs) <- ids

      quals <- reads@quality@quality

      seqs <- split(
        seqs[match(multiplexed.data$index, as.character(ids))],
        multiplexed.data$sampleName
      )

      quals <- split(
        quals[match(multiplexed.data$index, as.character(ids))],
        multiplexed.data$sampleName
      )

      demultiplex <- mapply(
        writeDemultiplexedSequences,
        reads = seqs,
        quals = quals,
        samplename = names(seqs),
        MoreArgs = list(
          type = read.type,
          outfolder = args$outfolder,
          compress = args$compress
        )
      )

    },
    read.file.path = read_paths,
    read.type = read_list,
    MoreArgs = list(
      multiplexed.data = multiplexed_data,
      writeDemultiplexedSequences = writeDemultiplexedSequences,
      args = args
    ),
    SIMPLIFY = FALSE
  )
}

cat("Demultiplexing complete.\n")
q()

R Biostrings From line 9 of rscripts/demulti.R

options(stringsAsFactors = FALSE, scipen = 99, width = 180)


# Set up and gather command line arguments ----
parser <- argparse::ArgumentParser(
  description = "Evaluation of iGUIDE data from input run(s).",
  usage = paste(
    "iguide eval <config(s)> -o <output> [-h/--help, -v/--version]",
    "[optional args]"
  )
)

parser$add_argument(
  "config", nargs = "+", type = "character",
  help = paste(
    "Run specific config file(s) in yaml format. Can specify more than",
    "one to combine several runs together for evaluation."
  )
)

parser$add_argument(
  "-o", "--output", nargs = 1, type = "character", required = TRUE,
  help = "Output eval file, .rds format. i.e. output.rds or output"
)

parser$add_argument(
  "-s", "--support", nargs = 1, type = "character",
  help = paste(
    "Supplementary data input, csv or tsv format. Only one file. Must have",
    "'specimen' column and only specimens matching data in this column will",
    "be considered for evaluation."
  )
)

parser$add_argument(
  "--stat", nargs = 1, type = "character", default = FALSE, 
  help = paste(
    "File name to be written in output directory of read couts for each",
    "sample. CSV file format. ie. test.stat.csv."
  )
)

parser$add_argument(
  "--override", action = "store_true", 
  help = "Override software and build version control checks."
)

parser$add_argument(
  "-q", "--quiet", action = "store_true", 
  help = "Hide standard output messages."
)

parser$add_argument(
  "--iguide_dir", nargs = 1, type = "character", default = "IGUIDE_DIR",
  help = "iGUIDE install directory path, do not change for normal applications."
)


args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

if( !dir.exists(args$iguide_dir) ){
  root_dir <- Sys.getenv(args$iguide_dir)
}else{
  root_dir <- args$iguide_dir
}

if( !dir.exists(root_dir) ){
  stop(paste0("\n  Cannot find install path to iGUIDE: ", root_dir, ".\n"))
}else{
  args$iguide_dir <- root_dir
}

## Determine output file name and path
if( !stringr::str_detect(args$output, ".rds$") ){
  args$output <- paste0(args$output, ".rds")
}

write(c(), file = args$output)
args$output <- normalizePath(args$output)
unlink(args$output)

## Construct input table and print to terminal
input_table <- data.frame(
  "Variables" = paste0(names(args), " :"), 
  "Values" = sapply(seq_along(args), function(i){
    paste(args[[i]], collapse = ", ")
  })
)

input_table <- input_table[
  match(
    c("config :", "output :", "support :", "iguide_dir :"),
    input_table$Variables),
]

if( !args$quiet ){

  cat("\niGUIDE Evaluation Inputs:\n")

  print(
    data.frame(input_table),
    right = FALSE, 
    row.names = FALSE
  )

}

# Load dependancies ----
if( !args$quiet ) cat("\nLoading dependencies.\n")

add_packs <- c("magrittr", "knitr", "iguideSupport")

add_packs_loaded <- suppressMessages(
  sapply(add_packs, require, character.only = TRUE)
)

if( !all(add_packs_loaded) ){

  print(
    data.frame(
      "R-Packages" = names(add_packs_loaded), 
      "Loaded" = add_packs_loaded
    ), 
    right = FALSE,
    row.names = FALSE
  )

  stop("Check dependancies.\n")

}


# Import metadata and consolidate objects ----
if( !args$quiet ) cat("Importing experimental data and configurations.\n\n")

## Load config files
configs <- lapply(args$config, function(x){
  if( file.exists(file.path(root_dir, x)) ){
    return(yaml::yaml.load_file(file.path(root_dir, x)))
  }else if( file.exists(x) ){
    return(yaml::yaml.load_file(x))
  }else{
    stop("\n  Cannot find config file: ", x, ".\n")
  }
})

names(configs) <- sapply(configs, "[[", "Run_Name")

## Load reference genome 
if( grepl(".fa", unique(sapply(configs, "[[", "Ref_Genome"))) ){

  if( !(
    file.exists(
      file.path(root_dir, unique(sapply(configs, "[[", "Ref_Genome")))
    ) | file.exists(unique(sapply(configs, "[[", "Ref_Genome")))
  ) ){
    stop("\n  Specified reference genome file not found.\n")
  }

  ref_file_type <- ifelse(
    grepl(".fastq", unique(sapply(configs, "[[", "Ref_Genome"))), 
    "fastq", 
    "fasta"
  )

  if( file.exists(
    file.path(root_dir, unique(sapply(configs, "[[", "Ref_Genome"))) 
    ) ){

    ref_genome <- Biostrings::readDNAStringSet(
      filepath = file.path(
        root_dir, unique(sapply(configs, "[[", "Ref_Genome"))
      ),
      format = ref_file_type
    )

  }else{

    ref_genome <- Biostrings::readDNAStringSet(
      filepath = unique(sapply(configs, "[[", "Ref_Genome")), 
      format = ref_file_type
    )
  }

}else{

  ref_genome <- unique(sapply(configs, "[[", "Ref_Genome"))

  genome <- grep(
    pattern = ref_genome, 
    x = unique(BSgenome::installed.genomes()), 
    value = TRUE
  )

  if( length(genome) == 0 ){

    cat("\nInstalled genomes include:")
    print(unique(BSgenome::installed.genomes()))
    cat("\n  Selected reference genome not in list.\n")
    stop("\n  Genome not available.\n")

  }else if( length(genome) > 1 ){

    cat("\nInstalled genomes include:\n")
    print(unique(BSgenome::installed.genomes()))
    cat(
      "\n  Please be more specific about reference genome.",
      "Multiple matches to input."
    )
    stop("\n  Multiple genomes requested.\n")

  }

  suppressMessages(library(genome, character.only = TRUE))

  ref_genome <- get(genome)

}

## Get versioning

soft_version <- as.character(read.delim(
  file = file.path(root_dir, ".version"), header = FALSE))

build_version <- list.files(file.path(root_dir, "etc")) %>%
  grep(pattern = "build.b[0-9\\.]+.*", x = ., value = TRUE) %>%
  stringr::str_extract(pattern = "b[0-9]+\\.[0-9]+\\.[0-9]+")


## Load reference files
ref_genes <- suppressMessages(loadRefFiles(
  configs[[1]]$refGenes, 
  type = "GRanges", 
  freeze = configs[[1]]$Ref_Genome,
  root = root_dir
))

onco_genes <- suppressMessages(loadRefFiles(
  configs[[1]]$oncoGeneList, 
  type = "gene.list", 
  freeze = configs[[1]]$Ref_Genome,
  root = root_dir
))

special_genes <- suppressMessages(loadRefFiles(
  configs[[1]]$specialGeneList, 
  type = "gene.list", 
  freeze = config[[1]]$Ref_Genome,
  root = root_dir
))

submat <- banmat()

## Determine processing parameters
## Some parameters will need to be an "all or nothing" approach, including:
##   - UMItags
##   - recoverMultihits
##   - Abundance_Method [Fragment, UMI, or Read based]
## Depending on these parameters others (upstream/downstream_dist, ...) may need
## to be consistent between runs otherwise, the primary config file (first one),
## will be used for parameterization.

umitag_option <- all(unlist(lapply(configs, "[[", "UMItags")))
multihit_option <- all(unlist(lapply(configs, "[[", "recoverMultihits")))

abundance_option <- unique(
  tolower(unlist(lapply(configs, "[[", "Abundance_Method")))
)[1]

if( is.na(abundance_option) ) abundance_option <- "Fragment"

if( abundance_option == "umi" & !umitag_option ){
  stop(
    "\n  Abundance method has been set to use UMItags, yet the current",
    "\n  configuration does not capture UMItag data (UMItags : FALSE).",
    "\n  Please correct this inconsistency before continuing analysis."
  )
}

if( multihit_option ){

  upstream_dist <- unique(sapply(configs, function(x) x$upstreamDist))
  downstream_dist <- unique(sapply(configs, function(x) x$downstreamDist))
  pile_up_min <- unique(sapply(configs, function(x) x$pileUpMin))

  if( 
    length(upstream_dist) > 1 | 
    length(downstream_dist) > 1 | 
    length(pile_up_min) > 1 
  ){

    stop(
      "\n  Inconsistant upstream or downstream distances between config files.",
      "\n  Comparisons between groups with different run specific criteria", 
      "\n  is not recommended when considering the recover multihit option.\n"
    )

  }

}else{

  upstream_dist <- configs[[1]]$upstreamDist
  downstream_dist <- configs[[1]]$downstreamDist
  pile_up_min <- configs[[1]]$pileUpMin

}

max_target_mismatch <- configs[[1]]$maxTargetMismatch


## Combine sampleInfo files

sample_info <- lapply(configs, loadSampleInfo, root_dir) %>%
  dplyr::bind_rows(.id = "run_set")

sample_name_col <- unique(sapply(configs, "[[", "Sample_Name_Column"))

if( length(sample_name_col) != 1 ){
  stop("\n  Sample_Info files not in same format.\n")
}

sample_info$specimen <- stringr::str_extract(
  string = sample_info[,sample_name_col], 
  pattern = "[\\w]+"
)

specimen_levels <- unique(sample_info$specimen)

sample_info$specimen <- factor(sample_info$specimen, levels = specimen_levels)


## Load in supporting information ----
if( length(args$support) > 0 ){

  if( file.exists(file.path(root_dir, args$support)) ){
    support_path <- file.path(root_dir, args$support)
  }else if( file.exists(args$support) ){
    support_path <- args$support
  }else{
    stop("\n  Cannot find supporting data file: ", args$support, ".\n")
  }

  supp_data <- data.table::fread(support_path, data.table = FALSE)

  specimen_levels <- supp_data$specimen[supp_data$specimen %in% specimen_levels]

  supp_data <- dplyr::filter(supp_data, specimen %in% specimen_levels) %>%
    dplyr::mutate(specimen = factor(specimen, levels = specimen_levels))

  sample_info <- dplyr::filter(sample_info, specimen %in% specimen_levels) %>%
    dplyr::mutate(
      specimen = factor(as.character(specimen), levels = specimen_levels)
    ) %>%
    dplyr::arrange(specimen)

}else{

  supp_data <- data.frame()

}


## Identify on-target edit sites from config files
if( any(lengths(lapply(configs, "[[", "On_Target_Sites")) > 0) ){

  on_targets <- unlist(lapply(configs, "[[", "On_Target_Sites")) %>%
    data.frame(id = names(.), target = ., row.names = NULL) %>%
    dplyr::mutate(
      id = stringr::str_replace(
        string = id, pattern = stringr::fixed("."), replacement = ":"
      ),
      id = stringr::str_extract(string = id, pattern = "[\\w\\_\\-\\'\\.]+$"),
      id = stringr::str_extract(string = id, pattern = "[\\w\\_\\-\\.]+")
    ) %>%
    dplyr::distinct() %$%
    structure(target, names = id)

}else{

  on_targets <- NULL

}

## Identify nuclease profiles used
if( any(lengths(lapply(configs, "[[", "Nuclease_Profiles")) > 0) ){

  nuc_profiles <- unlist(
    unname(lapply(configs, "[[", "Nuclease_Profiles")), 
    recursive = FALSE
  )

  nuc_profiles <- nuc_profiles[
    match(unique(names(nuc_profiles)), names(nuc_profiles))
  ]

}else{

  nuc_profiles <- NULL

}

## Create reference tables for nuclease, treatment, and combinations (combos)
nuclease_df <- lapply(configs, getNucleaseInfo, root_dir) %>%
  dplyr::bind_rows(.id = "run_set") %>%
  dplyr::filter(specimen %in% specimen_levels) %>%
  dplyr::mutate(
    specimen = factor(specimen, levels = specimen_levels),
    is_mock = dplyr::case_when(
      tolower(nuclease) == "mock" ~ TRUE,
      tolower(nuclease) == "none" ~ TRUE,
      tolower(nuclease) == "control" ~ TRUE,
      TRUE ~ FALSE
    ),
    nuclease = ifelse(is_mock, "Mock", nuclease)
  ) %>%
  dplyr::select(-is_mock) %>%
  dplyr::arrange(specimen)

treatment_df <- lapply(configs, getTreatmentInfo, root_dir) %>%
  dplyr::bind_rows(.id = "run_set") %>%
  dplyr::filter(specimen %in% specimen_levels) %>%
  dplyr::mutate(
    specimen = factor(specimen, levels = specimen_levels),
    is_mock = dplyr::case_when(
      tolower(treatment) == "mock" ~ TRUE,
      tolower(treatment) == "none" ~ TRUE,
      tolower(treatment) == "control" ~ TRUE,
      TRUE ~ FALSE
    ),
    treatment = ifelse(is_mock, "Mock", treatment)
  ) %>%
  dplyr::select(-is_mock) %>%
  dplyr::arrange(specimen)

nuclease_treatment_unmod_df <- dplyr::left_join(
  nuclease_df, treatment_df, by = c("run_set", "specimen")
)

combos_tbl <- nuclease_treatment_unmod_df %>%
  dplyr::filter(
    tolower(nuclease) != "mock" & tolower(treatment) != "mock"
  ) %>%
  dplyr::distinct(nuclease, treatment) %>%
  dplyr::mutate(combo = combo_symbols(seq_len(dplyr::n()))) %>%
  dplyr::select(combo, nuclease, treatment)

if( nrow(combos_tbl) == 0 ){
  combos_tbl <- data.frame(combo = "A", nuclease = "Mock", treatment = "Mock")
}

combos_set_tbl <- nuclease_treatment_unmod_df %>%
  dplyr::filter(
    tolower(nuclease) != "mock" & tolower(treatment) != "mock"
  ) %>%
  dplyr::distinct(run_set, nuclease, treatment) %>%
  dplyr::left_join(combos_tbl, by = c("nuclease", "treatment")) %>%
  dplyr::select(run_set, combo, nuclease, treatment)

if( nrow(combos_set_tbl) == 0){
  combos_set_tbl <- data.frame(
    run_set = unique(sample_info$run_set), 
    combo = "A",
    nuclease = "Mock", 
    treatment = "Mock"
  )
}

## Mock analyses should be compared against all combos to get an understanding
## of the background signal that was captured. To do this, Mock samples are 
## duplicated and analyzed against the different combinations. Combinations are
## indicated by a letter at the end of annotations, ie. (A).
## 
nuclease_combos_list <- split(combos_tbl, combos_tbl$nuclease)
treatment_combos_list <- split(combos_tbl, combos_tbl$treatment)
nuclease_combos_list$Mock <- combos_tbl
treatment_combos_list$Mock <- combos_tbl

nuc_treat_split_vec <-  paste(
    nuclease_treatment_unmod_df$run_set, nuclease_treatment_unmod_df$specimen
  ) %>%
  factor(levels = unique(.))

nuclease_treatment_df <- split(
    nuclease_treatment_unmod_df, nuc_treat_split_vec
  ) %>%
  lapply(function(x){

    if( tolower(x$treatment) == "mock" ){

      nuclease_combos_list[[
          match(unique(x$nuclease), names(nuclease_combos_list))
        ]] %>%
        dplyr::mutate(
          run_set = unique(x$run_set),
          specimen = unique(x$specimen)
        ) %>%
        return(.)

    }else if( tolower(x$nuclease) == "mock"){

      treatment_combos_list[[
          match(unique(x$treatment), names(treatment_combos_list))
        ]] %>%
        dplyr::mutate(
          run_set = unique(x$run_set),
          specimen = unique(x$specimen)
        ) %>%
        return(.)

    }else{

      dplyr::left_join(x, combos_tbl, by = c("nuclease", "treatment")) %>%
        return(.)

    }

  }) %>%
  dplyr::bind_rows() %>%
  dplyr::group_by(run_set) %>%
  dplyr::mutate(
    alt_specimen = paste0(as.character(specimen), "(", combo, ")"),
    alt_specimen = factor(alt_specimen, levels = unique(alt_specimen))
  ) %>%
  dplyr::ungroup()

alt_specimen_levels <- levels(nuclease_treatment_df$alt_specimen)


## Create vector objects for treatment and nuclease for later processing
nuclease <- structure(
  strsplit(nuclease_treatment_df$nuclease, ";"), 
  names = as.character(nuclease_treatment_df$alt_specimen)
)

treatment <- structure(
  strsplit(nuclease_treatment_df$treatment, ";"), 
  names = as.character(nuclease_treatment_df$alt_specimen)
)

combos_exp_specimen_list <- split(
  as.character(nuclease_treatment_df$alt_specimen),
  as.character(nuclease_treatment_df$specimen)
)



## Identify all target sequences used from config files
target_seqs <- lapply(
  do.call(c, lapply(configs, "[[", "Target_Sequences")), 
  toupper
)

target_grps <- stringr::str_extract(
  string = names(target_seqs), 
  pattern = "[\\w\\-\\_]+"
)

names(target_seqs) <- sub("[\\w\\-\\_]+.", "", names(target_seqs), perl = TRUE)
target_seqs <- split(target_seqs, target_grps)

target_seqs_df <- data.frame(
  run_set = as.character(
    S4Vectors::Rle(names(target_seqs), lengths(target_seqs))
  ),
  target = as.character(unlist(lapply(target_seqs, names))),
  sequence = as.character(unlist(target_seqs))
)


## Identify PAM sequences associated with nucleases
pam_seqs <- do.call(c, lapply(configs, function(x){
  toupper(unlist(lapply(x$Nuclease_Profiles, "[[", "PAM")))
}))

pam_grps <- stringr::str_extract(
  string = names(pam_seqs), 
  pattern = "[\\w\\-\\_]+"
)

names(pam_seqs) <- sub("[\\w\\-\\_]+.", "", names(pam_seqs), perl = TRUE)
pam_seqs <- split(pam_seqs, pam_grps)

pam_seqs_df <- data.frame(
  run_set = as.character(S4Vectors::Rle(names(pam_seqs), lengths(pam_seqs))),
  nuclease = as.character(unlist(lapply(pam_seqs, names))),
  PAM = as.character(unlist(pam_seqs))
)


## Combine into a single table for output
considered_target_seqs <- unique(unlist(treatment))
considered_nucleases <- unique(unlist(nuclease))

on_targets <- on_targets[names(on_targets) %in% considered_target_seqs]

target_tbl <- combos_set_tbl %>%
  split(paste(.$run_set, .$nuclease, .$treatment)) %>%
  lapply(function(x){
    data.frame(
      run_set = x$run_set,
      nuclease = x$nuclease,
      target = unlist(strsplit(x$treatment, ";"))
    )
  }) %>%
  dplyr::bind_rows() %>%
  dplyr::distinct() %>%
  dplyr::filter(tolower(target) != "mock")

if( nrow(target_tbl) > 0 ){

  target_tbl <- target_tbl %>%
    dplyr::left_join(target_seqs_df, by = c("run_set", "target")) %>%
    dplyr::left_join(pam_seqs_df, by = c("run_set", "nuclease")) %>%
    dplyr::filter(
      target %in% considered_target_seqs & nuclease %in% considered_nucleases
    )

}else{

  target_tbl <- data.frame(
    run_set = vector(mode = "character"),
    nuclease = vector(mode = "character"),
    target = vector(mode = "character"),
    sequence = vector(mode = "character"),
    PAM = vector(mode = "character")
  )

}

uniq_target_df <- target_tbl %>%
  dplyr::distinct(target, sequence, PAM)

uniq_target_seqs <- Biostrings::DNAStringSet(
  structure(uniq_target_df$sequence, names = uniq_target_df$target),
  use.names = TRUE
)

### Log combination treatment table
if( !args$quiet ){
  cat("\nNuclease and Treatment Combination Table:\n")
  print(combos_set_tbl, right = FALSE, row.names = FALSE)

  cat("\nTarget Sequence Table:\n")
  print(target_tbl, right = FALSE, row.names = FALSE)
}


## Consolidate supplementary data ----
if( is.null(args$support) ){
  spec_overview <- nuclease_treatment_unmod_df %>%
    dplyr::rename("Nuclease" = nuclease, "Treatment" = treatment)
}else{
  spec_overview <- supp_data %>%
    dplyr::mutate(run_set = "supp_data")
}

annot_overview <- spec_overview %>%
  dplyr::mutate(
    annotation = vcollapse(
      d = dplyr::select(spec_overview, -run_set, -specimen), 
      sep = " - ", 
      fill = "NA"
    ),
    annotation = factor(annotation, levels = c(unique(c(annotation, "Mock"))))
  ) %>%
  dplyr::select(specimen, annotation)

combo_overview <- nuclease_treatment_df %>%
  dplyr::left_join(annot_overview, by = "specimen") %>%
  dplyr::mutate(
    annotation = paste0(as.character(annotation), " (", combo, ")"),
    annotation = factor(annotation, levels = unique(annotation))
  )


# Beginning analysis ----
if( !args$quiet ) cat("\nStarting analysis...\n")

## Read in experimental data and contatenate different sets
input_data <- lapply(configs, function(x){
    name <- x$Run_Name

    path <- file.path(
      "analysis", name, paste0("output/incorp_sites.", name ,".rds")
    )

    if( file.exists(file.path(root_dir, path)) ){
      y <- readRDS(file.path(root_dir, path))
    }else if( file.exists(path) ){
      y <- readRDS(path)
    }else{
      stop("\n  Cannot find incorp_sites file: ", x, ".\n")
    }

    y$reads %>%
      dplyr::mutate(
        soft.version = y$soft_version,
        build.version = y$build_version
      )

  }) %>%
  dplyr::bind_rows(.id = "run_set") %>%
  dplyr::mutate(
    specimen = stringr::str_extract(sampleName, pattern = "[\\w]+")
  ) %>%
  dplyr::filter(specimen %in% spec_overview$specimen)

if( !multihit_option ){
  input_data <- dplyr::filter(input_data, type == "uniq")
}

## Check versioning for imported data ----
vc_check <- input_data %>%
  dplyr::distinct(run_set, soft.version, build.version)

input_data <- dplyr::select(input_data, -soft.version, -build.version)

cat("\nVersioning:\n")
print(vc_check, right = FALSE, row.names = FALSE)

if( dplyr::n_distinct(vc_check$soft.version) > 1 | 
      dplyr::n_distinct(vc_check$build.version) > 1 ){

  if( args$override ){
    warning("\n  Data processed under different software versions.")
  }else{
    stop("\n  Data processed with inconsistent software versions.")
  }

}

## Format input alignments ----
## Determine abundance metrics, with or without UMItags
algnmts_summaries <- list(
  count = dplyr::quo(sum(contrib)),
  umitag = if( umitag_option ){
    dplyr::quo(sum(
      as.integer(!duplicated(umitag[!is.na(umitag)])) * contrib[!is.na(umitag)]
    ))
  }else{
    dplyr::quo(0)
  },
  contrib = dplyr::quo(max(contrib))
)

algnmts_summaries <- algnmts_summaries[!sapply(algnmts_summaries, is.null)]

algnmts_unmod <- input_data %>%
  dplyr::arrange(desc(contrib)) %>%
  dplyr::group_by(seqnames, start, end, strand, specimen, sampleName) %>%
  dplyr::summarise(!!! algnmts_summaries) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    abund = dplyr::case_when(
      abundance_option == "umi" ~ umitag,
      abundance_option == "read" ~ count,
      TRUE ~ contrib
    )
  ) %>%
  as.data.frame()

algnmts <- algnmts_unmod %>%
  split(.$specimen) %>%
  lapply(function(x){

    alt_names <- combos_exp_specimen_list[[unique(x$specimen)]]

    if( length(alt_names) > 1 ){

      mod_algns <- x[rep(seq_len(nrow(x)), length(alt_names)),]
      mod_algns$alt_specimen <- rep(alt_names, each = nrow(x))
      return(mod_algns)

    }else if( length(alt_names) == 1 ){

      x$alt_specimen <- alt_names
      return(x)

    }else{

      x$alt_specimen <- x$specimen
      return(x)

    }

  }) %>%
  dplyr::bind_rows()


## Generate a sample table of the data for log purposes
sample_index <- ifelse(nrow(algnmts) > 10, 10, nrow(algnmts))
sample_index <- sample(seq_len(nrow(algnmts)), sample_index, replace = FALSE)

cat("\nSample of aligned templates:\n")

print(
  data.frame(algnmts[sample_index,]),
  right = FALSE,
  row.names = FALSE
)

cat(paste0("\nNumber of alignments: ", nrow(algnmts), "\n"))

rm(sample_index)

## Transform the data into a GRanges object
algnmts_gr <- GenomicRanges::GRanges(
  seqnames = algnmts$seqnames,
  ranges = IRanges::IRanges(start = algnmts$start, end = algnmts$end),
  strand = algnmts$strand,
  seqinfo = GenomeInfoDb::seqinfo(ref_genome)
)

GenomicRanges::mcols(algnmts_gr) <- dplyr::select(algnmts, c(
  "alt_specimen", "sampleName", "count", if( umitag_option ) "umitag", 
  "contrib", "abund"
))

# Analyze alignments ----
## Identify groups of alignments or pileups of aligned fragments
## These pileups give strong experimental evidence of directed incorporation of
## the dsODN into a region. Initially, pileups are identified and then checked 
## for pairing, or if there is another pileup on the opposite strand in close 
## proximity.
algnmts_gr$clus.ori <- pileupCluster(
  gr = algnmts_gr, 
  grouping = "alt_specimen", 
  maxgap = 0L, 
  return = "simple"
)

algnmts_gr$paired.algn <- identifyPairedAlgnmts(
  gr = algnmts_gr, 
  grouping = "alt_specimen", 
  maxgap = upstream_dist * 2
)

algnmts_grl <- split(algnmts_gr, unlist(nuclease)[algnmts_gr$alt_specimen])

annot_clust_info <- dplyr::bind_rows(lapply(
    seq_along(algnmts_grl), 
    function(i, grl){

      gr <- grl[[i]]
      nuc <- names(grl)[i]

      if( !nuc %in% names(nuc_profiles) ){
        nuc_profile <- NULL
      }else{
        nuc_profile <- nuc_profiles[[nuc]]
      }

      ## Create a GRange with only the unique cluster origins
      split_clus_id <- stringr::str_split(
        string = unique(paste0(gr$alt_specimen, ":", gr$clus.ori)), 
        pattern = ":", 
        simplify = TRUE
      )

      algn_clusters <- GenomicRanges::GRanges(
        seqnames = split_clus_id[,2],
        ranges = IRanges::IRanges(
          start = as.numeric(split_clus_id[,4]), width = 1
        ),
        strand = split_clus_id[,3],
        seqinfo = GenomeInfoDb::seqinfo(ref_genome)
      )

      algn_clusters$specimen <- split_clus_id[,1]
      algn_clusters$clus.ori <- vcollapse(split_clus_id[, 2:4], sep = ":")

      algn_clusters$clus.seq <- getSiteSeqs(
        gr = algn_clusters, 
        upstream.flank = upstream_dist, 
        downstream.flank = downstream_dist, 
        ref.genome = ref_genome
      )

      ## Identify which target sequences binding near clusters
      if( !is.null(nuc_profile) ){

        algn_clusters <- compareTargetSeqs(
          gr.with.sequences = algn_clusters, 
          seq.col = "clus.seq", 
          target.seqs = uniq_target_seqs,
          tolerance = max_target_mismatch,
          nuc.profile = nuc_profile,
          submat = submat, 
          upstream.flank = upstream_dist, 
          downstream.flank = downstream_dist
        )

      }else{

        algn_clusters$target.match <- "No_valid_match"
        algn_clusters$target.mismatch <- NA
        algn_clusters$target.score <- NA
        algn_clusters$aligned.sequence <- NA
        algn_clusters$edit.site <- NA

      }

      as.data.frame(GenomicRanges::mcols(algn_clusters))

    },
    grl = algnmts_grl
  )) %>%
  dplyr::rename("alt_specimen" = specimen)


## Merge the target sequence alignment information from the clusters back to all
## unique alignments
algnmts <- as.data.frame(merge(
    x = as.data.frame(algnmts_gr), 
    y = dplyr::select(annot_clust_info, -clus.seq),
    by = c("alt_specimen", "clus.ori")
  )) %>%
  dplyr::mutate(
    alt_specimen = factor(alt_specimen, level = alt_specimen_levels)
  )

## Change guideRNA.match to No_Valid_Match if an inappropriate gRNA is annotated
algnmts$target.match <- filterInappropriateComparisons(
  guideRNA.match = algnmts$target.match, 
  specimen = algnmts$alt_specimen, 
  treatment = treatment
)

## Fragment pileups, paired clustering, and guideRNA alignments have been used 
## to characterize the incorporation sites analyzed here. Each metric will be 
## used to create a list of incorporation sites that may be nuclease cut sites. 
## The following identifies which alignments are associated with each of these 
## criteria.
tbl_clus_ori <- algnmts %>% 
  dplyr::group_by(alt_specimen, clus.ori) %>%
  dplyr::filter(dplyr::n() >= pile_up_min) %>%
  dplyr::ungroup() %$%
  table(clus.ori)

idx_clus_ori <- which(algnmts$clus.ori %in% names(tbl_clus_ori))

tbl_paired_algn <- algnmts %>%
  dplyr::filter(!is.na(paired.algn)) %$%
  table(paired.algn)

idx_paired_algn <- which(algnmts$paired.algn %in% names(tbl_paired_algn))

idx_matched <- which(algnmts$target.match != "No_valid_match")

idx_combined <- sort(unique(c(idx_clus_ori, idx_paired_algn, idx_matched)))

idx_df <- data.frame(
  "Type" = c("PileUp", "Paired", "Target_Matched", "Combined"),
  "Counts" = sapply(
    list(idx_clus_ori, idx_paired_algn, idx_matched, idx_combined), 
    length
  )
)

cat("\nTable of uniquely aligned template counts:\n")
print(idx_df, right = FALSE, row.names = FALSE) 
cat(paste0("\nTotal number of alignments: ", nrow(algnmts), "\n"))

probable_algns <- algnmts[idx_combined,]

probable_algns$on.off.target <- ifelse(
  probable_algns$edit.site %in% expandPosStr(on_targets), 
  "On-target", 
  "Off-target"
)

cat("\nOn / Off target alignment counts:\n")
print(table(probable_algns$on.off.target))


## Create summary and output formated object related to each of the criteria for
## edited site detection.

## Matched alignments
matched_algns <- probable_algns[
  probable_algns$target.match != "No_valid_match",
]

matched_summaries <- list(
  on.off.target = dplyr::quo(
    paste(sort(unique(on.off.target)), collapse = ";")
  ),
  paired.algn = dplyr::quo(paste(sort(unique(paired.algn)), collapse = ";")),
  count = dplyr::quo(sum(count)), 
  umitag = if( umitag_option ) dplyr::quo(sum(umitag)),
  algns = dplyr::quo(sum(contrib)),
  abund = dplyr::quo(sum(abund)),
  orient = dplyr::quo(paste(sort(unique(as.character(strand))), collapse = ";"))
)

matched_summaries <- matched_summaries[!sapply(matched_summaries, is.null)]

if( nrow(matched_algns) > 0 ){

  matched_summary <- matched_algns %>%
    dplyr::mutate(
      target.match = stringr::str_replace(
        string = target.match, 
        pattern = "\\:\\([\\w]+\\)$",
        replacement = ""
      )
    ) %>%
    dplyr::group_by(
      alt_specimen, edit.site, aligned.sequence, target.match, target.mismatch
    ) %>%
    dplyr::summarise(!!! matched_summaries) %>%
    dplyr::ungroup() %>% 
    dplyr::arrange(alt_specimen, target.match, desc(abund)) %>%
    as.data.frame()

}else{

  matched_summary <- data.frame(
    alt_specimen = factor(character(), levels = alt_specimen_levels),
    edit.site = character(),
    aligned.sequence = character(),
    target.match = character(),
    target.mismatch = numeric(),
    on.off.target = character(),
    paired.align = character(),
    count = numeric(),
    umitag = if(umitag_option) numeric(),
    aligns = numeric(),
    abund = numeric(),
    orient = character()
  )

}

if( nrow(matched_algns) == 0 ) matched_summary <- matched_summary[0,]

## Paired alignments
paired_algns <- probable_algns[
  probable_algns$paired.algn %in% names(tbl_paired_algn),
]

if( nrow(paired_algns) > 0 ){

  paired_summaries <- list(
    seqnames = dplyr::quo(unique(seqnames)),
    start = dplyr::quo(min(pos)), 
    end = dplyr::quo(max(pos)), 
    mid = dplyr::quo(start + (end-start)/2),
    strand = dplyr::quo("*"), 
    width = dplyr::quo(end - start), 
    count = dplyr::quo(sum(count)), 
    umitag = if( umitag_option ) dplyr::quo(sum(umitag)), 
    algns = dplyr::quo(sum(contrib)),
    abund = dplyr::quo(sum(abund))
  )

  paired_summaries <- paired_summaries[!sapply(paired_summaries, is.null)]

  paired_regions <- paired_algns %>%
    dplyr::group_by(alt_specimen, paired.algn, strand) %>%
    dplyr::mutate(pos = ifelse(strand == "+", min(start), max(end))) %>%
    dplyr::group_by(alt_specimen, paired.algn) %>%
    dplyr::summarise(!!! paired_summaries) %>%
    dplyr::ungroup()

}else{

  paired_regions <- data.frame(
    alt_specimen = factor(character(), levels = alt_specimen_levels),
    paired.align = logical(),
    seqnames = character(),
    start = numeric(),
    end = numeric(),
    mid = numeric(),
    strand = character(),
    width = numeric(),
    count = numeric(),
    umitag = if(umitag_option) numeric(),
    aligns = numeric(),
    abund = numeric()
  )

}

if( nrow(paired_regions) > 0 & length(on_targets) > 0 ){

  paired_regions <- paired_regions %>%
    dplyr::group_by(alt_specimen, paired.algn) %>%
    dplyr::mutate(
      on.off.target = ifelse(
        any(sapply(
          expandPosStr(unlist(on_targets[
            which(
              stringr::str_extract(
                names(on_targets), "[\\w\\-\\_\\.]+") %in% 
                treatment[[alt_specimen]]
            )
            ])),
          function(x, seq, st, en){

            match_seq <- seq == stringr::str_extract(x, "[\\w]+")

            within_start <- st <= 
              as.numeric(stringr::str_extract(x, "[\\w]+$")) + downstream_dist

            within_end <- en >= 
              as.numeric(stringr::str_extract(x, "[\\w]+$")) - downstream_dist

            match_seq & within_start & within_end

          }, 
          seq = seqnames, 
          st = start, 
          en = end
        )), 
        "On-target", 
        "Off-target"
      )
    ) %>%
    dplyr::ungroup() %>% 
    as.data.frame()

}else if( nrow(paired_regions) > 0 & length(on_targets) == 0 ){

  paired_regions <- dplyr::mutate(
    paired_regions,
    on.off.target = "Off-target"
  )

}else{

  paired_regions <- dplyr::mutate(
    paired_regions,
    on.off.target = vector(mode = "character")
  )

}

## Pile up alignments
pile_up_algns <- probable_algns[
  probable_algns$clus.ori %in% names(tbl_clus_ori),
]

pile_up_summaries <- list(
  on.off.target = dplyr::quo(
    paste(sort(unique(on.off.target)), collapse = ";")
  ),
  paired.algn = dplyr::quo(paste(sort(unique(paired.algn)), collapse = ";")),
  count = dplyr::quo(sum(count)), 
  umitag = if( umitag_option ) dplyr::quo(sum(umitag)),
  algns = dplyr::quo(sum(contrib)),
  abund = dplyr::quo(sum(abund))
)

pile_up_summaries <- pile_up_summaries[!sapply(pile_up_summaries, is.null)]

if( nrow(pile_up_algns) > 0){
  pile_up_summary <- pile_up_algns %>%
    dplyr::mutate(
      target.match = stringr::str_replace(
        string = target.match, 
        pattern = "\\:\\([\\w]+\\)$",
        replacement = ""
      )
    ) %>%
    dplyr::group_by(alt_specimen, clus.ori) %>%
    dplyr::summarise(!!! pile_up_summaries) %>%
    dplyr::ungroup() %>% 
    dplyr::arrange(alt_specimen, desc(abund)) %>%
    as.data.frame()

}else{

  pile_up_summary <- data.frame(
    alt_specimen = factor(character(), levels = alt_specimen_levels),
    clus.ori = character(),
    on.off.target = character(),
    paired.align = character(),
    count = numeric(),
    umitag = if(umitag_option) numeric(),
    aligns = numeric(),
    abund = numeric()
  )

}

# Generate stats if requested ----
## If requested, generate stats from the analysis for qc.

if( args$stat != FALSE ){

  stat_summary <- function(x, y, remove.multi.mock = FALSE){

    if(remove.multi.mock){
      x <- x %>%
        dplyr::filter(
          !stringr::str_detect(alt_specimen, "\\([\\w]+\\)$") %in%
            names(combos_exp_specimen_list)[
              lengths(combos_exp_specimen_list) > 1
            ]
        )
    }

    x %>%
      dplyr::mutate(
        metric = y,
        specimen = stringr::str_remove(
          as.character(alt_specimen), "\\([\\w]+\\)$"
        )
      ) %>%
      dplyr::select(-alt_specimen) %>%
      dplyr::distinct() %>%
      dplyr::group_by(sampleName, metric) %>%
      dplyr::summarize(count = sum(abund)) %>%
      dplyr::ungroup()

  }

  total_stat <- stat_summary(algnmts, "total.algns")
  combined_stat <- stat_summary(probable_algns[, 1:12], "combined.algns")
  pile_up_stat <- stat_summary(pile_up_algns[, 1:12], "pileup.algns")
  paired_stat <- stat_summary(paired_algns[, 1:12], "paired.algns")
  matched_stat <- stat_summary(matched_algns[, 1:12], "matched.algns", TRUE)

  on_tar_stat <- dplyr::filter(
      matched_algns, on.off.target == "On-target"
    ) %>%
    stat_summary("ontarget.algns", TRUE)

  off_tar_stat <- dplyr::filter(
      matched_algns, on.off.target == "Off-target"
    ) %>%
    stat_summary("offtarget.algns", TRUE)

  metric_levels <- c(
    "total.algns", "combined.algns", 
    "pileup.algns", "paired.algns", "matched.algns"
  )

  stat <- dplyr::bind_rows(
      total_stat, combined_stat, pile_up_stat, paired_stat, 
      matched_stat, on_tar_stat, off_tar_stat
    ) %>%
    dplyr::mutate(
      metric = factor(metric, levels = metric_levels),
      sampleName = factor(sampleName, levels = unique(sample_info$sampleName))
    ) %>%
    tidyr::complete(sampleName, metric, fill = list("count" = 0))

  write.table(
    x = stat, file = args$stat, 
    sep = ",", row.names = FALSE, 
    col.names = FALSE, quote = FALSE
  )

}


## Specimen summary ----
# Summarize components and append to specimen table
tbl_algn_summaries <- list(
  Reads = dplyr::quo(sum(count)), 
  UMItags = if( umitag_option ) dplyr::quo(sum(umitag)), 
  Alignments = dplyr::quo(sum(contrib))
)

tbl_algn_summaries <- tbl_algn_summaries[!sapply(tbl_algn_summaries, is.null)]

tbl_algn_counts <- algnmts %>% 
  dplyr::group_by(alt_specimen) %>%
  dplyr::summarise(!!! tbl_algn_summaries) %>%
  dplyr::mutate(
    specimen = stringr::str_remove(as.character(alt_specimen), "\\([\\w]+\\)$"),
    specimen = factor(specimen, levels = specimen_levels)
  ) %>%
  dplyr::filter(alt_specimen %in% sapply(combos_exp_specimen_list, "[[", 1)) %>%
  dplyr::select(specimen, dplyr::everything(), -alt_specimen) %>%
  dplyr::arrange(specimen)

spec_overview <- dplyr::left_join(
  spec_overview, tbl_algn_counts, by = "specimen"
) 


## Annotate incorporation data ----
matched_summary <- suppressMessages(dplyr::mutate(
  matched_summary,
  gene_id = assignGeneID( 
    seqnames = stringr::str_extract(edit.site, "[\\w]+"), 
    positions = as.numeric(stringr::str_extract(edit.site, "[\\w]+$")), 
    reference = ref_genome, 
    ref.genes = ref_genes, 
    onco.genes = onco_genes, 
    special.genes = special_genes
  )
))

paired_regions <- suppressMessages(dplyr::mutate(
  paired_regions,     
  gene_id = assignGeneID(
    seqnames = seqnames, 
    positions = mid, 
    reference = ref_genome, 
    ref.genes = ref_genes, 
    onco.genes = onco_genes, 
    special.genes = special_genes
  )
))

pile_up_summary <- suppressMessages(dplyr::mutate(
  pile_up_summary,
  gene_id = assignGeneID( 
    seqnames = stringr::str_extract(clus.ori, "[\\w]+"), 
    positions = as.numeric(stringr::str_extract(clus.ori, "[\\w]+$")), 
    reference = ref_genome, 
    ref.genes = ref_genes, 
    onco.genes = onco_genes, 
    special.genes = special_genes
  )
))


## On-target summary ----
# Algnmts
tbl_ot_algn <- algnmts %>%
  dplyr::group_by(alt_specimen) %>%
  dplyr::summarise(
    ot_algns = pNums(
      sum(abund * as.integer(edit.site %in% expandPosStr(on_targets)))
    ),
    ot_algns_pct = 100 * sum(
        abund * as.integer(edit.site %in% expandPosStr(on_targets))
      ) /
      sum(abund)
  ) %>%
  dplyr::ungroup() %>% 
  as.data.frame()

if( nrow(algnmts) == 0 ) tbl_ot_algn <- tbl_ot_algn[0,]

# Probable edited sites
tbl_ot_prob <- probable_algns %>% 
  dplyr::group_by(alt_specimen) %>%
  dplyr::summarise(
    ot_prob = pNums(
      sum(abund * as.integer(edit.site %in% expandPosStr(on_targets)))
    ),
    ot_prob_pct = 100 * sum(
        abund * as.integer(edit.site %in% expandPosStr(on_targets))
      ) /
      sum(abund)
  ) %>%
  dplyr::ungroup() %>% 
  as.data.frame()

if( nrow(probable_algns) == 0 ) tbl_ot_prob <- tbl_ot_prob[0,]

# Pile ups of read alignments
tbl_ot_pile <- pile_up_algns %>% 
  dplyr::group_by(alt_specimen) %>%
  dplyr::summarise(
    ot_pile = pNums(
      sum(abund * as.integer(edit.site %in% expandPosStr(on_targets)))
    ),
    ot_pile_pct = 100 * sum(
        abund * as.integer(edit.site %in% expandPosStr(on_targets))
      ) /
      sum(abund)
  ) %>%
  dplyr::ungroup() %>% 
  as.data.frame()

if( nrow(pile_up_algns) == 0 ) tbl_ot_pile <- tbl_ot_pile[0,]

# Paired or flanking algnments
tbl_ot_pair <- paired_regions %>%
  dplyr::mutate(
    on.off.target = factor(
      on.off.target, levels = c("On-target", "Off-target")
    )
  ) %>%
  dplyr::group_by(alt_specimen, on.off.target) %>%
  dplyr::summarise(cnt = sum(abund)) %>%
  dplyr::group_by(alt_specimen) %>%
  dplyr::summarise(
    ot_pair = pNums(sum(ifelse(on.off.target == "On-target", cnt, 0))),
    ot_pair_pct = 100 * sum(ifelse(on.off.target == "On-target", cnt, 0)) /
      sum(cnt)
  ) %>%
  dplyr::ungroup() %>% 
  as.data.frame()

if( nrow(paired_regions) == 0 ) tbl_ot_pair <- tbl_ot_pair[0,]

# Guide RNA matched within 6 mismatches
tbl_ot_match <- matched_summary %>%
  dplyr::group_by(alt_specimen, on.off.target) %>%
  dplyr::summarise(cnt = sum(abund)) %>%
  dplyr::group_by(alt_specimen) %>%
  dplyr::summarise(
    ot_match = pNums(sum(ifelse(on.off.target == "On-target", cnt, 0))),
    ot_match_pct = 100 * sum(ifelse(on.off.target == "On-target", cnt, 0)) /
      sum(cnt)
  ) %>%
  dplyr::ungroup() %>% 
  as.data.frame()

if( nrow(matched_summary) == 0 ) tbl_ot_match <- tbl_ot_match[0,]

tbl_ot_eff <- matched_summary %>%
  dplyr::group_by(alt_specimen, on.off.target, target.match) %>%
  dplyr::summarise(cnt = sum(abund)) %>%
  dplyr::ungroup() %>% 
  dplyr::group_by(alt_specimen, target.match) %>%
  dplyr::summarise(
    ot_eff_pct = 100 * sum(ifelse(on.off.target == "On-target", cnt, 0)) /
      sum(cnt)
  ) %>%
  dplyr::ungroup() %>%
  tidyr::spread(key = target.match, value = ot_eff_pct) %>%
  tidyr::complete(alt_specimen) %>%
  as.data.frame() %>%
  dplyr::left_join(
    dplyr::select(combo_overview, alt_specimen, annotation), 
    by = "alt_specimen"
  ) %>%
  dplyr::select(alt_specimen, annotation, dplyr::everything())

if( nrow(matched_summary) == 0 ) tbl_ot_eff <- tbl_ot_eff[0,]

# Summary table
ot_tbl_summary <- combo_overview %>%
  dplyr::mutate(
    annotation = factor(
      ifelse(is.na(annotation), "Mock", paste(annotation)), 
      levels = levels(annotation)
    )
  ) %>%
  dplyr::select(-specimen)

ot_tbl_summary <- Reduce(
    function(x,y) dplyr::left_join(x, y, by = "alt_specimen"),
    list(
      tbl_ot_algn[,c(1,3)], tbl_ot_pile[,c(1,3)],
      tbl_ot_pair[,c(1,3)], tbl_ot_match[,c(1,3)]
    ),
    init = ot_tbl_summary
  ) %>%
  dplyr::arrange(alt_specimen) %>%
  dplyr::select(-nuclease, -treatment, -combo)



## On-target incorporation distribution ----
on_tar_dists <- matched_algns %>%
  dplyr::filter(on.off.target == "On-target") %>%
  dplyr::mutate(
    target = stringr::str_extract(string = target.match, pattern = "[\\w]+"),
    pos = as.numeric(
      stringr::str_extract(string = edit.site, pattern = "[0-9]+$")
    ),
    edit.site.dist = ifelse(strand == "+", start - pos, end - pos)
  ) %>%
  dplyr::left_join(combo_overview, by = "alt_specimen") %>%
  dplyr::select(
    alt_specimen, target, annotation, edit.site, edit.site.dist, strand, abund
  )

on_tar_dens <- lapply(
  split(on_tar_dists, on_tar_dists$annotation), 
  function(x){

    if( nrow(x) >= 10 ){
      return(
        density(abs(x$edit.site.dist), from = 0, to = upstream_dist, bw = 1)
      )
    }else{
      return(NULL)
    }

  }
)

on_tar_dists <- dplyr::group_by(
    on_tar_dists, annotation, target, edit.site.dist, strand
  ) %>%
  dplyr::summarise(cnt = sum(abund)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    strand.cnt = ifelse(
      strand == "+", log(cnt, base = 10), -log(cnt, base = 10))
  )

if( nrow(matched_algns) == 0 ) on_tar_dists <- on_tar_dists[0,]

if( length(unique(combo_overview$annotation)) == 1 ){
  on_tar_dists$annotation <- " "
}

sites_included <- on_tar_dists %>% 
  dplyr::group_by(annotation, target) %>%
  dplyr::summarise(
    prop = 100 * sum(cnt[
      abs(edit.site.dist) <= upstream_dist & 
        abs(edit.site.dist) >= -downstream_dist
    ]) / sum(cnt),
    x_pos = upstream_dist,
    y_pos = 0.8 * min(strand.cnt[
      abs(edit.site.dist) <= upstream_dist & 
        abs(edit.site.dist) >= -downstream_dist
    ])
  ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(prop = paste0(pNums(prop, digits = 4), "%"))

if( nrow(on_tar_dists) == 0 ) sites_included <- sites_included[0,]

## Off-target summary ----
# All alignments
tbl_ft_algn <- algnmts %>%
  dplyr::filter(!edit.site %in% expandPosStr(on_targets)) %>%
  dplyr::group_by(alt_specimen) %>%
  dplyr::summarise(ft_algns = dplyr::n_distinct(clus.ori)) %>%
  dplyr::ungroup() %>% 
  as.data.frame()

if( nrow(algnmts) == 0 ) tbl_ft_algn <- tbl_ft_algn[0,]

# Probable edit sites
tbl_ft_prob <- probable_algns %>%
  dplyr::filter(on.off.target == "Off-target") %>%
  dplyr::group_by(alt_specimen) %>%
  dplyr::summarise(ft_prob = dplyr::n_distinct(clus.ori)) %>%
  dplyr::ungroup() %>% 
  as.data.frame()

if( nrow(probable_algns) == 0 ) tbl_ft_prob <- tbl_ft_prob[0,]

# Pile ups
tbl_ft_pile <- pile_up_algns %>%
  dplyr::filter(on.off.target == "Off-target") %>%
  dplyr::group_by(alt_specimen) %>%
  dplyr::summarise(ft_pile = dplyr::n_distinct(clus.ori)) %>%
  dplyr::ungroup() %>% 
  as.data.frame()

if( nrow(pile_up_algns) == 0 ) tbl_ft_pile <- tbl_ft_pile[0,]

# Paired or flanked loci
tbl_ft_pair <- paired_regions %>%
  dplyr::filter(on.off.target == "Off-target") %>%
  dplyr::group_by(alt_specimen) %>%
  dplyr::summarise(ft_pair = n()) %>%
  dplyr::ungroup() %>% 
  as.data.frame()

if( nrow(paired_regions) == 0 ) tbl_ft_pair <- tbl_ft_pair[0,]

# target sequence matched
tbl_ft_match <- matched_summary %>%
  dplyr::filter(on.off.target == "Off-target") %>%
  dplyr::group_by(alt_specimen) %>%
  dplyr::summarise(ft_match = dplyr::n()) %>%
  dplyr::ungroup() %>% 
  as.data.frame()

if( nrow(matched_summary) == 0 ) tbl_ft_match <- tbl_ft_match[0,]

# Off-target summary table
ft_tbl_summary <- combo_overview %>%
  dplyr::mutate(
    annotation = factor(
      ifelse(is.na(annotation), "Mock", paste(annotation)), 
      levels = levels(annotation)
    )
  ) %>%
  dplyr::select(-specimen)

ft_tbl_summary <- Reduce(
    function(x,y) dplyr::left_join(x, y, by = "alt_specimen"),
    list(tbl_ft_algn, tbl_ft_pile, tbl_ft_pair, tbl_ft_match),
    init = ft_tbl_summary
  ) %>%
  dplyr::arrange(alt_specimen) %>%
  dplyr::select(-combo, -nuclease, -treatment)

# Evaluation summary ----
ot_eff_range <- tbl_ot_eff %>%
  tidyr::gather(key = "target", value = "eff", -alt_specimen, -annotation) %>%
  dplyr::group_by(alt_specimen, annotation) %>%
  dplyr::summarise(
    min = round(min(eff, na.rm = TRUE), digits = 1),
    max = round(max(eff, na.rm = TRUE), digits = 1),
    eff_rg = ifelse(
      min == max,
      sprintf("%.1f%%", max),
      sprintf("%1$.1f - %2$.1f%%", min, max)
    )
  ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(eff_rg = ifelse(grepl("Inf", eff_rg), NA, eff_rg)) %>%
  dplyr::select(-min, -max)

if( nrow(tbl_ot_eff) == 0 ) ot_eff_range <- ot_eff_range[0,]

eval_summary <- ot_eff_range %>%
  dplyr::full_join(
    ft_tbl_summary, by = c("alt_specimen", "annotation")
  ) %>%
  dplyr::mutate(
    specimen = stringr::str_remove(as.character(alt_specimen), "\\([\\w]+\\)$"),
    specimen = factor(specimen, levels = specimen_levels)
  ) %>%
  dplyr::full_join(tbl_algn_counts, by = "specimen") %>%
  dplyr::select(
    "alt_specimen", "annotation", 
    dplyr::case_when(
      abundance_option == "umi" ~ "UMItags",
      abundance_option == "read" ~ "Reads",
      TRUE ~ "Alignments"
    ), "eff_rg", "ft_match", -"specimen"
  ) %>%
  dplyr::rename(
    "Specimen" = alt_specimen, "Annotation" = annotation, 
    "On-target\nEfficiency" = eff_rg, "Predicted\nOff-targets" = ft_match
  )

## Onco-gene enrichment analysis ----
rand_sites <- selectRandomSites(
  num = max(c(
    table(paired_regions$alt_specimen), 
    table(matched_summary$alt_specimen)
  )), 
  ref.genome = ref_genome, 
  drop.extra.seqs = TRUE, 
  rnd.seed = 1
)

rand_sites$gene_id <- suppressMessages(assignGeneID(
  seqnames = seqnames(rand_sites), 
  positions = start(rand_sites), 
  reference = ref_genome, 
  ref.genes = ref_genes, 
  onco.genes = onco_genes, 
  special.genes = special_genes
))

rand_df <- data.frame(
  annotation = "Random", 
  "total" = length(
    unique(gsub("\\*", "", rand_sites$gene_id))
  ), 
  "onco" = sum(stringr::str_detect(
    unique(gsub("\\*", "", rand_sites$gene_id)), "~"
  )), 
  "special" = sum(stringr::str_detect(
    unique(gsub("\\*", "", rand_sites$gene_id)), "!"
  ))
)

ref_df <- data.frame(
  annotation = "--",
  "total" = length(unique(ref_genes$annot_sym)),
  "onco" = sum(unique(onco_genes) %in% ref_genes$annot_sym),
  "special" = sum(unique(special_genes) %in% ref_genes$annot_sym)
)

pile_up_list <- pile_up_summary %>%
  dplyr::filter(alt_specimen %in% sapply(combos_exp_specimen_list, "[[", 1)) %>%
  dplyr::mutate(
    specimen = stringr::str_remove(as.character(alt_specimen), "\\([\\w]+\\)$"),
    specimen = factor(specimen, levels = specimen_levels)
  ) %>%
  split(
    f = as.character(annot_overview$annotation)[
      match(.$specimen, annot_overview$specimen)
    ]
  )

if( length(pile_up_list) > 0){

  pile_up_df <- dplyr::bind_rows(
    lapply(
        pile_up_list, 
        function(df){

          gene_id <- unique(gsub("\\*", "", df$gene_id))

          data.frame(
            "total" = length(gene_id), 
            "onco" = sum(stringr::str_detect(gene_id, "~")), 
            "special" = sum(stringr::str_detect(gene_id, "!"))
          )

        }
      ), 
      .id = "annotation"
    ) %>%
    dplyr::mutate(annotation = as.character(annotation))

}else{

  pile_up_df <- data.frame(
    annotation = character(),
    total = numeric(),
    onco = numeric(),
    special = numeric()
  )

}

paired_list <- paired_regions %>%
  dplyr::filter(alt_specimen %in% sapply(combos_exp_specimen_list, "[[", 1)) %>%
  dplyr::mutate(
    specimen = stringr::str_remove(as.character(alt_specimen), "\\([\\w]+\\)$"),
    specimen = factor(specimen, levels = specimen_levels)
  ) %>%
  split(
    f = as.character(annot_overview$annotation)[
      match(.$specimen, annot_overview$specimen)
    ]
  )

if( length(paired_list) > 0 ){

  paired_df <- dplyr::bind_rows(
      lapply(
        paired_list, 
        function(df){

          gene_id <- unique(gsub("\\*", "", df$gene_id))

          data.frame(
            "total" = length(gene_id), 
            "onco" = sum(stringr::str_detect(gene_id, "~")), 
            "special" = sum(stringr::str_detect(gene_id, "!"))
          )

        }
      ), 
      .id = "annotation"
    ) %>%
    dplyr::mutate(annotation = as.character(annotation))

}else{

  paired_df <- data.frame(
    annotation = character(),
    total = numeric(),
    onco = numeric(),
    special = numeric()
  )

}

matched_list <- split(
  x = matched_summary, 
  f = as.character(combo_overview$annotation)[
    match(matched_summary$alt_specimen, combo_overview$alt_specimen)
  ]
)

if( length(matched_list) > 0 ){

  matched_df <- dplyr::bind_rows(
    lapply(
      matched_list, 
      function(df){

        gene_id <- unique(gsub("\\*", "", df$gene_id))

        data.frame(
          "total" = nrow(df), 
          "onco" = sum(stringr::str_detect(gene_id, "~")), 
          "special" = sum(stringr::str_detect(gene_id, "!"))
        )

      }
    ), 
    .id = "annotation"
  )

}else{

  matched_df <- data.frame(
    annotation = character(),
    total = numeric(),
    onco = numeric(),
    special = numeric()
  )

}

enrich_df <- dplyr::bind_rows(
    list(
      "Reference" = ref_df, 
      "Pile Ups" = pile_up_df,
      "Flanking Pairs" = paired_df, 
      "Target Matched" = matched_df
    ), 
    .id = "origin"
  ) %>%
  dplyr::filter(total > 0)

enrich_df$onco.p.value <- p.adjust(
  sapply(
    seq_len(nrow(enrich_df)), 
    function(i){

      ref <- enrich_df[1, c("total", "onco"), drop = TRUE]
      query <- enrich_df[i, c("total", "onco"), drop = TRUE]
      ref$diff <- abs(diff(as.numeric(ref)))
      query$diff <- abs(diff(as.numeric(query)))

      mat <- matrix(
        c(ref$diff, ref$onco, query$diff, query$onco),
        nrow = 2
      )

      fisher.test(mat)$p.value

    }
  ), 
  method = "BH"
)

enrich_df$special.p.value <- p.adjust(
  sapply(
    seq_len(nrow(enrich_df)), 
    function(i){

      ref <- enrich_df[1, c("total", "special"), drop = TRUE]
      query <- enrich_df[i, c("total", "special"), drop = TRUE]
      ref$diff <- abs(diff(as.numeric(ref)))
      query$diff <- abs(diff(as.numeric(query)))

      mat <- matrix(
        c(ref$diff, ref$special, query$diff, query$special),
        nrow = 2
      )

      fisher.test(mat)$p.value

    }
  ), 
  method = "BH"
)

enrich_df <- enrich_df %>%
  dplyr::mutate(
    onco.power = sapply(seq_len(n()), function(i){

      statmod::power.fisher.test(
        p1 = onco[1] / total[1],
        p2 = onco[i] / total[i],
        n1 = total[1], n2 = total[i]
      )

    }),
    special.power = sapply(seq_len(n()), function(i){

      statmod::power.fisher.test(
        p1 = special[1] / total[1],
        p2 = special[i] / total[i],
        n1 = total[1], n2 = total[i]
      )

    })
  ) %>%
  dplyr::select(
    origin, annotation, total, 
    onco, onco.p.value, onco.power, 
    special, special.p.value, special.power
  ) 

## Off-target sequence analysis ----
ft_MESL <- matched_algns %>%
  dplyr::mutate(
    edit.site.dist = abs(ifelse(
      strand == "+", 
      start - as.numeric(stringr::str_extract(edit.site, "[0-9]+$")), 
      as.numeric(stringr::str_extract(edit.site, "[0-9]+$")) - end
    ))
  ) %>%
  dplyr::left_join(
    dplyr::select(combo_overview, alt_specimen, annotation),
    by = "alt_specimen"
  )

if( nrow(ft_MESL) > 0 ){

  ft_MESL <- ft_MESL %>%
    dplyr::group_by(annotation) %>%
    dplyr::mutate(
      ESL = predictESProb(
        z = edit.site.dist, 
        density = on_tar_dens[[unique(annotation)]]
      ),
      gene_id = matched_summary$gene_id[
        match(edit.site, matched_summary$edit.site)
      ]
    ) %>%
    dplyr::group_by(annotation, edit.site, gene_id) %>%
    dplyr::summarise(MESL = 100 * max(c(0,ESL), na.rm = TRUE)) %>%
    dplyr::ungroup()

}else{

  ft_MESL <- ft_MESL %>%
    dplyr::mutate(
      MESL = vector(mode = "numeric"), 
      gene_id = vector(mode = "character")
    ) %>%
    dplyr::select(annotation, edit.site, gene_id, MESL)

}

ft_seqs <- matched_summary %>%
  dplyr::select(
    alt_specimen, aligned.sequence, target.match, edit.site,
    target.mismatch, on.off.target, abund, gene_id
  ) %>%
  dplyr::left_join(
    dplyr::select(combo_overview, alt_specimen, annotation),
    by = "alt_specimen"
  ) %>%
  dplyr::left_join(
    ft_MESL, 
    by = c("annotation", "edit.site", "gene_id")
  )


if( is.null(args$support) ){

  ft_seqs <- dplyr::group_by(
      ft_seqs, 
      combo, target.match, edit.site, aligned.sequence, 
      target.mismatch, on.off.target, gene_id
    ) %>%
    dplyr::summarise(abund = sum(abund), MESL = max(MESL, na.rm = TRUE))

}else{

  ft_seqs <- dplyr::group_by(
      ft_seqs,
      annotation, target.match, edit.site, aligned.sequence, 
      target.mismatch, on.off.target, gene_id
    ) %>%
    dplyr::summarise(abund = sum(abund), MESL = max(MESL, na.rm = TRUE))

}

if( nrow(matched_summary) == 0 ) ft_seqs <- ft_seqs[0,]

ft_seqs <- dplyr::arrange(
    ft_seqs, desc(abund), desc(MESL), target.mismatch
  ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    on.off.target = stringr::str_extract(on.off.target, "[\\w]+")
  ) %>%
  dplyr::rename(
    "target" = on.off.target, 
    "mismatch" = target.mismatch, 
    "target.seq" = target.match,
    "abund" = abund
  )

if( is.null(args$support) ){

  ft_seqs_list <- split(
    ft_seqs, paste0(ft_seqs$target.seq, " (", ft_seqs$combo, ")")
  )

}else{

  ft_seqs_conds <- dplyr::arrange(ft_seqs, annotation, target.seq) %$% 
    unique(paste0(annotation, " - ", target.seq))

  ft_seqs_list <- split(
    x = ft_seqs, 
    f = factor(
      paste0(ft_seqs$annotation, " - ", ft_seqs$target.seq), 
      levels = ft_seqs_conds
    )
  )

}

if( nrow(matched_summary) == 0 ) ft_seqs_list <- NULL


# Data consolidated for output object ----
set_names <- ifelse(
  length(configs) ==  1, 
  names(configs), 
  paste0(
    paste(names(configs)[seq_len(length(configs)-1)], collapse = ", "),
    ", and ", 
    names(configs)[length(configs)]
  )
)

## Write output file
saveRDS(
  object = list(
    "params" = list(
      "set_names" = set_names, 
      "configs" = configs, 
      "soft_version" = soft_version, 
      "build_version" = build_version,
      "input_vc" = vc_check,
      "specimen_levels" = specimen_levels,
      "alt_specimen_levels" = alt_specimen_levels
    ),
    "spec_info" = list(
      "sample_info" = sample_info, 
      "target_seqs" = target_seqs,
      "target_tbl" = target_tbl,
      "on_targets" = on_targets,
      "combos_set_tbl" = combos_set_tbl,
      "treatment" = treatment, 
      "treatment_df" = treatment_df,
      "nuclease" = nuclease,
      "nuclease_df" = nuclease_df,
      "nuclease_treatment_df" = nuclease_treatment_df,
      "nuclease_profiles" = nuc_profiles,
      "supp_data" = supp_data, 
      "spec_overview" = spec_overview, 
      "annot_overview" = annot_overview,
      "combo_overview" = combo_overview
    ),
    "incorp_data" = list(
      "algnmts" = algnmts, 
      "probable_algns" = probable_algns,
      "matched_algns" = matched_algns,
      "matched_summary" = matched_summary,
      "paired_algns" = paired_algns,
      "paired_regions" = paired_regions,
      "pile_up_algns" = pile_up_algns,
      "pile_up_summary" = pile_up_summary
    ), 
    "summary_tbls" = list(
      "ot_tbl_summary" = ot_tbl_summary,
      "ot_eff_summary" = tbl_ot_eff,
      "ft_tbl_summary" = ft_tbl_summary,
      "eval_summary" = eval_summary
    ), 
    "edit_models" = list(
      "on_tar_dists" = on_tar_dists, 
      "on_tar_dens" = on_tar_dens, 
      "sites_included" = sites_included
    ),
    "enrich_data" = list(
      "rand_sites" = rand_sites, 
      "rand_df" = rand_df, 
      "enrich_df" = enrich_df
    ),
    "ft_data" = ft_seqs_list
  ),
  file = args$output
)

if( !file.exists(args$output) ){

  stop("\n  Cannot verify existence of output file:\n  ", args$output, "\n")

}else{

  if( !args$quiet ){
    cat("Evaluation complete, output writen to:\n  ", args$output, "\n")
  }

  q(status = 0)

}

R From line 9 of rscripts/evaluate_incorp_data.R

options(stringsAsFactors = FALSE, scipen = 99, width = 999)

code_dir <- dirname(sub(
  pattern = "--file=", 
  replacement = "", 
  x = grep("--file=", commandArgs(trailingOnly = FALSE), value = TRUE)
))

desc <- yaml::yaml.load_file(
  file.path(code_dir, "descriptions/filt.desc.yml")
)


# Set up arguments and workflow of script --------------------------------------
## Argument parser =============================================================
parser <- argparse::ArgumentParser(
  description = desc$program_short_description,
  usage = "nuc filt <seqFile(s)> [-h/--help, -v/--version] [optional args]"
)

parser$add_argument(
  "seqFile", nargs = "+", type = "character", help = desc$seqFile
)

parser$add_argument(
  "-o", "--output", nargs = "+", type = "character", help = desc$output
)

parser$add_argument(
  "-i", "--index", nargs = "*", type = "character", help = desc$index
)

parser$add_argument(
  "-s", "--seq", nargs = "*", type = "character", help = desc$seq
)

parser$add_argument(
  "-m", "--mismatch", nargs = "+", type = "integer", default = 0, 
  help = desc$mismatch
)

parser$add_argument(
  "-r", "--refseqs", nargs = "+", type = "character", help = desc$refseqs
)

parser$add_argument(
  "--aligntype", nargs = 1, type = "character", default = "ov",
  help = desc$aligntype
)

parser$add_argument(
  "--pctID", nargs = 1, type = "integer", default = 95,
  help = desc$pctID
)

parser$add_argument(
  "--pctIDtype", nargs = 1, type = "character", default = "global", 
  help = desc$pctIDtype
)

parser$add_argument(
  "--subMatAdj", nargs = "+", type = "character", default = FALSE,
  help = desc$subMatAdj
)

parser$add_argument(
  "--gapOpen", nargs = 1, type = "integer", default = 10,
  help = desc$gapOpen
)

parser$add_argument(
  "--gapExt", nargs = 1, type = "integer", default = 4,
  help = desc$gapExt
)

parser$add_argument(
  "--minAlignLength", nargs = 1, type = "integer", default = 20,
  help = desc$minAlignLength
)

parser$add_argument(
  "--readNamePattern", nargs = 1, type = "character", default = "[\\w:-]+",
  help = desc$readNamePattern
)

parser$add_argument(
  "-c", "--cores", nargs = 1, default = 1, type = "integer", help = desc$cores
)

parser$add_argument(
  "--stat", nargs = 1, default = FALSE, type = "character", help = desc$stat
)

parser$add_argument(
  "--header", action = "store_true", help = desc$header
)

parser$add_argument(
  "-n", "--negSelect", action = "store_true", help = desc$negSelect
)

parser$add_argument(
  "--any", action = "store_true", help = desc$any
)

parser$add_argument(
  "--compress", action = "store_true", help = desc$compress
)

parser$add_argument(
  "-q", "--quiet", action = "store_true", help = desc$quiet
)



## Parse cmd line args =========================================================
args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

## Checks and balance ==========================================================
if( args$cores > 1 ){

  # Stop code since parallel operation has not been constructed yet
  stop("\n  Parallel options have not yet been implemented.\n")

  if( args$cores > parallel::detectCores() ){

    cat(
      "\n  Requested cores is greater than availible for system.",
      "Changing cores to max allowed.\n"
    )
    args$cores <- detectCores()

  }

}else if( args$cores < 1 ){

  args$cores <- 1

}

if( length(args$seqFile) != length(args$output) ){
  stop(
    "\n  The same number of input and output file names need to be provided.\n")
}

if( length(args$index) > 1 ){
  stop(
    "\n  Only one index file can be used at a time. ",
    "Please consolidate indices.\n"
  )
}

if( length(args$mismatch) != length(args$seq) ){
  args$mismatch <- rep(args$mismatch[1], length(args$seq))
}

if( length(args$seq) > 0 ){

  args$seq <- toupper(gsub("U", "T", args$seq))

  if( 
    any(!unlist(strsplit(paste(args$seq, collapse = ""), "")) %in% 
      names(Biostrings::IUPAC_CODE_MAP)) 
  ){
    stop("\n  Unknown nucleotides detected in input filtering sequence(s).\n")
  }

}

if( !args$pctIDtype %in% c("global", "local") ){
  stop("\n  Input '--pctIDtype' must be either 'local' or 'global' [default].")
}

# Determine input sequence file type(s)
seq_type <- unlist(strsplit(args$seqFile, "/"))
seq_type <- seq_type[length(seq_type)]
seq_type <- stringr::str_extract(seq_type, ".fa[\\w]*")

if( any(!seq_type %in% c(".fa", ".fq", ".fasta", ".fastq")) ){

  stop(
    "\n  Unrecognized sequence file type, please convert to '*.fasta' or ", 
    "'*.fastq'. Gzip compression is acceptable as well.\n"
  )

}

seq_type <- ifelse(seq_type %in% c(".fa", ".fasta"), "fasta", "fastq")

# Determine sequence output file type(s)
if( length(args$output) > 0 ){

  out_type <- unlist(strsplit(args$output, "/"))
  out_type <- out_type[length(out_type)]
  out_type <- stringr::str_extract(out_type, ".fa[\\w]*")

  if( any(!out_type %in% c(".fa", ".fq", ".fasta", ".fastq")) ){

    stop(
      "\n  Unrecognized output sequence file type, please change to ", 
      "'*.fasta' or '*.fastq'.\n"
    )

  }

  out_type <- ifelse(out_type %in% c(".fa", ".fasta"), "fasta", "fastq")

}

# Identify filtering type
select_methods <- c()
if( length(args$index) == 1 ) select_methods <- c(select_methods, 1)
if( length(args$seqFile) > 1 ) select_methods <- c(select_methods, 2)
if( length(args$seq) > 0 ) select_methods <- c(select_methods, 3)
if( length(args$refseqs) > 0 ) select_methods <- c(select_methods, 4)

methods <- c(
  "input indices", "multiple file input indices", 
  "sequence content", "sequence matching reference(s)"
)[select_methods]

filt_type <- paste0(
  ifelse(args$negSelect, "negative", "positive"), 
  " selection using ", 
  paste(methods, collapse = ifelse(args$any, " or ", " and ")), "."
)


## Input arguments table =======================================================
input_table <- data.frame(
  "Variables" = paste0(names(args), " :"), 
  "Values" = sapply(seq_along(args), function(i){
    paste(args[[i]], collapse = ", ")
  })
)

input_table <- input_table[
  match(
    c("seqFile :", "output :", "index :", "header :", "negSelect :", "seq :", 
      "mismatch :", "refseqs :", "aligntype :", "pctID :", "pctIDtype :", 
      "subMatAdj :", "gapOpen :", "gapExt :", "minAlignLength :", 
      "readNamePattern :", "compress :", "cores :"), 
    input_table$Variables)
  ,]

if( !args$quiet ){

  cat("\nFilter Inputs:\n")
  print(
    data.frame(input_table, row.names = NULL), 
    right = FALSE, 
    row.names = FALSE
  )
  cat("\n  Filtering methods include", filt_type, "\n")

}


# Additional supporting functions ----------------------------------------------
source(file.path(code_dir, "supporting_scripts", "writeSeqFiles.R"))
source(file.path(code_dir, "supporting_scripts", "nucleotideScoringMatrices.R"))
source(file.path(code_dir, "supporting_scripts", "substituteAdjustments.R"))
source(file.path(code_dir, "supporting_scripts", "utility_funcs.R"))

#' Filter sequences based on input arguments
#' This function is the basis for the script.
filterSeqFile <- function(input.seqs, args){

  ## Identify sequence names matching across multiple sequence files
  if( length(input.seqs) > 1 ){

    multi_input_ids <- lapply(input_seqs, function(seq){
      stringr::str_extract(
        string = as.character(unique(ShortRead::id(seq))), 
        pattern = args$readNamePattern
      )
    })

    multi_input_tbl <- table(unlist(multi_input_ids))

    if( args$negSelect ){
      multi_input_names <- names(multi_input_tbl)[which(multi_input_tbl == 1)]
    }else if( args$any ){
      multi_input_names <- names(multi_input_tbl)[which(multi_input_tbl > 1)]
    }else{
      multi_input_names <- names(multi_input_tbl)[
        which(multi_input_tbl == length(input_seqs))
      ]
    }

    multi_filter_idx <- lapply(input_seqs, function(seqs, idx){

        ids <- stringr::str_extract(
          string = as.character(ShortRead::id(seqs)), 
          pattern = args$readNamePattern
        )

        which(ids %in% idx)

      }, 
      idx = multi_input_names
    )

  }


  ## Identify sequence names by matching to index file
  if( length(args$index) == 1 ){

    input_ids <- lapply(input_seqs, function(seq){
      stringr::str_extract(
        string = as.character(ShortRead::id(seq)), 
        pattern = args$readNamePattern
      )
    })

    index_df <- read.delim(args$index, header = args$header)

    index <- stringr::str_extract(
      string = as.character(index_df[,1]), 
      pattern = args$readNamePattern
    )

    index_filter_idx <- lapply(input_ids, function(ids, idx){
        which(ids %in% idx)
      }, 
      idx = index
    )

  }


  ## Identify sequences by matching input nucleotide sequence
  if( length(args$seq) > 0 ){

    seq_filter_idx <- lapply(
      input_seqs, 
      function(seqs, pattern, mismatch, neg, any){

        vcp <- mapply(function(pat, mis, seqs, neg){

            v <- Biostrings::vcountPattern(
              pat, ShortRead::sread(seqs), max.mismatch = mis, fixed = FALSE)

            if( neg ){
              return(which(v == 0))
            }else{
              return(which(v > 0))
            }

          }, 
          pat = pattern, mis = mismatch, 
          MoreArgs = list(seqs = seqs, neg = neg),
          SIMPLIFY = FALSE
        )

        vcp_tbl <- table(unlist(vcp))

        if( any ){
          return(as.numeric(names(vcp_tbl[which(vcp_tbl >= 1)])))
        }else{
          return(as.numeric(names(vcp_tbl[which(vcp_tbl == length(pattern))])))
        }

      }, 
      pattern = args$seq, 
      mismatch = args$mismatch, 
      neg = args$negSelect, 
      any = args$any
    )

  }


  ## Identify sequence that match to reference sequence(s)
  if( length(args$refseqs) > 0 ){

    ref_filter_idx <- lapply(
      input.seqs,
      function(seqs, refs, alntype, pctID, idtype, subadj, gapOpen, gapExt, 
               minAlignLength, neg){

        # Load reference sequences
        ref_types <- unlist(strsplit(refs, "/"))
        ref_types <- ref_types[length(ref_types)]
        ref_types <- stringr::str_extract(ref_types, ".fa[\\w]*")

        if( any(!ref_types %in% c(".fa", ".fq", ".fasta", ".fastq")) ){

          stop(
            "\n  Unrecognized sequence file type, please convert to '*.fasta' or ", 
            "'*.fastq'. Gzip compression is acceptable as well.\n"
          )

        }

        ref_types <- ifelse(ref_types %in% c(".fa", ".fasta"), "fasta", "fastq")

        refs <- mapply(
          function(file, file_type){

            if( file_type == "fasta" ){
              return(ShortRead::readFasta(file))
            }else{
              return(ShortRead::readFastq(file))
            }

          }, 
          file = refs, 
          file_type = ref_types, 
          SIMPLIFY = FALSE
        )

        if( length(refs) > 1 ){
          refs <- serialAppendS4(refs)
        }else{
          refs <- refs[[1]]
        }

        # Alignment type
        align_types <- structure(
          c("global", "local", "overlap", "global-local", "local-global"),
          names = c("gg", "ll", "ov", "gl", "lg")
        )

        alntype <- align_types[alntype]

        # Score only?
        SO <- idtype == 'global'

        # Interpret adjustment if any
        input_adjs <- stringr::str_extract(
          subadj[grep("^i", subadj)], "[\\w]{2}$"
        )

        refer_adjs <- stringr::str_extract(
          subadj[grep("^r", subadj)], "[\\w]{2}$"
        )

        # Apply adjustments and convert to character vectors
        seqs <- substituteAdjustments(ShortRead::sread(seqs), input_adjs)
        refs <- substituteAdjustments(ShortRead::sread(refs), refer_adjs)

        alignments <- lapply(
          refs, 
          function(ref, seqs, alntype, gapOpen, gapExt, SO){

            Biostrings::pairwiseAlignment(
              pattern = seqs, 
              subject = ref, 
              type = alntype, 
              gapOpening = gapOpen,
              gapExtension = gapExt,
              substitutionMatrix = usanmat(),
              scoreOnly = SO
            )

          },
          seqs = Biostrings::DNAStringSet(seqs),
          alntype = alntype,
          gapOpen = gapOpen,
          gapExt = gapExt,
          SO = SO
        )

        if( idtype == "global" ){

          max_score <- 100 * apply(
            matrix(unlist(alignments), ncol = length(refs)), 1, max
          ) / nchar(seqs)

        }else if( idtype == "local" ){

          local_score <- matrix(
            unlist(lapply(alignments, function(x) Biostrings::score(x))), 
            ncol = length(refs)
          )

          local_size <- matrix(
            unlist(lapply(alignments, function(x) x@pattern@range@width)),
            ncol = length(refs)
          )

          top_score_idx <- apply(100 * local_score / local_size, 1, function(x){ 
            which(x == max(x)) 
          })

          top_score_len <- unlist(lapply(
            seq_along(top_score_idx), 
            function(i){ 
              unique(local_size[i, top_score_idx[[i]], drop = TRUE])
            }
          ))

          top_score <- unlist(lapply(
            seq_along(top_score_idx), 
            function(i){ 
              unique(local_score[i, top_score_idx[[i]], drop = TRUE])
            }
          ))

          max_score <- 100 * top_score / top_score_len

        }else{

          stop("\n  Input error, pctIDtype must be either 'local' or 'global'.")

        }

        if( neg ){
          return( which(max_score < pctID | top_score_idx < minAlignLength) )
        }else{
          return( which(max_score >= pctID & top_score_len >= minAlignLength) )
        }

      },
      refs = args$refseqs,
      alntype = args$aligntype,
      pctID = args$pctID,
      idtype = args$pctIDtype,
      subadj = args$subMatAdj,
      gapOpen = args$gapOpen,
      gapExt = args$gapExt,
      minAlignLength = args$minAlignLength,
      neg = args$negSelect
    )

  }

  # Consolidate indices from each method employed 
  lapply(seq_along(input_seqs), function(i){

    idx <- NULL
    cnt <- 0

    if( exists("multi_filter_idx") ){
      cnt <- cnt + 1
      idx <- c(idx, multi_filter_idx[[i]]) 
    }

    if( exists("index_filter_idx") ){ 
      cnt <- cnt + 1
      idx <- c(idx, index_filter_idx[[i]]) 
    }

    if( exists("seq_filter_idx") ){ 
      cnt <- cnt + 1
      idx <- c(idx, seq_filter_idx[[i]]) 
    }

    if( exists("ref_filter_idx") ){
      cnt <- cnt + 1
      idx <- c(idx, ref_filter_idx[[i]]) 
    }

    if( args$any ){
      return(unique(idx))
    }else{
      idx_tbl <- table(idx)
      return(as.numeric(names(idx_tbl)[idx_tbl == cnt]))
    }

  })

}


# Identify indices of input file(s) for filtering ------------------------------
input_seqs <- mapply(
  function(file, file_type){

    if( file_type == "fasta" ){
      return(ShortRead::readFasta(file))
    }else{
      return(ShortRead::readFastq(file))
    }

  }, 
  file = args$seqFile, 
  file_type = seq_type, 
  SIMPLIFY = FALSE
)

output_indices <- filterSeqFile(input_seqs, args)

output_seqs <- mapply(
  function(seqs, idx){ seqs[idx] }, 
  seqs = input_seqs, 
  idx = output_indices, 
  SIMPLIFY = FALSE
)


# Write output files -----------------------------------------------------------
if( args$stat != FALSE ){

  sample_name <- strsplit(args$output, "/", fixed = TRUE)
  sample_name <- mapply("[[", sample_name, lengths(sample_name))
  sample_name <- strsplit(sample_name, ".fa", fixed = TRUE)
  sample_name <- mapply("[[", sample_name, 1)

  write.table(
    data.frame(
      sampleName = sample_name,
      metric = "reads",
      count = lengths(output_seqs)
    ),
    file = args$stat,
    sep = ",", 
    row.names = FALSE, 
    col.names = FALSE, 
    quote = FALSE
  )

}


null <- sapply(args$output, unlink)

null <- mapply(
  writeSeqFiles, 
  seqs = output_seqs, 
  file = args$output, 
  MoreArgs = list(compress = args$compress)
)

q()

R From line 10 of rscripts/filt.R

options(stringsAsFactors = FALSE, scipen = 99, width = 180)


# Set up and gather command line arguments ----
parser <- argparse::ArgumentParser(
  description = "Generate an iGUIDE report for input evaluation data.",
  usage = paste(
    "Rscript generate_IGUIDE_report.R <eval.rds> -o <output>",
    "[-h/--help, -v/--version] [optional args]"
  )
)

parser$add_argument(
  "eval", nargs = 1, type = "character",
  help = paste(
    "Evaluation dataset, in rds format. Can be generated by the",
    "'iguide eval' subcommand."
  )
)

parser$add_argument(
  "-o", "--output", nargs = 1, type = "character", required = TRUE,
  help = "Output report file, extension not required."
)

parser$add_argument(
  "-b", "--tables", action = "store_true",
  help = "Generate tables along with output report (csv formats)."
)

parser$add_argument(
  "-f", "--figures", action = "store_true",
  help = "Generate figures along with output report (pdf and png formats)."
)

parser$add_argument(
  "-d", "--data", action = "store_true",
  help = "Data to generate the report will be saved as an R image with output."
)

parser$add_argument(
  "-t", "--format", nargs = 1, type = "character", default = "html",
  help = "Output format for report. Either 'pdf' or 'html' (default)."
)

parser$add_argument(
  "-g", "--graphic", action = "store_true",
  help = "Includes an opening graphic on the report."
)

parser$add_argument(
  "--template", nargs = 1, type = "character", 
  default = "tools/rscripts/report_templates/iGUIDE_report_template.Rmd",
  help = "File path to standard or custom iGUIDE report template."
)

parser$add_argument(
  "--iguide_dir", nargs = 1, type = "character", default = "IGUIDE_DIR",
  help = "iGUIDE install directory path, do not change for normal applications."
)


args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

if( !dir.exists(args$iguide_dir) ){
  root_dir <- Sys.getenv(args$iguide_dir)
}else{
  root_dir <- args$iguide_dir
}

if( !dir.exists(root_dir) ){
  stop(paste0("\n  Cannot find install path to iGUIDE: ", root_dir, ".\n"))
}else{
  args$iguide_dir <- root_dir
}

report_formats <- c("html" = "html_document", "pdf" = "pdf_document")

if( !args$format %in% names(report_formats) ){
  stop("Please input either 'html' or 'pdf' for format.\n",
       "Other formats not supported.")
}

output_format <- report_formats[args$format]

## Resolve template file path.
if( file.exists(file.path(root_dir, args$template)) ){

  template_path <- normalizePath(file.path(root_dir, args$template))

}else if( file.exists(file.path(args$template)) ){

  template_path <- normalizePath(file.path(args$template))

}else{

  stop("\nCannot find template file: ", args$template, ".\n")

}


## Construct input table and print to terminal
input_table <- data.frame(
  "Variables" = paste0(names(args), " :"), 
  "Values" = sapply(seq_along(args), function(i){
    paste(args[[i]], collapse = ", ")
  })
)

input_table <- input_table[
  match(
    c(
      "eval :", "output :", "tables :", "figures :", "data :", "graphic :", 
      "format :", "template :", "iguide_dir :"
    ),
    input_table$Variables),
]

cat("\niGUIDE Report Inputs:\n")

print(
  data.frame(input_table),
  right = FALSE, 
  row.names = FALSE
)


# Load dependancies ----
cat("\nLoading dependencies.\n")

add_packs <- c("magrittr", "knitr", "iguideSupport")

add_packs_loaded <- suppressMessages(
  sapply(add_packs, require, character.only = TRUE)
)

if( !all(add_packs_loaded) ){

  print(
    data.frame(
      "R-Packages" = names(add_packs_loaded), 
      "Loaded" = add_packs_loaded
    ), 
    right = FALSE,
    row.names = FALSE
  )

  stop("Check dependancies.\n")

}


# Import metadata and consolidate into report objects ----
cat("Importing evaluation data...\n")

if( file.exists(file.path(root_dir, args$eval)) ){

  eval_path <- normalizePath(file.path(root_dir, args$eval))

}else if( file.exists(file.path(args$eval)) ){

  eval_path <- normalizePath(file.path(args$eval))

}else{

  stop("\n  Cannot find evaluation dataset: ", args$eval, "\n")

}

eval_data <- readRDS(eval_path)

## Configuration
configs <- eval_data$params$configs

## Load reference genome 
if( grepl(".fa", unique(sapply(configs, "[[", "Ref_Genome"))) ){

  if( !(
    file.exists(
      file.path(root_dir, unique(sapply(configs, "[[", "Ref_Genome")))
    ) | file.exists(unique(sapply(configs, "[[", "Ref_Genome")))
  ) ){
    stop("Specified reference genome file not found.")
  }

  ref_file_type <- ifelse(
    grepl(".fastq", unique(sapply(configs, "[[", "Ref_Genome"))), 
    "fastq", 
    "fasta"
  )

  if( file.exists(
    file.path(root_dir, unique(sapply(configs, "[[", "Ref_Genome"))) 
    ) ){

    ref_genome <- Biostrings::readDNAStringSet(
      filepath = file.path(
        root_dir, unique(sapply(configs, "[[", "Ref_Genome"))
      ),
      format = ref_file_type
    )

  }else{

    ref_genome <- Biostrings::readDNAStringSet(
      filepath = unique(sapply(configs, "[[", "Ref_Genome")), 
      format = ref_file_type
    )
  }

}else{

  ref_genome <- unique(sapply(configs, "[[", "Ref_Genome"))

  genome <- grep(
    pattern = ref_genome, 
    x = unique(BSgenome::installed.genomes()), 
    value = TRUE
  )

  if( length(genome) == 0 ){

    cat("\nInstalled genomes include:")
    print(unique(BSgenome::installed.genomes()))
    cat("\nSelected reference genome not in list.")
    stop("Error: Genome not available.")

  }else if( length(genome) > 1 ){

    cat("\nInstalled genomes include:")
    print(unique(BSgenome::installed.genomes()))
    cat(
      "\nPlease be more specific about reference genome.",
      "Multiple matches to input."
    )
    stop("Error: Multiple genomes requested.")

  }

  suppressMessages(library(genome, character.only = TRUE))

  ref_genome <- get(genome)

}

## Get versioning and params

soft_version <- eval_data$params$soft_version

build_version <- eval_data$params$build_version

signature <- paste(
  unique(sort(unlist(lapply(configs, "[[", "signature")))), 
  collapse = ", ")

## Determine processing parameters
## Some parameters will need to be an "all or nothing" approach, including:
##   - UMItags
##   - recoverMultihits
## Depending on these parameters others (upstream/downstream_dist, ...) may need
## to be consistent between runs otherwise, the primary config file (first one),
## will be used for parameterization.

umitag_option <- all(unlist(lapply(configs, "[[", "UMItags")))
multihit_option <- all(unlist(lapply(configs, "[[", "recoverMultihits")))

abundance_option <- unique(
  tolower(unlist(lapply(configs, "[[", "Abundance_Method")))
)[1]

if( is.na(abundance_option) ) abundance_option <- "Fragment"

if( abundance_option == "umi" & !umitag_option ){
  stop(
    "\n  Abundance method has been set to use UMItags, yet the current",
    "\n  configuration does not capture UMItag data (UMItags : FALSE).",
    "\n  Please correct this inconsistency before continuing analysis."
  )
}

if( multihit_option ){

  upstream_dist <- unique(sapply(configs, function(x) x$upstreamDist))
  downstream_dist <- unique(sapply(configs, function(x) x$downstreamDist))
  pile_up_min <- unique(sapply(configs, function(x) x$pileUpMin))

  if( 
    length(upstream_dist) > 1 | 
    length(downstream_dist) > 1 | 
    length(pile_up_min) > 1 
  ){

    stop(
      "\n  Inconsistant upstream or downstream distances between config files.\n",
      "  Comparisons between groups with different run specific criteria\n", 
      "  is not recommended when considering the recover multihit option.\n"
    )

  }

}else{

  upstream_dist <- configs[[1]]$upstreamDist
  downstream_dist <- configs[[1]]$downstreamDist
  pile_up_min <- configs[[1]]$pileUpMin

}

## Combine sampleInfo files

sample_info <- eval_data$spec_info$sample_info

specimen_levels <- eval_data$params$specimen_levels
alt_specimen_levels <- eval_data$params$alt_specimen_levels

support_present <- nrow(eval_data$spec_info$supp_data) > 0

## Identify all targets used
target_tbl <- eval_data$spec_info$target_tbl %>%
  dplyr::select(-run_set) %>%
  dplyr::distinct() %>%
  dplyr::rename(
    "Nuclease" = nuclease,
    "Target Name" = target,
    "Sequence" = sequence
  )

## Identify on-target edit sites
on_targets <- eval_data$spec_info$on_targets

## Treatment across runs
treatment <- eval_data$spec_info$treatment
treatment_df <- eval_data$spec_info$treatment_df

## Nuclease profiles
nuc_profiles <- eval_data$spec_info$nuclease_profiles

## Combo information
nuc_treatment_unmod_df <- eval_data$spec_info$nuclease_df %>%
  dplyr::full_join(treatment_df, by = c("run_set", "specimen"))

nuclease_treatment_df <- eval_data$spec_info$nuclease_treatment_df
combos_set_tbl <- eval_data$spec_info$combos_set_tbl

combos_tbl <- combos_set_tbl %>%
  dplyr::select(-run_set) %>%
  dplyr::distinct() %>%
  dplyr::mutate(combo = paste0("(", combo, ")")) %>%
  dplyr::rename(
    "Combination" = combo,
    "Nuclease" = nuclease,
    "Treatment" = treatment
  )

## Load in supporting information ----
supp_data <- eval_data$spec_info$supp_data

## Consolidate supplementary data ----
spec_overview <- eval_data$spec_info$spec_overview %>%
  dplyr::mutate(
    specimen = ifelse(
      nuc_treatment_unmod_df$nuclease[
        match(specimen, nuc_treatment_unmod_df$specimen)
      ] == "Mock" | nuc_treatment_unmod_df$treatment[
        match(specimen, nuc_treatment_unmod_df$specimen)
      ] == "Mock",
      as.character(specimen),
      as.character(nuclease_treatment_df$alt_specimen)[
        match(specimen, nuclease_treatment_df$specimen)
      ]
    )
  )

if( length(unique(spec_overview$run_set)) == 1 ){
  spec_overview <- dplyr::select(spec_overview, -run_set)
}else{
  spec_overview <- dplyr::rename(spec_overview, "Run Name" = run_set)
}

annot_overview <- eval_data$spec_info$annot_overview

combo_overview <- nuclease_treatment_df %>%
  dplyr::left_join(annot_overview, by = "specimen") %>%
  dplyr::mutate(
    annotation = paste0(as.character(annotation), " (", combo, ")"),
    annotation = factor(annotation, levels = unique(annotation))
  )


## Read in experimental data and contatenate different sets ----
incorp_data <- eval_data$incorp_data


## Info graphic data ----
graphic_order <- c("algnmts", "pile_up_algns", "paired_algns", "matched_algns")
graphic_data <- incorp_data[graphic_order]

graphic_grl <- GenomicRanges::GRangesList(lapply(
  graphic_data, 
  GenomicRanges::makeGRangesFromDataFrame, 
  seqinfo = GenomicRanges::seqinfo(ref_genome)
))


# Genomic Distribution of edited sites ----
genomic_grl <- GenomicRanges::GRangesList(lapply(
  graphic_data, 
  function(x){

    y <- makeGRangesFromDataFrame(x, seqinfo = seqinfo(ref_genome))
    mcols(y) <- combo_overview[
      match(x$alt_specimen, combo_overview$alt_specimen), 
      "annotation", 
      drop = FALSE
    ]

    y

  }
))

num_conds <- max(length(unique(combo_overview$annotation)), 1)

names(genomic_grl) <- c(
  "All Align.", "Pileup Align.", "Flanking Pairs", "Target Matched"
)


# On-target summary ----
ot_tbl_summary <- eval_data$summary_tbls$ot_tbl_summary %>%
  dplyr::rename("specimen" = alt_specimen) %>%
  dplyr::mutate(
    annotation = stringr::str_remove(annotation, "\\([\\w]+\\)$")
  ) %>%
  dplyr::select(
    "specimen", "annotation", "ot_algns_pct", "ot_pile_pct", 
    "ot_pair_pct", "ot_match_pct"
  )

ot_eff_summary <- eval_data$summary_tbls$ot_eff_summary %>%
  dplyr::rename("specimen" = alt_specimen) %>%
  dplyr::mutate(
    annotation = stringr::str_remove(annotation, "\\([\\w]+\\)$")
  ) %>%
  dplyr::select(specimen, annotation, dplyr::everything())

eval_summary <- eval_data$summary_tbls$eval_summary

# On-target distribution of incorporations ----
on_tar_dists <- eval_data$edit_models$on_tar_dists
sites_included <- eval_data$edit_models$sites_included

# Off-target summary ----
ft_tbl_summary <- eval_data$summary_tbls$ft_tbl_summary %>%
  dplyr::rename("specimen" = alt_specimen) %>%
  dplyr::mutate(
    annotation = stringr::str_remove(annotation, "\\([\\w]+\\)$")
  ) %>%
  dplyr::select(
    "specimen", "annotation", "ft_algns", "ft_pile", "ft_pair", "ft_match"
  )


# Off-target sequence analysis ----
ft_seqs_list <- eval_data$ft_data


# Onco-gene enrichment analysis ----
enrich_df <- eval_data$enrich_data$enrich_df


# Data passed to Rmd for report generation ----
set_names <- eval_data$params$set_names

# Normalize file output path
write(c(), file = args$output)
args$output <- normalizePath(args$output)
unlink(args$output)

output_path <- unlist(strsplit(args$output, "/"))
output_dir <- paste(output_path[seq_len(length(output_path)-1)], collapse = "/")
output_file <- output_path[length(output_path)]

if( args$format == "html" & !stringr::str_detect(output_file, ".html$") ){
  output_file <- paste0(output_file, ".html")
}

if( args$format == "pdf" & !stringr::str_detect(output_file, ".pdf$") ){
  output_file <- paste0(output_file, ".pdf")
}

figure_path <- file.path(
  output_dir, gsub("[\\w]+$", "figures", output_file, perl = TRUE)
)

null <- dir.create(figure_path)

if( args$tables ){

  tables_path <- file.path(
    output_dir, gsub("[\\w]+$", "tables", output_file, perl = TRUE)
  )

  null <- dir.create(tables_path)

}

if( args$data ){

  if( args$format == "html" ){

    save.image(file = file.path(
      output_dir, stringr::str_replace(output_file, ".html$", ".RData")
    )) 

  }else if( args$format == "pdf" ){

    save.image(file = file.path(
      output_dir, stringr::str_replace(output_file, ".pdf$", ".RData")
    )) 

  }

}

if( args$format == "html" ){

  css_path <- normalizePath(
    file.path(root_dir, "tools/rscripts/report_templates/iguide.css")
  )

  rmarkdown::render(
    input = template_path,
    output_format = output_format, 
    output_file = output_file,
    output_dir = output_dir,
    output_options = list("css" = css_path)
  )

}else{

  rmarkdown::render(
    input = template_path,
    output_format = output_format, 
    output_file = output_file,
    output_dir = output_dir
  )

}

if( !args$figures ){

  tmp_fig_paths <- c(
    list.files(
      path = figure_path, pattern = "incorp_dist", full.names = TRUE
    ),
    list.files(
      path = figure_path, pattern = "genomic_dens", full.names = TRUE
    ),
    list.files(
      path = figure_path, pattern = "off_target_seqs", full.names = TRUE
    )
  )

  cat(sprintf("Removing temporary files: %s\n", tmp_fig_paths), sep = "")
  null <- file.remove(tmp_fig_paths)
  cat("Removing temorary directory:", figure_path, "\n")
  null <- file.remove(figure_path)

}

q()

R From line 9 of rscripts/generate_iGUIDE_report.R

options(stringsAsFactors = FALSE, scipen = 99, width = 180)
set.seed(1)


# Set up and gather command line arguments ----
parser <- argparse::ArgumentParser(
  description = "Generate an iGUIDE summary from input evaluation data.",
  usage = paste(
    "Rscript generate_IGUIDE_summary.R <eval.rds> [optional args]",
    "[-h/--help, -v/--version]"
  )
)

parser$add_argument(
  "eval", nargs = 1, type = "character",
  help = paste(
    "Evaluation dataset, in rds format. Can be generated by the",
    "'iguide eval' subcommand."
  )
)

parser$add_argument(
  "-o", "--output", nargs = 1, type = "character", default = FALSE,
  help = paste(
    "Output report file, extention not required. Will be writen as text",
    "file. If no output given, results will be printed to screen.",
    "Example output name: summary.iGUIDE_Run.txt or summary.iguide_run"
  )
)

parser$add_argument(
  "-p", "--power_filt", nargs = 1, type = "integer", default = 0,
  help = paste(
    "Specify a integer between 0 and 100 indicating the percent for",
    "which to filter statistical comparisons for gene enrichment based",
    "on power."
  ),
  metavar = "INT"
)

parser$add_argument(
  "-m", "--mesl_filt", nargs = 1, type = "integer", default = 0,
  help = paste(
    "Specify a integer between 0 and 100 indicating the likelyhood for",
    "which to filter off-target sites based on the Mean Edit Site",
    "Likelyhood (MESL) model."
  ),
  metavar = "INT"
)

parser$add_argument(
  "--iguide_dir", nargs = 1, type = "character", default = "IGUIDE_DIR",
  help = "iGUIDE install directory path, do not change for normal applications."
)


args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

if( !dir.exists(args$iguide_dir) ){
  root_dir <- Sys.getenv(args$iguide_dir)
}else{
  root_dir <- args$iguide_dir
}

if( !dir.exists(root_dir) ){
  stop(paste0("\n  Cannot find install path to iGUIDE: ", root_dir, ".\n"))
}else{
  args$iguide_dir <- root_dir
}

# Normalize file output path
if( args$output != FALSE ){

  write(c(), file = args$output)
  args$output <- normalizePath(args$output)
  unlink(args$output)

  output_path <- unlist(strsplit(args$output, "/"))
  output_dir <- paste(output_path[seq_len(length(output_path)-1)], collapse = "/")
  output_file <- output_path[length(output_path)]

  if( !stringr::str_detect(output_file, ".txt$") ){
    output_file <- paste0(output_file, ".txt")
  }

  args$output <- file.path(output_dir, output_file)
  unlink(args$output)

}

## Additional functions to help with output 
catOrWrite <- function(obj, args, big.mark = ",", style = "simple", ...){

  if( is.data.frame(obj) ){

    obj <- pander::pandoc.table.return(
      obj, row.names = FALSE, 
      split.cells = Inf, split.tables = Inf, style = style, 
      justify = ifelse(
        sapply(seq_len(ncol(obj)), function(i){
          is.numeric(obj[, i, drop = TRUE])
        }),
        "right", "centre"
      ),
      plain.ascii = TRUE,
      ...
    )

    obj <- stringr::str_remove(obj, "\n")

  }else{

    obj <- paste0(obj, "\n")

  }

  if( args$output != FALSE ){
    cat(obj, file = args$output, append = TRUE)
  }else{
    cat(obj)
  }

}


# Load dependancies ----
add_packs <- c("magrittr", "iguideSupport")

add_packs_loaded <- suppressMessages(
  sapply(add_packs, require, character.only = TRUE)
)

if( !all(add_packs_loaded) ){

  print(
    data.frame(
      "R-Packages" = names(add_packs_loaded), 
      "Loaded" = add_packs_loaded
    ), 
    right = FALSE,
    row.names = FALSE
  )

  stop("Check dependancies.\n")

}


# Import metadata and consolidate into report objects ----
if( file.exists(file.path(root_dir, args$eval)) ){

  eval_path <- normalizePath(file.path(root_dir, args$eval))

}else if( file.exists(file.path(args$eval)) ){

  eval_path <- normalizePath(file.path(args$eval))

}else{

  stop("\n  Cannot find evaluation dataset: ", args$eval, "\n")

}

eval_data <- readRDS(eval_path)

## Configuration
configs <- eval_data$params$configs

## Get versioning and params
set_names <- eval_data$params$set_names
soft_version <- eval_data$params$soft_version
build_version <- eval_data$params$build_version

signature <- paste(
  unique(sort(unlist(lapply(configs, "[[", "signature")))), 
  collapse = ", ")

umitag_option <- all(unlist(lapply(configs, "[[", "UMItags")))
abundance_option <- unique(
  tolower(unlist(lapply(configs, "[[", "Abundance_Method")))
)[1]

if( is.na(abundance_option) ) abundance_option <- "Fragment"

if( tolower(abundance_option) == "umi" & !umitag_option ){
  stop(
    "\n  Abundance method has been set to use UMItags, yet the current",
    "\n  configuration does not capture UMItag data (UMItags : FALSE).",
    "\n  Please correct this inconsistency before continuing analysis."
  )
}


# Start summary ----
null <- catOrWrite(
  paste0(
    "iGUIDE Summary:\n",
    "  run(s) : ", set_names, "\n",
    "  author(s): ", signature, "\n",
    "  generated : ", "[", paste(Sys.time()), "]\n",
    "  software version : ", soft_version, "\n",
    "  build version : ", build_version, "\n",
    "\n************************************************************\n"
  ), 
  args
)

# Analysis overview table ----
eval_summary <- eval_data$summary_tbls$eval_summary %>%
  dplyr::mutate(Specimen = stringr::str_remove(Specimen, "\\([\\w]+\\)$"))

eval_summary[is.na(eval_summary)] <- 0

null <- catOrWrite(
  "Table 1. Analysis overview with specific data highlights.", 
  args
)

null <- catOrWrite(eval_summary, args, missing = 0, style = "multiline")

null <- catOrWrite("", args)

# Specimen summary table ----
specimen_levels <- eval_data$params$specimen_levels
alt_specimen_levels <- eval_data$params$alt_specimen_levels
spec_overview <- eval_data$spec_info$spec_overview
on_targets <- eval_data$spec_info$on_targets

if( length(unique(spec_overview$run_set)) == 1 ){
  spec_overview <- dplyr::select(spec_overview, -run_set)
}else{
  spec_overview <- dplyr::rename(spec_overview, "Run Name" = run_set)
}

spec_overview <- eval_data$incorp_data$algnmts %>%
  dplyr::mutate(type = ifelse(
    is.na(edit.site),
    "Independent", 
    ifelse(edit.site %in% expandPosStr(on_targets), "On-target", "Off-target")
    )
  ) %>%
  dplyr::group_by(alt_specimen, type) %>%
  dplyr::summarise(abund = sum(abund)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    alt_specimen = factor(alt_specimen, levels = alt_specimen_levels),
    type = factor(type, levels = c("On-target", "Off-target", "Independent"))
  ) %>%
  tidyr::spread(key = "type", value = "abund", fill = 0) %>%
  tidyr::complete(
    alt_specimen, 
    fill = list("On-target" = 0, "Off-target" = 0, "Independent" = 0)
  ) %>%
  dplyr::mutate(
    specimen = factor(
      stringr::str_remove(as.character(alt_specimen), "\\([\\w]+\\)$"), 
      levels = specimen_levels
    )
  ) %>%
  dplyr::left_join(spec_overview, ., by = "specimen") %>%
  dplyr::select(alt_specimen, dplyr::everything(), -specimen) %>%
  dplyr::rename(specimen = alt_specimen)

null <- catOrWrite(
  "Table 2. Specimen overview covering reads, umitags, and alignments:", 
  args
)

null <- catOrWrite(spec_overview, args, missing = 0, style = "multiline")

null <- catOrWrite("", args)

# Combination info table ----
combo_overview <- eval_data$spec_info$combo_overview

combos_tbl <- eval_data$spec_info$combos_set_tbl %>%
  dplyr::select(-run_set) %>%
  dplyr::distinct() %>%
  dplyr::mutate(combo = paste0("(", combo, ")")) %>%
  dplyr::rename(
    "Combination" = combo,
    "Nuclease" = nuclease,
    "Treatment" = treatment
  )

null <- catOrWrite(
  "Table 3. Combinations of nuclease(s) and treatment(s).",
  args
)

null <- catOrWrite(combos_tbl, args)

# Target info table ----
## Identify all targets used
target_tbl <- eval_data$spec_info$target_tbl %>%
  dplyr::select(-run_set) %>%
  dplyr::distinct() %>%
  dplyr::rename(
    "Nuclease" = nuclease,
    "Target Name" = target,
    "Sequence" = sequence
  ) %>%
  dplyr::mutate(
    "Edit Loci" = sapply(
      `Target Name`, 
      function(x){
        paste(on_targets[which(names(on_targets) == x)], collapse = "\n")
      }
    )
  )

null <- catOrWrite(
  "Table 4. Target pattern table specifying sequences and edited loci:", 
  args
)

null <- catOrWrite(
  target_tbl, args, style = "multiline", keep.line.breaks = TRUE
)

null <- catOrWrite("", args)


# On-target summary table ----
ot_tbl_summary <- eval_data$summary_tbls$ot_tbl_summary %>%
  dplyr::mutate(
    specimen = combo_overview$specimen[
      match(alt_specimen, combo_overview$alt_specimen)
      ]
  ) %>%
  dplyr::select(
    "specimen", "annotation", "ot_algns_pct", "ot_pile_pct", 
    "ot_pair_pct", "ot_match_pct"
  )

names(ot_tbl_summary) <- c(
  "Specimen", "Annotation", "All\nAlign.", "Align.\nPileups", 
  "Flanking\nPairs", "Target\nMatched"
)

null <- catOrWrite(
  "Table 5. On-target editing percentages based on alignment criteria:",
  args
)

null <- catOrWrite(
  ot_tbl_summary, args, 
  digits = 4, round = 2, style = "multiline", missing = 0, 
  keep.line.breaks = TRUE
)

null <- catOrWrite("", args)


# On-target distribution of incorporations ----
on_tar_dists <- eval_data$edit_models$on_tar_dists
sites_included <- eval_data$edit_models$sites_included

on_tar_dist_summary <- on_tar_dists %>%
  dplyr::group_by(annotation, target) %>%
  dplyr::summarise(
    quant = paste(
      round(quantile(S4Vectors::Rle(abs(edit.site.dist), cnt)), digits = 0), 
      collapse = ";"
    )
  ) %>%
  dplyr::ungroup() %>%
  dplyr::left_join(
    dplyr::select(sites_included, annotation, target, prop),
    by = c("annotation", "target")
  ) %>%
  dplyr::select(annotation, target, prop, quant) %>%
  tidyr::separate(
    quant, paste0(as.character(100*seq(0, 1, 0.25)), "%"), sep = ";"
  ) %>%
  dplyr::rename(
    "Annotation" = annotation,
    "Target" = target,
    "Inclusion Pct." = prop
  )

null <- catOrWrite(
  "Table 6. On-target incorporation profile, quantile counts given in % columns:",
  args
)

null <- catOrWrite(on_tar_dist_summary, args, style = "multiline", missing = 0)

null <- catOrWrite("", args)

# On-target editing efficiency ----
ot_eff_summary <- eval_data$summary_tbls$ot_eff_summary %>%
  dplyr::mutate(
    specimen = combo_overview$specimen[
      match(alt_specimen, combo_overview$alt_specimen)
      ]
  ) %>%
  dplyr::select(specimen, dplyr::everything(), -alt_specimen)

names(ot_eff_summary)[c(1,2)] <- c("Specimen", "Annotation")

null <- catOrWrite(
  "Table 7. Estimate of On-target editing efficiency (percent) for each target by specimen.",
  args
)

null <- catOrWrite(ot_eff_summary, args, style = "multiline", missing = "-")

null <- catOrWrite("", args)

# Off-target summary ----
ft_tbl_summary <- eval_data$summary_tbls$ft_tbl_summary %>%
  dplyr::mutate(
    specimen = combo_overview$specimen[
      match(alt_specimen, combo_overview$alt_specimen)
      ]
  ) %>%
  dplyr::select(
    "specimen", "annotation", "ft_algns", "ft_pile", "ft_pair", "ft_match"
  )

names(ft_tbl_summary) <- c(
  "Specimen", "Annotation", "All\nAlign.", "Align.\nPileups", 
  "Flanking\nPairs", "Target\nMatched"
)

null <- catOrWrite(
  "Table 8. Off-target loci counts from criteria-based alignments:",
  args
)

null <- catOrWrite(
  ft_tbl_summary, args,
  digits = 1, big.mark = ",", missing = 0, 
  keep.line.breaks = TRUE, style = "multiline"
)

null <- catOrWrite("", args)


# Onco-gene enrichment analysis ----
enrich_df <- eval_data$enrich_data$enrich_df %>%
  dplyr::select(
    origin, annotation, total, 
    onco, onco.p.value, onco.power, 
    special, special.p.value, special.power
  ) %>%
  dplyr::filter(
    onco.power >= args$power_filt / 100 | special.power >= args$power_filt / 100
  )


names(enrich_df) <- c(
  "Origin", "Annotation", "Total Gene Count", "Onco Related Count", 
  "Onco Enrich. p-value", "Onco Test Power", "Special Gene Count", 
  "Special Enrich. p-value", "Special Test Power"
)

null <- catOrWrite(
  "Table 9. Off-target gene enrichment:",
  args
)

if( nrow(enrich_df) > 0 ){

  names(enrich_df) <- gsub(" ", "\n", names(enrich_df))
  enriched_idx <- which(enrich_df <= 0.05, arr.ind = TRUE)
  enriched_idx <- enriched_idx[enriched_idx[,2] >= 6, , drop = FALSE]
  enrich_df[,5] <- sprintf("%.3f", round(enrich_df[,5], digits = 3))
  enrich_df[,6] <- sprintf("%.3f", round(enrich_df[,6], digits = 3))
  enrich_df[,8] <- sprintf("%.3f", round(enrich_df[,8], digits = 3))
  enrich_df[,9] <- sprintf("%.3f", round(enrich_df[,9], digits = 3))

  null <- catOrWrite(
    enrich_df, args,
    digits = 4, style = "multiline", keep.line.breaks = TRUE
  )

}else if( args$power_filt > 0){

  null <- catOrWrite(
    paste0(
      "  No gene enrichment was observed with at a power ",
      "greater than or equal to ", args$power_filt, "%."
    ), 
    args
  )

}else{

  null <- catOrWrite("  No gene enrichment was observed.", args)

}

null <- catOrWrite("", args)

# Off-target sequence analysis ----
nuc_profiles <- eval_data$spec_info$nuclease_profiles
ft_seqs_list <- eval_data$ft_data

full_target_seqs <- structure(
  sapply(seq_len(nrow(target_tbl)), function(i){

    nuc <- target_tbl$Nuclease[i]
    sequence <- target_tbl$Sequence[i]

    ifelse(
      nuc_profiles[[nuc]]$PAM_Loc == "3p",
      paste0(sequence, nuc_profiles[[nuc]]$PAM),
      ifelse(
        nuc_profiles[[nuc]]$PAM_Loc == "5p",
        paste0(nuc_profiles[[nuc]]$PAM, sequence),
        sequence
      )
    )

  }),
  names = target_tbl$`Target Name`
)

null <- lapply(seq_along(ft_seqs_list), function(i){

  null <- catOrWrite(paste0("Table ", i+9, ". Off-Target Loci:"), args)
  null <- catOrWrite(paste0("  Annotation : ", names(ft_seqs_list)[i]), args)

  target_ref_seq <- full_target_seqs[unique(ft_seqs_list[[i]]$target.seq)]

  null <- dplyr::select(
      ft_seqs_list[[i]], target, gene_id, edit.site, abund, MESL, 
      aligned.sequence, mismatch
    ) %>%
    dplyr::mutate(
      aligned.sequence = divSeq(aligned.sequence, target_ref_seq)
    ) %>%
    dplyr::rename(
      "Target" = target, "Gene ID" = gene_id, "Edit Site" = edit.site,
      "Abund." = abund, "Aligned Sequence" = aligned.sequence,
      "Mismatch" = mismatch
    ) %>%
    dplyr::filter(MESL >= args$mesl_filt) %>%
    catOrWrite(args)

  null <- catOrWrite("", args)

})

q()

R Quant From line 9 of rscripts/generate_iGUIDE_summary.R

args <- commandArgs(trailingOnly = TRUE)

if( length(args) > 2 ){
  stop("More than genome and output file name supplied.")
}else if( length(args) < 2 ){
  stop("Please provide inputs as: genome outputfile.")
}

genome <- args[1]
outfile <- args[2]

# Conditional checks
suppressMessages(library(BSgenome))
genName <- grep(genome, unique(installed.genomes()), value = TRUE)

if( length(genName) < 1 ){

  stop("No matched BSgenome installed. Please install.")

}else if( length(genName) > 1 ){

  message("Installed matching genomes:\n")
  message(paste(genName, collapse = ", "))
  stop("Ambiguous match to requested genome. Please specify.")

}

if( file.exists(outfile) ) stop("Output file already exists.")

# Load requested genome
suppressMessages(library(genName, character.only = TRUE))
genome <- BiocGenerics::get(genName)

# Check outputfile name
if( !grepl(".2bit$", outfile) & !grepl(".fasta$", outfile) ){
  stop("Specify output format by output file extention: .2bit or .fasta")
}

if( grepl(".2bit$", outfile) ){
  # Write to 2bit output format
  BSgenome::export(genome, outfile, format = "2bit")
}else{
  # Write to fasta output format
  BSgenome::export(genome, outfile, format = "fasta", compress = FALSE)
}

if( file.exists(outfile) ){
  message("Genome ", genName, " written to file.")
}else{
  message("Check for output file: ", outfile)
}

q()

R BSgenome From line 4 of rscripts/generate_ref_genome.R

options(stringsAsFactors = FALSE, scipen = 99, width = 120)

# Set up and gather command line arguments ----
parser <- argparse::ArgumentParser(
  description = "Generate an iGUIDE Stat report for core and eval data.",
  usage = paste(
    "Rscript generate_stat_report.R <core.stat> <eval.stat> -o <output>",
    "-c <config> [-h/--help, -v/--version] [optional args]"
  )
)

parser$add_argument(
  "-r", "--core", nargs = 1, type = "character",
  help = paste(
    "Core stat object generated by iGUIDE run command. Requires csv format."
  )
)

parser$add_argument(
  "-e", "--eval", nargs = 1, type = "character",
  help = paste(
    "Eval stat object generated by iGUIDE run command. Requires csv format."
  )
)

parser$add_argument(
  "-i", "--incorpSites", nargs = 1, type = "character", required = TRUE,
  help = "Unique sites csv file from project directory."
)

parser$add_argument(
  "-o", "--output", nargs = 1, type = "character", required = TRUE,
  help = "Output report file, extension not required."
)

parser$add_argument(
  "-c", "--config", nargs = 1, type = "character", required = TRUE,
  help = "Run specific config file in yaml format."
)

parser$add_argument(
  "-f", "--format", nargs = 1, type = "character", default = "html",
  help = "Output format for report. Either 'pdf' or 'html' (default)."
)

parser$add_argument(
  "-t", "--template", nargs = 1, type = "character", 
  default = "tools/rscripts/report_templates/iGUIDE_stat_template.Rmd",
  help = "File path to standard or custom stat report template."
)

parser$add_argument(
  "--iguide_dir", nargs = 1, type = "character", default = "IGUIDE_DIR",
  help = "iGUIDE install directory path, do not change for normal applications."
)


args <- parser$parse_args(commandArgs(trailingOnly = TRUE))


if( !dir.exists(args$iguide_dir) ){
  root_dir <- Sys.getenv(args$iguide_dir)
}else{
  root_dir <- args$iguide_dir
}

if( !dir.exists(root_dir) ){
  stop(paste0("\n  Cannot find install path to iGUIDE: ", root_dir, ".\n"))
}else{
  args$iguide_dir <- root_dir
}

code_dir <- dirname(sub(
  pattern = "--file=", 
  replacement = "", 
  x = grep("--file=", commandArgs(trailingOnly = FALSE), value = TRUE)
))

# Check input file ----
core_file <- args$core
eval_file <- args$eval
sites_file <- args$incorpSites

if(
  !file.exists(core_file) | !file.exists(eval_file) | !file.exists(sites_file)
  ){
  stop("\n  Cannot find input stat files. Check inputs.")
}

if( file.exists(args$output) ){
  cat("Removing existing output file of the same name: ", args$output, "\n")
  unlink(args$output)
}


# Check config input ----
if( !file.exists(args$config) ){
  stop("\n  Cannot find config file: ", args$config, ".\n")
}

config <- yaml::yaml.load_file(args$config)

# Check config for defaults ----
if( !"UMItags" %in% names(config) ) config$UMItags <- TRUE

if( !"Alternate_UMI_Method" %in% names(config) ){
  config$Alternate_UMI_Method <- FALSE
}

# Check for format ----
report_formats <- c("html" = "html_document", "pdf" = "pdf_document")

if( !tolower(args$format) %in% names(report_formats) ){
  stop(
    "\n  Please input either 'html' or 'pdf' for format.\n",
    "  Other formats not supported."
  )
}

output_format <- report_formats[tolower(args$format)]

# Check for template path ----
if( file.exists(file.path(root_dir, args$template)) ){

  template_path <- normalizePath(file.path(root_dir, args$template))

}else if( file.exists(file.path(args$template)) ){

  template_path <- normalizePath(file.path(args$template))

}else{

  stop("\n  Cannot find template file: ", args$template, ".\n")

}


# Load required r-packages ----
packs_loaded <- sapply(
  c("magrittr", "knitr"), require, character.only = TRUE
)

if( !any(packs_loaded) ){
  stop(
    "\n  Could not find required r-package: ", 
    paste(names(packs_loaded)[packs_loaded], collapse = ", "), 
    ".\n"
  )
}


# Get versioning ----
soft_version <- as.character(read.delim(
  file = file.path(root_dir, ".version"), header = FALSE
))

build_version <- list.files(file.path(root_dir, "etc")) %>%
  grep(pattern = "build.b[0-9\\.]+.*", x = ., value = TRUE) %>%
  stringr::str_extract(pattern = "b[0-9]+\\.[0-9]+\\.[0-9]+")

signature <- config[["signature"]]

# Load input data ----
core_stat_df <- read.csv(core_file) %>%
  dplyr::select(
    -align.unique.reads, -align.unique.algns, -align.unique.loci
  )

eval_stat_df <- read.csv(eval_file)

site_stat_df <- readRDS(sites_file)$reads %>%
  dplyr::filter(type == "uniq") %>%
  dplyr::group_by(sampleName) %>%
  dplyr::summarise(
    align.unique.reads = dplyr::n_distinct(ID),
    align.unique.algns = dplyr::n_distinct(seqnames, start, end, strand),
    align.unique.loci = dplyr::n_distinct(
      seqnames, strand, ifelse(strand == "+", start, end)
    )
  )

stat_df <- dplyr::full_join(core_stat_df, eval_stat_df, by = "sampleName") %>%
  dplyr::full_join(site_stat_df, by = "sampleName") %>%
  dplyr::mutate_all(function(x) ifelse(is.na(x), rep(0, length(x)), x))

sampleName_levels <- unique(stat_df$sampleName)

if( 
  any(c("ambiguous_reads", "degenerate_reads", "unassigned_reads") %in% 
      sampleName_levels) 
  ){

    sampleNames <- sampleName_levels[
      -match(
        c("ambiguous_reads", "degenerate_reads", "unassigned_reads"), 
        sampleName_levels
      )
    ]

}else{

  sampleNames <- sampleName_levels

}

sampleName_levels <- c(
  sampleNames, "ambiguous_reads", "degenerate_reads", "unassigned_reads"
)


# Read attrition table ----
read_tbl <- dplyr::select(
    stat_df, c(
      "sampleName", "demulti.reads", 
      "R1.trim.reads", 
      if( config$Alternate_UMI_Method ) "R1.primer.trim.reads",
      if( !config$Alternate_UMI_Method ) "R2.primer.trim.reads",
      "R2.trim.reads", 
      if( config$UMItags ) "umitags.reads", 
      "filt.reads", 
      if( tolower(config$Aligner) == "blat" ) "R1.consol.reads", 
      if( tolower(config$Aligner) == "blat" ) "R2.consol.reads", 
      "align.unique.reads", 
      "align.chimera.reads", 
      "align.multihit.reads"
  )) %>%
  dplyr::mutate(sampleName = factor(sampleName, levels = sampleName_levels)) %>%
  dplyr::arrange(sampleName)

names(read_tbl) <- stringr::str_replace(names(read_tbl), ".reads$", "")


# Alignment outcome table ----
algn_tbl <- dplyr::select(
    stat_df, sampleName, align.unique.reads, align.unique.algns, 
    align.unique.loci, align.multihit.reads, align.multihit.lengths, 
    align.multihit.clusters, align.chimera.reads
  ) %>%
  dplyr::filter(sampleName %in% sampleNames) %>%
  dplyr::mutate(sampleName = factor(sampleName, levels = sampleNames)) %>%
  dplyr::arrange(sampleName)

names(algn_tbl) <- stringr::str_replace(names(algn_tbl), "align.", "")


# Incorporation breakdown table ----
incorp_levels <- c(
  "eval.total.algns", "eval.combined.algns", 
  "eval.pileup.algns", "eval.paired.algns", "eval.matched.algns", 
  "eval.ontarget.algns", "eval.offtarget.algns"
)

incorp_tbl <- stat_df[, names(stat_df) %in% c("sampleName", incorp_levels)] %>%
  tidyr::gather(key = "metric", value = "counts", -sampleName) %>%
  dplyr::mutate(metric = factor(metric, levels = incorp_levels)) %>%
  tidyr::complete(metric) %>%
  dplyr::mutate(metric = as.character(metric)) %>%
  tidyr::spread(key = metric, value = counts) %>%
  dplyr::select(
    sampleName, eval.total.algns, eval.combined.algns, 
    eval.pileup.algns, eval.paired.algns, eval.matched.algns, 
    eval.ontarget.algns, eval.offtarget.algns
  ) %>%
  dplyr::filter(sampleName %in% sampleNames) %>%
  dplyr::mutate(sampleName = factor(sampleName, levels = sampleNames)) %>%
  dplyr::arrange(sampleName)

names(incorp_tbl) <- stringr::str_replace(names(incorp_tbl), "eval.", "")
names(incorp_tbl) <- stringr::str_replace(names(incorp_tbl), ".algns$", "")


# Initiate report generation ----
# Normalize file output path
write(c(), file = args$output)
output_file <- normalizePath(args$output)
unlink(output_file)

output_path <- unlist(strsplit(output_file, "/"))
output_dir <- paste(output_path[seq_len(length(output_path)-1)], collapse = "/")
output_file <- output_path[length(output_path)]

if( output_format == "html_document" & !stringr::str_detect(output_file, ".html$") ){
  output_file <- paste0(output_file, ".html")
}

if( output_format == "pdf_document" & !stringr::str_detect(output_file, ".pdf$") ){
  output_file <- paste0(output_file, ".pdf")
}

if( output_format == "html_document" ){

  css_path <- normalizePath(file.path(code_dir, "report_templates/iguide.css"))

  rmarkdown::render(
    input = template_path,
    output_format = output_format, 
    output_file = output_file,
    output_dir = output_dir,
    output_options = list("css" = css_path)
  )

}else{

  rmarkdown::render(
    input = template_path,
    output_format = output_format, 
    output_file = output_file,
    output_dir = output_dir
  )

}

q()

R From line 14 of rscripts/generate_stat_report.R

options(stringsAsFactors = FALSE, scipen = 99, warn = -1)
suppressMessages(library("magrittr"))


# Set up and gather command line arguments ----
parser <- argparse::ArgumentParser(
  description = "List samples associated with a config file for iGUIDE.",
  usage = "iguide list_samples <path/to/config.file> <options>"
)

parser$add_argument(
  "config", nargs = 1, type = "character",
  help = "Run specific config file in yaml format."
)

parser$add_argument(
  "-o", "--output", nargs = 1, type = "character", default = FALSE,
  help = "Output file name .csv, .tsv, or .rds format."
)

parser$add_argument(
  "-v", "--verbose", action = "store_true", 
  help = "Turns on diagnositc-based messages."
)

parser$add_argument(
  "--install_path", nargs = 1, type = "character", default = "IGUIDE_DIR",
  help = "iGUIDE install directory path, do not change for normal applications."
)

## Set arguments with parser
args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

root_dir <- Sys.getenv("IGUIDE_DIR")

code_dir <- dirname(sub(
  pattern = "--file=", 
  replacement = "", 
  x = grep("--file=", commandArgs(trailingOnly = FALSE), value = TRUE)
))


input_table <- data.frame(
  "Variables" = paste0(names(args), " :"), 
  "Values" = sapply(
    seq_along(args), 
    function(i) paste(args[[i]], collapse = ", ")
  )
)

input_table <- input_table[
  match(
    c("config :", "output :", "verbose :", "install_path :"), 
    input_table$Variables
  ),
]

## Log inputs
if( args$verbose ){

  cat("List Sample Inputs\n")
  print(
    x = data.frame(input_table),
    right = FALSE, 
    row.names = FALSE
  )

}


# Load files ----
## Config
if( file.exists(args$config) ){
  config <- yaml::yaml.load_file(args$config)
}else{
  stop("\nCannot find config file: ", args$config, ".\n")
}

## Sample Info
if( file.exists(config$Sample_Info) ){

  sample_info <- data.table::fread(config$Sample_Info, data.table = FALSE)

}else if( file.exists(file.path(root_dir, config$Sample_Info)) ){

  sample_info <- data.table::fread(
    input = file.path(root_dir, config$Sample_Info), 
    data.table = FALSE
  )

}else{

  stop("\nCannot find associated Sample Info file: ", configs$Sample_Info, ".\n")

}

## Supplemental Info
if( config$Supplemental_Info != "." ){
  if( file.exists(config$Supplemental_Info) ){

    supp_info <- data.table::fread(config$Supplemental_Info)

  }else if( file.exists(file.path(root_dir, config$Supplemental_Info)) ){

    supp_info <- data.table::fread(
      input = file.path(root_dir, config$Supplemental_Info), 
      data.table = FALSE
    )

  }else{

    warning(
      "Cannot find Supplemental Info file: ", configs$Supplemental_Info, ".\n"
    )

  }
}

# Join appropriate tables together and / or format for output ----

sample_col <- match(config$Sample_Name_Column, names(sample_info))

if( is.na(sample_col) ){
  stop("\nCannot isolate sampleName column: ", config$Sample_Name_Column, ".\n")
}

names(sample_info)[sample_col] <- "sampleName"

sample_info <- sample_info %>%
  dplyr::mutate(
    specimen = stringr::str_extract(sample_info$sampleName, "[\\w]+")
  ) %>%
  dplyr::group_by(specimen) %>%
  dplyr::summarise(replicates = n()) %>%
  dplyr::ungroup()

if( exists("supp_info") ){
  sample_info <- dplyr::left_join(sample_info, supp_info, by = "specimen")
}


# Output consolidated information ----
if( args$output != FALSE ){

  source(file.path(code_dir, "supporting_scripts/writeOutputFile.R"))
  writeOutputFile(as.data.frame(sample_info), args$output)

}else{

  run_name <- stringr::str_extract(args$config, "[\\w]+.config.yml$") %>%
    stringr::str_extract("[\\w]+")

  cat(paste0("\nSpecimen Info for : ", run_name, "."))
  pander::pandoc.table(sample_info, style = "simple", split.table = Inf)

}

q()

R magrittr From line 19 of rscripts/list_samples.R

options(stringsAsFactors = FALSE, scipen = 99, width = 120)
suppressMessages(library("magrittr"))

code_dir <- dirname(sub(
  pattern = "--file=", 
  replacement = "", 
  x = grep("--file=", commandArgs(trailingOnly = FALSE), value = TRUE)
))

desc <- yaml::yaml.load_file(
  file.path(code_dir, "descriptions/samqc.desc.yml")
)

# Set up and gather command line arguments
parser <- argparse::ArgumentParser(
  description = desc$program_short_description,
  usage = "Rscript samqc.R <bam> <bai> [-h/--help, -v/--version] [optional args]"
)

parser$add_argument(
  "bam", nargs = 1, type = "character", help = desc$bam
)

parser$add_argument(
  "bai", nargs = 1, type = "character", help = desc$bai
)

parser$add_argument(
  "-o", "--uniqOutput", nargs = 1, type = "character", required = TRUE,
  help = desc$uniqOutput
)

parser$add_argument(
  "--condSites", nargs = 1, type = "character", help = desc$condSites
)

parser$add_argument(
  "--chimeras", nargs = 1, type = "character", help = desc$chimeras
)

parser$add_argument(
  "--multihits", nargs = 1, type = "character", help = desc$multihits
)

parser$add_argument(
  "--stat", nargs = 1, type = "character", default = FALSE, help = desc$stat
)

parser$add_argument(
  "-g", "--refGenome", nargs = 1, type = "character", default = "hg38",
  help = desc$refGenome
)

parser$add_argument(
  "--maxAlignStart", nargs = 1, type = "integer", default = 5L,
  help = desc$maxAlignStart
)

parser$add_argument(
  "--minPercentIdentity", nargs = 1, type = "integer", default = 95L,
  help = desc$minPercentIdentity
)

parser$add_argument(
  "--minTempLength", nargs = 1, type = "integer", default = 30L,
  help = desc$minTempLength
)

parser$add_argument(
  "--maxTempLength", nargs = 1, type = "integer", default = 2500L,
  help = desc$maxTempLength
)

parser$add_argument(
  "--keepAltChr", action = "store_true", help = desc$keepAltChr
)

parser$add_argument(
  "--batches", nargs = 1, type = "integer", default = 25L,
  help = paste(
    "A tuning parameter to batch process the alignments, specifies how many", 
    "batches to do. Default: 500."
  )
)

parser$add_argument(
  "--readNamePattern", nargs = 1, type = "character", 
  default = "[\\w\\:\\-\\+]+", help = desc$readNamePattern
)

parser$add_argument(
  "--saveImage", nargs = 1, type = "character", help = desc$saveImage
)


args <- parser$parse_args(commandArgs(trailingOnly = TRUE))


# Print Inputs to terminal
input_table <- data.frame(
  "Variables" = paste0(names(args), " :"), 
  "Values" = sapply(seq_along(args), function(i){
    paste(args[[i]], collapse = ", ")
  })
)

input_table <- input_table[
  match(
    c("bam :", "bai :", "uniqOutput :", "condSites :", "chimeras :", 
      "multihits :", "stat :", "refGenome :", "maxAlignStart :", 
      "minPercentIdentity :", "minTempLength :", "maxTempLength :", 
      "keepAltChr :", "readNamePattern :"
    ),
    input_table$Variables
  ),
  ]

cat("\nSAM QC Inputs:\n")
print(
  data.frame(input_table),
  right = FALSE, 
  row.names = FALSE
)


# Load supporting scripts
source(file.path(code_dir, "supporting_scripts", "printHead.R"))
source(file.path(code_dir, "supporting_scripts", "condenseSites.R"))
source(file.path(code_dir, "supporting_scripts", "writeOutputFile.R"))

if( !all(c("printHead", "condenseSites", "writeOutputFile") %in% ls()) ){
  stop(
    "\n  Cannot load supporting scripts. ",
    "You may need to clone from github again.\n"
  )
}

# Load reference genome
if( grepl(".fa", args$refGenome) ){

  if( !file.exists(args$refGenome) ){
    stop("\n  Specified reference genome file not found.\n")
  }

  ref_file_type <- ifelse(grepl(".fastq", args$refGenome), "fastq", "fasta")

  ref_genome <- Biostrings::readDNAStringSet(
    args$refGenome, format = ref_file_type
  )

}else{

  genome <- grep(
    args$refGenome, 
    unique(BSgenome::installed.genomes()), 
    value = TRUE
  )

  if( length(genome) == 0 ){

    cat("\nInstalled genomes include:\n")
    print(paste(unique(BSgenome::installed.genomes()), collapse = "\n"))
    stop("\n  Selected reference '", args$refGenome, "'genome not in list.\n")

  }else if( length(genome) > 1 ){

    cat("\nInstalled genomes include:\n")
    print(paste(unique(BSgenome::installed.genomes(), collapse = "\n")))
    stop(
      "\n  Please be more specific about reference genome. ", 
      "Multiple matches to input.\n"
    )

  }

  suppressMessages(library(genome, character.only = TRUE))
  ref_genome <- get(genome)

}

## Determine an associated sample name
sampleName <- unlist(strsplit(args$uniqOutput, "/"))

sampleName <- unlist(
  strsplit(sampleName[length(sampleName)], ".", fixed = TRUE)
)[1]


## Set up stat object
if( args$stat != FALSE ){

  stat <- data.frame(
    sampleName = vector("character"),
    metric = vector("character"),
    count = vector("character")
  )

}

# Additional functions ----
#' Load sorted BAM file into a data.frame
#' @param bam path to sorted BAM file (*.bam).
#' @param bai path to BAM index file (*.bai).
#' @param params character vector indicating the fields to import. Refer to 
#' SAMtools or BWA manual for field names.
#' @param tags character vector indicating the additional tags to import. Again,
#' refer to the SAMtools or BWA manual for tag names.

loadBAM <- function(bam, bai, params, tags, onlyPairMapped = TRUE){

  algn <- unlist(Rsamtools::scanBam(
      file = bam, index = bai,
      param = Rsamtools::ScanBamParam(
        flag = Rsamtools::scanBamFlag(
          isPaired = ifelse(onlyPairMapped, TRUE, NA), 
          isUnmappedQuery = ifelse(onlyPairMapped, FALSE, NA),
          hasUnmappedMate = ifelse(onlyPairMapped, FALSE, NA)
        ), 
        what = params, 
        tag = tags
      )
    ),
    recursive = FALSE
  )

  df <- as.data.frame(algn[seq_along(params)])

  for(t in tags){
    df[,t] <- algn$tag[[t]]
  }

  df

}

#' Calculate the global percent identity for alignments from cigar and MD tags
#' @param cigar character vector of cigar strings.
#' @param MD character vector of MDz tags.
#' @description Both input parameters must be the same length of vectors and 
#' indexed accordingly. Function calculates the global percent identity of the 
#' alignment.

calcPctID <- function(cigar, MD){
  # Must have same length to calc pct ID
  stopifnot( length(cigar) == length(MD) )

  data.frame("cig" = cigar, "md" = MD, stringsAsFactors = FALSE) %>%
    dplyr::mutate(
      mismatch = rowSums(matrix(
        stringr::str_extract_all(md, "[ATGC]", simplify = TRUE) %in% 
          c("A", "T", "G", "C"), 
        nrow = n()), na.rm = TRUE
      ),
      match = rowSums(matrix(as.numeric(gsub(
          "M", "",stringr::str_extract_all(cig, "[0-9]+M", simplify = TRUE)
        )), 
        nrow = n()), na.rm = TRUE
      ) - mismatch,
      length = rowSums(matrix(as.numeric(gsub(
          "[HSMIDX=]", "", stringr::str_extract_all(
            cig, "[0-9]+[HSMIDX=]", simplify = TRUE
          )
        )),
        nrow = n()), na.rm = TRUE
      ),
      pctID = round(100 * (match / length), digits = 1)
    ) %>%
    .$pctID

}

#' Count clipped bases at start or end of aligments from cigar strings
#' @param cigar character string with cigar information
#' @param type character indicating hard ("H", "h"), soft ("S", "s"), or both 
#' ("both", default) clipping to be counted.
#' @param end character indicating the 5-prime ("5p", default) or 3-prime ("3p")
#' end of the alignment.

cntClipped <- function(cigar, type = "both", end = "5p"){

  # Format inputs
  type <- tolower(type)
  end <- tolower(end)

  # Check inputs
  stopifnot( type %in% c("both", "h", "s") )
  stopifnot( end %in% c("5p", "3p") )

  # Assign query
  if( type == "both" ){
    query_pat <- "[0-9]+[HS]"
  }else if( type == "h" ){
    query_pat <- "[0-9]+[H]"
  }else{
    query_pat <- "[0-9]+[S]"
  }

  # Assign end
  if( end == "5p" ){
    query_pat <- paste0("^", query_pat)
  }else{
    query_pat <- paste0(query_pat, "$")
  }

  # Capture all patterns and return integer of clipped bases
  rowSums(matrix(as.numeric(
        gsub("[HS]", "", stringr::str_extract_all(
          cigar, query_pat, simplify = TRUE
        ))
      ), 
      nrow = length(cigar)
    ), 
    na.rm = TRUE
  )

}

#' Process alignment data to valid paired-end alignments representing the input
#' template DNA.
#' @param id character vector indicating grouping of alignments.
#' @param chr character vector of seqnames. If using reference genome, these 
#' will need to match seqnames present in the reference object passed to 
#' `refGen`.
#' @param strand character vector of strand or alignment orientation, must be 
#' either "+" or "-".
#' @param pos numeric or integer vector indicating the "start" of the alignment.
#' @param width numeric or integer vector indicating the width of the alignment.
#' @param type character vector indicating type of alignment 
#' ("anchor" or "adrift").
#' @param maxLen numeric or integer value indicating the minimum distance 
#' between the two alignments that should be considered.
#' @param maxLen numeric or integer value indicating the maximum distance 
#' between the two alignments that should be considered.
#' @param refGen BSgenome object or other object with GenomeInfoDb::seqinfo.
#' This method is currently depreciated for the latter method.

.processAlignments <- function(id, chr, strand, pos, width, type, minLen = 30L,
                              maxLen = 2500L, refGen = NULL){

  # Check inputs
  inputs <- list(
    "grp" = id, "chr" = chr, "strand" = strand, 
    "pos" = pos, "width" = width, "type" = type
  )

  stopifnot( length(unique(sapply(inputs, length))) == 1 ) # All same length

  # Combine into data.frame and build GenomicRanges
  input_df <- as.data.frame(inputs) %>%
    dplyr::mutate(
      grp = as.character(grp),
      start = pos,
      end = pos + width - 1,
      type = as.character(type),
      strand = as.character(strand),
      posid = paste0(type, ":", chr, strand, ifelse(strand == "+", start, end))
    ) %>%
    dplyr::select(grp, chr, strand, start, end, type, posid)

  input_gr <- GenomicRanges::GRanges(
    seqnames = as.character(input_df$chr),
    ranges = IRanges::IRanges(
      start = as.numeric(input_df$start), 
      end = as.numeric(input_df$end)
    ),
    strand = as.character(input_df$strand),
    seqinfo = if(!is.null(refGen)){ GenomeInfoDb::seqinfo(refGen) }else{ NULL },
    grp = as.character(input_df$grp),
    type = as.character(input_df$type),
    posid = as.character(input_df$posid)
  )

  # Find overlaps within maxLen for anchors and adrift
  grl <- GenomicRanges::split(
    GenomicRanges::flank(input_gr, width = -1, start = TRUE), 
    input_gr$type
  )

  # Flip strand of adrift hits to enforce opposite strand requirement
  GenomicRanges::strand(grl$adrift) <- ifelse(
    GenomicRanges::strand(grl$adrift) == "+", "-", "+"
  )

  # Reduce to unique locations to minimize work
  red_list <- lapply(
    grl, GenomicRanges::reduce, min.gapwidth = 0L, with.revmap = TRUE
  )

  # ID all anchor-to-adrift alignment pairs
  ovlp_hits <- GenomicRanges::findOverlaps(
    red_list$anchor, red_list$adrift, maxgap = maxLen
  )

  # Gather data for each type
  anchor_df <- as.data.frame(red_list$anchor) %>%
    dplyr::mutate(
      seqnames = as.character(seqnames),
      strand = as.character(strand),
      type = "anchor",
      anchorid = seq_len(n()),
      posid = paste0("anchor:", seqnames, strand, start)
    )

  adrift_df <- as.data.frame(
      red_list$adrift[S4Vectors::subjectHits(ovlp_hits)]
    ) %>%
    dplyr::mutate(
      seqnames = as.character(seqnames),
      strand = as.character(strand),
      type = "adrift",
      anchorid = S4Vectors::queryHits(ovlp_hits),
      posid = paste0(
        "adrift:", seqnames, ifelse(strand == "+", "-", "+"), start
      )
    )

  # Combine hits to form valid paired-end alignments
  combo_df <- dplyr::bind_rows(anchor_df, adrift_df) %>%
    dplyr::group_by(anchorid) %>%
    dplyr::mutate(
      anchor.dist = start[type == "anchor"] - start,
      anchor.upstream = ifelse(
        type == "adrift", 
        ifelse(strand == "+", anchor.dist < 0, anchor.dist > 0),
        TRUE
      ),
      right.size = ifelse(
        type == "adrift",
        abs(anchor.dist) >= minLen & abs(anchor.dist) <= maxLen,
        TRUE
      )
    ) %>%
    dplyr::filter(anchor.upstream & right.size) %>%
    dplyr::ungroup()

  cond_df <- combo_df %>%
    dplyr::group_by(anchorid) %>%
    dplyr::mutate(
      anchor.posid = posid[type == "anchor"],
      adrift.posid = posid,
      start = ifelse(strand == "+", start - abs(anchor.dist), start),
      end = ifelse(strand == "+", end, end + abs(anchor.dist))
    ) %>%
    dplyr::filter(type == "adrift") %>%
    dplyr::ungroup()

  adrift_revmap <- cond_df$revmap

  cond_df[rep(seq_len(nrow(cond_df)), lengths(adrift_revmap)),] %>%
    dplyr::mutate(
      grp = grl$adrift$grp[BiocGenerics::unlist(adrift_revmap)]
    ) %>%
    dplyr::filter(
      paste0(grp, ":", anchor.posid) %in% 
        paste0(input_df$grp, ":", input_df$posid)
    ) %>%
    dplyr::mutate(grp = factor(grp, levels = unique(id))) %>%
    dplyr::arrange(grp) %>%
    dplyr::mutate(grp = as.character(grp)) %>%
    dplyr::select("id" = grp, "chr" = seqnames, strand, start, end)

}

#' Process alignment data to valid paired-end alignments representing the input
#' template DNA.
#' @param id character vector indicating grouping of alignments.
#' @param chr character vector of seqnames. If using reference genome, these 
#' will need to match seqnames present in the reference object passed to 
#' `refGen`.
#' @param strand character vector of strand or alignment orientation, must be 
#' either "+" or "-".
#' @param pos numeric or integer vector indicating the "start" of the alignment.
#' @param width numeric or integer vector indicating the width of the alignment.
#' @param type character vector indicating type of alignment 
#' ("anchor" or "adrift").
#' @param maxLen numeric or integer value indicating the minimum distance 
#' between the two alignments that should be considered.
#' @param maxLen numeric or integer value indicating the maximum distance 
#' between the two alignments that should be considered.
#' @param refGen BSgenome object or other object with GenomeInfoDb::seqinfo.
#' @param batches integer indicating the number of batches to serialize the 
#' data processing with. The number of reads analyzed within a batch will be
#' the number of unique `id`'s divided by the `batches`.

processAlignments <- function(id, chr, strand, pos, width, type, minLen = 30L,
                              maxLen = 2500L, refGen = NULL, batches = 25L){

  # Check inputs
  inputs <- list(
    "grp" = id, "chr" = chr, "strand" = strand, 
    "pos" = pos, "width" = width, "type" = type
  )

  stopifnot( length(unique(sapply(inputs, length))) == 1 ) # All same length

  # Combine into data.frame and build GenomicRanges
  input_df <- as.data.frame(inputs) %>%
    dplyr::mutate(
      grp = as.character(grp),
      start = pos,
      end = pos + width - 1,
      type = as.character(type),
      strand = as.character(strand),
      pos = ifelse(strand == "+", start, end)
    ) %>%
    dplyr::select(grp, chr, strand, pos, type)

  idx_list <- IRanges::IntegerList(split(seq_len(nrow(input_df)), input_df$grp))

  anchor_idx_list <- idx_list[
    IRanges::LogicalList(split(input_df$type == "anchor", input_df$grp))
  ]

  adrift_idx_list <- idx_list[
    IRanges::LogicalList(split(input_df$type == "adrift", input_df$grp))
  ]

  batch_list <- split(
    seq_along(idx_list), 
    ceiling(seq_along(idx_list) / (length(idx_list) / batches))
  )

  dplyr::bind_rows(lapply(seq_along(batch_list), function(i){

    print(i)
    idxs <- batch_list[[i]]

    # Identify which reads to analyze
    x <- names(idx_list)[idxs]

    # Pull in all anchors associated with reads
    anchor_aligns <- input_df[unlist(anchor_idx_list[x]),]

    # Pull in all adrift alignments associated with reads
    adrift_aligns <- input_df[unlist(adrift_idx_list[x]),] %>%
      dplyr::select(grp, "chr.d" = chr, "strand.d" = strand, "pos.d" = pos)

    anc_idx <- IRanges::IntegerList(
      split(seq_len(nrow(anchor_aligns)), anchor_aligns$grp)
    )

    adr_idx <- IRanges::IntegerList(
      split(seq_len(nrow(adrift_aligns)), adrift_aligns$grp)
    )

    exp_anc_idxs <- unlist(lapply(
      seq_along(anc_idx), 
      function(i) rep(anc_idx[[i]], each = length(adr_idx[[i]]))
    ))

    adrift_aligns[
        unlist(unname(adr_idx[rep(names(anc_idx), lengths(anc_idx))])),
      ] %>%
      dplyr::mutate(
        chr.n = anchor_aligns$chr[exp_anc_idxs],
        strand.n = anchor_aligns$strand[exp_anc_idxs],
        pos.n = anchor_aligns$pos[exp_anc_idxs]
      ) %>%
      dplyr::filter(
        # Filter for opposite strands
        strand.n != strand.d,
        # Filter for correct size window
        ifelse(strand.n == "+", pos.d - pos.n, pos.n - pos.d) >= minLen,
        ifelse(strand.n == "+", pos.d - pos.n, pos.n - pos.d) <= maxLen,
        # Filter for same chromosome
        chr.n == chr.d
      ) %>%
      dplyr::mutate(
        start = ifelse(strand.n == "+", pos.n, pos.d),
        end = ifelse(strand.n == "+", pos.d, pos.n)
      ) %>%
      dplyr::select(
        "id" = grp, "chr" = chr.n, "strand" = strand.n, start, end
      )

  }))


}

#' Determine if pair of reads are mapped
#' @param flag numeric or integer vector of flag codes indicating mapping 
#' status. This integer will be converted into binary bits and decoded to 
#' determine if the flag indicates paired mapping.
#' @description Given flag integer codes, this function returns a logical to 
#' indicate if the pair of reads are both mapped. If one or both reads are 
#' unmapped, then the return is "FALSE".

pair_is_mapped <- function(flag){

  #Check if input is in correct format.
  stopifnot( all(is.numeric(flag) | is.integer(flag)) )

  # Switch flag codes to binary bit matrix
  x <- matrix(as.integer(intToBits(flag)), ncol = 32, byrow = TRUE)

  # Flag codes designate 3rd and 4th bits to indicate unmapped read or mate
  # As long as both are zero, then the pair of reads are both mapped
  rowSums(x[,c(3,4)]) == 0

}

#' Determine the alignment is for the read or mate
#' @param flag numeric or integer vector of flag codes indicating mapping 
#' status. This integer will be converted into binary bits and decoded to 
#' determine if the flag indicates read or mate maping.
#' @param output character vector of length 2, indicating the output designation
#' for if the alignment is for the read or the mate.
#' @description Given flag integer codes, this function returns a logical or 
#' character vector to indicate if the alignment is for the read or mate

read_or_mate <- function(flag, output = NULL){

  #Check if input is in correct format.
  stopifnot( all(is.numeric(flag) | is.integer(flag)) )

  # Switch flag codes to binary bit matrix
  x <- matrix(as.integer(intToBits(flag)), ncol = 32, byrow = TRUE)

  # Flag codes designate 7th bit to indicate 1st read (read) and the 8th for mate
  # As long as both are zero, then the pair of reads are both mapped
  if( is.null(output) ){
    return(x[,c(7)] == 1)
  }else{
    return(ifelse(x[,c(7)] == 1, output[1], output[2]))
  }

}

# Additional parameters ---- 
# BAM parameters to get from file
bam_params <- c(
  "qname", "flag", "rname", "strand", "pos", "qwidth", "mapq", "cigar"
)

# BAM Tags to get from files
bam_tags <- c("MD")

# Import read alignments and filter on input criteria ----
input_hits <- loadBAM(
  bam = args$bam, bai = args$bai, params = bam_params, tags = bam_tags
)

# Top of inputs from alignments
printHead(
  input_hits,
  title = "Head of input alignments",
  caption = sprintf(
    "%1$s total alignments from %2$s reads.", 
    nrow(input_hits), 
    length(unique(input_hits$qname))
  )
)

# Stop if there are no remaining alignments
if( nrow(input_hits) == 0 ){ 
  cat("\nNo alignments in input bam file.\n")
  writeNullOutput(args)
  q()
}

## Initial quality filtering: min percent ID, minimum size, max align start ----
read_hits <- input_hits %>%
  dplyr::mutate(
    pairMapped = pair_is_mapped(flag),
    type = read_or_mate(flag, c("anchor", "adrift"))
  ) %>% 
  dplyr::filter(pairMapped) %>%
  dplyr::mutate(
    clip5p = cntClipped(cigar),
    pctID = calcPctID(cigar, MD)
  ) %>%
  dplyr::filter(
    pctID >= args$minPercentIdentity,
    qwidth >= args$minTempLength,
    clip5p <= args$maxAlignStart
  )

read_wo_pairs_after_init_filter <- read_hits %>%
  dplyr::group_by(qname) %>%
  dplyr::summarise(
    anchors = sum(type == "anchor"), 
    adrifts = sum(type == "adrift")
  ) %>%
  dplyr::filter(anchors == 0 | adrifts == 0) %>%
  dplyr::pull(qname)

read_hits <- dplyr::filter(
  read_hits, !qname %in% read_wo_pairs_after_init_filter
)

# Stop if there are no remaining alignments
if( nrow(read_hits) == 0 | dplyr::n_distinct(read_hits$type) == 1 ){

  cat(
    "\nNo valid alignments were found within the data given input criteria.\n"
  )

  writeNullOutput(args)
  q()

}

## Additional quality filtering: orientation structure, min and max size ----
all_valid_aligns <- with(
    read_hits, 
    processAlignments(
      qname, rname, strand, pos, qwidth, type, 
      refGen = ref_genome, batches = args$batches
    )
  ) %>%
  dplyr::mutate(
    lociPairKey = paste0(
      as.integer(factor(
        paste0(chr, strand, ifelse(strand == "+", start, end))
      )), ":", 
      as.integer(factor(
        paste0(chr, strand, ifelse(strand == "+", end, start))
      ))
    ),
    readPairKey = as.integer(factor(id))
  )

## Remove alternative sequence alignments if requested during input ----
if( !args$keepAltChr ){
  all_valid_aligns <- dplyr::filter(
    all_valid_aligns, !stringr::str_detect(chr, stringr::fixed("_"))
  )
}

### Print out top of valid alignments
printHead(
  all_valid_aligns,
  title = "Head of valid alignments",
  caption = sprintf(
    "%1$s valid alignments from %2$s reads.", 
    nrow(all_valid_aligns), 
    length(unique(all_valid_aligns$id))
  )
)

# Stop if there are no remaining alignments
if( nrow(all_valid_aligns) == 0 ){

  cat("\nNo valid alignments were found after QC filtering.\n")
  writeNullOutput(args)
  q()

}


## Group alignments into unique and multihit alignments ----
uniq_aligns <- all_valid_aligns %>%
  dplyr::group_by(id) %>%
  dplyr::filter(n() == 1) %>%
  dplyr::ungroup()

multihits <- all_valid_aligns %>%
  dplyr::group_by(id) %>%
  dplyr::filter(n() > 1) %>%
  dplyr::ungroup()

## Recover any reads not captured in the two groups above ----
failed_reads <- input_hits %>%
  dplyr::filter(
    !qname %in% c(unique(uniq_aligns$id), unique(multihits$id))
  )

# Log allocated read counts
cat(
  "\nReads associated with types of alignments:\n",
  "  unique alignments  : ", 
  format(length(unique(uniq_aligns$id)), big.mark = ","), "\n",
  "  multihit alignments: ", 
  format(length(unique(multihits$id)), big.mark = ","), "\n",
  "  chimera artifacts  : ", 
  format(length(unique(failed_reads$qname)), big.mark = ","), "\n"
)


# Bin reads that would map to different loci on the same read (chimeras)
# All unique and multihit templates are mapped successfully to 
# genomic loci, yet some templates are sequenced but do not make it through
# the selection criteria. These templates either do not have alignments to the
# reference genome (anchor or adrift did not align) or map to two distant 
# genomic loci. The latter are termed chimeras and are considered to be 
# artifacts of PCR amplification.

if( !is.null(args$chimeras) ){

  if( args$stat != FALSE ){

    add_stat <- data.frame(
      sampleName = sampleName,
      metric = "chimera.reads",
      count = length(unique(failed_reads$qname))
    )

    stat <- rbind(stat, add_stat)

  }

  chimeraData <- list(
    "failed_reads" = failed_reads
  )

  writeOutputFile(chimeraData, file = args$chimeras, format = "rds")

}


## Write unique (and condensed) output and record stats ----
uniq_sites <- uniq_aligns %>%
  dplyr::mutate(
    width = as.integer(end - start + 1),
    sampleName = sampleName
  ) %>%
  dplyr::select(
    "seqnames" = chr, start, end, width, strand, 
    lociPairKey, readPairKey, sampleName, "ID" = id
  ) %>%
  GenomicRanges::makeGRangesFromDataFrame(
    keep.extra.columns = TRUE, 
    seqinfo = GenomeInfoDb::seqinfo(ref_genome)
  )

writeOutputFile(uniq_sites, file = args$uniqOutput)

### Print out head of uniq_sites for reference.
printHead(
  uniq_sites,
  title = "Head of uniquely mapped genomic loci",
  caption = sprintf(
    paste(
      "Alignments yeilded %1$s unique anchor sites from %2$s", 
      "properly-paired and aligned reads."
    ),
    length(GenomicRanges::reduce(
      GenomicRanges::flank(uniq_sites, -1, start = TRUE), min.gapwidth = 0L
    )),
    length(uniq_sites)
  )
)

if( args$stat != FALSE ){

  add_stat <- data.frame(
    sampleName = sampleName,
    metric = c("unique.reads", "unique.algns", "unique.loci"),
    count = c(
      length(unique(uniq_sites$ID)), 
      length(unique(uniq_sites)),
      length(GenomicRanges::reduce(
        x = GenomicRanges::flank(uniq_sites, width = -1, start = TRUE), 
        min.gapwidth = 0L
      ))
    )
  )

  stat <- rbind(stat, add_stat)

}

## Generate condensed sites ----
if( !is.null(args$condSites) ){

  cond_sites <- condenseSites(
    uniq_sites, keep.cols = "sampleName", list.bp.counts = TRUE
  )

  writeOutputFile(cond_sites, file = args$condSites)

  printHead(
    cond_sites,
    title = "Head of unique anchor sites",
    caption = sprintf(
      paste(
        "There were %1$s unique anchor sites identified with a total", 
        "of %2$s unique template lengths and %3$s read counts."
      ),
      length(cond_sites),
      sum(cond_sites$fragLengths),
      sum(cond_sites$counts)
    )
  )

}


## Write multihits output and record stats ----
if( !is.null(args$multihits) ){

  unclustered_multihits <- GenomicRanges::GRanges()
  clustered_multihit_positions <- GenomicRanges::GRangesList()
  clustered_multihit_lengths <- list()

  if( nrow(multihits) > 0 ){

    #' As the loci are expanded from the coupled_loci object, unique templates 
    #' and readPairKeys are present in the readPairKeys unlisted from the 
    #' paired_loci object.
    multihit_templates <- multihits %>%
      dplyr::mutate(
        width = end - start + 1,
        sampleName = sampleName
      ) %>%
      dplyr::select(
        "seqnames" = chr, start, end, width, strand, 
        lociPairKey, readPairKey, "ID" = id, sampleName
      ) %>%
      GenomicRanges::makeGRangesFromDataFrame(
        keep.extra.columns = TRUE, 
        seqinfo = GenomeInfoDb::seqinfo(ref_genome)
      )

    multihit_keys <- multihits %>%
      dplyr::mutate(sampleName = sampleName) %>%
      dplyr::distinct(sampleName, "ID" = id, readPairKey) %>%
      dplyr::select(sampleName, ID, readPairKey)

    #' Medians are based on all the potential sites for a given read, which will
    #' be identical for all reads associated with a readPairKey.
    multihit_medians <- round(
      median(GenomicRanges::width(split(
        x = multihit_templates, 
        f = multihit_templates$readPairKey
      )))
    )

    multihit_keys$medians <- multihit_medians[
      as.character(multihit_keys$readPairKey)
    ]

    multihits_pos <- GenomicRanges::flank(
      x = multihit_templates, width = -1, start = TRUE
    )

    multihits_red <- GenomicRanges::reduce(
      x = multihits_pos, min.gapwidth = 5L, with.revmap = TRUE
    )  #! Should make min.gapwidth a option

    revmap <- multihits_red$revmap

    axil_nodes <- as.character(S4Vectors::Rle(
      values = multihit_templates$readPairKey[min(revmap)], 
      lengths = lengths(revmap)
    ))

    nodes <- multihit_templates$readPairKey[unlist(revmap)]
    edgelist <- unique(matrix( c(axil_nodes, nodes), ncol = 2 ))

    multihits_cluster_data <- igraph::clusters(
      igraph::graph.edgelist(el = edgelist, directed = FALSE)
    )

    clus_key <- data.frame(
      row.names = unique(as.character(t(edgelist))),
      "clusID" = multihits_cluster_data$membership
    )

    multihits_pos$clusID <- clus_key[
      as.character(multihits_pos$readPairKey), "clusID"
    ]

    multihits_pos <- multihits_pos[order(multihits_pos$clusID)]

    clustered_multihit_index <- as.data.frame(
      GenomicRanges::mcols(multihits_pos)
    )

    multihit_loci_rle <- S4Vectors::Rle(factor(
      x = clustered_multihit_index$lociPairKey, 
      levels = unique(clustered_multihit_index$lociPairKey)
    ))

    multihit_loci_intL <- split(
      multihit_loci_rle, clustered_multihit_index$clusID
    )

    clustered_multihit_positions <- GenomicRanges::granges(
      x = multihits_pos[
        match(
          x = BiocGenerics::unlist(S4Vectors::runValue(multihit_loci_intL)), 
          table = clustered_multihit_index$lociPairKey
        )
      ]
    )

    clustered_multihit_positions <- GenomicRanges::split(
      x = clustered_multihit_positions,
      f = S4Vectors::Rle(
        values = seq_along(multihit_loci_intL), 
        lengths = S4Vectors::width(S4Vectors::runValue(
          multihit_loci_intL
        )@partitioning)
      )
    )

    readPairKey_cluster_index <- unique(
      clustered_multihit_index[,c("readPairKey", "clusID")]
    )

    multihit_keys$clusID <- readPairKey_cluster_index$clusID[
      match(
        as.character(multihit_keys$readPairKey), 
        readPairKey_cluster_index$readPairKey
      )
    ]

    multihit_keys <- multihit_keys[order(multihit_keys$medians),]

    clustered_multihit_lengths <- split(
      x = S4Vectors::Rle(multihit_keys$medians), 
      f = multihit_keys$clusID
    )

    #' Expand the multihit_templates object from readPairKey specific to read
    #' specific.
    multihit_keys <- multihit_keys[order(multihit_keys$readPairKey),]

    multihit_readPair_read_exp <- IRanges::IntegerList(
      split(x = seq_len(nrow(multihit_keys)), f = multihit_keys$readPairKey)
    )

    unclustered_multihits <- multihit_templates

    multihit_readPair_read_exp <- multihit_readPair_read_exp[
      as.character(unclustered_multihits$readPairKey)
    ]

    unclustered_multihits <- unclustered_multihits[S4Vectors::Rle(
      values = seq_along(unclustered_multihits),
      lengths = S4Vectors::width(multihit_readPair_read_exp@partitioning)
    )]

    names(unclustered_multihits) <- multihit_keys$ID[
      BiocGenerics::unlist(multihit_readPair_read_exp)
    ]

    unclustered_multihits$ID <- multihit_keys$ID[
      BiocGenerics::unlist(multihit_readPair_read_exp)
    ]

    unclustered_multihits$sampleName <- multihit_keys$sampleName[
      BiocGenerics::unlist(multihit_readPair_read_exp)
    ]

  }

  stopifnot(
    length(clustered_multihit_positions) == length(clustered_multihit_lengths)
  )

  multihitData <- list(
    unclustered_multihits, 
    clustered_multihit_positions, 
    clustered_multihit_lengths
  )

  names(multihitData) <- c(
    "unclustered_multihits", 
    "clustered_multihit_positions", 
    "clustered_multihit_lengths"
  )

  writeOutputFile(multihitData, file = args$multihits, format = "rds")

  printHead(
    data.frame(
      "multihit_reads" = length(unique(names(unclustered_multihits))),
      "multihit_alignments" = length(unique(unclustered_multihits)),
      "multihit_clusters" = length(clustered_multihit_positions),
      "multihit_lengths" = sum(lengths(clustered_multihit_lengths))
    ),
    title = "Multihit metrics", 
    caption = "Metrics highlighting the observation of multiple aligning reads."
  )

  if( args$stat != FALSE ){

    add_stat <- data.frame(
      sampleName = sampleName,
      metric = c("multihit.reads", "multihit.lengths", "multihit.clusters"),
      count = c(
        length(unique(names(unclustered_multihits))), 
        sum(lengths(clustered_multihit_lengths)), 
        length(clustered_multihit_positions))
    )

    stat <- rbind(stat, add_stat)

  }

}


if( args$stat != FALSE ){

  write.table(
    x = stat, file = args$stat, 
    sep = ",", row.names = FALSE, 
    col.names = FALSE, quote = FALSE
  )

}


if( !is.null(args$saveImage) ) save.image(args$saveImage)

q()

R magrittr From line 10 of rscripts/samqc.R

options(stringsAsFactors = FALSE, scipen = 99, width = 999)

code_dir <- dirname(sub(
  pattern = "--file=", 
  replacement = "", 
  x = grep("--file=", commandArgs(trailingOnly = FALSE), value = TRUE)
))

desc <- yaml::yaml.load_file(
  file.path(code_dir, "descriptions/trim.desc.yml")
)

#' Set up and gather command line arguments
parser <- argparse::ArgumentParser(
  description = desc$program_short_description,
  usage = "Rscript trim.R <seqFile> [-h/--help, -v/--version] [optional args]"
)

parser$add_argument(
  "seqFile", nargs = 1, type = "character", help = desc$seqFile
)

parser$add_argument(
  "-o", "--output", nargs = 1, type = "character", help = desc$output
)

parser$add_argument(
  "-l", "--leadTrimSeq", nargs = 1, type = "character", default = ".",
  help = desc$leadTrimSeq
)

parser$add_argument(
  "-r", "--overTrimSeq", nargs = 1, type = "character", default = ".",
  help = desc$overTrimSeq
)

parser$add_argument(
  "--phasing", nargs = 1, type = "integer", default = 0, 
  help = desc$phasing
)

parser$add_argument(
  "--maxMismatch", nargs = 1, type = "integer", help = desc$maxMismatch
)

parser$add_argument(
  "--leadMismatch", nargs = "+", type = "integer", default = 0,
  help = desc$leadMismatch
)

parser$add_argument(
  "--overMismatch", nargs = 1, type = "integer", default = 0,
  help = desc$overMismatch
)

parser$add_argument(
  "--overMaxLength", nargs = 1, type = "integer", default = 20,
  help = desc$overMaxLength
)

parser$add_argument(
  "--overMinLength", nargs = 1, type = "integer", default = 3,
  help = desc$overMinLength
)

parser$add_argument(
  "--minSeqLength", nargs = 1, type = "integer", default = 30,
  help = desc$minSeqLength
)

parser$add_argument(
  "--collectRandomIDs", nargs = "+", type = "character", default = FALSE,
  help = desc$collectRandomIDs
)

parser$add_argument(
  "--badQualBases", nargs = 1, type = "integer", default = 5,
  help = desc$badQualBases
)

parser$add_argument(
  "--qualSlidingWindow", nargs = 1, type = "integer", default = 10,
  help = desc$qualSlidingWindow
)

parser$add_argument(
  "--qualThreshold", nargs = 1, type = "character", default = '?',
  help = desc$qualThreshold
)

parser$add_argument(
  "--stat", nargs = 1, type = "character", default = FALSE, help = desc$stat
)

parser$add_argument(
  "-c", "--cores", nargs = 1, default = 1, type = "integer", help = desc$cores
)

parser$add_argument(
  "--compress", action = "store_true", help = desc$compress
)

parser$add_argument(
  "--noFiltering", action = "store_true",
  help = desc$noFiltering
)

parser$add_argument(
  "--noQualTrimming", action = "store_true",
  help = desc$noQualTrimming
)



args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

if( is.null(args$seqFile) ){
  stop("\n  Please choose a sequence file (fasta or fastq).\n")
}

if( !is.null(args$maxMismatch) ){
  args$leadMismatch <- args$maxMismatch
  args$overMismatch <- args$maxMismatch
}

if( args$overMaxLength == 0 ){
  args$overMaxLength <- nchar(args$overTrimSeq)
}

if( all(args$collectRandomIDs != FALSE) ){
  if( !grepl("N", args$leadTrimSeq) ){
    cat(
      "\n  No random nucleotides (Ns) found in leadTrimSeq.",
      "Turning off collection of randomIDs.\n"
    )
    args$collectRandomIDs <- FALSE
  }
}

if( args$leadTrimSeq == "." ){
  args$leadTrimSeq <- ""
}

if( args$overTrimSeq == "." ){
  args$overTrimSeq <- ""
}

if( args$cores <= 0 ){
  args$cores <- 1
}

input_table <- data.frame(
  "Variables" = paste0(names(args), " :"), 
  "Values" = sapply(
    seq_along(args), 
    function(i) paste(args[[i]], collapse = ", ")
  )
)

input_table <- input_table[
  match(
    c("seqFile :", "output :", "leadTrimSeq :", "overTrimSeq :", 
      "phasing :", "maxMismatch :", "leadMismatch :", "overMismatch :", 
      "overMaxLength :", "overMinLength :", "minSeqLength :", 
      "collectRandomIDs :", "noFiltering :", "noQualTrimming :", 
      "badQualBases :", "qualSlidingWindow :", "qualThreshold :", 
      "stat :", "compress :", "cores :"),
    input_table$Variables)
  ,]

cat("\nTrim Inputs:\n")
print(
  data.frame(input_table, row.names = NULL), 
  right = FALSE, 
  row.names = FALSE
)

# Reduce number of requested cores if needed.
if( args$cores > 1 ){
  if( args$cores > parallel::detectCores() ){
    cat(
      "\n  Requested cores is greater than availible for system.",
      "Changing cores to max allowed."
    )
    args$cores <- detectCores()
  }
}

# Load supporting scripts
source(file.path(code_dir, "supporting_scripts", "trimLeading.R"))

source(file.path(code_dir, "supporting_scripts", "trimOverreading.R"))

source(file.path(code_dir, "supporting_scripts", "writeSeqFiles.R"))

source(file.path(code_dir, "supporting_scripts", "utility_funcs.R"))

if( !all(
  c("trimLeading", "trimOverreading", "writeSeqFiles", "logSeqData", 
    "serialAppendS4") %in% 
  ls()
)){
  stop(
    "\n  Cannot load supporting scripts. ",
    "You may need to clone from github again.\n"
  )
}

# Determine sequence file types
seq_type <- seqFileType(args$seqFile)
out_type <- seqFileType(args$output)

# Determine random output file type
if( all(args$collectRandomIDs != FALSE) ){
  random_type <- seqFileType(args$collectRandomIDs)
}

# Read sequence file
if( seq_type == "fasta" ){
  seqs <- ShortRead::readFasta(args$seqFile)
}else{
  seqs <- ShortRead::readFastq(args$seqFile)
}

# Log info
input_tbl <- logSeqData(seqs)
cat("\nInput sequence information:\n")
print(input_tbl, row.names = FALSE)

# If no reads remaining, terminate and write output
if( length(seqs) == 0 ){

  cat(
    "\n  No reads remaining to trim. Terminating script after writing output.\n"
  )

  writeNullFile(
    file = args$output, 
    write.random = args$collectRandomIDs, 
    stat = args$stat,
    compress = args$compress
  )

  q()

}

# Quality trimming, trim from left to remove consecutive bad quality bases.
## Below block sets the OpenMP threads to the cores specified in args.
if( !args$noQualTrimming & seq_type == "fastq" ){

  nthreads <- .Call(ShortRead:::.set_omp_threads, as.integer(args$cores))
  on.exit(.Call(ShortRead:::.set_omp_threads, nthreads))

  seqs <- ShortRead::trimTailw(
    object = seqs, 
    k = args$badQualBases, 
    a = args$qualThreshold, 
    halfwidth = round(args$qualSlidingWindow/2)
  )

  # Log info
  qual_trimmed_tbl <- logSeqData(seqs)
  cat("\nSequence information remaining after quality trimming:\n")
  print(qual_trimmed_tbl, row.names = FALSE)

}

# If no reads remaining, terminate and write output
if( length(seqs) == 0 ){

  cat(
    "\n  No reads remaining to trim. Terminating script after writing output.\n"
  )

  writeNullFile(
    file = args$output, 
    write.random = args$collectRandomIDs, 
    stat = args$stat,
    compress = args$compress
  )

  q()

}

# Remove sequences that do not contain enough sequence information
seqs <- seqs[
  Biostrings::width(seqs) >= (
    args$minSeqLength + nchar(args$leadTrimSeq) + args$phasing
  )
]

len_trimmed_tbl <- logSeqData(seqs)
cat("\nSequence information remaining after minimum length trimming:\n")
print(len_trimmed_tbl, row.names = FALSE)

# Trim sequences, either on a single core or multiple cores
if( args$cores <= 1 ){

  # Trim 5' end or leading end. Conditionals present for added features.
  if( nchar(args$leadTrimSeq) > 0 ){

    trimmed_seqs <- trimLeading(
      seqs,
      trim.sequence = args$leadTrimSeq,
      phasing = args$phasing,
      max.mismatch = args$leadMismatch,
      collect.random = all(args$collectRandomIDs != FALSE),
      filter = !args$noFiltering
    )

  }else{

    trimmed_seqs <- seqs

  }


  # Collect random sequences if desired.
  if( all(args$collectRandomIDs != FALSE) ){
    random_seqs <- trimmed_seqs$randomSequences
    trimmed_seqs <- trimmed_seqs$trimmedSequences
  }

  # Log info
  lead_trimmed_tbl <- logSeqData(trimmed_seqs)
  cat("\nSequence information remaining after lead trimming:\n")
  print(lead_trimmed_tbl, row.names = FALSE)

  # Overread trimming
  if( nchar(args$overTrimSeq) > 0 ){

    # Determine percent identity from allowable mismatch.
    percent_id <- (nchar(args$overTrimSeq) - args$overMismatch) / 
      nchar(args$overTrimSeq)

    # Trim 3' end or overreading protion of sequences.
    trimmed_seqs <- trimOverreading(
      seqs = trimmed_seqs, 
      trim.sequence = args$overTrimSeq, 
      percent.id = percent_id, 
      max.seq.length = args$overMaxLength,
      min.seq.length = args$overMinLength
    )

    # Log info
    over_trimmed_tbl <- logSeqData(trimmed_seqs)
    cat("\nSequence information remaining after overreading trimming:\n")
    print(over_trimmed_tbl, row.names = FALSE)

  }

}else{

  # Split sequences up evenly across cores for trimming
  split.seqs <- split(
    seqs, ceiling(seq_along(seqs)/(length(seqs)/args$cores))
  )

  # Set up buster the cluster
  buster <- parallel::makeCluster(args$cores)

  # Trim 5' end or leading section of sequence while capturing random sequences,
  # if desired. Added features required workflow changes.
  if( nchar(args$leadTrimSeq) > 0 ){

    trimmed_seqs <- parallel::parLapply(
      buster,
      split.seqs,
      trimLeading,
      trim.sequence = args$leadTrimSeq,
      phasing = args$phasing,
      max.mismatch = args$leadMismatch,
      collect.random = all(args$collectRandomIDs != FALSE),
      filter = !args$noFiltering
    )

    if( all(args$collectRandomIDs != FALSE) ){

      random_seqs <- lapply(trimmed_seqs, "[[", "randomSequences")

      random_seqs <- lapply(seq_along(random_seqs[[1]]), function(i){
        serialAppendS4(
          lapply(seq_along(random_seqs), function(j){
            random_seqs[[j]][[i]]
          })
        )
      })

      trimmed_seqs <- lapply(trimmed_seqs, "[[", "trimmedSequences")

    }

  }else{

    trimmed_seqs <- split.seqs

  }

  trimmed_seqs <- serialAppendS4(trimmed_seqs)

  # Log info
  lead_trimmed_tbl <- logSeqData(trimmed_seqs)
  cat("\nSequence information remaining after lead trimming:\n")
  print(lead_trimmed_tbl, row.names = FALSE)

  # The method for overread trimming sequentially aligns shorter fragments of 
  # the overTrimSeq, and solely requiring mismatches could lead to some issues.
  # Therefore the same percent identity is requried across all alignments, 
  # however long.
  if( nchar(args$overTrimSeq) > 0 ){

    trimmed_seqs <- split(
      x = trimmed_seqs, 
      f = ceiling(seq_along(trimmed_seqs)/(length(trimmed_seqs)/args$cores))
    )

    percent_id <- (nchar(args$overTrimSeq) - args$overMismatch) / 
      nchar(args$overTrimSeq)

    # Trim 3' end or overreading protion of the sequence.
    trimmed_seqs <- parallel::parLapply(
      buster,
      trimmed_seqs,
      trimOverreading,
      trim.sequence = args$overTrimSeq, 
      percent.id = percent_id, 
      max.seq.length = args$overMaxLength,
      min.seq.length = args$overMinLength
    )

    trimmed_seqs <- serialAppendS4(trimmed_seqs)

    # Log info
    over_trimmed_tbl <- logSeqData(trimmed_seqs)
    cat("\nSequence information remaining after overreading trimming:\n")
    print(over_trimmed_tbl, row.names = FALSE)

  }

  # Stop buster before he gets out of control.
  parallel::stopCluster(buster)

}


# If no reads remaining, terminate and write output
if( length(seqs) == 0 ){

  cat(
    "\n  No reads remaining to trim. Terminating script after writing output.\n"
  )

  writeNullFile(
    file = args$output, 
    write.random = args$collectRandomIDs, 
    stat = args$stat,
    compress = args$compress
  )

  q()

}


# Second check for sequences below minimum length
trimmed_seqs <- trimmed_seqs[
  Biostrings::width(trimmed_seqs) >= args$minSeqLength
]

# Recover filtered reads if requested
if( args$noFiltering ){

  if( seq_type == "fasta" ){
    inputSeqs <- ShortRead::readFasta(args$seqFile)
  }else{
    inputSeqs <- ShortRead::readFastq(args$seqFile)
  }

  matched_idx <- which(id(inputSeqs) %in% id(trimmed_seqs))
  unmatched_idx <- which(!id(inputSeqs) %in% id(trimmed_seqs))
  untrimmed_seqs <- inputSeqs[unmatched_idx]
  output_seqs <- Biostrings::append(trimmed_seqs, untrimmed_seqs)
  output_seqs <- output_seqs[order(c(matched_idx, unmatched_idx))]

}else{

  output_seqs <- trimmed_seqs

}


# Log info
final_trimmed_tbl <- logSeqData(output_seqs)
cat("\nSequence information remaining:\n")
print(final_trimmed_tbl, row.names = FALSE)


# Write stats if requested
if( args$stat != FALSE ){

  sample_name <- unlist(strsplit(args$output, "/"))

  sample_name <- unlist(
    strsplit(sample_name[length(sample_name)], ".fa", fixed = TRUE)
  )[1]

  write.table(
    data.frame(
      sampleName = sample_name,
      metric = "reads",
      count = length(output_seqs)
    ),
    file = args$stat,
    sep = ",", 
    row.names = FALSE, 
    col.names = FALSE, 
    quote = FALSE
  )

}

# Collect RandomIDs if requested
if( all(args$collectRandomIDs != FALSE) ){

  random_seqs <- lapply(
    seq_along(random_seqs), 
    function(i, ids){

      random_seqs[[i]][
        which(as.character(ShortRead::id(random_seqs[[i]])) %in% ids)
      ]

    }, 
    ids = as.character(ShortRead::id(trimmed_seqs))
  )

}

# Sequences have been trimmed and random sequnces collected (if desired). 
# Next step is to write to output file(s).
# For fasta format, this is as simple as writing out the sequences currently in
# the environment. For fastq format, the quality scores for the trimmed bases
# must be loaded and trimmed as well.

# Write sequence file.
writeSeqFiles(
  seqs = output_seqs, 
  file = args$output,
  compress = args$compress
)

# Write randomID file.
if( all(args$collectRandomIDs != FALSE) ){

  if( length(args$collectRandomIDs) != length(random_seqs) ){

    new_file_name <- unlist(strsplit(args$collectRandomIDs[[1]], ".fa"))

    new_names <- paste0(
      new_file_name[[1]], ".", 1:length(random_seqs), ".", random_type
    )

    args$collectRandomIDs <- new_names
  }

  null <- mapply(
    writeSeqFiles,
    seqs = random_seqs,
    file = args$collectRandomIDs,
    MoreArgs = list(compress = args$compress)
  )

}

cat("\nScript completed.\n")

q()