A Snakemake workflow to analyse and visualise Illumina Infinium Methylation arrays

public 1yr ago 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation, topic

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

A Snakemake workflow to analyse Illumina methylation arrays

Overview
Installation
Usage

Code Snippets

script:
    "../scripts/annotate.R"

SnakeMake From line 26 of rules/annotate.smk

script:
    "../scripts/dmrcatecpg.R"

SnakeMake From line 28 of rules/diffmeth.smk

script:
    "../scripts/dmrcatedmr.R"

SnakeMake From line 54 of rules/diffmeth.smk

script:
    "../scripts/pca.R"

SnakeMake From line 22 of rules/eda.smk

script:
    "../scripts/densityheatmap.R"

SnakeMake From line 44 of rules/eda.smk

script:
    "../scripts/filter.R"

SnakeMake From line 27 of rules/filter.smk

script:
    "../scripts/normalise.R"

SnakeMake From line 21 of rules/normalise.smk

script:
    "../scripts/import.R"

SnakeMake From line 20 of rules/preprocessing.smk

script:
    "../scripts/getmset.R"

SnakeMake From line 36 of rules/preprocessing.smk

script:
    "../scripts/getrset.R"

SnakeMake From line 55 of rules/preprocessing.smk

script:
    "../scripts/getgrset.R"

SnakeMake From line 71 of rules/preprocessing.smk

script:
    "../scripts/densityqc.R"

SnakeMake From line 23 of rules/qcpost.smk

script:
    "../scripts/densityqc.R"

SnakeMake From line 43 of rules/qcpost.smk

script:
    "../scripts/boxplotqc.R"

SnakeMake From line 62 of rules/qcpost.smk

script:
    "../scripts/controlstripqc.R"

SnakeMake From line 19 of rules/qcpre.smk

script:
    "../scripts/detpqc.R"

SnakeMake From line 38 of rules/qcpre.smk

script:
    "../scripts/msetqc.R"

SnakeMake From line 54 of rules/qcpre.smk

script:
    "../scripts/densityqc.R"

SnakeMake From line 74 of rules/qcpre.smk

script:
    "../scripts/densityqc.R"

SnakeMake From line 94 of rules/qcpre.smk

script:
    "../scripts/boxplotqc.R"

SnakeMake From line 113 of rules/qcpre.smk

shell:
    "wget https://zhouserver.research.chop.edu/InfiniumAnnotation/current/{params.array}/{params.array}.{params.organism}.manifest.rds -O resources/{params.array}.{params.organism}.manifest.rds"

SnakeMake From line 17 of rules/resources.smk

shell:
    "wget https://github.com/sirselim/illumina450k_filtering/blob/master/48639-non-specific-probes-Illumina450k.csv -O {output.probes}"

SnakeMake From line 30 of rules/resources.smk

script:
    "../scripts/rankplot.R"

SnakeMake From line 36 of rules/visualisation.smk

script:
    "../scripts/tracks.R"

SnakeMake From line 60 of rules/visualisation.smk

addAnno <- function(dmrs, outputLoc = "nearestLocation", featureLocForDistance="TSS", 
                    bindingRegion=c(-2000, 2000), organism = "hg38"){

    library(GenomicRanges)
    library(ChIPpeakAnno)
    library(org.Hs.eg.db)

    dmrs = GRanges(dmrs)

    if(organism == "hg38"){   

        library(TxDb.Hsapiens.UCSC.hg38.knownGene)

        annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg38.knownGene)

    } 

    if(organism == "hg19"){

        library(TxDb.Hsapiens.UCSC.hg19.knownGene)

        annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene)

    }

    seqlevelsStyle(dmrs) <- seqlevelsStyle(annoData)

    anno_dmrs <- annotatePeakInBatch(dmrs, AnnotationData = annoData, 
                                    output = outputLoc, 
                                    FeatureLocForDistance = featureLocForDistance,
                                    bindingRegion = bindingRegion)

    anno_dmrs$symbol <- xget(anno_dmrs$feature, org.Hs.egSYMBOL)

    return(anno_dmrs)

}

main <- function(input, output, params, log) {

    # Log

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script

    library(minfi)
    library(DMRcate)
    library(rtracklayer)

    dmrs <- readRDS(input$rds)

    # params
    outputLoc <- params$output # "nearestLocation"
    featureLocForDistance <- params$featureLocForDistance # "TSS"
    bindingRegion <- params$bindingRegion  # c(-2000, 2000)
    organism <- params$organism

    # output 
    save <- output$csv

    # run annotation
    dmrs = addAnno(dmrs, outputLoc, featureLocForDistance, bindingRegion, organism)

    # save output

    write.csv(as.data.frame(dmrs), save)

    rtracklayer::export(dmrs, output$bed) 

    saveRDS(dmrs, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R org.Hs.eg.db TxDb.Hsapiens.UCSC.hg19.knownGene TxDb.Hsapiens.UCSC.hg38.knownGene GenomicRanges minfi DMRcate From line 3 of scripts/annotate.R

plotBoxplots <- function(beta, phenodata, fill, group = "Sample_Group"){

  t_beta = t(beta)
  t_beta = as.data.frame(t_beta, drop=FALSE)
  t_beta$Sample_name = phenodata$Sample_Name
  t_beta$Group = as.character(phenodata[[group]])
  melt_beta = melt(t_beta)

  colnames(melt_beta) = c("Sample","Group", "CpG", "B.Val" )

  bp <- ggplot(melt_beta, aes(x=Sample, y=B.Val, fill = Group)) + 
    geom_boxplot()+
    labs(title="B vals Samples Pre Norm",x="Sample", y = "B.Val") + 
    theme_classic() + coord_flip() + scale_fill_manual(values=fill)

  return(bp)

}

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)
  library(ggplot2)
  library(reshape2)

  GRset <- readRDS(input$rds)

  beta <- getBeta(GRset)

  phenodata <- pData(GRset)

  fill <- unlist(params$fill)

  # TODO See BW-31
  bp <- plotBoxplots(beta, phenodata, fill, group = "status")

  ggsave(output$pdf, plot = bp)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R ggplot2 reshape2 minfi From line 7 of scripts/boxplotqc.R

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)

  RGSet <- readRDS(input$rds)

  qcReport(RGSet, pdf= output$pdf)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R minfi From line 6 of scripts/controlstripqc.R

densityHeatmapWrap <- function(GRset, col, samples_names, fill, cluster_columns = TRUE, clustering_distance_columns = "ks"){

  names(fill) = pData(GRset)[, col]

  ha1 = HeatmapAnnotation(group = pData(GRset)[, col], col = list(group = fill))

  mat <- as.matrix(getBeta(GRset))

  colnames(mat) <- pData(GRset)[, samples_names]

  p <- densityHeatmap(mat, top_annotation = ha1, 
                      cluster_columns = cluster_columns, clustering_distance_columns = "ks", 
                      ylab = "Beta-values")

  return(p)

}

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)
  library(ComplexHeatmap)

  GRset <- readRDS(input$rds)

  samples_names <- params$samples_names

  col <- params$group

  fillList <- params$fill

  # Check if names supplied are in GRSet metadata
  stopifnot(names(fillList) %in% pData(GRset)[, col])

  fill <- sapply(pData(GRset)[, col], function(x, fillList){as.character(fillList[x])}, fillList = fillList)

  # ensure character vector in correct order for plotting
  stopifnot(names(fill) == pData(GRset)[, col])

  cluster_columns <- params$cluster_columns

  clustering_distance_columns <- params$clustering_distance_columns

  # Plot

  pdf(output$pdf)

  p <- densityHeatmapWrap(GRset, col, samples_names, fill, cluster_columns = params$cluster_columns, 
                          clustering_distance_columns = params$clustering_distance_columns)

  draw(p)

  dev.off()

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R ComplexHeatmap minfi From line 6 of scripts/densityheatmap.R

getData <- function(GRset, type){

  if(type == "beta"){

    data = getBeta(GRset)

  }

  if(type == "M"){

    data = getM(GRset)

  }

  return(data)
}

plotDensity <- function(data, phenodata, output, fill, group = "Sample_Group"){

  ## Density plot
  pdf(output)

  densityPlot(data, sampGroups = phenodata[[group]], legend = F, pal = fill)

  legend("top", legend = levels(factor(phenodata[[group]])),
         text.col=fill)

  dev.off()

}

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)
  library(ggplot2)

  GRset <- readRDS(input$rds)

  data <- getData(GRset, params$type)

  phenodata <- pData(GRset)

  fill <- unlist(params$fill)

  # TODO See BW-31
  plotDensity(data, phenodata, output$pdf, fill, group = "status")

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R ggplot2 minfi From line 7 of scripts/densityqc.R

qcPval <- function(detP,targets, output, fill, group = "Sample_Group"){

  pal <- fill

  data <- colMeans(detP)

  names(data) <- targets$Sample_Name

  pdf(output,width=12)

  par(mar = c(9,8,1,1))
  barplot(data, col = pal[factor(targets[[group]])], las = 2,
          cex.names = 0.8, ylim = c(0,0.02), ylab = "Mean detection p-values")
  abline(h=0.01,col = "red")
  legend("topright", legend = levels(factor(targets[[group]])), fill = pal,
         bg = "white")

  dev.off()

}

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)

  # tsv file location

  RGset <- readRDS(input$rds)

  detP <- detectionP(RGset)

  targets <- read.metharray.sheet(params$dir)

  fill <- unlist(params$fill)

  # TODO See BW-31
  qcPval(detP, targets, output$pdf, fill, group = "status")

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R minfi From line 7 of scripts/detpqc.R

modelMatrix <- function(data) {

  # Get phenotype data

  names <- colnames(data)

  # Set condition factor

  if ("condition" %in% names) {

    condition <- factor(data$condition)

    n.condition <- nlevels(condition)

    is.condition <- n.condition > 1

  } else {

    is.condition <- FALSE

  }

  # Set batch factor

  if ("batch" %in% names) {

    batch <- factor(data$batch)

    n.batch <- nlevels(batch)

    is.batch <- n.batch > 1

  } else {

    is.batch <- FALSE

  }

  # Set block factor

  if ("block" %in% names) {

    block <- factor(data$block)

    n.block <- nlevels(block)

    is.block <- n.block > 1

  } else {

    is.block <- FALSE

  }

  # Construct design matrix

  if (is.condition & !is.batch & !is.block) {

    design <- model.matrix(~ 0 + condition)

  }

  if (is.condition & is.batch & !is.block) {

    design <- model.matrix(~ 0 + condition + batch)

  }

  if (is.condition & !is.batch & is.block) {

    design <- model.matrix(~ 0 + condition + block)

  }

  if (is.condition & is.batch & is.block) {

    design <- model.matrix(~ 0 + condition + batch + block)

  }

  # Rename condition coefficients

  which.condition <- seq_len(n.condition)

  colnames(design)[which.condition] <- levels(condition)

  # Required for DMRcate if using numeric coef
  # colnames(design)[1] <- "(Intercept)"

  # Return design matrix

  design

}

# Makes sure sample tsv is in the same order as methylation object coldata

checkOrder <- function(sampledata, rds, colname = "Sample_Name") {

  sampledata <- sampledata[match(colData(rds)[[colname]],sampledata$sample),]

  return(sampledata)

}

main <- function(input, output, params, log, config) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(DMRcate)
  library(rtracklayer)
  library(minfi)
  library(limma)

  GRset <- readRDS(input$rds)

  data <- read.table(params$samples, header = T)

  data <- checkOrder(data, GRset)

  mod <- modelMatrix(data)

  #### Run DMRCate 

  # Already ratio converted so no need to run below:
  # GRsetRatio <- ratioConvert(GRset)

  GRsetRatio <- GRset

  arraytype = params$arraytype

  # cpg.annotate expects array string as 450K but config specifies as HM450
  if(arraytype == "HM450"){

    arraytype = "450K"

  }

  analysis.type = params$analysistype

  # Changing to limma style contrast matrix makes easier to specify comparison explicitly in config
  # See BW-23
  con <- params$contrast

  # limma style contrast matrix
  cont.matrix <- makeContrasts(contrasts=con, levels=mod)

  # must be a column name in cont.matrix
  # Should correspond to contrast given as param con
  coef <- colnames(cont.matrix)[1]
  stopifnot(coef == con)

  fdr = params$fdr

  # See BW-23 Added cont.matrix = cont.matrix
  # Allowed us to remove intercept from desing matrix
  # Otherwise design matrix must have intercept and coef must be numeric
  cpg.annotation <- cpg.annotate("array", GRsetRatio, arraytype = arraytype, cont.matrix = cont.matrix,
                               analysis.type = analysis.type, design = mod, coef = coef,
                               contrasts = TRUE, fdr = fdr)

  write.csv(as.data.frame(cpg.annotation@ranges), file = output$csv, quote = F)

  saveRDS(cpg.annotation, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log, snakemake@config)

R limma minfi DMRcate From line 8 of scripts/dmrcatecpg.R

liftover <- function(results.ranges, chainfile){

  # hg19ToHg38.over.chain from https://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/

  ch <- import.chain(chainfile)

  results <- rtracklayer::liftOver(results.ranges, ch)

  results <- unlist(results)

  return(results)
}

main <- function(input, output, params, log, config) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(DMRcate)
  library(rtracklayer)
  library(minfi)
  library(ExperimentHub)

  cpg.annotation <- readRDS(input$rds)

  # Below are default values
  # fdr cutoff default applied and advised by authors
  # see opt pcutoff = "fdr"
  dmrcoutput <- dmrcate(cpg.annotation, lambda=1000, C=2)

  results.ranges <- extractRanges(dmrcoutput, genome = "hg19")
  #results.ranges <- dmrcateExtractRanges(dmrcoutput, genome = "hg19")

  seqlevelsStyle(results.ranges) <- "UCSC"

  # chainfile <- params$chainfile
  chainfile <- input$chainfile

  if(params$liftover){

    results <- liftover(results.ranges, chainfile)

  } else {

    results <- results.ranges

  }

  # order by fdr of choice

  fdr <- params$fdr

  results = results[order(mcols(results)[[fdr]]),]

  write.csv(as.data.frame(results), file = output$csv, quote = F)

  saveRDS(results, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log, snakemake@config)

R minfi DMRcate ExperimentHub From line 7 of scripts/dmrcatedmr.R

filterByDetP <- function(GRset, RGset){

  detP <- detectionP(RGset)

  detP <- detP[match(featureNames(GRset),rownames(detP)),]

  # remove any probes that have failed in one or more samples
  keep <- rowSums(detP < 0.01) == ncol(GRset)

  # filter out bad probes
  GRset <- GRset[keep,]

  return(GRset)
}

filterBySexChrom <- function(GRset){

  annEPIC <- minfi::getAnnotation(GRset)
  keep <- !(featureNames(GRset) %in% annEPIC$Name[annEPIC$chr %in% c("chrX","chrY")])

  GRset <- GRset[keep,]

  return(GRset)

}

filterBySnpProbes <- function(GRset){

  GRset <- dropLociWithSnps(GRset, snps = c("SBE","CpG"), maf = 0)

  return(GRset)

}

filterByXReactiveProbes <- function(GRset, xprobes){

  xReactiveProbes <- xprobes

  keep <- !(featureNames(GRset) %in% xReactiveProbes$TargetID)

  GRset <- GRset[keep,]

  return(GRset)

}

filterByExtendedAnno <- function(GRset, anno){

  ### Drop all probes that are cross reactive, dont map to hg38/hg19, extended SNP annotation (see Nucleic acid research paper)
  cpg_mask <- names(anno)[anno$MASK_general == TRUE]

  keep <- !(featureNames(GRset) %in% cpg_mask)

  GRset <- GRset[keep,]

  return(GRset)

}

getAnno <- function(annoType = "hg38", array = "HM450", static = TRUE){

  # anno <- sesameDataGetAnno("EPIC/EPIC.hg38.manifest.rds")

  if(static){

    anno <- readRDS(paste0("resources/", array, ".", annoType, ".manifest.rds"))

  } 
  if(!static) {

    sesameDataCache(array)

    anno <- sesameDataGetAnno(paste0(array, "/", array, ".", annoType, ".manifest.rds"))
  }

  return(anno)

}

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)
  #library(sesameData)

  GRset <- readRDS(input$rds)

  # or could do hg38Anno <- sesameDataGetAnno("EPIC/EPIC.hg38.manifest.rds")
  # anno <- readRDS(params$anno)

  # Have gone with downloading the annotation per analysis - but can add to /resources if too slow
  anno <- getAnno(params$anno, params$array, params$static)

  RGset <- readRDS(params$rgset)

  # "48639-non-specific-probes-Illumina450k.csv"
  # xprobes <- read.csv(file=params$xprobes, stringsAsFactors=FALSE)
  xprobes <- read.csv(file=input$xprobes, stringsAsFactors=FALSE)

  GRsetFilt <- filterByDetP(GRset, RGset)
  GRsetFilt <- filterBySexChrom(GRset)
  GRsetFilt <- filterBySnpProbes(GRset)
  GRsetFilt <- filterByXReactiveProbes(GRset, xprobes)
  GRsetFilt <- filterByExtendedAnno(GRset, anno)

  # Note we may like to add additional filtering based on updates to manifest from Illumina:
  # See https://github.com/sirselim/illumina450k_filtering

  saveRDS(GRsetFilt, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R minfi From line 6 of scripts/filter.R

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)

  RSet <- readRDS(input$rds)

  GRset <- mapToGenome(RSet)

  saveRDS(GRset, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R minfi From line 6 of scripts/getgrset.R

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)

  RGset <- readRDS(input$rds)

  MSet <- preprocessRaw(RGset) 

  saveRDS(MSet, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R minfi From line 6 of scripts/getmset.R

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)

  MSet <- readRDS(input$rds)

  RSet <- ratioConvert(MSet, what = params$what, keepCN = params$keepCN)

  saveRDS(RSet, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R minfi From line 6 of scripts/getrset.R

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)

  targets <- read.metharray.sheet(input$dir)

  sample_table <- read.table(input$samples, header = T)

  # match targets with sample table
  targets <- targets[match(sample_table[,"sample"], targets[,"Sample_Name"]),]

  stopifnot(targets[,"Sample_Name"] == sample_table[,"sample"])

  # add sample table to targets
  targets <- cbind.data.frame(targets, sample_table)

  RGset <- read.metharray.exp(targets = targets)

  saveRDS(RGset, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R minfi From line 6 of scripts/import.R

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)
  library(ggplot2)

  # tsv file location

  MSet <- readRDS(input$rds)

  qc <- getQC(MSet)

  pdf(output$pdf)

  plotQC(qc)

  dev.off()

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R ggplot2 minfi From line 6 of scripts/msetqc.R

normalise <- function(RGset, type) {

  if(type == "preprocessFunnorm"){

    # Returns a GRset

    obj <- preprocessFunnorm(RGset)

  }

  if(type == "preprocessSWAN"){

    # NOTE - this returns an Mset not a GRset

    obj <- preprocessSWAN(RGset)

  }

  if(type == "preprocessQuantile"){

    # Returns a GRset

    obj <- preprocessQuantile(RGset)

  }

  if(type == "preprocessNoob"){

    # NOTE - this returns an Mset not a GRset

    obj <- preprocessNoob(RGset)

  }

  if(type == "preprocessIllumina"){

    # NOTE - this returns an Mset not a GRset
    # Also not recommended type of normalisation

    obj <- preprocessIllumina(RGset)

  }

  return(obj)
}

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)

  RGset <- readRDS(input$rds)

  print(sessionInfo())
  GRset <- normalise(RGset, type = params$type)

  saveRDS(GRset, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R minfi From line 6 of scripts/normalise.R

plotPCA <- function(x, ...) {

  UseMethod("plotPCA")

}

plotPCA.GenomicRatioSet <- function(x, col, fill) {

  mat <- getBeta(x)

  var <- matrixStats::rowVars(mat)

  num <- min(500, length(var))

  ind <- order(var, decreasing = TRUE)[seq_len(num)]

  pca <- prcomp(t(mat[ind, ]))

  pct <- (pca$sdev ^ 2) / sum(pca$sdev ^ 2)

  dat <- data.frame(PC1 = pca$x[,1], PC2 = pca$x[,2], group = pData(x)[, col])

  p <- ggplot(dat, aes_string(x = "PC1", y = "PC2", color = "group")) + 
    geom_point(size = 3) + 
    xlab(paste0("PC1: ", round(pct[1] * 100), "% variance")) + 
    ylab(paste0("PC2: ", round(pct[2] * 100), "% variance")) + 
    coord_fixed() + scale_colour_manual(values=fill)

  return(p)
}

main <- function(input, output, params, log) {

  # Log function

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script function

  library(ggplot2)

  library(minfi)

  x <- readRDS(input$rds)

  # convert fill to named vector as preserves names
  fill = unlist(params$fill)

  p <- plotPCA(x, col = params$group, fill = fill)

  ggsave(output$pdf, plot = p)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R ggplot2 PCAtools minfi From line 6 of scripts/pca.R

prepRes <- function(results, yaxis = "meandiff", symbol = "overlapping.genes"){

  results <- as.data.frame(mcols(results)[,c(symbol, yaxis)])

  # Change colnames (expects data parsed as symbol, enrichment)

  colnames(results) <- c("symbol","score")

  # Rank by value

  results <- results[order(-results$score),]

  results$rank <- rep(1:length(results$score))

  return(results)
}

#### Mean diff rank plots 
# ranked_data assumed to be 2 col df with gene symbol and enrichment score titled symbol and score 
plotRank <- function(ranked_data, save, genes_of_interest = NULL, 
                     textsize = 0.8, pad = 2, pointsize = 1, highlight = 30, 
                     title = "Rank Plot", yaxis = "meandiff", number_show = 10, num_score_thres = 0.5){

  library(plyr)
  library(ggrepel)
  library(ggplot2)

  # Define gene names of interest if none provided

  if (is.null(genes_of_interest)){

    genes_of_interest <- c(head(ranked_data$symbol, (number_show/2)), tail(ranked_data$symbol, (number_show/2)))

    genes_of_interest <- genes_of_interest[!is.na(genes_of_interest)]
  }

  # Define state column for plotting of colour points and labelled points

  ranked_data$state = rep(FALSE,length(rownames(ranked_data)))

  ranked_data$state = ifelse(ranked_data$rank %in% 1:highlight | ranked_data$rank %in% (length(ranked_data$rank)-highlight):length(ranked_data$rank), "Yes", "No")

  # This will also pick up DMRs with associated genes that are not unique (multiple peaks per gene if they are within the top hits)
  ranked_data$state = ifelse(ranked_data$symbol %in% genes_of_interest & abs(ranked_data$score) > num_score_thres, "Highlight", ranked_data$state)

  # Write out underlying CSV file of rank in case required downstream

  write.table(as.data.frame(ranked_data), file = save, quote = F, sep = "\t", row.names = FALSE)

  # Ggplot obj

  num_highlight <- round(min(number_show, nrow(ranked_data[ranked_data$state == "Highlight",]))/2)
  highlight_df <- rbind.data.frame(head(ranked_data[ranked_data$state == "Highlight",], num_highlight), tail(ranked_data[ranked_data$state == "Highlight",], num_highlight))

  p = ggplot(ranked_data, aes(rank, score)) +
        geom_point(aes(col=state), size = pointsize, alpha = 0.5) + 
        geom_point(data = subset(ranked_data, state %in% c('Yes')), aes(col = state), size = pointsize, alpha = 0.05) + 
        geom_point(data = subset(ranked_data, state %in% c("Highlight")), aes(col = state), size = pointsize, alpha = 0.5) +
        scale_color_manual(values = c("#f05b5b", "#828282", "#5b88f0"), guide = FALSE) +
        geom_text_repel(data = highlight_df, aes(label = symbol), size = textsize, point.padding = pad) + 
        geom_rect(data = head(ranked_data, 1), aes(ymin=-num_score_thres, ymax=num_score_thres, xmin=-Inf, xmax=Inf), alpha= 0.5) +
        geom_hline(yintercept = 0, linetype="solid", color = "black", size=0.5, alpha= 0.5) + 
        geom_hline(yintercept = -num_score_thres, linetype="dashed", color = "black", size=0.5, alpha= 0.5) + 
        geom_hline(yintercept = num_score_thres, linetype="dashed", color = "black", size=0.5, alpha= 0.5) + 
        ggtitle(title) + 
        labs(y = yaxis, x = "Rank") + 
        theme_classic() + 
        theme(plot.title = element_text(hjust = 0.5, size=rel(1.6)) ,
              axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size=rel(1.6)), 
              axis.title.x = element_text(size=rel(1.6)),
              axis.text.y = element_text(size=rel(1.6)),
              axis.title.y = element_text(size=rel(1.6)))

  return(p)

}

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)
  library(DMRcate)

  results <- readRDS(input$rds)

  # Define params for rank plot function
  # note dmrcate results file for ranking is names meandiff (in the past it was meanbetafc)
  save <- output$tsv
  genes_of_interest <- params$genes_of_interest
  width <- params$width
  height <- params$height
  textsize <- params$textsize
  pad <- params$pad
  pointsize <- params$pointsize
  highlight <- params$highlight
  title <- params$title
  yaxis <- params$yaxis
  number_show <- params$number_show
  num_score_thres <- params$num_score_thres

  symbol <- params$symbol # overlapping.genes

  # Run prep results table 
  ranked_data <- prepRes(results, yaxis, symbol = symbol)

  # Plot rank plot
  p <- plotRank(ranked_data, save, genes_of_interest,
                     textsize, pad, pointsize, highlight, 
                     title, yaxis, number_show, num_score_thres)

  ggsave(output$pdf, plot = p, width = width,  height = height)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R ggplot2 ggrepel plyr minfi DMRcate From line 8 of scripts/rankplot.R

getTrackObj <- function(filter, anno = "hg38", array = "HM450", combine = "mean", by = "status"){

  library(TxDb.Hsapiens.UCSC.hg38.knownGene)

  txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

  manifest <- readRDS(paste0("resources/", array, ".", anno, ".manifest.rds"))

  # only keep cpgs which are in final filtered Genomic Ratio set
  keep <- names(manifest) %in% featureNames(filter)

  manifest <- manifest[keep,]

  # Get Beta table from GRSet

  beta <- getBeta(filter)

  # match order of beta table with manifest meta data

  beta <- beta[match(names(manifest), rownames(beta)),]

  # check rownames equal

  stopifnot(rownames(beta) == names(manifest))

  # Add new colnames to beta table matching colData

  stopifnot(colnames(beta) == rownames(colData(filter)))

  colnames(beta) <- colData(filter)$Sample_Name

  # Add beta signal to tracks GRanges mcols instead of other data

  tracks <- manifest 

  mcols(tracks) <- beta

  # Turn into list of separate GRanges objects

  if (is.null(combine)) {

    tracksList <- lapply(colnames(mcols(tracks)), function(x, tracks){tracks[, colnames(mcols(tracks)) == x] } , tracks = tracks)

    names(tracksList) = colnames(mcols(tracks))

    tracksList <- lapply(tracksList, filterTrackOverlaps)

  } else {

    combine_by <- unique(colData(filter)[[by]])

    tracksList <- lapply(combine_by, combineBeta, tracks = tracks, colData=colData(filter), by = by, combine = combine)

    names(tracksList) = combine_by

  }

  return(tracksList)
}

# Combine samples together based on colData col name and label into combined tracks

combineBeta <- function(label, tracks, colData, by, combine = "mean", samplename = "Sample_Name"){

  library(TxDb.Hsapiens.UCSC.hg38.knownGene)

  txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

  samples_int <- colData[colData[[by]] %in% label,][[samplename]]

  if(combine == "mean"){

    combineMeta <- rowMeans(as.data.frame(mcols(tracks[, colnames(mcols(tracks)) %in% samples_int])))

  } 

  if(combine == "median"){

    combineMeta <- rowMedians(as.data.frame(mcols(tracks[, colnames(mcols(tracks)) %in% samples_int])))

  }

  tracks_new <- tracks

  mcols(tracks_new) <- combineMeta

  tracks_new <- filterTrackOverlaps(tracks_new)

  return(tracks_new)

}

# Parse GRranges object to ensure ready for output
# Ensures Beta col is labelled score
# Removes indeterminate chrs (*)
# Removes Cpg sites which overlap boundaries - this should not be the case with any CpG GRanges obj but rtracklayer also does not want them to share boundaries

filterTrackOverlaps <- function(tracks){

    library(TxDb.Hsapiens.UCSC.hg38.knownGene)

    txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

    colnames(mcols(tracks)) <- "score"

    seqlevelsStyle(tracks) <- "UCSC"

    seqlevels(tracks) <- seqnames(seqinfo(tracks))[seqnames(seqinfo(tracks)) != "*"]

    seqinfo(tracks) <- seqinfo(txdb)[seqnames(seqinfo(tracks))[seqnames(seqinfo(tracks)) != "*"]]

    # take tracks which share boundaries and remove them
    tracks <- tracks[!tail(start(tracks), -1) <= head(end(tracks), -1)]

    # resort
    tracks <- sort(tracks)

    return(tracks)

}

# Save out track via rtracklayer

saveTrack <- function(sample, tracks, fileExt = ".BigWig", location = "./"){

  library(rtracklayer)

  track <- tracks[sample][[1]]

  rtracklayer::export(track, paste0(location, sample, fileExt))  

}

main <- function(input, output, params, log) {

    # Log

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script

    library(minfi)
    library(DMRcate)
    library(rtracklayer)

    filter <- readRDS(input$rds)

    # params
    anno <- params$anno # "hg38", "hg19"
    array <- params$array # "EPIC", "HM450"
    combine <- params$combine  # "mean", "median"
    by <- params$by # "sample"

    # output 
    save <- output$rds

    # run annotation
    tracks = getTrackObj(filter, anno, array, combine, by)

    # save output
    # Bigwig
    lapply(names(tracks), saveTrack, track = tracks, fileExt = ".BigWig", location = params$bwLocation )

    # Bedgraph
    lapply(names(tracks), saveTrack, track = tracks, fileExt = ".bedGraph", location = params$bwLocation)

    # save out list of GRanges 
    saveRDS(tracks, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)