Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation, topic
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
A workflow for RNA-seq analysis in Snakemake.
Table of Contents
Introduction
This workflow is a bioinformatics analysis pipeline for RNA sequencing data. The workflow is built using Snakemake - a scalabale bioinformatics workflow engine
Requirements
This workflow requires the following software to run:
-
[Snakemake][snakemake]
-
[Conda][code]
Usage
Clone workflow into working directory:
git clone https://github.com/jma1991/rnaseq.git
Execute workflow and deploy software dependencies via conda:
snakemake --use-conda
Configuration
Configure the workflow by editing the files in the
config
directory:
-
config.yaml
is a YAML file containing the workflow metadata. -
samples.csv
is a CSV file containing the sample metadata. -
units.csv
is a CSV file contains the unit metadata.
Contributing
To contribute to the workflow, clone this repository locally and commit your code on a separate branch. Please generate unit tests for your code, and run the linter before opening a pull-request:
snakemake --generate-unit-tests # generate unit tests
snakemake --lint # run the linter
You can find more detail in our Contributing Guide . Participation in this open source project is subject to a Code of Conduct .
Thanks
I would like to thank Johannes Köster for developing the Snakemake workflow engine and Istvan Albert for writing the biostar handbook.
License
This workflow is licensed under the
MIT
license.
Copyright © 2020, James Ashmore
Code Snippets
21 22 | shell: "bamCoverage --bam {input.bam} --outFileName {output.wig} --numberOfProcessors {threads} --normalizeUsing RPKM --skipNonCoveredRegions 1> {log.out} 2> {log.err}" |
18 19 | script: "../scripts/deseq2.R" |
33 34 | script: "../scripts/counts.R" |
48 49 | script: "../scripts/normcounts.R" |
63 64 | script: "../scripts/logcounts.R" |
79 80 | script: "../scripts/results.R" |
44 45 | script: "../scripts/analyzeDuprates.R" |
57 58 | script: "../scripts/duprateExpDensPlot.R" |
22 23 | shell: "fastqc -o {params.out} -t {threads} {input.fqz} 1> {log.out} 2> {log.err}" |
22 23 | shell: "genomepy install -g results/genomepy -r 'chrX' -a {wildcards.genome} 1> {log.out} 2> {log.err}" |
36 37 | shell: "gunzip -c {input} > {output}" |
20 21 | shell: "kallisto index -i {output.idx} {input.fas} 1> {log.out} 2> {log.err}" |
39 40 | shell: "kallisto quant -i {input.idx} -o {params.out} -t {threads} {params.arg} {input.fqz} 1> {log.out} 2> {log.err}" |
18 19 | script: "../scripts/limma.R" |
33 34 | script: "../scripts/voom.R" |
48 49 | script: "../scripts/counts.R" |
63 64 | script: "../scripts/normcounts.R" |
78 79 | script: "../scripts/logcounts.R" |
94 95 | script: "../scripts/toptable.R" |
18 19 | script: "../scripts/dist.R" |
33 34 | script: "../scripts/prcomp.R" |
48 49 | script: "../scripts/cmdscale.R" |
63 64 | script: "../scripts/diff.R" |
78 79 | script: "../scripts/pvalue.R" |
93 94 | script: "../scripts/volcano.R" |
108 109 | script: "../scripts/heatmap.R" |
17 18 | shell: "bam_stat.py -i {input} 1> {output} 2> {log}" |
35 36 | shell: "inner_distance.py -i {input.bam} -r {input.bed} -o {params.out} 1> {log.out} 2> {log.err}" |
50 51 | shell: "infer_experiment.py -i {input.bam} -r {input.bed} 1> {output.txt} 2> {log.err}" |
67 68 | shell: "junction_annotation.py -i {input.bam} -r {input.bed} -o {params.out} 2> {log.err}" |
84 85 | shell: "junction_saturation.py -i {input.bam} -r {input.bed} -o {params.out} 2> {log.err}" |
99 100 | shell: "read_distribution.py -i {input.bam} -r {input.bed} 1> {output.txt} 2> {log.err}" |
115 116 | shell: "read_duplication.py -i {input.bam} -o {params.out} 2> {log.err}" |
134 135 | shell: "geneBody_coverage.py -i {input.bam} -r {input.bed} -o {params.out} 2> {log.err}" |
150 151 | shell: "read_GC.py -i {input.bam} -o {params.out} 2> {log.err}" |
21 22 | shell: "STAR --runMode genomeGenerate --runThreadN {threads} --genomeDir {output.dir} --genomeFastaFiles {input.fas} --sjdbGTFfile {input.gtf} 1> {log.out} 2> {log.err}" |
42 43 | shell: "STAR --runMode alignReads --runThreadN {threads} --genomeDir {input.idx} --readFilesIn {params.fqz} --readFilesCommand gunzip -c --outFileNamePrefix {params.out} --outSAMtype BAM SortedByCoordinate --outTmpDir /tmp/TMPDIR/{wildcards.sample} 1> {log.out} 2> {log.err}" |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | main <- function(input, output, params, log, threads) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(dupRadar) mat <- analyzeDuprates( bam = input$bam, gtf = input$gtf, stranded = params$stranded, paired = params$paired, threads = threads ) write.csv(mat, file = output$csv) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log, snakemake@threads) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | calculateMDS <- function(x) { # Perform multi-dimensional scaling d <- dist(t(x)) d <- cmdscale(d) d <- as.data.frame(d) colnames(d) <- c("MDS.1", "MDS.2") return(d) } main <- function(input, output, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(ggplot2) dat <- read.csv("config/samples.csv", stringsAsFactors = FALSE) mat <- read.csv(input$csv, row.names = 1) mds <- calculateMDS(mat) dat <- merge(dat, mds, by.x = "sample", by.y = "row.names") plt <- ggplot(dat, aes(MDS.1, MDS.2, colour = condition)) + geom_point() + labs(x = "MDS 1", y = "MDS 2") + theme_bw() + theme( aspect.ratio = 1, axis.title.x = element_text(margin = unit(c(1, 0, 0, 0), "lines")), axis.title.y = element_text(margin = unit(c(0, 1, 0, 0), "lines")) ) ggsave(output$pdf, plot = plt, width = 7, height = 7) } main(snakemake@input, snakemake@output, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | counts <- function(object) { # Return raw counts UseMethod("counts") } counts.DESeqDataSet <- function(object) { # DESeq2 method return(DESeq2::counts(object)) } counts.DGEList <- function(object) { # edgeR method return(object$counts) } main <- function(input, output, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function obj <- readRDS(input$rds) mat <- counts(object = obj) write.csv(mat, file = output$csv) } main(snakemake@input, snakemake@output, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | DESeqDataSet <- function(object, ...) { # Dispatch method UseMethod("DESeqDataSet") } DESeqDataSet.tximport <- function(object, colData, design) { # Create DESeqDataSet object from tximport DESeqDataSetFromTximport(txi = object, colData = colData, design = design) } DESeqDataSet.rsubread <- function(object, colData, design) { # Create DESeqDataSet object from rsubread DESeqDataSetFromMatrix(countData = object$counts, colData = colData, design = design) } main <- function(input, output, log, config) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(DESeq2) obj <- readRDS(input$rds) dat <- read.csv("config/samples.csv", row.names = "sample", stringsAsFactors = FALSE) dds <- DESeqDataSet(object = obj, colData = dat, design = ~ condition) dds <- DESeq(dds) saveRDS(dds, file = output$rds) } main(snakemake@input, snakemake@output, snakemake@log, snakemake@config) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | add.status <- function(x, fdr.threshold = 0.05) { # Determine the status of each gene in the results table x$status <- factor("NS", levels = c("Up", "NS", "Down")) x$status[x$log2FoldChange > 0 & x$FDR < fdr.threshold] <- "Up" x$status[x$log2FoldChange < 0 & x$FDR < fdr.threshold] <- "Down" return(x) } add.symbol <- function(x, n = 50) { # Show symbol for the top N genes in the results table y <- subset(x, status %in% c("Up", "Down")) r <- rank(-y$baseMean) + rank(-abs(y$log2FoldChange)) + rank(y$PValue) n <- min(n, nrow(y)) i <- sort(r, decreasing = FALSE, index.return = TRUE)$ix[seq_len(n)] i <- x$geneId %in% y$geneId[i] x$symbol <- "" x$symbol[i] <- x$geneName[i] return(x) } shrink.log2FoldChange <- function(x, lfc = 3) { # Shrink absolute log2FoldChange values greater than threshold x$log2FoldChange[x$log2FoldChange > lfc] <- lfc x$log2FoldChange[x$log2FoldChange < -lfc] <- -lfc return(x) } main <- function(input, output, log ) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(ggplot2) library(ggrepel) library(scales) res <- read.csv(input$csv) res <- add.status(res, fdr.threshold = 0.05) res <- add.symbol(res, n = 50) res <- shrink.log2FoldChange(res, lfc = 3) res <- res[order(res$PValue, decreasing = TRUE), ] col <- c("Up" = "#d8b365", "NS" = "#f5f5f5", "Down" = "#5ab4ac") lab <- c( "Up" = sprintf("Up (%s)", comma(sum(res$status == "Up"))), "NS" = sprintf("NS (%s)", comma(sum(res$status == "NS"))), "Down" = sprintf("Down (%s)", comma(sum(res$status == "Down"))) ) plt <- ggplot(res, aes(log10(baseMean), log2FoldChange, colour = status, label = symbol)) + geom_point() + scale_colour_manual(values = col, labels = lab) + geom_text_repel(size = 1.88, colour = "#000000", show.legend = FALSE, max.overlaps = Inf) + labs(x = "Normalized mean (log10)", y = "Fold change (log2)", colour = "Status") + theme_bw() + theme( aspect.ratio = 1, axis.title.x = element_text(margin = unit(c(1, 0, 0, 0), "lines")), axis.title.y = element_text(margin = unit(c(0, 1, 0, 0), "lines")) ) ggsave(output$pdf, plot = plt, width = 7, height = 7) } main(snakemake@input, snakemake@output, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | pheatmap.mat <- function(x) { # Return distance matrix as.matrix(dist(t(x), method = "euclidean")) } pheatmap.color <- function(x) { # Return color vector colorRampPalette(rev(RColorBrewer::brewer.pal(n = 5, name = x)))(100) } pheatmap.breaks <- function(x) { # Return breaks vector seq(0, max(x), length.out = 101) } pheatmap.cluster_rows <- function(x) { # Return hclust object for rows hclust(as.dist(x), method = "complete") } pheatmap.cluster_cols <- function(x) { # Return hclust object for columns hclust(as.dist(x), method = "complete") } main <- function(input, output, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(pheatmap) library(RColorBrewer) dat <- read.csv(input$csv, row.names = 1) mat <- pheatmap.mat(dat) pheatmap( mat = mat, color = pheatmap.color("Blues"), breaks = pheatmap.breaks(mat), cluster_rows = pheatmap.cluster_rows(mat), cluster_cols = pheatmap.cluster_cols(mat), filename = output$pdf, width = 7, height = 7 ) } main(snakemake@input, snakemake@output, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | main <- function(input, output, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(dupRadar) mat <- read.csv(input$csv) pdf(output$pdf) duprateExpDensPlot(DupMat = mat) dev.off() } main(snakemake@input, snakemake@output, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | selectCPM <- function(x) { id1 <- which(colnames(x) == "falsePos") + 1 id2 <- ncol(x) idx <- seq(from = id1, to = id2) cpm <- x[, idx] cpm <- as.matrix(cpm) return(cpm) } pheatmap.mat <- function(x) { # Scale rows by 'variance-aware' Z-transformation M <- rowMeans(x, na.rm = TRUE) DF <- ncol(x) - 1 isNA <- is.na(x) anyNA <- any(isNA) if (anyNA) { mode(isNA) <- "integer" DF <- DF - rowSums(isNA) DF[DF == 0] <- 1 } x <- x - M V <- rowSums(x^2, na.rm = TRUE) / DF x <- x / sqrt(V + 0.01) } pheatmap.color <- function(x) { # Return color vector colorRampPalette(rev(RColorBrewer::brewer.pal(n = 5, name = x)))(100) } pheatmap.breaks <- function(x) { # Return breaks vector abs <- max(abs(x)) abs <- min(abs, 5) seq(-abs, +abs, length.out = 101) } pheatmap.cluster_rows <- function(x) { # Return hclust object for rows hclust(dist(x, method = "euclidean"), method = "complete") } pheatmap.cluster_cols <- function(x) { # Return hclust object for columns hclust(dist(t(x), method = "euclidean"), method = "complete") } main <- function(input, output, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(pheatmap) res <- read.csv(input$csv, row.names = 1) ind <- order(res$PValue)[seq_len(50)] res <- res[ind, ] cpm <- selectCPM(res) mat <- pheatmap.mat(cpm) pheatmap( mat = mat, color = pheatmap.color("RdBu"), breaks = pheatmap.breaks(mat), cluster_rows = pheatmap.cluster_rows(mat), cluster_cols = pheatmap.cluster_cols(cpm), labels_row = res$geneName, filename = output$pdf, width = 6, height = 8 ) } main(snakemake@input, snakemake@output, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | DGEList <- function(object, ...) { # Dispatch method UseMethod("DGEList") } DGEList.rsubread <- function(object, samples, group) { # Rsubread method edgeR::DGEList(counts = object$counts, samples = samples, group = group) } DGEList.tximport <- function(object, samples, group) { # tximport method counts <- tximport::makeCountsFromAbundance(object$counts, object$abundance, object$length, countsFromAbundance = "lengthScaledTPM") edgeR::DGEList(counts = counts, samples = samples, group = group) } main <- function(input, output, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(edgeR) library(limma) obj <- readRDS(input$rds) dat <- read.csv("config/samples.csv", row.names = "sample", stringsAsFactors = FALSE) dge <- DGEList(object = obj, samples = dat, group = dat$condition) ind <- filterByExpr(dge, group = dat$condition) dge <- dge[ind, , keep.lib.sizes = FALSE] dge <- calcNormFactors(dge) saveRDS(dge, file = output$rds) } main(snakemake@input, snakemake@output, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | logcounts <- function(object) { # Return log-transformed counts UseMethod("logcounts") } logcounts.DESeqDataSet <- function(object) { # DESeq2 method rld <- DESeq2::rlog(object, blind = FALSE) return(SummarizedExperiment::assay(rld)) } logcounts.DGEList <- function(object) { # edgeR method return(edgeR::cpm(object, log = TRUE)) } logcounts.EList <- function(object) { # limma method return(object$E) } main <- function(input, output, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function obj <- readRDS(input$rds) mat <- logcounts(object = obj) write.csv(mat, file = output$csv) } main(snakemake@input, snakemake@output, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | normcounts <- function(object) { # Return normalized counts UseMethod("normcounts") } normcounts.DESeqDataSet <- function(object) { # DESeq2 method return(DESeq2::counts(object, normalized = TRUE)) } normcounts.DGEList <- function(object) { # edgeR method return(edgeR::cpm(object)) } normcounts.EList <- function(object) { # limma method return(2 ^ object$E) } main <- function(input, output, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function obj <- readRDS(input$rds) mat <- normcounts(object = obj) write.csv(mat, file = output$csv) } main(snakemake@input, snakemake@output, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | calculatePCA <- function(x, n = 500) { var <- apply(x, 1, var) num <- min(length(var), n) sel <- order(var, decreasing = TRUE)[seq_len(num)] sub <- x[sel, ] pca <- prcomp(t(sub)) dev <- pca$sdev^2 / sum(pca$sdev^2) dat <- data.frame("PCA.1" = pca$x[, 1], "PCA.2" = pca$x[, 2], row.names = colnames(sub)) attr(dat, "percentVar") <- dev[1:2] return(dat) } main <- function(input, output, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(ggplot2) dat <- read.csv("config/samples.csv", stringsAsFactors = FALSE) mat <- read.csv(input$csv, row.names = 1) pca <- calculatePCA(mat, n = 500) dat <- merge(dat, pca, by.x = "sample", by.y = "row.names") var <- attr(pca, "percentVar") plt <- ggplot(dat, aes(PCA.1, PCA.2, color = condition)) + geom_point() + xlab(paste0("PC1: ", round(var[1] * 100), "% variance")) + ylab(paste0("PC2: ", round(var[2] * 100), "% variance")) + coord_fixed() + theme_bw() + theme( axis.title.x = element_text(margin = unit(c(1, 0, 0, 0), "lines")), axis.title.y = element_text(margin = unit(c(0, 1, 0, 0), "lines")) ) ggsave(output$pdf, plot = plt, width = 7, height = 7) } main(snakemake@input, snakemake@output, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | main <- function(input, output, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(ggplot2) library(magick) res <- read.csv(input$csv, row.names = 1) plt <- ggplot(res, aes(x = PValue)) + geom_histogram(binwidth = 0.05, colour = "black", fill = "gainsboro", boundary = 0) + labs(x = "P value", y = "Frequency") + theme_bw() + theme( aspect.ratio = 1, axis.title.x = element_text(margin = unit(c(1, 0, 0, 0), "lines")), axis.title.y = element_text(margin = unit(c(0, 1, 0, 0), "lines")) ) ggsave(output$pdf, plot = plt, width = 7, height = 7) } main(snakemake@input, snakemake@output, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | main <- function(input, output, log, wildcards) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(DESeq2) dds <- readRDS(input$rds) ann <- read.delim(input$tsv, header = FALSE, col.names = c("gene_id", "gene_name")) con <- c("condition", wildcards$A, wildcards$B) res <- results(dds, contrast = con) res <- res[order(res$pvalue), ] cpm <- counts(dds, normalized = TRUE)[rownames(res), ] idx <- list( A = which(dds$condition %in% wildcards$A), B = which(dds$condition %in% wildcards$B) ) dat <- data.frame( geneId = ann$gene_id[match(rownames(res), ann$gene_id)], geneName = ann$gene_name[match(rownames(res), ann$gene_id)], baseMean = rowMeans(cpm[, c(idx$A, idx$B)]), baseMeanA = rowMeans(cpm[, idx$A]), baseMeanB = rowMeans(cpm[, idx$B]), foldChange = 2 ^ res$log2FoldChange, log2FoldChange = res$log2FoldChange, PValue = res$pvalue, PAdj = p.adjust(res$pvalue, method = "hochberg"), FDR = res$padj, falsePos = round(seq_along(res$padj) * res$padj) ) out <- merge(dat, cpm[, c(idx$A, idx$B)], by.x = "geneId", by.y = "row.names", sort = FALSE) write.csv(out, file = output$csv, quote = FALSE, row.names = FALSE) } main(snakemake@input, snakemake@output, snakemake@log, snakemake@wildcards) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | main <- function(input, output, log, wildcards) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(limma) obj <- readRDS(input$rds) ann <- read.delim(input$tsv, header = FALSE, col.names = c("transcript_id", "gene_id", "gene_name")) fit <- lmFit(obj) str <- paste(wildcards$A, wildcards$B, sep = "-") lvl <- colnames(fit$design) con <- makeContrasts(contrasts = str, levels = lvl) fit <- contrasts.fit(fit, contrasts = con) fit <- eBayes(fit) res <- topTable(fit, number = Inf, sort.by = "P") cts <- 2 ^ obj$E[rownames(res), ] idx <- list( A = which(fit$design[, wildcards$A] == 1), B = which(fit$design[, wildcards$B] == 1) ) dat <- data.frame( geneId = ann$gene_id[match(rownames(res), ann$gene_id)], geneName = ann$gene_name[match(rownames(res), ann$gene_id)], baseMean = rowMeans(cts[, c(idx$A, idx$B)]), baseMeanA = rowMeans(cts[, idx$A]), baseMeanB = rowMeans(cts[, idx$B]), foldChange = 2 ^ res$logFC, log2FoldChange = res$logFC, PValue = res$P.Value, PAdj = p.adjust(res$P.Value, method = "hochberg"), FDR = res$adj.P.Val, falsePos = round(seq_along(res$adj.P.Val) * res$adj.P.Val) ) out <- merge(dat, cts[, c(idx$A, idx$B)], by.x = "geneId", by.y = "row.names", sort = FALSE) write.csv(out, file = output$csv, quote = FALSE, row.names = FALSE) } main(snakemake@input, snakemake@output, snakemake@log, snakemake@wildcards) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | add.status <- function(x, fdr.threshold = 0.05) { # Determine the status of each gene in the results table x$status <- factor("NS", levels = c("Up", "NS", "Down")) x$status[x$log2FoldChange > 0 & x$FDR < fdr.threshold] <- "Up" x$status[x$log2FoldChange < 0 & x$FDR < fdr.threshold] <- "Down" return(x) } add.symbol <- function(x, n = 50) { # Show symbol for the top N genes in the results table i <- sort(x$FDR, decreasing = FALSE, index.return = TRUE)$ix[seq_len(n)] x$symbol <- "" x$symbol[i] <- x$geneName[i] return(x) } main <- function(input, output, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(ggplot2) library(ggrepel) library(scales) res <- read.csv(input$csv, row.names = 1) res <- add.status(res, fdr.threshold = 0.05) res <- add.symbol(res, n = 50) res <- res[order(res$FDR, decreasing = TRUE), ] col <- c("Up" = "#d8b365", "NS" = "#f5f5f5", "Down" = "#5ab4ac") lab <- c( "Up" = sprintf("Up (%s)", comma(sum(res$status == "Up"))), "NS" = sprintf("NS (%s)", comma(sum(res$status == "NS"))), "Down" = sprintf("Down (%s)", comma(sum(res$status == "Down"))) ) plt <- ggplot(res, aes(log2FoldChange, -log10(PValue), colour = status, label = symbol)) + geom_point() + scale_colour_manual(values = col, labels = lab) + geom_text_repel(size = 1.88, colour = "#000000", show.legend = FALSE, max.overlaps = Inf) + labs(x = "Fold change (log2)", y = "P value (-log10)", colour = "Status") + theme_bw() + theme( aspect.ratio = 1, axis.title.x = element_text(margin = unit(c(1, 0, 0, 0), "lines")), axis.title.y = element_text(margin = unit(c(0, 1, 0, 0), "lines")) ) ggsave(output$pdf, plot = plt, width = 7, height = 7) } main(snakemake@input, snakemake@output, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | main <- function(input, output, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(edgeR) library(limma) dge <- readRDS(input$rds) mod <- model.matrix(~ 0 + condition, data = dge$samples) colnames(mod) <- unique(dge$samples$condition) els <- voom(dge, design = mod) saveRDS(els, file = output$rds) } main(snakemake@input, snakemake@output, snakemake@log) |
Support
- Future updates
Related Workflows





