Snakemake scRNAseq: A single cell RNA-seq workflow

public public 1yr ago 0 bookmarks

A single cell RNA-seq workflow following Lun, McCarthy and Marioni 2016 and Soneson and Robinson 2018 , with added more recent functionality.

Code Snippets

18
19
script:
    "../scripts/cell-cycle.R"
36
37
script:
    "../scripts/cell-cycle-scores.R"
25
26
script:
    "../scripts/cellassign.R"
42
43
script:
    "../scripts/plot-cellassign.R"
67
68
script:
    "../scripts/celltype-tsne.R"
89
90
script:
    "../scripts/plot-gene-expression.R"
18
19
script:
    "../scripts/load-counts.R"
35
36
script:
    "../scripts/diffexp.R"
55
56
script:
    "../scripts/plot-differential-expression.R"
21
22
script:
    "../scripts/filter-cells.R"
23
24
script:
    "../scripts/normalize.R"
40
41
script:
    "../scripts/batch-effect-removal.R"
35
36
script:
    "../scripts/qc.R"
SnakeMake From line 35 of rules/qc.smk
53
54
script:
    "../scripts/explained-variance.R"
SnakeMake From line 53 of rules/qc.smk
77
78
script:
    "../scripts/plot-gene-gene-expression.R"
SnakeMake From line 77 of rules/qc.smk
96
97
script:
    "../scripts/gene-tsne.R"
SnakeMake From line 96 of rules/qc.smk
38
39
script:
    "../scripts/hvg.R"
69
70
script:
    "../scripts/hvg-correlation.R"
89
90
script:
    "../scripts/hvg-pca.R"
111
112
script:
    "../scripts/hvg-tsne.R"
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)
library(limma)

sce <- readRDS(snakemake@input[["sce"]])
cycle.assignments <- readRDS(snakemake@input[["cycles"]])
colData(sce)$G1 <- cycle.assignments$scores$G1
colData(sce)$G2M <- cycle.assignments$scores$G2M

# setup design matrix
model <- snakemake@params[["model"]]
design <- model.matrix(as.formula(model), data=colData(sce))

# store design matrix
saveRDS(design, file=snakemake@output[["design_matrix"]])

# remove batch effects based on variables
batch.removed <- removeBatchEffect(logcounts(sce), covariates=design)
logcounts(sce) <- batch.removed
saveRDS(sce, file=snakemake@output[["sce"]])
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
Sys.setenv(MKL_THREADING_LAYER = "GNU")

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(SingleCellExperiment)
library(tensorflow)
library(cellassign)
library(ComplexHeatmap)
library(viridis)
library(ggsci)

is.float <- function(x) {
    (typeof(x) == "double") && (x %% 1 != 0)
}

sce <- readRDS(snakemake@input[["sce"]])
parent <- snakemake@wildcards[["parent"]]

# get parent fit and filter sce to those cells
parent_fit <- snakemake@input[["fit"]]
if(length(parent_fit) > 0) {
    parent_fit <- readRDS(parent_fit)$cell_type
    is_parent_type <- rownames(parent_fit)[parent_fit$cell_type == parent]
    sce <- sce[, is_parent_type]
}

markers <- read.table(snakemake@input[["markers"]], row.names = NULL, header = TRUE, sep="\t", stringsAsFactors = FALSE, na.strings = "")
markers[is.na(markers$parent), "parent"] <- "root"
markers <- markers[markers$parent == parent, ]

# convert markers into something cellAssign understands
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
get_genes <- function (x) {
    if(is.na(x)) {
        vector()
    } else {
        sapply(strsplit(x, ","), trim)
    }
}
genes <- vector()
for(g in markers$genes) {
    genes <- c(genes, get_genes(g))
}
genes <- sort(unique(genes))

if(length(genes) < 1) {
    stop("Markers have to contain at least two different genes in union.")
}

marker_mat <- matrix(0, nrow = nrow(markers), ncol = length(genes))
colnames(marker_mat) <- genes
rownames(marker_mat) <- markers$name
for(i in 1:nrow(markers)) {
    cell_type <- markers[i, "name"]
    marker_mat[cell_type, ] <- genes %in% get_genes(markers[i, "genes"])
}
marker_mat <- t(marker_mat)
marker_mat <- marker_mat[rownames(marker_mat) %in% rownames(sce),, drop=FALSE]


# apply cellAssign
sce <- sce[rownames(marker_mat), ]
# remove genes with 0 counts in all cells and cells with 0 counts in all genes
sce <- sce[rowSums(counts(sce)) != 0, colSums(counts(sce)) != 0]
# obtain batch effect model
model <- readRDS(snakemake@input[["design_matrix"]])
# constrain to selected cells and remove intercept (not allowed for cellassign)
model <- model[colnames(sce), colnames(model) != "(Intercept)"]
# normalize float columns (as recommended in cellassign manual)
float_cols <- apply(model, 2, is.float)
model[, float_cols] <- apply(model[, float_cols], 2, scale)
if(nrow(sce) == 0) {
    stop("Markers do not match any gene names in the count matrix.")
}
# fit
fit <- cellassign(exprs_obj = sce, marker_gene_info = marker_mat, s = sizeFactors(sce), learning_rate = 1e-2, B = 20, shrinkage = TRUE, X = model)

# add cell names to results
cells <- colnames(sce)
rownames(fit$mle_params$gamma) <- cells
fit$cell_type <- data.frame(cell_type = fit$cell_type)
rownames(fit$cell_type) <- cells

saveRDS(fit, file = snakemake@output[["fit"]])

save.image()
# plot heatmap
source(file.path(snakemake@scriptdir, "common.R"))
sce <- assign_celltypes(fit, sce, snakemake@params[["min_gamma"]])

pdf(file = snakemake@output[["heatmap"]])
pal <- pal_d3("category20")(ncol(marker_mat))
names(pal) <- colnames(marker_mat)
celltype <- HeatmapAnnotation(df = data.frame(celltype = colData(sce)$celltype), col = list(celltype = pal))
Heatmap(logcounts(sce), col = viridis(100), clustering_distance_columns = "canberra", clustering_distance_rows = "canberra", use_raster = TRUE, show_row_dend = FALSE, show_column_dend = FALSE, show_column_names = FALSE, top_annotation = celltype, name = "logcounts")
dev.off()
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)
library(scran)

sce <- readRDS(snakemake@input[[1]])

# determine cell cycle
species = snakemake@params[["species"]]
if (species == "mouse") {
    markers <- "mouse_cycle_markers.rds"
} else if (species == "human") {
    markers <- "human_cycle_markers.rds"
} else {
    stop("Unsupported species. Only mouse and human are supported.")
}

# read trained data
pairs <- readRDS(system.file("exdata", markers, package="scran"))

# obtain assignments
assignments <- cyclone(sce, pairs, gene.names=rowData(sce)$ensembl_gene_id)

# store assignments
saveRDS(assignments, file=snakemake@output[[1]])
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")


library(RColorBrewer)

assignments <- readRDS(snakemake@input[["rds"]])
annotation <- read.table(snakemake@input[["cells"]], header=TRUE, row.names=1)
covariate <- gsub("-", ".", snakemake@wildcards[["covariate"]])
covariate.values <- unique(annotation[, covariate])
n <- length(covariate.values)

# plot
pdf(file=snakemake@output[[1]])
# set color palette
palette(brewer.pal(n=n, name="Dark2"))
plot(assignments$score$G1, assignments$score$G2M,
     xlab="G1 score", ylab="G2/M score", pch=16,
     col=annotation[, covariate])
legend("topright", legend=covariate.values, col=palette()[covariate.values], pch=16)
dev.off()
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)
library(scran)
library(ggsci)
source(file.path(snakemake@scriptdir, "common.R"))

seed <- as.integer(snakemake@wildcards[["seed"]])
target_parent <- snakemake@wildcards[["parent"]]
parents <- snakemake@params[["parents"]]
fits <- snakemake@input[["fits"]]

sce <- readRDS(snakemake@input[["sce"]])

for(i in 1:length(fits)) {
    cellassign_fit <- fits[i]
    parent <- parents[i]
    cellassign_fit <- readRDS(cellassign_fit)
    sce <- assign_celltypes(cellassign_fit, sce, snakemake@params[["min_gamma"]])
    if(parent == target_parent) {
        break
    }
}

style <- theme(
    axis.text=element_text(size=12),
    axis.title=element_text(size=16))

# plot t-SNE
pdf(file=snakemake@output[[1]], width = 12)
set.seed(seed)
plotTSNE(sce, colour_by="celltype") + scale_fill_d3(alpha = 1.0) + scale_colour_d3(alpha = 1.0) + style
dev.off()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(tidyverse)
library(SingleCellExperiment)
library(scran)
library(edgeR)
library(dplyr)

source(file.path(snakemake@scriptdir, "common.R"))

sce <- readRDS(snakemake@input[["sce"]])
for(cellassign_fit in snakemake@input[["cellassign_fits"]]) {
    cellassign_fit <- readRDS(cellassign_fit)
    sce <- assign_celltypes(cellassign_fit, sce, snakemake@params[["min_gamma"]])
}
# handle constrain celltypes
constrain_celltypes <- snakemake@params[["constrain_celltypes"]]
if(!is.null(constrain_celltypes)) {
    celltypes <- constrain_celltypes[["celltypes"]]
    common_var <- constrain_celltypes[["common"]]

    sce <- sce[, colData(sce)$celltype %in% celltypes]

    if(!is.null(common_var)) {
        if(!(common_var %in% colnames(colData(sce)))) {
            stop(paste("covariate", common_var, "not found in cell metadata"))
        }
        is_common_in_all <- apply(table(colData(sce)[, c(common_var, "celltype")]) > 0, 1, all)
	common_in_all <- names(is_common_in_all)[is_common_in_all]
	sce <- sce[, colData(sce)[, common_var] %in% common_in_all]
        colData(sce) <- droplevels(colData(sce))
    }
}
# only keep the requested cells
if(length(snakemake@params[["celltypes"]]) > 0) {
    sce <- sce[, colData(sce)$celltype %in% snakemake@params[["celltypes"]]]
}
colData(sce)$celltype <- factor(colData(sce)$celltype)
colData(sce)$detection_rate <- cut(colData(sce)$detection_rate, 10)


# convert to edgeR input
y <- convertTo(sce, type = "edgeR", col.fields = colnames(colData(sce)))

# obtain design matrix
design <- model.matrix(as.formula(snakemake@params[["design"]]), data=y$samples)

y <- calcNormFactors(y)
y <- estimateDisp(y, design)
fit <- glmQLFit(y, design)
qlf <- glmQLFTest(fit, coef = snakemake@params[["coef"]])

saveRDS(y, file = snakemake@output[["edger_dge"]])

write_tsv(rownames_to_column(topTags(qlf, n = 100000)$table, var = "gene"), snakemake@output[["table"]])

pdf(file = snakemake@output[["bcv"]])
plotBCV(y)
dev.off()

pdf(file = snakemake@output[["md"]])
plotMD(qlf)
abline(h = c(-1,1), col = "grey")
dev.off()

pdf(file = snakemake@output[["disp"]])
plotQLDisp(fit)
dev.off()
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)
library(scran)

sce <- readRDS(snakemake@input[["rds"]])
annotation <- read.table(snakemake@input[["cells"]], row.names=1, header=TRUE)

pdf(file=snakemake@output[[1]])
plotExplanatoryVariables(sce,
    variables=c("total_counts_Spike",
                "log10_total_counts_Spike",
                colnames(annotation))) + theme(
    axis.text=element_text(size=12),
    axis.title=element_text(size=16)
)
dev.off()
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)

sce <- readRDS(snakemake@input[[1]])

# drop cells with too few counts or expressed features
libsize.drop <- isOutlier(sce$total_counts, nmads=3, type="lower", log=TRUE)
feature.drop <- isOutlier(sce$total_features_by_counts, nmads=3, type="lower", log=TRUE)

# drop cells with too high proportion of mito genes or spike-ins expressed
mito.drop <- isOutlier(sce$pct_counts_Mt, nmads=3, type="higher")
spike.drop <- isOutlier(sce$pct_counts_Spike, nmads=3, type="higher")

# write stats
stats <- data.frame(ByLibSize=sum(libsize.drop), ByFeature=sum(feature.drop),
     ByMito=sum(mito.drop), BySpike=sum(spike.drop), Remaining=ncol(sce))
write.table(stats, file=snakemake@output[["stats"]], sep='\t',
            row.names=FALSE, quote=FALSE)

# filter sce
sce <- sce[,!(libsize.drop | feature.drop | mito.drop | spike.drop)]

saveRDS(sce, file=snakemake@output[["rds"]])
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)
library(scran)
library(ggsci)
library(viridis)

seed <- as.integer(snakemake@wildcards[["seed"]])
gene <- snakemake@wildcards[["gene"]]

sce <- readRDS(snakemake@input[["sce"]])

colData(sce)[, gene] <- logcounts(sce)[gene, ]

style <- theme(
    axis.text=element_text(size=12),
    axis.title=element_text(size=16))

# plot t-SNE
pdf(file=snakemake@output[[1]], width = 12)
set.seed(seed)
plotTSNE(sce, colour_by=gene) + scale_fill_viridis(alpha = 1.0) + scale_colour_viridis(alpha = 1.0) + style
dev.off()
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)
library(scran)
library(RBGL)
library(Rgraphviz)
library(gplots)


fdr <- snakemake@params[["fdr"]]
sce <- readRDS(snakemake@input[["sce"]])
hvgs <- read.table(snakemake@input[["hvg"]], row.names=1)

topn <- min(nrow(hvgs), snakemake@params[["top_n"]])
hvgs <- hvgs[1:topn, ]

# find correlated pairs
set.seed(100)
var.cor <- correlatePairs(sce, subset.row=rownames(hvgs))
sig.cor <- var.cor$FDR <= fdr
write.table(file=snakemake@output[["corr"]], var.cor, sep="\t", quote=FALSE, row.names=FALSE)


# define graph with significant correlation as edges
g <- ftM2graphNEL(cbind(var.cor$gene1, var.cor$gene2)[sig.cor,],
     W=NULL, V=NULL, edgemode="undirected")
# find highly connected clusters
cl <- highlyConnSG(g)$clusters
cl <- cl[order(lengths(cl), decreasing=TRUE)]


# plot correlation graph
pdf(file=snakemake@output[["graph"]])
plot(g, "neato", attrs=list(node=list(fillcolor="lightblue", color="lightblue")))
dev.off()


# choose significantly correlated genes
chosen <- unique(c(var.cor$gene1[sig.cor], var.cor$gene2[sig.cor]))
# get normalized expressions
norm.exprs <- logcounts(sce)[chosen,,drop=FALSE]


# plot heatmap
pdf(file=snakemake@output[["heatmap"]])
# z-score
heat.vals <- norm.exprs - rowMeans(norm.exprs)
heat.out <- heatmap.2(heat.vals, col=bluered, symbreak=TRUE, trace="none", cexRow=0.6)
dev.off()
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)

fdr <- snakemake@params[["fdr"]]
covariate <- gsub("-", ".", snakemake@wildcards[["covariate"]])
sce <- readRDS(snakemake@input[["sce"]])
var.cor <- read.table(snakemake@input[["var_cor"]], header=TRUE)
sig.cor <- var.cor$FDR <= fdr

# choose significantly correlated genes
chosen <- unique(c(var.cor$gene1[sig.cor], var.cor$gene2[sig.cor]))

style <- theme(
    axis.text=element_text(size=12),
    axis.title=element_text(size=16))

# plot PCA
pdf(file=snakemake@output[[1]])
plotPCA(sce[chosen, ], colour_by=covariate,
        ncomponents=3) + style
dev.off()
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)
library(scran)

use.spikes <- snakemake@params[["use_spikes"]]
fdr <- snakemake@params[["fdr"]]
min.bio.comp <- snakemake@params[["min_bio_comp"]]

sce <- readRDS(snakemake@input[["sce_norm"]])
design <- readRDS(snakemake@input[["design_matrix"]])

# apply trend model to obtain per-gene variance
# batch effects are modeled in design matrix
var.fit <- trendVar(sce, use.spikes=use.spikes, design=design)
var.out <- decomposeVar(sce, var.fit)


# from now on use expressions with removed batch effects
sce <- readRDS(snakemake@input[["sce_batch"]])


# plot mean vs. variance (spikes are red)
pdf(file=snakemake@output[["mean_vs_variance"]])
plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression",
     ylab="Variance of log-expression")
o <- order(var.out$mean)
lines(var.out$mean[o], var.out$tech[o], col="dodgerblue", lwd=2)
cur.spike <- isSpike(sce)
points(var.out$mean[cur.spike], var.out$total[cur.spike], col="red", pch=16)
dev.off()


# determine HVGs (highly variable genes)
hvg.out <- var.out[which(var.out$FDR <= fdr & var.out$bio >= min.bio.comp),]
# sort
hvg.out <- hvg.out[order(hvg.out$bio, decreasing=TRUE),]


# store HVGs in table on disk
write.table(file=snakemake@output[["hvg"]], hvg.out, sep="\t", quote=FALSE, col.names=NA)


# plot expression distributions
pdf(file=snakemake@output[["hvg_expr_dist"]])
print(hvg.out)
plotExpression(sce, rownames(hvg.out)[1:snakemake@params[["show_n"]]]) + theme(
    axis.text=element_text(size=12),
    axis.title=element_text(size=16)
)
dev.off()


# store variance estimates
saveRDS(var.out, file=snakemake@output[["var"]])
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)
library(scran)

seed <- as.integer(snakemake@wildcards[["seed"]])
fdr <- snakemake@params[["fdr"]]
covariate <- gsub("-", ".", snakemake@wildcards[["covariate"]])

sce <- readRDS(snakemake@input[["sce"]])
var.cor <- read.table(snakemake@input[["var_cor"]], header=TRUE)
sig.cor <- var.cor$FDR <= fdr

# choose significantly correlated genes
chosen <- unique(c(var.cor$gene1[sig.cor], var.cor$gene2[sig.cor]))

style <- theme(
    axis.text=element_text(size=12),
    axis.title=element_text(size=16))

# plot t-SNE
pdf(file=snakemake@output[[1]])
set.seed(seed)
plotTSNE(sce[chosen, ], colour_by=covariate) + style
dev.off()
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)
library(scran)


all.counts <- as.matrix(read.table(snakemake@input[["counts"]], row.names=1, header=TRUE))
all.annotation <- read.table(snakemake@input[["cells"]], row.names=1, header=TRUE)
sce <- SingleCellExperiment(
  assays = list(counts = all.counts),
  colData = all.annotation
)

# get feature annotation
species = snakemake@params[["species"]]
if (species == "mouse") {
    dataset <- "mmusculus_gene_ensembl"
    symbol <- "mgi_symbol"
} else if (species == "human") {
    dataset <- "hsapiens_gene_ensembl"
    symbol <- "hgnc_symbol"
} else {
    stop("Unsupported species. Only mouse and human are supported.")
}

sce <- getBMFeatureAnnos(sce, filters=c(snakemake@params[["feature_ids"]]), attributes = c("ensembl_gene_id", symbol, "chromosome_name", "gene_biotype", "start_position", "end_position"), dataset=dataset, host=snakemake@config[["counts"]][["biomart"]])
rowData(sce)[, "gene_symbol"] <- rowData(sce)[, symbol]
rownames(sce) <- uniquifyFeatureNames(rownames(sce), rowData(sce)$gene_symbol)

# get mitochondrial genes
is.mito <- colData(sce)$chromosome_name == "MT"

# annotate spike-ins
is.spike <- grepl(snakemake@params[["spike_pattern"]], rownames(sce))
isSpike(sce, "Spike") <- is.spike

# calculate metrics
sce <- calculateQCMetrics(sce, feature_controls=list(Spike=is.spike, Mt=is.mito))
colData(sce)[, "detection_rate"] <- sce$total_features_by_counts / nrow(sce)

saveRDS(sce, file=snakemake@output[[1]])
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)
library(scran)

sce <- readRDS(snakemake@input[[1]])

# pre-cluster in order to group similar cells
# this avoids a violation of the non-DE assumption
preclusters <- quickCluster(sce)

# compute size factors by pooling cells according to the clustering
sce <- computeSumFactors(sce, clusters=preclusters, min.mean=snakemake@params[["min_count"]])

pdf(file=snakemake@output[["scatter"]])
plot(sizeFactors(sce), sce$total_counts/1e6, log="xy",
     ylab="Library size (millions)", xlab="Size factor")
dev.off()

# use size factors to calculate normalized counts in log space (available under logcounts(sce)) 
sce <- normalize(sce)

saveRDS(sce, file=snakemake@output[["rds"]])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(ComplexHeatmap)
library(viridis)

set.seed(562374)

fit <- readRDS(snakemake@input[[1]])

pdf(file = snakemake@output[[1]])
Heatmap(fit$mle_params$gamma, col = viridis(100), clustering_distance_rows = "canberra", use_raster = TRUE, show_row_dend = FALSE, show_column_dend = FALSE, show_row_names = FALSE)
dev.off()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(tidyverse)
library(edgeR)

gene_of_interest <- snakemake@wildcards[["gene"]]
coef <- snakemake@params[["coef"]]

diffexp <- read_tsv(snakemake@input[["diffexp"]])
y <- readRDS(snakemake@input[["edger_dge"]])

log_cpm <- as_tibble(cpm(y, log=TRUE), rownames="gene") %>% gather(key = "cell", value = "cpm", -gene) %>% filter(gene == gene_of_interest)

if (nrow(log_cpm) == 0) {
    stop(paste("Error: gene not found:", gene_of_interest))
}

log_cpm <- log_cpm %>% mutate(condition = as.factor(y$design[, coef]))

diffexp <- diffexp %>% filter(gene == gene_of_interest)

fmt_float <- function(x) {
    formatC(x, digits=2, format="g")
}
coef_name <- colnames(y$design)[coef]
pdf(file = snakemake@output[[1]])
ggplot(log_cpm, aes(x = condition, y = cpm, fill = condition)) + geom_violin() + geom_jitter(alpha = 0.5, size = 1) + labs(fill = coef_name, title = gene_of_interest, subtitle = paste("p =", fmt_float(diffexp$PValue), ", FDR =",  fmt_float(diffexp$FDR), ", log2fc =", fmt_float(diffexp$logFC))) + theme_classic()
dev.off()
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)
library(scran)
library(ggsci)
source(file.path(snakemake@scriptdir, "common.R"))

seed <- as.integer(snakemake@wildcards[["seed"]])

sce <- readRDS(snakemake@input[["sce"]])

cellassign_fit <- snakemake@input[["fit"]]
cellassign_fit <- readRDS(cellassign_fit)
sce <- assign_celltypes(cellassign_fit, sce, snakemake@params[["min_gamma"]])


# plot gene expression
pdf(file=snakemake@output[[1]], width = 12, height = 12)
plotExpression(sce, snakemake@params[["genes"]], snakemake@params[["feature"]])
dev.off()
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)
library(scran)
library(ggsci)
library(ggpubr)
source(file.path(snakemake@scriptdir, "common.R"))

gene_a <- snakemake@wildcards[["gene_a"]]
gene_b <- snakemake@wildcards[["gene_b"]]

aes_name <- function(name) {
    paste0("`", name, "`")
}

sce <- readRDS(snakemake@input[["sce"]])
constrain_celltypes <- snakemake@params[["constrain_celltypes"]]

for(cellassign_fit in snakemake@input[["fits"]]) {
    cellassign_fit <- readRDS(cellassign_fit)
    sce <- assign_celltypes(cellassign_fit, sce, snakemake@params[["min_gamma"]], constrain_celltypes)
}

# handle constrain celltypes
if(!is.null(constrain_celltypes)) {
    celltypes <- constrain_celltypes
    print(celltypes)
    sce <- sce[, colData(sce)$celltype %in% celltypes]
}

pdf(file=snakemake@output[[1]], width = 4, height = 4)
data <- as.data.frame(t(logcounts(sce[c(gene_a, gene_b), ])))
vmin <- min(data)
vmax <- max(data)

regression <- snakemake@params[["regression"]]
correlation <- snakemake@params[["correlation"]]
dropout_threshold <- snakemake@params[["dropout_threshold"]]

data[, "dropout"] <- apply(data, 1, min) < dropout_threshold
non_dropout_data <- data[!data$dropout, ]

p <- ggplot(data, aes_string(x=aes_name(gene_a), y=aes_name(gene_b), color=aes_name("dropout"))) +
    geom_point(shape=1) + 
    scale_color_manual(values = c("#000000", "#666666")) +
    theme_classic() + 
    theme(legend.position = "none")

if(regression != FALSE) {
    regression <- as.formula(regression)
    p = p + geom_smooth(method="lm", color = "red", formula = formula, data = non_dropout_data) +
	    stat_regline_equation(data = non_dropout_data, label.x.npc = "left", label.y.npc = "top", formula = formula,
				  aes(label = paste(..eq.label.., ..rr.label.., ..adj.rr.label.., sep = "~~~~")))
}
if(correlation != FALSE) {
    p = p + stat_cor(method = correlation, data = non_dropout_data) + geom_smooth(method="lm", data = non_dropout_data, color = "red")
}

# pass plot to PDF
p

dev.off()
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(scater)

sce <- readRDS(snakemake@input[[1]])

# plot library sizes
pdf(file=snakemake@output[["libsizes"]])
hist(sce$total_counts/1e6, xlab="Library sizes (millions)", main="",
     breaks=20, col="grey80", ylab="Number of cells")
dev.off()

# plot number of expressed genes
pdf(file=snakemake@output[["expressed"]])
hist(sce$total_features_by_counts, xlab="Number of expressed genes", main="",
     breaks=20, col="grey80", ylab="Number of cells")
dev.off()

# plot mitochondrial proportion
pdf(file=snakemake@output[["mito_proportion"]])
hist(sce$pct_counts_Mt, xlab="Mitochondrial proportion (%)",
     ylab="Number of cells", breaks=20, main="", col="grey80")
dev.off()


# plot spike-in proportion
pdf(file=snakemake@output[["spike_proportion"]])
hist(sce$pct_counts_Spike, xlab="ERCC proportion (%)",
     ylab="Number of cells", breaks=20, main="", col="grey80")
dev.off()
ShowHide 31 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/t-i-r/single-cell-rna-seq
Name: single-cell-rna-seq
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Other Versions:
Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...