The evaluation of MERFISHtools and the underyling model as published in the corresponding article.
Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation, topic
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
Evaluation of MERFISHtools
This Snakemake workflow generates the entire analysis of the forthcoming manuscript "A Bayesian model for single cell transcript expression analysis on MERFISH data".
Requirements
- Any 64-bit Linux installation with GLIBC 2.5 or newer (i.e. any Linux distribution that is newer than CentOS 6).
Setup
Step 1: Setup Snakemake
To run this workflow, you need to setup Snakemake via the Conda package manager . This does not require admin priviledges.
Step 2: Download the workflow
First, create a working directory:
mkdir merfishtools-evaluation
cd merfishtools-evaluation
Then, download the workflow archive from https://doi.org/10.5281/zenodo.752340. Finally, extract the downloaded archive with
tar -xf merfishtools-evaluation.tar.bz2
Step 4: Run the workflow
Then you can perfom a dry-run of the workflow with
snakemake -n
and execute the workflow using, e.g., 24 cores with
snakemake --cores 24 --use-conda
Note that the argument
--use-conda
is mandatory in such that Snakemake
can deploy the software packages delivered together with the workflow.
For further possibilities, see
snakemake --help
and http://snakemake.bitbucket.org.
Author
License
Code Snippets
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | import pandas as pd from scipy.spatial import ConvexHull data = pd.read_table(snakemake.input[0], index_col=0) def get_area(rnas): hull = ConvexHull(rnas[["x", "y"]]) return hull.area def get_first(rnas): return rnas.iloc[0] cells = data.groupby(level=0) area = cells[["x", "y"]].aggregate(get_area).iloc[:, 0] x = cells["cell_x"].aggregate(get_first) y = cells["cell_y"].aggregate(get_first) props = pd.DataFrame({"area": area, "x": x, "y": y}) props.to_csv(snakemake.output[0], sep="\t") |
1 2 3 4 5 6 | import pandas as pd # load data exprs = pd.read_table(snakemake.input[0], index_col=[0, 1]) exprs = (exprs["corrected"] + exprs["exact"]).unstack(0).fillna(0) exprs.to_csv(snakemake.output[0], sep="\t") |
1 2 3 4 5 | import pandas as pd # load data exprs = pd.read_table(snakemake.input[0], index_col=[0, 1])["expr_map"].unstack(0).fillna(0) exprs.to_csv(snakemake.output[0], sep="\t") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | import svgutils.transform as sg from common import load_svg, label_plot fig = sg.SVGFigure("4.15in", "1.8in") a = load_svg(snakemake.input.a) b = load_svg(snakemake.input.b) c = load_svg(snakemake.input.c) d = load_svg(snakemake.input.d) b.moveto(100, 0) c.moveto(200, 0) d.moveto(300, 0) la = label_plot(5, 10, "a") lb = label_plot(95, 10, "b") lc = label_plot(195, 10, "c") ld = label_plot(295, 10, "d") fig.append([a, b, c, d, la, lb, lc, ld]) fig.save(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | import svgutils.transform as sg from common import load_svg, label_plot fx, fy = "5.1in", "1.3in" if snakemake.wildcards.dataset == "1001genesData": fx, fy = "5.5in", "1.3in" fig = sg.SVGFigure(fx, fy) a = load_svg(snakemake.input.a) b = load_svg(snakemake.input.b) c = load_svg(snakemake.input.c) d = load_svg(snakemake.input.d) xb = 170 xc = 275 xd = 380 if snakemake.wildcards.dataset == "1001genesData": xb = 170 xc = 290 xd = 405 b.moveto(xb, 0) c.moveto(xc, 0) d.moveto(xd, 0) la = label_plot(5,10, "a") lb = label_plot(xb,10, "b") lc = label_plot(xc,10, "c") ld = label_plot(xd,10, "d") fig.append([a, b, c, d, la, lb, lc, ld]) fig.save(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | import svgutils.transform as sg from common import load_svg, label_plot fig = sg.SVGFigure("7.5in", "1.9in") a = load_svg(snakemake.input.mhd4[0]) b = load_svg(snakemake.input.mhd4[1]) c = load_svg(snakemake.input.mhd2[0]) d = load_svg(snakemake.input.mhd2[1]) b.moveto(170, 0) c.moveto(340, 0) d.moveto(510, 0) la = label_plot(5, 10, "a") lb = label_plot(175, 10, "b") lc = label_plot(345, 10, "c") ld = label_plot(515, 10, "d") fig.append([a, b, c, d, la, lb, lc, ld]) fig.save(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | import svgutils.transform as sg from common import load_svg, label_plot fig = sg.SVGFigure("4.1in", "1.8in") a = load_svg(snakemake.input[1]) b = load_svg(snakemake.input[0]) b.moveto(190, 0) la = label_plot(5, 10, "a") lb = label_plot(185, 10, "b") fig.append([a, b, la, lb]) fig.save(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | import svgutils.transform as sg from common import load_svg, label_plot fig = sg.SVGFigure("5.3in", "1.8in") a = load_svg(snakemake.input.a) b = load_svg(snakemake.input.b) a.moveto(10, 0) b.moveto(260, 0) la = label_plot(5, 10, "a") lb = label_plot(255, 10, "b") fig.append([a, b, la, lb]) fig.save(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | import svgutils.transform as sg from common import load_svg, label_plot fig = sg.SVGFigure("5.8in", "1.8in") a = load_svg(snakemake.input.a) b = load_svg(snakemake.input.b) b.moveto(260, 0) la = label_plot(5, 10, "a") lb = label_plot(255, 10, "b") fig.append([a, b, la, lb]) fig.save(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | import svgutils.transform as sg from common import load_svg, label_plot fig = sg.SVGFigure("6.4in", "1.7in") a = load_svg(snakemake.input[0]) b = load_svg(snakemake.input[1]) #c = load_svg(snakemake.input.c) #d = load_svg(snakemake.input.d) e = load_svg(snakemake.input[2]) #f = load_svg(snakemake.input.f) b.moveto(190, 0) #c.moveto(380, 0) #d.moveto(0, 160) e.moveto(380, 0) #f.moveto(380, 160) la = label_plot(5,10, "a") lb = label_plot(195,10, "b") lc = label_plot(385,10, "c") fig.append([a, b, e, la, lb, lc]) fig.save(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | import svgutils.transform as sg from common import load_svg, label_plot fig = sg.SVGFigure("6.5in", "2.1in") a = load_svg(snakemake.input[0]) b = load_svg(snakemake.input[1]) c = load_svg(snakemake.input[2]) d = load_svg(snakemake.input[3]) a.moveto(10, 10, scale=0.6) b.moveto(20, 130, scale=0.4) c.moveto(250, 10, scale=0.65) d.moveto(440, 10, scale=1.0) la = label_plot(5, 10, "a") lb = label_plot(5, 130, "b") lc = label_plot(240, 10, "c") ld = label_plot(430, 10, "d") fig.append([a, b, c, d, la, lb, lc, ld]) fig.save(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | import svgutils.transform as sg from common import load_svg, label_plot fig = sg.SVGFigure("6.5in", "1.8in") a = load_svg(snakemake.input.a) b = load_svg(snakemake.input.b) b.moveto(230, 0) la = label_plot(5, 10, "a") lb = label_plot(225, 10, "b") fig.append([a, b, la, lb]) fig.save(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | import svgutils.transform as sg from common import load_svg, label_plot fig = sg.SVGFigure("5.8in", "3.3in") a = load_svg(snakemake.input.a) b = load_svg(snakemake.input.b) c = load_svg(snakemake.input.c) d = load_svg(snakemake.input.d) e = load_svg(snakemake.input.e) f = load_svg(snakemake.input.f) b.moveto(260, 0) c.moveto(0, 160) d.moveto(128, 160) e.moveto(250, 160) f.moveto(384, 160) la = label_plot(5, 10, "a") lb = label_plot(265, 10, "d") lc = label_plot(5, 170, "b") ld = label_plot(130, 170, "c") le = label_plot(255, 170, "e") lf = label_plot(385, 170, "f") fig.append([a, b, c, d, e, f, la, lb, lc, ld, le, lf]) fig.save(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | import svgutils.transform as sg from common import load_svg, label_plot fig = sg.SVGFigure("5.6in", "1.8in") a = load_svg(snakemake.input.a) b = load_svg(snakemake.input.b) a.moveto(10, 0) b.moveto(260, 0) la = label_plot(5, 10, "a") lb = label_plot(255, 10, "b") fig.append([a, b, la, lb]) fig.save(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | import numpy as np import pandas as pd dataset = pd.read_table(snakemake.input[0], index_col=0) dataset = dataset.loc[int(snakemake.wildcards.experiment)] group = snakemake.config["groups"][snakemake.wildcards.group] dataset = dataset[dataset["Cell_ID"].astype(np.str).str.match(group)] dataset = dataset.loc[:, ["Cell_ID", "Gene_Name", "Exact_Match", "Cell_Position_X", "Cell_Position_Y", "RNA_Position_X", "RNA_Position_Y"]] dataset.columns = ["cell", "feat", "dist", "cell_x", "cell_y", "x", "y"] dataset["dist"] = 1 - dataset["dist"] dataset.to_csv(snakemake.output[0], sep="\t", index=False) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | library("biomaRt") ensembl = useMart("ensembl")#,dataset="hsapiens_gene_ensembl", verbose=TRUE) ensembl = useDataset("hsapiens_gene_ensembl", mart=ensembl) library("GOstats") library("GO.db") library("org.Hs.eg.db") library("mutoss") est <- read.table(snakemake@input[[1]], header = TRUE, stringsAsFactors = FALSE) rownames(est) <- est$feat # translate gene names to entrez ids ids <- getBM(attributes = c("hgnc_symbol", "entrezgene"), filters = "hgnc_symbol", values = est$feat, mart = ensembl) # take the first entrez id in case of ambiguity ids <- ids[!duplicated(ids$hgnc_symbol), ] rownames(ids) <- ids$hgnc_symbol # lookup entrez ids est$entrez <- ids[est$feat, "entrezgene"] # define foreground genes significant <- est$diff_fdr <= 0.05 foreground <- est[significant, ] if(nrow(foreground) > 0) { # define test parameters params <- new("GOHyperGParams", geneIds = foreground$entrez, universeGeneIds = est$entrez, ontology = "BP", annotation = "org.Hs.eg", pvalueCutoff = 0.05, conditional = TRUE, testDirection = "over") results <- hyperGTest(params) print(results) goterms <- summary(results) # correct for multiple testing. We use Benjamini-Yekuteli here, because the performed tests are strongly dependent #by <- BY(goterms$Pvalue, 0.05) #goterms$adjPvalue <- by[["adjPValues"]] write.table(goterms, file = snakemake@output[["terms"]], row.names = FALSE, quote = FALSE, sep = "\t") # associate genes with go terms ids <- ids[!is.na(ids$entrezgene), ] rownames(ids) <- ids$entrezgene genes <- stack(geneIdUniverse(results)) colnames(genes) <- c("entrezid", "goterm") genes <- genes[, c("goterm", "entrezid")] genes$gene <- ids[genes$entrezid, "hgnc_symbol"] genes$entrezid <- NULL genes$cv <- est[genes$gene, "cv_map"] genes$fdr <- est[genes$gene, "diff_fdr"] write.table(genes, file = snakemake@output[["genes"]], row.names = FALSE, quote = FALSE, sep = "\t") } else { file.create(snakemake@output[["terms"]]) file.create(snakemake@output[["genes"]]) } |
1 2 3 4 5 6 | import pandas as pd cdf = pd.read_table(snakemake.input.cdf, index_col=0) scales = pd.read_table(snakemake.input.scales, index_col=0, squeeze=True, header=None) cdf["expr"] *= scales[int(snakemake.wildcards.experiment)] cdf.to_csv(snakemake.output[0], sep="\t") |
1 2 3 4 5 6 | import pandas as pd expr = pd.read_table(snakemake.input.expr, index_col=0) scales = pd.read_table(snakemake.input.scales, index_col=0, squeeze=True, header=None) expr *= scales[int(snakemake.wildcards.experiment)] expr.to_csv(snakemake.output[0], sep="\t") |
1 2 3 4 5 6 7 8 9 10 11 12 | import pandas as pd import numpy as np exprs = [pd.read_table(f, index_col=0) for f in snakemake.input] # calculate upper quartiles quartiles = np.array([e.unstack().quantile(0.75) for e in exprs]) mean_quartile = quartiles.mean() # calculate scale factors for upper quartile normalization scales = pd.Series([mean_quartile / q for q in quartiles], index=snakemake.params.experiments, name="scale_factors") scales.to_csv(snakemake.output[0], sep="\t") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | import numpy as np import pandas as pd import matplotlib matplotlib.use("agg") import matplotlib.pyplot as plt import seaborn as sns from itertools import combinations import random random.seed(2139) sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) fig = plt.figure(figsize=snakemake.config["plots"]["figsize"]) ax = fig.add_subplot(111, aspect='equal') expr_a = pd.Series() expr_b = pd.Series() for f in snakemake.input[:1]: exprs = pd.read_table(f, index_col=0) for a, b in combinations(exprs.columns, 2): expr_a = expr_a.append(exprs[a]) expr_b = expr_b.append(exprs[b]) expr_a = np.log10(1 + expr_a) expr_b = np.log10(1 + expr_b) dropout = np.logical_or(np.logical_and(expr_a > 0.2, expr_b <= 0.2), np.logical_and(expr_a <= 0.2, expr_b > 0.2)) high = np.logical_or(expr_a >= 10, expr_b >= 10) sns.kdeplot(expr_a[~dropout], expr_b[~dropout], shade=True, cmap="Greys", shade_lowest=False, ax=ax, clip=[0, 2]) sns.kdeplot(expr_a[dropout], expr_b[dropout], shade=True, cmap="Reds", shade_lowest=False, ax=ax, clip=[0, 2]) #sns.kdeplot(expr_a, expr_b, shade=True, cmap="Greys", shade_lowest=False, ax=ax) plt.xlabel("log10 expression") plt.ylabel("log10 expression") sns.despine() plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | import matplotlib matplotlib.use("agg") import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np PSEUDOCOUNTS = 1 sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) plt.figure(figsize=[snakemake.config["plots"]["figsize"][1]] * 2) def get_means(raw_counts): return raw_counts.groupby(level=1).mean() + PSEUDOCOUNTS def get_cvs(batch_means): g = batch_means.groupby(level=1) return g.std() / g.mean() def read_raw_counts(f): m = pd.read_table(f, index_col=[0, 1]).sum(axis=1).unstack(fill_value=0) return m.stack() def normalize(raw_counts): quartiles = np.array([counts.quantile(0.75) for counts in raw_counts]) mean_quartile = quartiles.mean() print([mean_quartile / q for counts, q in zip(raw_counts, quartiles)]) return [counts * (mean_quartile / q) for counts, q in zip(raw_counts, quartiles)] raw_counts = normalize([read_raw_counts(f) for f in snakemake.input.raw_counts]) batch_means = [get_means(counts) for counts in raw_counts] batch_means = pd.concat(batch_means, keys=range(len(batch_means))) raw_cvs = get_cvs(batch_means) #print(raw_cvs[raw_cvs < 0.1]) posterior_cvs = pd.read_table(snakemake.input.diffexp, index_col=0)[snakemake.wildcards.estimate] plt.scatter(posterior_cvs, raw_cvs.loc[posterior_cvs.index], s=2, c="k", alpha=0.6, edgecolors="face") #sns.regplot(posterior_cvs, raw_cvs.loc[posterior_cvs.index], color="k", scatter_kws={"s": 2, "alpha": 0.6}) limits = (0, min(plt.xlim()[1], plt.ylim()[1])) limits = (0, 1) plt.plot(limits, limits, "r--") plt.xlim(limits) plt.ylim(limits) plt.xlabel("posterior CV" + (" (conservative)" if snakemake.wildcards.estimate == "cv_ci_lower" else "")) plt.ylabel("raw CV") sns.despine() plt.savefig(snakemake.output[0], bbox_inches="tight") print(raw_cvs) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | import pandas as pd import matplotlib as mpl mpl.use("agg") import matplotlib.pyplot as plt import seaborn as sns from sklearn import metrics import numpy as np sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) small = pd.concat([pd.read_table(f, index_col=0) for f in snakemake.input.small], axis=1) large = pd.concat([pd.read_table(f, index_col=0) for f in snakemake.input.large], axis=1) small_counts = pd.concat([pd.read_table(f, index_col=0) for f in snakemake.input.small_counts], axis=1) large_counts = pd.concat([pd.read_table(f, index_col=0) for f in snakemake.input.large_counts], axis=1) common_genes = small.index.intersection(large.index) means = lambda estimates: estimates.loc[common_genes].mean(axis="columns") small = means(small) large = means(large) small_counts = means(small_counts) large_counts = means(large_counts) errors = pd.concat([pd.DataFrame({"mhd4": small, "error": np.log2((large + 1) / (small + 1)), "approach": "posterior estimates"}), pd.DataFrame({"mhd4": small, "error": np.log2((large_counts + 1) / (small_counts + 1)), "approach": "raw counts"})]) min_value = 0.1#min(small.min(), large.min()) max_value = 100#max(small.max(), large.max()) fig = plt.figure(figsize=snakemake.config["plots"]["figsize"]) sns.swarmplot(y="error", x="approach", data=errors, palette=["red", "black"], size=4) #plt.scatter("mhd4", "error", c="black", s=3, data=errors[errors["approach"] == "raw counts"]) #plt.scatter("mhd4", "error", c="red", s=3, data=errors[errors["approach"] == "posterior estimates"]) #plt.scatter(small_counts, large_counts - small_counts, s=3, c="black", label="raw counts") #plt.ylabel("MHD2 - MHD4") plt.xlabel("") plt.ylabel("log2 fold change") plt.legend() sns.despine() plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | import pandas as pd import matplotlib as mpl mpl.use("agg") import matplotlib.pyplot as plt import seaborn as sns from sklearn import metrics import numpy as np sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) small = pd.concat([pd.read_table(f, index_col=0) for f in snakemake.input.small], axis=1) large = pd.concat([pd.read_table(f, index_col=0) for f in snakemake.input.large], axis=1) common_genes = small.index.intersection(large.index) means = lambda estimates: estimates.loc[common_genes].mean(axis="columns") small = means(small) large = means(large) smape = (large - small).abs().sum() / (large + small).sum() #relative_accuracy = np.log10(large / small) small = np.log10(small) large = np.log10(large) min_value = np.log10(0.1)#min(small.min(), large.min()) max_value = np.log10(100)#max(small.max(), large.max()) fig = plt.figure(figsize=snakemake.config["plots"]["figsize"]) fig.add_subplot(111, aspect='equal') sns.regplot(small, large, line_kws={"color": "red"}, scatter_kws={"color": "black", "s": 3}) plt.plot([min_value, max_value], [min_value, max_value], "k--") label = "raw counts" if snakemake.wildcards.type == "counts" else "posterior counts" plt.xlabel("MHD4 " + label) plt.ylabel("MHD2 " + label) plt.xlim((min_value, max_value)) plt.ylim((min_value, max_value)) plt.text(x=0.5, y=1, s="SMAPE={:.0%}".format(smape), horizontalalignment="center", verticalalignment="top", transform=plt.gca().transAxes) logticks = lambda ticks: ["$10^{{{:.0f}}}$".format(y) for y in ticks] ax = plt.gca() ax.locator_params(nbins=4) ax.set_xticklabels(logticks(ax.get_xticks())) ax.set_yticklabels(logticks(ax.get_yticks())) sns.despine() plt.savefig(snakemake.output[0], bbox_inches="tight") |
1
of
scripts/plot-dataset-correlation.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | import numpy as np import pandas as pd import matplotlib as mpl mpl.use("svg") import matplotlib.pyplot as plt import seaborn as sns import re pattern = re.compile(r"DEBUG: TYPE=EM-iteration, CELL=(?P<cell>\d+), x=(?P<expr>\[(\d+, )+\d+\])") exprs = [] with open(snakemake.input[0]) as f: for l in f: m = pattern.match(l) if re.match(r"DEBUG: TYPE=EM-iteration, CELL=(?P<cell>\d+)", l): print("match") if m and m.group("cell") == "1": exprs.append(eval(m.group("expr"))) exprs = pd.DataFrame(exprs) sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) x, y = snakemake.config["plots"]["figsize"] plt.figure(figsize=(x * 1.5, y)) for feat, e in exprs.iteritems(): curr = e[1:].reset_index(drop=True) last = e[:-1].reset_index(drop=True) #e = ((curr + 1) / (last + 1)) plt.semilogy(np.arange(e.size), e, "k-", linewidth=1, alpha=0.5) sns.despine() plt.xlabel("EM iteration") plt.ylabel("expression") #plt.ylim((0, 200)) plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | import numpy as np import pandas as pd import matplotlib as mpl mpl.use("svg") import matplotlib.pyplot as plt import seaborn as sns sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) x, y = snakemake.config["plots"]["figsize"] plt.figure(figsize=(x * 1.5, y)) error_rates = pd.concat([pd.read_table(f, index_col=0) for f in snakemake.input], keys=snakemake.params.experiments) error_rates = pd.DataFrame(error_rates.stack()).reset_index() error_rates.columns = ["experiment", "pos", "error-rate", "rate"] error_rates["pos"] += 1 sns.barplot(x="pos", y="rate", hue="error-rate", data=error_rates, errwidth=1, linewidth=0, palette=["red", "grey"], saturation=0.7) plt.ylim((0.0, plt.ylim()[1] + 0.1)) plt.ylabel("") plt.xlabel("position") sns.despine() plt.savefig(snakemake.output[0], bbox_inches="tight") print("mean error rates per position") means = error_rates.groupby(["error-rate", "pos"])["rate"].mean() print("p0", list(means.loc["p0"].values)) print("p1", list(means.loc["p1"].values)) print("mean overall") print(error_rates.groupby("error-rate")["rate"].mean()) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | import pandas as pd import matplotlib as mpl mpl.use("agg") import matplotlib.pyplot as plt import seaborn as sns sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) codebook = pd.read_table(snakemake.input.codebook, index_col=0) all_known_counts = [] raw_errors = [] for raw_counts, known_counts in zip(snakemake.input.raw_counts, snakemake.input.known_counts): known_counts = pd.read_table(known_counts, index_col=[0, 1]) known_counts = known_counts.reindex(codebook.index, level=1) all_known_counts.append(known_counts) raw_counts = pd.read_table(raw_counts, index_col=[0, 1]) raw_counts = raw_counts["exact"] + raw_counts["corrected"] raw_counts = raw_counts.reindex(known_counts.index, fill_value=0) raw_errors.append(raw_counts - known_counts["count"]) raw_error_mean = pd.concat(raw_errors).mean() errors = [] for uncertainty in [0, 5, 10, 20, 30]: u = "err-{}%".format(uncertainty) if uncertainty > 0 else "default" for mean, posterior_counts, known_counts in zip(snakemake.params.means, snakemake.input.get(u), all_known_counts): posterior_estimates = pd.read_table(posterior_counts, index_col=[0, 1]) posterior_estimates = posterior_estimates.reindex(known_counts.index, fill_value=0) errors.append(pd.DataFrame({"error": posterior_estimates["expr_map"] - known_counts["count"], "mean": mean, "uncertainty": uncertainty})) errors = pd.concat(errors) x, y = snakemake.config["plots"]["figsize"] plt.figure(figsize=(x * 1.5, y)) colors = sns.xkcd_palette(["light red"]) sns.violinplot(x="uncertainty", y="error", data=errors, bw=1, inner="quartile", palette=colors, linewidth=1) plt.plot(plt.xlim(), [0, 0], "-k", linewidth=1, zorder=-5) plt.plot(plt.xlim(), [raw_error_mean] * 2, ":k", linewidth=1, zorder=-5) sns.despine() plt.xlabel("error rate underestimation (%)") plt.ylabel("predicted - truth") plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | import matplotlib matplotlib.use("agg") import matplotlib.pyplot as plt import seaborn as sns import pandas as pd sns.set(style="ticks", palette="colorblind", context="paper") plt.figure(figsize=snakemake.config["plots"]["figsize"]) for f in snakemake.input: counts = pd.read_table(f, index_col=[0, 1]) plt.scatter(counts["exact"], counts["corrected"], s=2, c="k", alpha=0.6, edgecolors="face", rasterized=True) plt.xlabel("exact counts") plt.ylabel("corrected counts") plt.xlim((0, 1000)) plt.ylim((0, 1000)) plt.locator_params(nbins=4) #ax = plt.gca() #ax.set_yscale('log') #ax.set_xscale('log') sns.despine() plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | import numpy as np import pandas as pd import matplotlib matplotlib.use("agg") import matplotlib.pyplot as plt import seaborn as sns sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) plt.figure(figsize=snakemake.config["plots"]["figsize"]) means = [] stds = [] for f in snakemake.input: exprs = pd.read_table(f, index_col=0) for cell in exprs: expr = exprs[cell] means.append(expr.mean()) stds.append(expr.std()) expr = np.log10(expr + 1) sns.kdeplot(expr, color="black", clip=[0, expr.max()], alpha=0.1, lw=1, label="", kernel="gau", bw="scott") means = np.array(means) stds = np.array(stds) cvs = stds / means sns.kdeplot(np.log10(means + 1), color="red", clip=[0, means.max()], label="means") sns.kdeplot(np.log10(stds + 1), color="red", linestyle="--", clip=[0, stds.max()], label="standard deviations") m = means.mean() cv = cvs.mean() print(m, cv) #def nbinom_params(mean, cv): # variance = (cv * mean) ** 2 # p = 1 - ((variance - mean) / variance) # n = (mean * p) / (1 - p) # return n, p #d = np.random.negative_binomial(*nbinom_params(m, cv), 5000) #print(np.mean(d), np.std(d), np.std(d) / np.mean(d)) #sns.kdeplot(np.log10(d + 1), color="red", linestyle=":", clip=[0, d.max()], label="simulated example") plt.xlim([0, plt.xlim()[1]]) plt.xlabel("log10 counts") plt.ylabel("density") plt.legend(loc="upper right") sns.despine() plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | import pandas as pd import matplotlib matplotlib.use("agg") import matplotlib.pyplot as plt import seaborn as sns idx = pd.IndexSlice import merfishtools sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) plt.figure(figsize=snakemake.config["plots"]["figsize"]) cdf = merfishtools.read_cdf(snakemake.input.expr).loc[idx[:, snakemake.wildcards.gene], :] est = merfishtools.read_exp_estimates(snakemake.input.expr_est).loc[idx[:, snakemake.wildcards.gene], :].iloc[0] print(est) raw_counts = pd.read_table( snakemake.input.raw_counts, index_col=1).loc[snakemake.wildcards.gene] count_exact = raw_counts["exact"] count_corrected = raw_counts["corrected"] count_total = count_exact + count_corrected merfishtools.plot_pmf( cdf, map_value=est["expr_map"], credible_interval=est[["expr_ci_lower", "expr_ci_upper"]], legend=False) # plot raw counts ylim = plt.ylim() ylim = [0, ylim[1]] plt.vlines([count_total], *ylim, colors="grey", linestyles="--", label="total count", zorder=6) plt.vlines([count_exact], *ylim, colors="grey", linestyles=":", label="exact count", zorder=6) plt.ylim(ylim) plt.xlabel("expression") plt.ylabel("PMF") if snakemake.wildcards.legend == "legend": #plt.legend(loc="upper left", bbox_to_anchor=(0.5, 1)) plt.legend(loc="best") plt.xlim([0, plt.xlim()[1]]) sns.despine() plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | import matplotlib matplotlib.use("agg") import matplotlib.pyplot as plt import seaborn as sns import pandas as pd idx = pd.IndexSlice import merfishtools sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) plt.figure(figsize=snakemake.config["plots"]["figsize"]) cdf = merfishtools.read_cdf(snakemake.input.fc).loc[snakemake.wildcards.gene] est = merfishtools.read_diffexp_estimates(snakemake.input.fc_est).loc[snakemake.wildcards.gene] merfishtools.plot_cdf(cdf, map_value=est["log2fc_map"], credible_interval=est[["log2fc_ci_lower", "log2fc_ci_upper"]], legend=snakemake.wildcards.legend == "legend") plt.xlabel("log2 fold change") plt.ylabel("CDF") if snakemake.wildcards.legend == "legend": plt.legend(loc="best") sns.despine() plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | import matplotlib matplotlib.use("agg") import matplotlib.pyplot as plt import networkx as nx from networkx.algorithms import bipartite import pandas as pd import seaborn as sns terms = pd.read_table(snakemake.input.terms, index_col=0) genes = pd.read_table(snakemake.input.genes) genes.index = genes["goterm"] terms = terms[(terms["Pvalue"] <= 0.05) & (terms["Size"] >= 5)] genes = genes.loc[terms.index] genes = genes.set_index(["goterm", "gene"]) #matrix = ~genes["cv"].unstack().isnull() matrix = genes["cv"].unstack() matrix = matrix.loc[terms.index] matrix.sort_index(axis=1, inplace=True) print(matrix) annot = genes["fdr"].unstack() annot = annot.loc[terms.index] annot.sort_index(axis=1, inplace=True) print(annot) significant = annot <= 0.05 annot[significant] = "•" isna = annot.isnull() annot = annot.fillna(" ") print(annot) annot = annot.where(significant | isna, other=" ") print(annot) plt.figure(figsize=(10,2)) sns.heatmap(matrix, cmap=plt.cm.Reds, annot=annot, fmt="s", cbar_kws={"label": "CV"}) plt.ylabel("") plt.xlabel("") plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | import numpy as np import pandas as pd import matplotlib as mpl mpl.use("agg") import matplotlib.pyplot as plt import seaborn as sns from mpl_toolkits.axes_grid.inset_locator import inset_axes import merfishtools sns.set(style="ticks", context=snakemake.wildcards.context) sns.set_palette("Paired", n_colors=7) fig_x, fig_y = snakemake.config["plots"]["figsize"] figsize = (fig_x * 4, fig_y * 4) plt.figure(figsize=figsize) exprs = [pd.read_table(f, index_col=0).stack(0) for f in snakemake.input.exprs] exprs = pd.concat(exprs, keys=snakemake.params.expmnts) diffexp = merfishtools.read_diffexp_estimates(snakemake.input.diffexp) diffexp.sort_values("cv_map", inplace=True) cdfs = merfishtools.read_cdf(snakemake.input.diffexp_cdf) significant = diffexp[diffexp["diff_fdr"] <= 0.05] if not significant.empty: significant = significant[~significant.index.str.startswith("blank") & ~significant.index.str.startswith("notarget")] exprs = exprs.loc[:, significant.index] n = exprs.index.levels[1].size for i, (gene, gene_exprs) in enumerate(exprs.groupby(level=1)): vmax = gene_exprs.quantile(0.95) ax = plt.subplot(4, 4, i + 1) for _, exp_exprs in gene_exprs.groupby(level=0): ax = sns.kdeplot(exp_exprs, ax=ax, linewidth=1, alpha=0.5, clip=(0, vmax), clip_on=True) if i == n - 1: plt.xlabel("gene expression") plt.ylabel(gene) plt.xlim((0, vmax)) sns.despine() plt.yticks([]) # plot cdf cdf_ax = inset_axes(ax, width="30%", height=0.5, loc=1) cdf = cdfs.loc[gene] est = significant.loc[gene] merfishtools.plot_cdf(cdf, map_value=est["cv_map"], credible_interval=est[["cv_ci_lower", "cv_ci_upper"]], legend=False) plt.setp(cdf_ax.get_xticklabels(), rotation=45, ha="right") cdf_ax.tick_params(pad=1) plt.locator_params(nbins=4) sns.despine() plt.tight_layout() plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | import matplotlib as mpl mpl.use("svg") import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns errors = [] for m, f in zip(snakemake.params.ms, snakemake.input): d = pd.read_table(f) d["m"] = m errors.append(d) errors = pd.concat(errors) errors["error"] = errors["error"].abs() print(errors) sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) colors = sns.xkcd_palette(["grey", "light red", "red"]) g = sns.FacetGrid(errors, col="type") g.map(sns.boxplot, "m", "error").despine() #plt.xlabel("1-bits") #plt.ylabel("absolute error") #plt.legend(loc="best") #sns.despine() plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | import matplotlib as mpl mpl.use("svg") import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns data = [] for exprs, counts in zip(snakemake.input.estimates, snakemake.input.counts): exprs = pd.read_table(exprs, index_col=[0, 1]) counts = pd.read_table(counts, index_col=[0, 1]) d = pd.concat([exprs, counts], axis="columns") d["diff"] = d["expr_map"] - d["expr_naive"] data.append(d) data = pd.concat(data) sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) fx, fy = snakemake.config["plots"]["figsize"] plt.figure(figsize=[fx*2, fy]) colors = sns.xkcd_palette(["light red"]) hist = data["diff"].value_counts(normalize=True) hist = hist.reindex(np.arange(hist.index.min(), hist.index.max(), dtype=np.int64), fill_value=0) sns.barplot(hist.index, hist, color=colors[0], log=True) plt.xlabel("MAP - naive") plt.ylabel("fraction") sns.despine() plt.savefig(snakemake.output.hist, bbox_inches="tight") plt.figure(figsize=[fx, fy]) plt.scatter(x="exact", y="corrected", c="diff", data=data, cmap="viridis", edgecolors="face", alpha=0.7, rasterized=True) xlim = plt.xlim() ylim = plt.ylim() plt.xlim([0, xlim[1]]) plt.ylim([0, ylim[1]]) xlim = plt.xlim() ylim = plt.ylim() # helper line vmin = min(xlim[1], ylim[1]) plt.plot([0, vmin], [0, vmin], "k--", linewidth=1, alpha=0.7) plt.xlabel("exact") plt.ylabel("corrected") sns.despine() cb = plt.colorbar() cb.set_label("MAP - naive") plt.savefig(snakemake.output.scatter, bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | from itertools import combinations import math import matplotlib as mpl mpl.use("svg") import numpy as np import pandas as pd import networkx as nx import matplotlib.pyplot as plt import seaborn as sns def hamming_distance(a, b): return sum(x != y for x, y in zip(a, b)) # read data codebook = pd.read_table(snakemake.input.codebook, index_col=0, dtype=np.dtype(str))["codeword"] if snakemake.wildcards.pred == "raw": counts = pd.read_table(snakemake.input.counts, index_col=1) counts["total"] = counts["corrected"] + counts["exact"] else: counts = pd.read_table(snakemake.input.exprs, index_col=1) counts["total"] = counts["expr_map"] counts = counts.loc[codebook.index] counts = counts.reset_index() feature = snakemake.wildcards.feature if feature == "maxexpr": feature = counts["feat"][counts["total"].argmax()] codeword = codebook[feature] counts["dist"] = [hamming_distance(codeword, codebook[feat]) for feat in counts["feat"]] counts.sort_values(["dist", "feat"], inplace=True) counts = counts[~(counts["feat"].str.startswith("blank") | counts["feat"].str.startswith("notarget"))] sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) x, y = snakemake.config["plots"]["figsize"] plt.figure(figsize=(x * 1.5, y)) sns.stripplot(x="feat", y="total", hue="dist", data=counts, rasterized=True, palette="Reds_d", size=2) plt.xticks([]) plt.ylim((0, 500)) plt.legend().set_title("hamming distance to {}".format(feature)) if snakemake.wildcards.pred == "raw": plt.ylabel("raw counts") else: plt.ylabel("posterior estimate") plt.xlabel("transcripts") sns.despine(offset=5) plt.savefig(snakemake.output.strip, bbox_inches="tight") plt.figure() dist = 2 if 2 in counts["dist"].values else 4 for _, d in counts[counts["dist"] != dist].groupby("cell"): sns.kdeplot(np.log10(d["total"]), color="k", label=None, legend=False, linewidth=1, alpha=0.5) for _, d in counts[counts["dist"] == dist].groupby("cell"): sns.kdeplot(np.log10(d["total"]), color="r", label=None, legend=False, linewidth=1, alpha=0.5) if snakemake.wildcards.pred == "raw": plt.xlabel("log10 raw counts") else: plt.xlabel("log10 posterior expression") plt.ylabel("density") sns.despine() plt.savefig(snakemake.output.kde, bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | import pandas as pd import matplotlib matplotlib.use("agg") import matplotlib.pyplot as plt import seaborn as sns import numpy as np import math sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) plt.figure(figsize=snakemake.config["plots"]["figsize"]) ax = plt.subplot(111, aspect="equal") for f in snakemake.input: exprs = pd.read_table(f, index_col=0) mean = exprs.mean(axis="columns") var = exprs.var(axis="columns") plt.loglog(mean, var, "k.", alpha=0.5) max_val = max(plt.xlim()[1], plt.ylim()[1]) min_val = min(plt.xlim()[0], plt.ylim()[0]) lim = (min_val, max_val) plt.xlim(lim) plt.ylim(lim) x = plt.xlim() plt.loglog(x, x, "-r", alpha=0.8) plt.xlabel("mean expression") plt.ylabel("variance") sns.despine() plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | import numpy as np import pandas as pd import matplotlib matplotlib.use("agg") import matplotlib.pyplot as plt import seaborn as sns from itertools import combinations sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) quantiles = np.linspace(0, 1, 100) exprs = [pd.read_table(f, index_col=0) for f in snakemake.input] plt.figure(figsize=(15, 15)) for (i, (a, a_name)), (j, (b, b_name)) in combinations(enumerate(zip(exprs, snakemake.params.experiments)), 2): plt.subplot2grid([len(exprs)] * 2, (i, j)) q_a = a.unstack().quantile(quantiles) q_b = b.unstack().quantile(quantiles) vmax = max(q_a.max(), q_b.max()) plt.loglog(q_b, q_a, "k.") plt.loglog([0.01, vmax], [0.01, vmax], "r--") plt.xlabel("experiment {}".format(b_name)) plt.ylabel("experiment {}".format(a_name)) sns.despine() plt.tight_layout() plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | import numpy as np import pandas as pd import matplotlib as mpl mpl.use("agg") import matplotlib.pyplot as plt import seaborn as sns sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) x, y = snakemake.config["plots"]["figsize"] plt.figure(figsize=(5.59, y)) pmfs = [] for i, (mean, pmf, known_counts) in enumerate(zip( snakemake.params.means, snakemake.input.pmfs, snakemake.input.known_counts)): print(mean) pmf = pd.read_table(pmf, index_col=[0, 1]) known_counts = pd.read_table(known_counts, index_col=[0, 1]) codebook = "mhd4" if snakemake.wildcards.dist == "4" else "mhd2" known_counts = known_counts[known_counts[codebook]]["count"] common = pmf.index.intersection(known_counts.index) known_counts = known_counts.loc[common] pmf = pmf.loc[common] pmf["expr"] -= known_counts pmf.set_index("expr", append=True, inplace=True) pmf = pmf.unstack(level=[0, 1]) pmf.columns = known_counts pmf = np.exp(pmf.sample(10, axis=1, random_state=mean)) pmfs.append(pmf) print("plotting") pmfs = pd.concat(pmfs, axis=1) pmfs = pmfs.reindex(np.arange(-20, 21)) pmfs.fillna(0, inplace=True) pmfs.sort_index(ascending=False, inplace=True) pmfs.sort_index(inplace=True, axis=1) sns.heatmap(pmfs, cbar=False, cmap="Greys", xticklabels=False, yticklabels=10) plt.ylabel("predicted - truth") plt.xlabel("PMF") plt.savefig(snakemake.output[0], bbox_inches="tight") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | import numpy as np import pandas as pd import matplotlib as mpl mpl.use("svg") import matplotlib.pyplot as plt import seaborn as sns from seaborn.palettes import blend_palette from seaborn.utils import set_hls_values def ci_error(lower, upper, truth): below = np.maximum(lower - truth, 0) above = np.maximum(truth - upper, 0) return below + above epsilon = 0.001 sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) codebook = pd.read_table(snakemake.input.codebook, index_col=0) errors = [] counts = [] ci_errors = [] for i, (mean, posterior_counts, raw_counts, known_counts) in enumerate(zip( snakemake.params.means, snakemake.input.posterior_counts, snakemake.input.raw_counts, snakemake.input.known_counts)): posterior_estimates = pd.read_table(posterior_counts, index_col=[0, 1]) raw_counts = pd.read_table(raw_counts, index_col=[0, 1]) raw_counts = raw_counts["exact"] + raw_counts["corrected"] known_counts = pd.read_table(known_counts, index_col=[0, 1]) known_counts = known_counts.reindex(codebook.index, level=1) # remove PTPN14 because we have artificially increased it's simulated expression # this will bias our plots known_counts = known_counts[known_counts.index.get_level_values(1) != "PTPN14"] raw_counts = raw_counts.reindex(known_counts.index, fill_value=0) posterior_estimates = posterior_estimates.reindex(known_counts.index, fill_value=0) dropouts = known_counts[(known_counts["count"] > 0) & (raw_counts == 0)] print("dropouts", dropouts) counts.append(pd.DataFrame({"known": known_counts["count"], "raw": raw_counts, "posterior": posterior_estimates["expr_map"]})) errors.append(pd.DataFrame({"error": raw_counts - known_counts["count"], "mean": mean, "type": "raw", "known": known_counts["count"]})) errors.append(pd.DataFrame({"error": posterior_estimates["expr_map"] - known_counts["count"], "mean": mean, "type": "posterior", "known": known_counts["count"]})) errors.append(pd.DataFrame({ "error": ci_error(posterior_estimates["expr_ci_lower"], posterior_estimates["expr_ci_upper"], known_counts["count"]), "mean": mean, "type": "ci", "known": known_counts["count"]})) counts = pd.concat(counts) def plot_hexbin(type, path, color): color_rgb = mpl.colors.colorConverter.to_rgb(color) colors = [set_hls_values(color_rgb, l=l) for l in np.linspace(1, 0, 12)] cmap = blend_palette(colors, as_cmap=True) plt.figure(figsize=0.75 * np.array(snakemake.config["plots"]["figsize"])) plt.subplot(111, aspect="equal") #plt.scatter(counts["known"], counts[type], s=1, c="k", alpha=0.3, rasterized=True, edgecolors="face", marker="o") plt.hexbin(counts["known"], counts[type], cmap=cmap, gridsize=25, clip_on=True) maxv = max(plt.xlim()[1], plt.ylim()[1]) plt.plot([0, maxv], [0, maxv], "--k") plt.xlim((0, maxv)) plt.ylim((0,maxv)) plt.ylabel("predicted") plt.xlabel("truth") sns.despine() plt.savefig(path, bbox_inches="tight") colors = sns.xkcd_palette(["grey", "light red"]) plot_hexbin("raw", snakemake.output.scatter_raw, colors[0]) plot_hexbin("posterior", snakemake.output.scatter_posterior, colors[1]) errors = pd.concat(errors) x, y = snakemake.config["plots"]["figsize"] plt.figure(figsize=(x * 1.5, y)) pred_errors = errors[(errors["type"] == "raw") | (errors["type"] == "posterior")] #bins = pd.cut(pred_errors["known"], # [0, 6, 11, 16, 21, 26, 30, 100000], # right=False, # labels=["0-5", "6-10", "11-15", "16-20", "21-25", "26-30", "≥30"]) #pred_errors["bin"] = bins sns.violinplot(x="mean", y="error", hue="type", data=pred_errors, bw=1, split=True, inner="quartile", palette=colors, linewidth=1) plt.plot(plt.xlim(), [0, 0], "-k", linewidth=1, zorder=-5) plt.xlabel("mean expression") plt.ylabel("predicted - truth") plt.legend(loc="lower left") sns.despine() plt.savefig(snakemake.output.violin, bbox_inches="tight") # plot CI errors ci_errors = errors[errors["type"] == "ci"]["error"].astype(np.int64) hist = ci_errors.value_counts(normalize=True) hist = hist[[0, 1, 2]] plt.figure(figsize=(0.2 * hist.size, y)) sns.barplot(hist.index, hist, color=colors[1]) plt.ylabel("fraction") plt.xlabel("CI error") sns.despine() plt.savefig(snakemake.output.ci_errors, bbox_inches="tight") # print RMSE rmse = lambda errors: np.sqrt((errors ** 2).mean()) print("posterior RMSE", rmse(errors[errors["type"] == "posterior"]["error"])) print("raw RMSE", rmse(errors[errors["type"] == "raw"]["error"])) # write errors errors.to_csv(snakemake.output.errors, sep="\t") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | import numpy as np import pandas as pd import matplotlib matplotlib.use("agg") import matplotlib.pyplot as plt import seaborn as sns from sklearn import preprocessing, manifold, decomposition from itertools import combinations from matplotlib import gridspec experiments = np.array(list(range(1, len(snakemake.input) + 1))) # load data exprs = [pd.read_table(f, index_col=0) for f in snakemake.input.exprs] exprs = pd.concat(exprs, axis="columns", keys=experiments, names=["expmnt", "cell"]) exprs = np.log10(1 + exprs.transpose()) exprs.index = exprs.index.set_levels(exprs.index.levels[1].astype(np.int64), level=1) cellprops = [pd.read_table(f, index_col=0) for f in snakemake.input.cellprops] cellprops = pd.concat(cellprops, keys=experiments, names=["expmnt", "cell"]) cellprops.columns = ["area", "cell_x", "cell_y"] # reduce cell position dimensionality to 1 dimension #pca = decomposition.PCA(n_components=1) tsne = manifold.TSNE(n_components=1, random_state=2390) pos = tsne.fit_transform(cellprops[["cell_x", "cell_y"]])[:, 0] cellprops["pos"] = pos # calculate t-SNE embedding tsne = manifold.TSNE(random_state=21498) X = preprocessing.scale(exprs) embedding = pd.DataFrame(tsne.fit_transform(X), columns=["x", "y"]) embedding.index = exprs.index # plot sns.set(style="ticks", palette="colorblind", context=snakemake.wildcards.context) width, height = snakemake.config["plots"]["figsize"] fig = plt.figure(figsize=[0.75 * snakemake.config["plots"]["figsize"][0]] * 2) plt.subplot(111, aspect="equal") for expmnt, codebook in zip(experiments, snakemake.params.codebooks): embedding.loc[expmnt, "codebook"] = codebook embedding["codebook"] = embedding["codebook"].astype("category") embedding = pd.concat([embedding, cellprops], axis="columns") embedding.reset_index(inplace=True) if snakemake.wildcards.highlight == "expmnt": colors = sns.color_palette("Paired", len(experiments)) highlight = [colors[e] for e in embedding["expmnt"]] cmap = None elif snakemake.wildcards.highlight == "codebook": codebooks = embedding["codebook"].cat.categories colors = dict(zip(codebooks, sns.color_palette("muted", len(codebooks)))) highlight = [colors[c] for c in embedding["codebook"]] cmap = None elif snakemake.wildcards.highlight == "cellsize": highlight = "area" cmap = "viridis" elif snakemake.wildcards.highlight == "cellpos": highlight = "pos" cmap = "viridis" ax = plt.scatter("x", "y", c=highlight, s=5, data=embedding, cmap=cmap, alpha=0.7, edgecolors="face") plt.axis("off") from matplotlib.ticker import NullLocator plt.gca().xaxis.set_major_locator(NullLocator()) plt.gca().yaxis.set_major_locator(NullLocator()) if snakemake.wildcards.highlight == "cellsize": cb = plt.colorbar(ax) cb.set_label("cell size in nm²") plt.savefig(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | import pandas as pd import numpy as np raw = pd.read_table(snakemake.input[0], index_col=[0, 1]) def exact(d): return (1 - d).sum() def corrected(d): return d.sum() genes = raw.groupby(level=[0, 1]) counts = pd.DataFrame({"exact": genes["dist"].aggregate(exact), "corrected": genes["dist"].aggregate(corrected)}) counts.to_csv(snakemake.output[0], sep="\t") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | from collections import defaultdict import csv import pandas as pd import numpy as np #def nbinom_params(mean, cv): # variance = (cv * mean) ** 2 # p = 1 - ((variance - mean) / variance) # n = (mean * p) / (1 - p) # return n, p # Fix the seed for a particular mean, so that results are reproducible. np.random.seed(int(snakemake.wildcards.mean)) codebook_mhd4 = pd.read_table(snakemake.input.mhd4, index_col=0, dtype=np.dtype(str))["codeword"] codebook_mhd2 = pd.read_table(snakemake.input.mhd2, index_col=0, dtype=np.dtype(str))["codeword"] genes = set(codebook_mhd2.index) | set(codebook_mhd4.index) with open(snakemake.output[0], "w") as known_out: known_out = csv.writer(known_out, delimiter="\t") known_out.writerow(["cell", "feat", "mhd2", "mhd4", "count"]) for cell in range(snakemake.params.cell_count): random_counts = np.random.poisson(int(snakemake.wildcards.mean), len(genes)) for gene, count in zip(genes, random_counts): if gene == "PTPN14": # PTPN14 occurs in all codebooks. count = np.random.poisson(1500) if gene.startswith("notarget") or gene.startswith("blank"): count = 0 known_out.writerow([cell, gene, gene in codebook_mhd2.index, gene in codebook_mhd4.index, count]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | from collections import Counter, defaultdict from itertools import chain import csv import pandas as pd import numpy as np from bitarray import bitarray noise_rate = 0.5 # Fix the seed for a particular mean, so that results are reproducible. np.random.seed(int(snakemake.wildcards.mean)) p0 = snakemake.params.ds["err01"] p1 = snakemake.params.ds["err10"] N = snakemake.params.ds["N"] m = snakemake.params.ds["m"] def sim_errors(word): p = np.random.uniform(0, 1, len(word)) err10 = bitarray(list(p <= p1)) & word err01 = bitarray(list(p <= p0)) & ~word readout = (word ^ err10) ^ err01 err_10_count = err10.count(True) err_01_count = err01.count(True) errs = err_10_count + err_01_count assert errs == 0 or word != readout return readout, errs, err_01_count, err_10_count def hamming1_env(word): for i in range(len(word)): w = word.copy() w[i] ^= 1 yield w codebook = pd.read_table(snakemake.input.codebook, index_col=0, dtype=np.dtype(str)) codebook["codeword"] = codebook["codeword"].apply(bitarray) codebook["expressed"] = codebook["expressed"] == "1" noise_word = bitarray("0" * len(codebook["codeword"].iloc[0])) known_counts = pd.read_table(snakemake.input.known_counts, index_col=[0, 1]) def simulate(codebook, counts_path, readouts_path, stats_path, has_corrected=True): lookup_exact = {word.tobytes(): gene for gene, word, _ in codebook.itertuples()} if has_corrected: lookup_corrected = {w.tobytes(): gene for gene, word, _ in codebook.itertuples() for w in hamming1_env(word)} else: lookup_corrected = {} with open(counts_path, "w") as sim_out, open(readouts_path, "w") as readouts_out: sim_out = csv.writer(sim_out, delimiter="\t") sim_out.writerow(["cell", "feat", "dist", "cell_x", "cell_y", "x", "y"]) readouts_out = csv.writer(readouts_out, delimiter="\t") readouts_out.writerow(["cell", "feat", "readout"]) stats = [] for cell in range(snakemake.params.cell_count): readouts = [] words = [] genes = [] errors = [] err_01_counts = [] err_10_counts = [] total_counts = 0 for gene, word, expressed in codebook.itertuples(): if not expressed: # Skip entries that are marked as not expressed in the codebook. # These can act as misidentification probes. continue count = known_counts.loc[cell, gene]["count"] total_counts += count for _ in range(count): readout, errs, err_01_count, err_10_count = sim_errors(word) errors.append(errs) err_01_counts.append(err_01_count) err_10_counts.append(err_10_count) readouts.append(readout) words.append(word) genes.append(gene) print("effective 0-1 error-rate:", sum(err_01_counts) / ((N - m) * len(readouts))) print("effective 1-0 error-rate:", sum(err_10_counts) / (m * len(readouts))) # add noise noise_count = int(noise_rate * total_counts) print("noise count: ", noise_count) for _ in range(noise_count): readout, errs, err_01_count, err_10_count = sim_errors(noise_word) if has_corrected: if errs < 3: continue elif errs < 4: continue errors.append(errs) readouts.append(readout) words.append(noise_word) genes.append("noise") ReadoutCounter = lambda: defaultdict(list) exact_counts = ReadoutCounter() corrected_counts = ReadoutCounter() exact_miscalls = ReadoutCounter() corrected_miscalls = ReadoutCounter() exact_noise_miscalls = ReadoutCounter() corrected_noise_miscalls = ReadoutCounter() for readout, errs, word, orig_gene in zip(readouts, errors, words, genes): try: gene = lookup_exact[readout.tobytes()] exact_counts[gene].append(readout) if errs > 0: if orig_gene == "noise": exact_noise_miscalls[gene].append(readout) else: exact_miscalls[gene].append(readout) except KeyError: try: gene = lookup_corrected[readout.tobytes()] corrected_counts[gene].append(readout) if errs > 1: if orig_gene == "noise": corrected_noise_miscalls[gene].append(readout) else: corrected_miscalls[gene].append(readout) except KeyError: # readout is lost pass for gene in set(chain(exact_counts, corrected_counts)): for _ in range(len(exact_counts[gene])): sim_out.writerow([cell, gene, 0, 0, 0, 0, 0]) for _ in range(len(corrected_counts[gene])): sim_out.writerow([cell, gene, 1, 0, 0, 0, 0]) for readout in chain(exact_counts[gene], corrected_counts[gene]): readouts_out.writerow([cell, gene, readout.to01()]) for gene in exact_counts: known = known_counts.loc[cell, gene]["count"] counts = len(exact_counts[gene]) + len(corrected_counts[gene]) _exact_miscalls = len(exact_miscalls[gene]) _corrected_miscalls = len(corrected_miscalls[gene]) noise_miscalls = len(exact_noise_miscalls[gene]) + len(corrected_noise_miscalls[gene]) miscalls = _exact_miscalls + _corrected_miscalls + noise_miscalls missed = known - counts + miscalls stats.append([cell, gene, known, missed, counts, miscalls, noise_miscalls]) stats = pd.DataFrame(stats) stats.columns = ["cell", "gene", "truth", "missed", "counts", "miscalls", "noise-miscalls"] stats["total"] = stats["truth"] + stats["miscalls"] stats["calls"] = stats["counts"] - stats["miscalls"] stats["call-rate"] = stats["calls"] / stats["total"] stats["miscall-rate"] = stats["calls"] / stats["total"] stats["missed-rate"] = stats["missed"] / stats["total"] stats.to_csv(stats_path, sep="\t") print("rates vs counts") print("miscalls", (stats["miscalls"] / stats["counts"]).mean()) print("calls", (stats["calls"] / stats["counts"]).mean()) print("missed", (stats["missed"] / stats["counts"]).mean()) print("rates vs truth") print("miscalls", (stats["miscalls"] / stats["truth"]).mean()) print("calls", (stats["calls"] / stats["truth"]).mean()) print("missed", (stats["missed"] / stats["truth"]).mean()) print("rates vs total") print("miscalls", (stats["miscalls"] / stats["total"]).mean()) print("calls", (stats["calls"] / stats["total"]).mean()) print("missed", (stats["missed"] / stats["total"]).mean()) print("Simulating dataset") simulate(codebook, snakemake.output.sim_counts, snakemake.output.readouts, snakemake.output.stats, has_corrected=snakemake.params.ds["has_corrected"]) |
89 90 | script: "scripts/format-dataset.py" |
104 105 106 107 108 | shell: "cut -f1 {input.template} | tail -n+2 | merfishtools gen-mhd2 " "-N {params.ds[N]} -m {params.ds[m]} " "--not-expressed '(notarget|blank).+' " "> {output}" |
122 123 124 125 | shell: "cut -f1 {input.template} | tail -n+2 | merfishtools gen-mhd4 " "-m {params.ds[m]} --not-expressed '(notarget|blank).+' " "> {output}" |
135 136 | script: "scripts/cell-properties.py" |
146 147 | script: "scripts/raw-counts.py" |
157 158 | script: "scripts/count-matrix.py" |
190 191 192 | shell: "grep -P '^{wildcards.experiment}\\t' {input.readouts} | cut -f2,3,4 | " "merfishtools est-error-rates {input.codebook} > {output}" |
205 206 207 | shell: "merfishtools est-error-rates {input.codebook} " "< {input.readouts} > {output}" |
227 228 229 230 231 232 | shell: "merfishtools -v exp {input.codebook} --p0 {params.ds[err01]} " "--p1 {params.ds[err10]} --seed 9823749 " "--estimate {output.est} -t {threads} " "--stats {output.stats} " "< {input.data} > {output.pmf} 2> {log}" |
242 243 | script: "scripts/expression-matrix.py" |
258 259 | script: "scripts/normalize.py" |
270 271 | script: "scripts/normalize-matrix.py" |
282 283 | script: "scripts/normalize-cdf.py" |
306 307 308 309 | shell: "merfishtools diffexp -t {threads} --pseudocounts 1 " "--max-null-log2fc 1.0 --cdf {output.cdf} {input} " "> {output.est}" |
331 332 333 334 | shell: "merfishtools multidiffexp --pseudocounts 1 " "-t {threads} --max-null-cv 0.5 " "--cdf {output.cdf} {input} > {output.est}" |
345 346 | script: "scripts/go-enrichment.R" |
365 366 | script: "scripts/simulate-counts.py" |
384 385 | script: "scripts/simulate-dataset.py" |
411 412 | script: "scripts/plot-error-rates.py" |
430 431 | script: "scripts/plot-error-rate-uncertainty.py" |
442 443 | script: "scripts/plot-cv-raw-vs-posterior.py" |
455 456 | script: "scripts/plot-naive-vs-map.py" |
467 468 | script: "scripts/plot-go-enrichment.py" |
483 484 | script: "scripts/plot-multidiffexp.py" |
497 498 | script: "scripts/plot-expression-pmf.py" |
509 510 | script: "scripts/plot-foldchange-cdf.py" |
522 523 | script: "scripts/plot-qq.py" |
533 534 | script: "scripts/plot-expression-dist.py" |
544 545 | script: "scripts/plot-overdispersion.py" |
555 556 | script: "scripts/plot-correlation.py" |
573 574 | script: "scripts/plot-tsne.py" |
593 594 | script: "scripts/plot-simulation.py" |
606 607 | script: "scripts/plot-m-vs-errors.py" |
620 621 | script: "scripts/plot-simulation-pmf.py" |
632 633 | script: "scripts/plot-dataset-correlation.py" |
646 647 | script: "scripts/plot-dataset-correlation-error.py" |
662 663 | script: "scripts/plot-neighborhood.py" |
673 674 | script: "scripts/plot-em.py" |
688 689 | script: "scripts/fig-thbs1-bias.py" |
701 702 | script: "scripts/fig-error-rates.py" |
713 714 | script: "scripts/fig-error-rate-uncertainty.py" |
725 726 | script: "scripts/fig-naive-vs-map.py" |
739 740 | script: "scripts/fig-example.py" |
755 756 | script: "scripts/fig-simulation.py" |
771 772 | script: "scripts/fig-simulation.py" |
785 786 | script: "scripts/fig-ci-error.py" |
799 800 | script: "scripts/fig-clustering.py" |
830 831 | script: "scripts/fig-cv-raw-vs-posterior.py" |
841 842 | script: "scripts/plot-exact-vs-corrected.py" |
854 855 | script: "scripts/fig-model.py" |
865 866 | script: "scripts/fig-dataset-correlation.py" |
879 880 | shell: "cairosvg -f {wildcards.fmt} {input} -o {output}" |
Support
- Future updates
Related Workflows





