Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
This workflow creates small test datasets for NGS data analyses. The generated data is available in the folders
ref
and
reads
, such that the repository can be directly used as a git submodule for continuous integration tests.
Authors
-
Johannes Köster (@johanneskoester), https://koesterlab.github.io
-
Jana Jansen (@jana-ja)
-
Ludmila Janzen (@sophsatt)
-
Sophie Sattler (@l-janzen)
-
Antonie Vietor (@AntonieV)
Usage
Step 1: Install workflow
#TODO unser link If you simply want to use this workflow, download and extract the latest release . If you intend to modify and further develop this workflow, fork this reposity. Please consider providing any generally applicable modifications via a pull request.
In any case, if you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository and, if available, its DOI (see above).
Step 2: Configure workflow
Configure the workflow according to your needs via editing the file
config.yaml
. Further instructions can be found in the file.
Step 3: Execute workflow
Test your configuration by performing a dry-run via
snakemake -n
Execute the workflow locally via
snakemake --cores $N
using
$N
cores or run it in a cluster environment via
snakemake --cluster qsub --jobs 100
or
snakemake --drmaa --jobs 100
See the Snakemake documentation for further details.
Code Snippets
2 3 4 5 6 7 8 9 10 11 12 13 14 15 | import pandas as pd import seaborn as sns #import matplotlib #matplotlib.use("Agg") import matplotlib.pyplot as plt #from pysam import VariantFile #daten = pd.read_csv('sleuth_matrix.csv', sep='\t') daten = pd.read_csv(snakemake.input[0], sep='\t') sns.boxenplot(data=daten, scale = "linear"); plt.title("Boxenplots der (normalisierten) Counts aller Samples") #plt.savefig('boxenplot.svg') plt.savefig(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | import sys import json import csv from collections import OrderedDict ### ### gene1_name gene1_id, gene2_name, gene2_id, type, pair, split, txlist def loadJSON(fn): with open(fn) as f: JJ = json.load(f,object_pairs_hook=OrderedDict) return JJ['genes'] def outputGeneTable(fusions, outf, filters = None): with open(outf, 'w') as csvfile: csvwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(['\t'.join(["geneA.name", "geneA.id", "geneB.name", "geneB.id", "paircount", "splitcount", "transcripts.list"])]) for gf in fusions: gAname = gf['geneA']['name'] gAid = gf['geneA']['id'] gBname = gf['geneB']['name'] gBid = gf['geneB']['id'] pairs = str(gf['paircount']) split = str(gf['splitcount']) txp = [tp['fasta_record'] for tp in gf['transcripts']] csvwriter.writerow(['\t'.join([gAname, gAid, gBname, gBid, pairs, split, ';'.join(txp)])]) def usage(): print("Usage: python flatten_json.py fusion.out.json genetable.csv") print("") print(" outputs a flat table listing all gene fusions, if the output file is not") if __name__ == "__main__": nargs = len(sys.argv) if nargs <= 1: usage() else: infn = sys.argv[1] fusions = loadJSON(infn) outf = sys.stdout outputGeneTable(fusions,sys.argv[2]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | import h5py import numpy as np import sys import csv def get_cumulative_dist(fn): f = h5py.File(fn) x = np.asarray(f['aux']['fld'], dtype='float64') y = np.cumsum(x)/np.sum(x) f.close() return y if __name__ == "__main__": if len(sys.argv) < 4: print("Usage: python get_fragment_length.py H5FILE [cutoff] output") print("") print("Prints 95 percentile fragment length") print("") print(" H5FILE: abundance.h5 file produced by kallisto") print(" cutoff: percentage cutoff to use") else: fn = sys.argv[1] if len(sys.argv) >=4: cutoff = float(sys.argv[2]) if cutoff <= 0 or cutoff >= 1.0: cutoff = 0.95 y = get_cumulative_dist(fn) fraglen = np.argmax(y > .95) with open(sys.argv[3], 'w') as csvfile: csvwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(["95 percentile fragment length"]) csvwriter.writerow([fraglen]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | import matplotlib.pyplot as plt import pandas as pd from sklearn.decomposition import PCA sleuth_matrix = pd.read_csv(snakemake.input[0], sep='\t') samples = list(sleuth_matrix) count_sam = sleuth_matrix.shape[1] ind = list(range(0, count_sam)) sleuth_matrix = sleuth_matrix.transpose() n_components = 2 pca = PCA(n_components=n_components) sl_pca = pca.fit_transform(sleuth_matrix) colors = plt.cm.get_cmap("hsv", count_sam+1) plt.figure(figsize=(8, 8)) for i, sam in zip(ind, samples): plt.scatter(sl_pca[i, 0], sl_pca[i, 1], color=colors(i), lw=2, label=sam) plt.title("PCA of Transcriptexpression") plt.legend(loc="best", shadow=False, scatterpoints=1) plt.axis([-40, 40, -1.5, 1.5]) #plt.show() plt.savefig(snakemake.output[0]) |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | import pandas as pd import seaborn as sns import matplotlib.pyplot as plt p_value_table = pd.read_csv(snakemake.input[0], sep='\t') #p_value_table = pd.read_csv('p-values_all_transcripts.csv', sep='\t') p_value = p_value_table['pval'] p_value = p_value.dropna() pval_plot = sns.distplot(p_value, kde=False, axlabel="P-Values", color="k", norm_hist=True) pval_plot.set(ylabel='count') plt.title("p-value Histogramm") #plt.show() plt.savefig(snakemake.output[0]) |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | import pandas as pd import seaborn as sns import matplotlib.pyplot as plt p_value_table = pd.read_csv(snakemake.input[0], sep='\t') #p_value_table = pd.read_csv('p-values_all_transcripts.csv', sep='\t') print("tabelle: ") p_value_table = p_value_table.loc[0:20, :] print(p_value_table) sns.set(style="white") #Wo finden wir die Condition? sns.stripplot(x='pval', y='ext_gene', data=p_value_table, jitter=False) #soll ich x und y nicht angeben? plt.title("Stripchart der top20 differentiell exprimierten Gene") #plt.show() plt.savefig(snakemake.output[0]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | library(ComplexHeatmap) #library(rsvg) library(circlize) #Input zum direkten Testen ohne Workflow #path.matr <- "../sleuth/sleuth_matrix.csv" #path.dist <- "../clustering_distance.txt" #path.p_all <- "../sleuth/p-values_all_transcripts.csv" #Snakemake-Input path.matr <- snakemake@input[["matrix"]] path.dist <- snakemake@input[["dist"]] path.p_all <- snakemake@input[["p_all"]] write("\n", file = path.dist, append = TRUE) dist <- gsub("[[:space:]]", "", unlist(read.table(path.dist, stringsAsFactors = FALSE))) write(dist, file = path.dist, append = FALSE) matr.so <- read.table(path.matr) genes <- read.table(path.p_all) #Sortieren Sleuth-Resultaten nach target_id genes <- dplyr::arrange(genes, target_id) #Ersetzen der target_id durch Gen-Namen rownames(matr.so) = make.names(genes$ext_gene, unique = TRUE) #NA-Zeilen entfernen matr.so <- na.omit(matr.so) #Null-Zeilen entfernen matr.so <- subset.matrix(matr.so, rowSums(matr.so)!=0) #Bestimmung von Median(.5-Quantil) und der Quartile fuer die Faerbung der Heatmap so.min <- min(matr.so) so.quantiles <- rowMeans(apply(matr.so, 2, quantile, probs = c(.25, .5, .75))) #Heatmap wird aufgebaut svg(file=snakemake@output[[1]]) Heatmap(matr.so, name = "normalized\nestimated\ncounts", column_title = "Samples", column_names_side = "top", row_title = "Transcripts", row_names_side = "right", row_dend_side = "left", col = colorRamp2(c(so.min, so.quantiles), c("darkgreen", "green", "darkred", "red")), cluster_rows = TRUE, cluster_columns = TRUE, clustering_distance_rows = dist) dev.off() |
1 2 | BiocManager::install("gage", version = "3.8") library("gage") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | library(biomaRt) #Sleuth starten: suppressMessages({ library("sleuth") }) path <- getwd() # aktuelles Verzeichnis speichern #Vorbereitungen: #Listen mit den Pfaden zu den einzelnen Kallisto-Dateien anlegen: kal_dirs <- file.path(gsub(" ","", paste(path, "/", snakemake@input[["kal_path"]]))) # Liste der Pfadnamen der einzelnen Kallisto-results kal_dirs <- strsplit(kal_dirs, "/abu.*")[[1]][1] s2c <- read.table(file = snakemake@input[["sam_tab"]] , sep = "\t", header = TRUE, stringsAsFactors = FALSE) s2c <- dplyr::select(s2c, sample, condition) #Die Pfade der Kallisto-Dateien muessen nun als neue Spalte an die Hilfstabelle angefuegt werden: s2c <- dplyr::mutate(s2c, path = kal_dirs) #Zuordnung der Transkripte zu Genen mart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host = 'ensembl.org') t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id", "external_gene_name"), mart = mart) t2g <- dplyr::rename(t2g, target_id = ensembl_transcript_id, ens_gene = ensembl_gene_id, ext_gene = external_gene_name) so <- sleuth_prep(s2c, full_model = ~condition, target_mapping = t2g) #extra_bootstrap_summary = FALSE) #, read_bootstrap_tpm=TRUE, full_model = ~condition) #estimate parameters for the sleuth response error measurement (full) model #Das Sleuth-Objekt an das Full-Model anpassen: so <- sleuth_fit(so, ~condition, 'full') #estimate parameters for the sleuth reduced model #Das Sleuth-Objekt an das reduzierte Model anpassen: so <- sleuth_fit(so, ~1, 'reduced') #perform differential analysis (testing) using the likelihood ratio test #Analyse starten: so <- sleuth_lrt(so, 'reduced', 'full') #Betrachten des Full-Models und des reduzierten Modells: models(so) #Betrachen der signifikanten Ergebnisse aus dem Likelihood-Ratio-Test: sleuth_table <- sleuth_results(so, 'reduced:full', 'lrt', show_all = TRUE) sleuth_significant <- dplyr::filter(sleuth_table, qval <= 0.05) #### TODO - Gennamen zu den IDs hinzugfuegen sleuth_save(so, "sleuth/sleuth_object") sleuth_matrix = sleuth_to_matrix(so, 'obs_norm', 'est_counts') write.table(sleuth_matrix, file = "sleuth/sleuth_matrix.csv", sep = "\t") write.table(sleuth_table, file = "sleuth/p-values_all_transcripts.csv", sep = "\t") write.table(sleuth_significant, file = "sleuth/significant_transcripts.csv", sep = "\t") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | library(rsvg) library(png) library(grid) library(gridExtra) path <- snakemake@input[["plots"]] svg_files <- unlist(list.files(path, pattern = ".*\\.svg$")) plots_png <- lapply (1:length(svg_files), function(i){ out_file <- gsub(" ","", paste(path, "/", strsplit(svg_files[i], "\\.")[[1]][1], ".png")) rsvg_png(gsub(" ","", paste(path, "/", svg_files[i])), out_file) }) png_files <- unlist(list.files(path, pattern = ".*\\.png$")) pdf(file=snakemake@output[[1]]) for (i in seq(along=png_files)){ im <- rasterGrob(readPNG(gsub(" ","", paste(path, "/",png_files[i])), native = FALSE), interpolate = FALSE) do.call(grid.arrange, c(list(im), ncol = 1)) } dev.off() delete_png <- file.remove((gsub(" ","", paste(path, "/",png_files)))) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | library(MASS) library(calibrate) #p_all <- read.table("../sleuth/p-values_all_transcripts.csv", header=TRUE) #matr <- read.table("../sleuth/sleuth_matrix.csv", header=TRUE) #samples <- read.table("../table_for_reads.csv", header=TRUE, stringsAsFactors = TRUE) p_all <- read.table(snakemake@input[["pval"]], header=TRUE) matr <- read.table(snakemake@input[["matrix"]], header=TRUE) samples <- read.table(snakemake@input[["samples"]], header=TRUE, stringsAsFactors = TRUE) #p_all ist nach pval sortiert, wird nun wie Matrix nach Gen-ID angeordnet: p_all <- dplyr::arrange(p_all, target_id) if(any(p_all$target_id != row.names(matr))){ stop("Die Datenmatrix mit der Anzahl der Counts und der Datensatz der Signifikanzanalyse aus Sluth sind verschieden!") quit(status = 1, runLast = FALSE) }else{ #Ersetzen der target_id durch Gen-Namen rownames(matr) = make.names(p_all$ext_gene, unique = TRUE) condition_1 <- samples$sample[samples$condition == as.character(factor(samples$condition)[1])] condition_2 <- samples$sample[samples$condition == as.character(factor(samples$condition)[2])] samples.cond_1 <- matr[][as.character(samples$sample[condition_1])] samples.cond_2 <- matr[][as.character(samples$sample[condition_2])] #Definitionsbereich: log2-FoldChange FoldChange <- rowSums(samples.cond_2)/rowSums(samples.cond_1) log2FC <- log2(FoldChange) plot_x <- log2FC #ungueltige Werte anpassen, Intervall des Definitionsbereichs bestimmen plot_x[which(is.nan(plot_x))] = Inf plot_x[which(is.na(plot_x))] = 0 min_x <- min(plot_x[is.finite(plot_x)]) max_x <- max(plot_x[is.finite(plot_x)]) #Wertebereich: -log10 der p-Werte p_val <- p_all$pval plot_y <- -log10(p_val) #ungueltige Werte anpassen, Intervall des Definitionsbereichs bestimmen plot_y[which(is.nan(plot_y))] = Inf plot_y[which(is.na(plot_y))] = 0 min_y <- min(abs(plot_y[is.finite(plot_y)])) max_y <- max(plot_y[is.finite(plot_y)]) #Anlegen des Dataframes fuer den Volcano-Plot mit: #Gen/Target-ID, #FoldChange(log2), #p-Werten aus der Sleuth-Analyse und #den durch Post-Hoc-Tests normaliesierten p-Werten (qval, also Korrektur der #Alphafehler-Kumulierung beim multiplen Testen) aus der Sleuth-Analyse volcano.data <- data.frame(GeneID = p_all$ext_gene, log2FoldChange = plot_x, pVal = plot_y, PostHoc_pValues = p_all$qval) #svg("../plots/test.svg") svg(file=snakemake@output[[1]]) #Volcano-Plot anlegen with(volcano.data, plot(log2FoldChange, pVal, pch = 20, main = "Volcano-Plot", xlim = c(min_x, max_x), ylim = c(min_y, max_y), xlab = "log2(FoldChange)", ylab = "-log10(p-Values)")) #Farbgebung: rot := p-Wert(nach Posthoc-Test) < 0.05, orange := log2FoldChange > 1, # gruen := signifikant(p < 0.05), log2FoldChange > 1 with(subset(volcano.data, PostHoc_pValues<.05 ), points(log2FoldChange, pVal, pch=20, col="red")) with(subset(volcano.data, abs(log2FoldChange)>1), points(log2FoldChange, pVal, pch=20, col="orange")) with(subset(volcano.data, PostHoc_pValues<.05 & abs(log2FoldChange)>1), points(log2FoldChange, pVal, pch=20, col="green")) #Datenpunkte anpassen with(subset(volcano.data, PostHoc_pValues<.05 & abs(log2FoldChange)>1), textxy(log2FoldChange, pVal, labs=GeneID, cex=.8)) dev.off() } |
30 31 | shell: "kallisto index -i {output} {input}" |
45 46 | shell: "kallisto quant --fusion --bootstrap-samples=2 -i {input.id} -o {params} {input.fq1} {input.fq2}" |
59 60 | script: "r_scripts/sleuth_script.R" |
71 72 | script: "r_scripts/volcano.R" |
84 85 | script: "r_scripts/complexHeatmap.R" |
95 96 | script: "py_scripts/pca_plot.py" |
106 107 | script: "py_scripts/boxen_plot.py" |
117 118 | script: "py_scripts/p-value_histogramm.py" |
128 129 | script: "py_scripts/strip_plot.py" |
139 140 | script: "r_scripts/svg_to_pdf.R" |
154 155 | shell: "pizzly -k 31 --gtf {input.gtf} --cache {params[0]}/indx.cache.txt --align-score 2 --insert-size 400 --fasta {input.transcript} --output {params[1]} {input.fusion}" |
164 165 | shell: "python py_scripts/flatten_json.py {input} {output}" |
174 175 176 177 178 179 180 181 182 183 184 185 186 | shell: "python py_scripts/get_fragment_length.py {input[0]} 0.95 {output} " #evtl andees percentil angeben rule all_csv_plots: input: expand("plots/pizzly/pizzly_genetable_{sample}.csv", sample = samples['sample']), expand("plots/pizzly/pizzly_fragment_length_{sample}.csv", sample = samples['sample']) rule gage: input: conda: "envs/gage.yaml" |
189 190 | script: "r_scripts/gage.R" |
Support
- Future updates
Related Workflows





