Workflow dervied form https://github.com/snakemake-workflows/rna-seq-star-deseq2 for multi condition Deseq2 and enrichment analyis
Snakemake workflow for running differential expression and enrichment analyses for experimental setups with multiple groups or multiple conditions. Results are provided as a set of HTML files with plots and result description both for each comparison defined in the config file.
Workflow was derived from https://github.com/snakemake-workflows/rna-seq-star-deseq2, with alignment steps removed.
Installation
Required dependencies
The quickest way to get up and running with this workflow is using
Installation
git clone https://github.com/schlesnerlab/multicondition-deseq2-enrichment
cd multicondition-deseq-enrichment
You can install snakemake with the yaml provided in
workflow/snakemake.yaml
with
conda env create -n snakemake -f workflow/snakemake.yaml
conda activate snakemake
In case you use snakedeploy you could also deploy this workflow with
snakedeploy deploy-workflow https://github.com/schlesnerlab/multicondition-deseq2-enrichment multicondition-deseq2-enrichment --branch main
Carnival usage
If you wish to use analysis provided by Carnival you need to provide a solver supported by CARNIVAL. Right now the following solvers are supported:
- cplex
For academic users the cplex can be downloaded here . Then the path to the cplex executable needs to added to the config file.
Usage
Setting up config and sample sheet
To use this workflow you will need:
-
RNAseq quantified data (f.e. counts) as a tsv or csv file
-
a config yaml file for your project to control the workflow
-
a tsv file with the sample information for the samples in the count file.
Further details on the configuration can be found in the config README
Supported input data
- gene count matrix
Code Snippets
32 33 | script: "../scripts/test_carnival.R" |
66 67 | script: "../scripts/test_carnival.R" |
91 92 | script: "../scripts/RMD_scripts/carnival_downstream.Rmd" |
112 113 | script: "../scripts/RMD_scripts/carnival_join.Rmd" |
132 133 | script: "../scripts/transcriptutorial/sample_resolution_dorothea.R" |
150 151 | script: "../scripts/transcriptutorial/sample_resolution_progeny.R" |
176 177 | script: "../scripts/transcriptutorial/sample_resolution_carnival.R" |
202 203 | script: "../scripts/transcriptutorial/06_analysis_CARNIVAL_results.Rmd" |
18 19 | script: "../scripts/deseq2-init.R" |
35 36 | script: "../scripts/plot-pca.R" |
65 66 | script: "../scripts/deseq2.R" |
83 84 | script: "../scripts/rlog_transform.R" |
113 114 | script: "../scripts/DESeq2_analysis.Rmd" |
158 159 | script: "../scripts/RMD_scripts/gsea_report.Rmd" |
181 182 | script: "../scripts/DEseq2_cohort.Rmd" |
209 210 | script: "../scripts/run_mitch.R" |
235 236 | script: "../scripts/export_diffexp_xlsx.R" |
255 256 | script: "../scripts/dorothea.Rmd" |
281 282 | script: "../scripts/RMD_scripts/dorothea_diffexp.Rmd" |
306 307 | script: "../scripts/RMD_scripts/progeny_analysis.Rmd" |
23 24 | script: "../scripts/count-matrix.py" |
38 39 | script: "../scripts/gtf2bed.py" |
66 67 68 | shell: "junction_annotation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} " "> {log[0]} 2>&1" |
96 97 98 | shell: "junction_saturation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} " "> {log} 2>&1" |
121 122 | shell: "bam_stat.py -i {input} > {output} 2> {log}" |
143 144 | shell: "infer_experiment.py -r {input.bed} -i {input.bam} > {output} 2> {log}" |
171 172 | shell: "inner_distance.py -r {input.bed} -i {input.bam} -o {params.prefix} > {log} 2>&1" |
196 197 | shell: "read_distribution.py -r {input.bed} -i {input.bam} > {output} 2> {log}" |
223 224 | shell: "read_duplication.py -i {input} -o {params.prefix} > {log} 2>&1" |
250 251 | shell: "read_GC.py -i {input} -o {params.prefix} > {log} 2>&1" |
331 332 | wrapper: "v1.19.0/bio/multiqc" |
348 349 | script: "../scripts/export_fpkm.py" |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | import pandas as pd def get_column(strandedness): if pd.isnull(strandedness) or strandedness == "none": return 8 # non stranded protocol elif strandedness == "yes": return 9 # 3rd column elif strandedness == "reverse": return 10 # 4th column, usually for Illumina truseq else: raise ValueError( ( "'strandedness' column should be empty or have the " "value 'none', 'yes' or 'reverse', instead has the " "value {}" ).format(repr(strandedness)) ) def get_fpkm_column(strandedness): if pd.isnull(strandedness) or strandedness == "none": return 11 # non stranded protocol elif strandedness == "yes": return 12 # 3rd column elif strandedness == "reverse": return 13 # 4th column, usually for Illumina truseq else: raise ValueError( ( "'strandedness' column should be empty or have the " "value 'none', 'yes' or 'reverse', instead has the " "value {}" ).format(repr(strandedness)) ) counts = [ pd.read_table( f, index_col=0, usecols=[3, get_column(strandedness)], header=None, skiprows=1 ) for f, strandedness in zip(snakemake.input, snakemake.params.strand) ] fpkm = [ pd.read_table( f, index_col=0, usecols=[3, get_fpkm_column(strandedness)], header=None, skiprows=1, ) for f, strandedness in zip(snakemake.input, snakemake.params.strand) ] gene_names = pd.read_table(snakemake.input[0], index_col=0, usecols=[3, 6], header=0) for t, sample in zip(counts, snakemake.params.samples): t.columns = [sample] for t, sample in zip(fpkm, snakemake.params.samples): t.columns = [sample] matrix = pd.concat(counts, axis=1) fpkm_mat = pd.concat(fpkm, axis=1) matrix.index.name = "gene" fpkm_mat.index.name = "gene" gene_names.index.name = "gene" # collapse technical replicates matrix = matrix.groupby(matrix.columns, axis=1).sum() fpkm_mat = fpkm_mat.groupby(fpkm_mat.columns, axis=1).sum() fpkm_mat["gname"] = gene_names["name"] matrix.to_csv(snakemake.output[0], sep="\t") fpkm_mat.to_csv(snakemake.output[1], sep="\t") |
14 15 16 17 18 19 20 | body .main-container { max-width: 1800px !important; width: 1800px !important; } body { max-width: 1800px !important; } |
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | knitr::opts_chunk$set( echo = FALSE, warning = FALSE, message = FALSE, fig.width = 12, fig.height = 12 ) library(tidyverse) library(clusterProfiler) library(DESeq2) library(enrichplot) library(ggupset) library(patchwork) library(biomaRt) library(svglite) if (!require("RNAscripts")) { devtools::install("./RNAscripts") } library("RNAscripts") library("BiocParallel") library(ComplexHeatmap) |
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | if (exists("snakemake")) { dds_path <- snakemake@input[["dds_obj"]] diffexp_tb_path <- snakemake@input[["table"]] fpkm_path <- snakemake@input[["fpkm"]] feature_counts_fp <- snakemake@input[["featureCounts"]] enrich_list <- snakemake@input[["gsea_result"]] cond_id <- snakemake@wildcards[["condition"]] write(cond_id, file = stderr()) contrast_groups <- snakemake@params[["contrast"]] coef_string <- paste0( cond_id, "_", contrast_groups[[1]], "_vs_", contrast_groups[[2]] ) samp_map <- snakemake@params[["samples"]] lfc_shrink <- snakemake@config[["diffexp"]][["shrink_lfc"]] rld_path <- snakemake@input[["rld"]] register(MulticoreParam(snakemake@threads)) lfc_threshold <- snakemake@config[["diffexp"]][["LFC_threshold"]] pvalue_threshold <- snakemake@config[["diffexp"]][["pval_threshold"]] plot_path <- snakemake@params[["plot_path"]] group_colors <- snakemake@config[["group_colors"]][[cond_id]] %>% unlist() if (!is.null(snakemake@config[["diffexp"]][["custom_model"]][[cond_id]])) { model_string <- snakemake@config[["diffexp"]][["custom_model"]][[cond_id]] } else { model_string <- snakemake@config[["diffexp"]][["model"]] } organism <- snakemake@config[["organism"]] conf <- snakemake@config } else { conf <- yaml::read_yaml("configs/VascAge_config.yaml") base_analysis_dir <- file.path(conf$dirs$BASE_ANALYSIS_DIR) cond_id <- names(conf$diffexp$contrasts)[1] comp_id <- names(conf$diffexp$contrasts[[cond_id]])[1] contrast_groups <- conf$diffexp$contrasts[[cond_id]][[comp_id]] coef_string <- paste0( cond_id, "_", contrast_groups[[1]], "_vs_", contrast_groups[[2]] ) dds_path <- file.path(paste0(base_analysis_dir), "deseq2/all.rds") diffexp_tb_path <- file.path( paste0(base_analysis_dir), glue::glue("results/diffexp/{cond_id}/{comp_id}.diffexp.tsv") ) fpkm_path <- file.path(base_analysis_dir, "fpkm/all.tsv") samp_map <- file.path(conf$samples) rld_path <- file.path( paste0(base_analysis_dir), "deseq2/rlog_transform.RDS.gz" ) register(SerialParam()) plot_path <- "./" lfc_threshold <- 0.5 pvalue_threshold <- 0.05 enrich_list <- file.path( base_analysis_dir, glue::glue("results/diffexp/{cond_id}/{comp_id}.gseares.RDS") ) group_colors <- conf[["group_colors"]][[cond_id]] %>% unlist() organism <- conf$organism lfc_shrink <- FALSE model_string <- "~ condition" } dir.create(plot_path, recursive = TRUE) dds_obj <- readRDS(dds_path) rld <- readRDS(rld_path) enrich_list <- readRDS(enrich_list) fpkm <- readr::read_tsv(fpkm_path) |
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | if (!is.null(conf$diffexp$custom_model[[cond_id]])) { model_string <- conf$diffexp$custom_model[[cond_id]] # Rerun deseq since it was intialized for (x in names(conf[["diffexp"]][["contrasts"]])) { colData(dds_obj)[, x] <- as.factor(colData(dds_obj)[, x]) } colData(dds_obj)[, cond_id] <- as.factor(colData(dds_obj)[, cond_id]) design(dds_obj) <- as.formula(model_string) dds_obj <- DESeq(dds_obj) dds_obj@colData[, cond_id] <- relevel(dds_obj@colData[, cond_id], ref = contrast_groups[2] ) } if (lfc_shrink) { dds_obj@colData[, cond_id] <- relevel(dds_obj@colData[, cond_id], ref = contrast_groups[2] ) design(dds_obj) <- as.formula(model_string) dds_obj <- DESeq(dds_obj) write(coef_string, file = stderr()) res <- lfcShrink(dds_obj, coef = stringr::str_replace_all(coef_string, "-", "."), type = "apeglm" ) } else { res <- results(dds_obj, contrast = c(cond_id, contrast_groups)) } DESeq2::plotMA(res) |
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | sample_overview <- readr::read_tsv(samp_map) deseq_tb <- readr::read_tsv(diffexp_tb_path, col_names = c( "gene_id", "baseMean", "logFoldChange", "lfcSE", "stat", "pvalue", "padj" ), skip = 1 ) sample_overview <- sample_overview %>% dplyr::filter(sample %in% rownames(colData(dds_obj))) filer <- fpkm %>% dplyr::filter(gene %in% deseq_tb$gene_id) joined_df <- join_tables(deseq_tb, filer) joined_df <- joined_df %>% dplyr::mutate(overexpressed_in = ifelse(logFoldChange > 0, contrast_groups[1], contrast_groups[2] )) mean_tb <- joined_df %>% dplyr::filter(padj < pvalue_threshold & abs(logFoldChange) > lfc_threshold) %>% RNAscripts::mean_tibble_from_mat(., "logFoldChange", contrast_groups = contrast_groups, s_map = sample_overview, cond_id = cond_id ) DT::datatable(mean_tb) |
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 | diff_exp_trans <- SummarizedExperiment::assay(rld)[joined_df$gene, ] rownames(diff_exp_trans) <- joined_df$gname write(cond_id, file = stderr()) df_col <- data.frame(SummarizedExperiment::colData(rld)[, c(cond_id)]) if (!is.null(group_colors)) { col_annotation <- HeatmapAnnotation( condition = dds_obj@colData[, cond_id], col = list(condition = group_colors) ) } else { col_annotation <- HeatmapAnnotation(condition = dds_obj@colData[, cond_id]) } rownames(df_col) <- colnames(SummarizedExperiment::assay(rld)) colnames(df_col) <- cond_id index_vec <- which(joined_df$padj < pvalue_threshold & abs(joined_df$logFoldChange) > lfc_threshold) diff_exp_genes <- diff_exp_trans[index_vec, ] small_joined_df <- joined_df[joined_df$padj < pvalue_threshold & abs(joined_df$logFoldChange) > lfc_threshold, ] small_joined_df <- na.omit(small_joined_df) if (nrow(small_joined_df) > 1) { scaled_diffexp <- diff_exp_genes diffexp_heatmap <- Heatmap( head(scaled_diffexp[order( abs(small_joined_df$logFoldChange * -log10(small_joined_df$padj)), decreasing = TRUE ), ], 50), top_annotation = col_annotation ) save_cheatmap_svg( x = diffexp_heatmap, filename = file.path( plot_path, "diffexp_heatmap.svg" ) ) diffexp_heatmap } |
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 | library(EnhancedVolcano) volcano_plot <- EnhancedVolcano(as.data.frame(joined_df), lab = joined_df$gname, x = "logFoldChange", y = "padj", title = paste(contrast_groups, collapse = "_vs_"), pCutoff = 10e-3, ylab = bquote(~ -Log[10] ~ italic(Padj)), FCcutoff = lfc_threshold ) ggsave( filename = file.path(plot_path, "Volcano_plot.svg"), plot = volcano_plot ) volcano_plot |
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 | #' Uniquify a vector #' #' @param x Input factor to check #' @return corrected facotr value #' @examples #' NULL uniquify <- function(x) { if (length(x) == 1) { x } else { sprintf("%s_%02d", x, seq_along(x)) } } #' Disambiguate a vector #' #' Add numbered suffixes to redundant entries in a vector #' #' @param in_vector Vector to be disambiguated #' @importFrom stats ave #' @return The disambiguated vector. #' @export #' #' @examples #' NULL #' disambiguate_vector <- function(in_vector) { ave(in_vector, in_vector, FUN = uniquify) } diff_exp_heatmaps <- list() diff_norm_heatmaps <- list() if (!is.null(group_colors)) { col_annotation <- HeatmapAnnotation( condition = dds_obj@colData[dds_obj@colData[, cond_id] %in% contrast_groups, cond_id], col = list(condition = group_colors) ) } else { col_annotation <- HeatmapAnnotation( condition = dds_obj@colData[dds_obj@colData[, cond_id] %in% contrast_groups, cond_id] ) } filtered_join_df <- joined_df[] %>% dplyr::filter(padj < pvalue_threshold & abs(logFoldChange) > lfc_threshold) # Since it can happen that duplicate gene names appear -> uplicates are marked filtered_join_df$gname <- disambiguate_vector(filtered_join_df$gname) s_table <- filter_split_table(rld, contrast_groups, filtered_join_df, reorder_genes = FALSE, cond_id = cond_id ) r_means <- rowMeans(s_table) r_sds <- apply(s_table, 1, sd) s_table_centered <- s_table - r_means s_table_norm <- t(scale(t(s_table))) diff_exp_heatmaps <- ComplexHeatmap::Heatmap( s_table_centered, cluster_rows = TRUE, top_annotation = col_annotation, name = "Centered exp.", row_split = filtered_join_df$overexpressed_in, column_names_rot = 45, show_row_names = nrow(s_table_norm) < 100, ) diff_norm_heatmaps <- ComplexHeatmap::Heatmap( s_table_norm, cluster_rows = TRUE, top_annotation = col_annotation, name = "z-score exp.", column_names_rot = 45, show_row_names = nrow(s_table_norm) < 100 ) if (nrow(filtered_join_df) > 0) { save_cheatmap_svg(diff_exp_heatmaps, file.path(plot_path, "split_heatmap.svg"), width = 14, height = 12 ) save_cheatmap_svg(diff_norm_heatmaps, file.path("standard_redblue_heatmap.svg"), width = 14, height = 12 ) plot(diff_exp_heatmaps) plot(diff_norm_heatmaps) } |
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 | genereate_gsea_plots <- function(gsea_obj, gsea_name) { cat("\n") cat("### ", gsea_name , " \n") cat("\n") } msig_enrichment <- enrich_list$msig_enrichment msig_gsea <- enrich_list$msig_gsea kegg <- enrich_list$kegg reactome_stuff <- enrich_list$reactome_stuff ensembl_gene_list <- joined_df %>% dplyr::select(c(gene, stat)) gene_list <- joined_df %>% dplyr::select(c(gname, stat)) |
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 | better_dotplot <- function(gset, c_groups = contrast_groups) { pos_gsea <- gset %>% dplyr::filter(NES > 0.5) %>% dplyr::arrange(desc(NES)) if (nrow(pos_gsea) > 1) { dp_pos_nes <- dotplot( pos_gsea, size = "NES", color = "p.adjust", showCategory = 20, title = glue::glue("Pathways enriched in {contrast_groups[1]}") ) + scale_size(range = c(1, 7), limits = c(1, max(gset@result$NES))) } else { dp_pos_nes <- NULL } neg_gsea <- gset %>% dplyr::filter(NES < -0.5) %>% dplyr::arrange(NES) if (nrow(neg_gsea) > 1) { dp_neg_nes <- dotplot( neg_gsea, size = "NES", color = "p.adjust", showCategory = 20, title = glue::glue("Pathways enriched in {contrast_groups[2]}") ) + scale_size(range = c(7, 1), limits = c(min(gset@result$NES), -1)) } else { dp_neg_nes <- NULL } return(list(dp_pos_nes, dp_neg_nes)) } #' Title #' #' @param d_plot #' @param p_group #' @param p_path #' #' @return #' @export #' #' @examples save_dotplots <- function(d_plot, p_group, p_path = plot_path, gsea_type) { ggplot2::ggsave( filename = file.path( p_path, glue::glue("{p_group}_{gsea_type}_dplot.svg") ), d_plot, width = 10, height = 10 ) } org_db <- get_org_db(organism) do_gseaplots <- function(enrich_res, gsea_name) { cat("\n") cat("### ", gsea_name , " \n") cat("\n") id_class <- conf[["gsea"]][[gsea_name]][["id_class"]] all_plots <- list() if (!is.null(enrich_res)) { if (nrow(enrich_res) > 1) { enrich_res <- clusterProfiler::setReadable(enrich_res, org_db, id_class) z <- enrichplot::upsetplot(enrich_res) all_plots$upset <- z if (conf[["gsea"]][[gsea_name]][["use_gsea"]]) { enrich_plots <- RNAscripts::plot_enrichment(enrich_res, X = "ID", Y = "NES", pval = "p.adjust" ) ggsave(filename = file.path(plot_path, paste0( contrast_groups[1], "_vs_", contrast_groups[2], "_", gsea_name, "_enrichplot.svg" )), plot = enrich_plots) print(enrich_plots) all_plots$enrichplots <- enrich_plots dplot <- better_dotplot(enrich_res, c_groups = contrast_groups) x<- purrr::map(dplot, print) purrr::map2(dplot, contrast_groups, save_dotplots, gsea_type = gsea_name ) all_plots$dotplots <- dplot cnetplot <- cnetplot(enrich_res, foldChange = set_names(joined_df$logFoldChange, gene_list$gname), node_label = "all", layout = "nicely", cex_label_category = 0.8, cex_label_gene = 0.6 ) print(cnetplot) ggsave( filename = file.path(plot_path, paste0( contrast_groups[1], "_vs_", contrast_groups[2], "_", gsea_name, "_cnet.svg" )), cnetplot, width = 16, height = 16 ) all_plots <- cnetplot } else { enrich_plots <- NULL all_plots$dotplots <- dotplot(msig_enrichment) print(dotplot) } } else { all_plots <- NULL } } cat("\n \n") return(all_plots) } all_plots <- purrr::map2(enrich_list, names(enrich_list), do_gseaplots) |
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | knitr::opts_chunk$set( echo = FALSE, warning = FALSE, message = FALSE, dev = "png", fig.width = 12, fig.height = 12 ) require(dplyr) library(magrittr) library(ComplexHeatmap) library(pheatmap) library(PCAtools) library(clusterProfiler) library(RNAscripts) if (exists("snakemake")) { diffexp_tables_paths <- snakemake@input[["tables"]] contrast_groups <- snakemake@params[["contrast"]] contrast_list <- names(snakemake@params[["contrast"]]) cond_id <- snakemake@wildcards[["condition"]] fpkm_path <- snakemake@input[["fpkm"]] dds_path <- snakemake@input[["dds_obj"]] rld_path <- snakemake@input[["rld"]] write(paste0( class(diffexp_tables_paths), length(diffexp_tables_paths), " and ", class(contrast_list), " ", length(contrast_list) ), file = stderr()) threads <- snakemake@threads lfc_threshold <- snakemake@config[["diffexp"]][["LFC_threshold"]] pvalue_threshold <- snakemake@config[["diffexp"]][["pval_threshold"]] group_colors <- snakemake@config[["group_colors"]][[cond_id]] %>% unlist() run_pert <- snakemake@config[["perform_perturbation"]] } else { conf <- yaml::read_yaml("../../configs/VascAge_config.yaml") SARIFA_DIR <- "/Users/heyechri/Documents/software/heyer/multicondition-deseq2-enrichment" BASE_ANALYSIS_DIR <- file.path(conf$dirs$BASE_ANALYSIS_DIR) cond_id <- names(conf$diffexp$contrasts)[1] diffexp_tables_paths <- as.list(file.path( BASE_ANALYSIS_DIR, glue::glue("results/diffexp/{cond_id}/{names(conf$diffexp$contrasts[[cond_id]])}.diffexp.tsv") )) contrast_list <- names(conf$diffexp$contrasts$SARIFA) fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv") dds_path <- file.path(BASE_ANALYSIS_DIR, "deseq2/all.rds") rld_path <- file.path(BASE_ANALYSIS_DIR, "deseq2/rlog_transform.RDS.gz") threads <- 2 lfc_threshold <- 0.5 pvalue_threshold <- 0.05 run_pert <- FALSE group_colors <- conf[["group_colors"]][[cond_id]] %>% unlist() } |
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | ## READ all DIFFexp tables diff_exp_tables <- purrr::map(diffexp_tables_paths, readr::read_tsv, col_names = c( "gene_id", "baseMean", "logFoldChange", "lfcSE", "stat", "pvalue", "padj" ), skip = 1 ) names(diff_exp_tables) <- contrast_list all_ensembl <- diff_exp_tables[[1]][, 1] ### Set Rownames for data frame diff_exp_frames <- purrr::map(diff_exp_tables, function(x) { gene_ids <- x$gene_id diff_exp_fr <- as.data.frame(x[, -1]) rownames(diff_exp_fr) <- gene_ids diff_exp_fr }) ### Read FPKM Table fpkm <- readr::read_tsv(fpkm_path) all_genes <- fpkm %>% dplyr::filter(gene %in% all_ensembl$gene_id) %>% dplyr::select(gname) Big_tables <- purrr::map(diff_exp_tables, join_tables, fpkm = fpkm) |
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | build_comb_gene_set <- function(comb_mat, comb_mat_order) { combination_gene_sets <- list() for (c_name in ComplexHeatmap::comb_name(comb_mat)) { gnames <- ComplexHeatmap::extract_comb(m = comb_mat, comb_name = c_name) groups_index <- stringr::str_split(c_name, "") %>% unlist() group_names <- ComplexHeatmap::set_name(comb_mat)[as.logical(as.numeric(groups_index))] merged_group_name <- paste(group_names, collapse = " + ") combination_gene_sets[[merged_group_name]] <- tibble::tibble( gene_names = gnames, lfc = rep(0, length(gnames)) ) } if (length(combination_gene_sets) > 30) { combination_gene_sets <- combination_gene_sets[c(match(names(comb_mat_order), ComplexHeatmap::comb_name(comb_mat))[1:30])] } else { combination_gene_sets <- combination_gene_sets } combination_gene_sets } group_name_combinations <- c() gene_names <- purrr::map(Big_tables, function(x) { x %>% dplyr::pull(gname) }) sig_gene_map <- purrr::map(Big_tables, function(x) { x %>% dplyr::filter(padj < pvalue_threshold & abs(logFoldChange) > lfc_threshold) %>% dplyr::select(gene, gname) }) sig_gene_names <- purrr::map(sig_gene_map, function(x) { dplyr::pull(x, gname) }) if (any(purrr::map(sig_gene_map, nrow) > 0)) { gene_name_set <- ComplexHeatmap::list_to_matrix(sig_gene_names) comb_mat <- ComplexHeatmap::make_comb_mat(gene_name_set) comb_mat_order <- ComplexHeatmap::comb_size(comb_mat) %>% sort(decreasing = T) combination_gene_sets <- build_comb_gene_set(comb_mat = comb_mat, comb_mat_order = comb_mat_order) H_res <- run_msig_enricher(combination_gene_sets, "H", GSEA = F) H_res <- H_res[!(purrr::map_lgl(H_res, is.null))] } if (length(sig_gene_names) > 1) { test <- UpSetR::upset(UpSetR::fromList(sig_gene_names), nsets = length(sig_gene_names), nintersects = 30, order.by = c("degree", "freq"), decreasing = c(T, T) ) } else { test <- NULL } # if (!run_pert) { # small_comb <- comb_mat[comb_size(comb_mat) >= 20] # UpSet(small_comb, comb_order = order(comb_size(small_comb), # decreasing = T), ) # } test |
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | #' Create dynamic table from significant gene names #' #' @param sig_gene_names import list of genes to put in matrix #' #' @return DT::datatable #' @export #' #' @examples create_overlap_matrix <- function(sig_gene_names) { gene_name_index <- unlist(sig_gene_names) %>% unique() %>% sort() #' Takes two character vecotrs (info and index) and checks which index fields are present in info #' #' @return returns a vecor of length(index) with 1 or 0 depending on True or False check_against_index <- function(info, index) { return(as.numeric(index %in% info)) } gene_matrix <- sapply(sig_gene_names, check_against_index, index = gene_name_index) rownames(gene_matrix) <- gene_name_index gene_matrix <- cbind(gene_matrix, total = rowSums(gene_matrix)) ordered_matrix <- gene_matrix[order(gene_matrix[, "total"], decreasing = T), ] DT::datatable(ordered_matrix) } ifelse(all(purrr::map( sig_gene_names, is.null )), create_overlap_matrix(sig_gene_names), print("No enrichments present") ) |
238 239 240 241 242 243 244 245 246 247 248 249 250 251 | plot_gene_sets <- function(gset_res, plot_title) { print(plot_title) if (!is.null(gset_res)) { if (any(gset_res@result$p.adjust < 0.05)) { cnet_plot <- enrichplot::cnetplot(gset_res) + ggplot2::ggtitle(plot_title) return(cnet_plot) } } else { return(NULL) } } if (exists("H_res")) { cnet_plots <- purrr::pmap(list(H_res, names(H_res)), plot_gene_sets) } |
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 | H_barplots <- purrr::map(H_res, barplot) H_heatmaps <- purrr::map(H_res, clusterProfiler::heatplot, showCategory = 10) for (set_n in names(H_res)) { if (any(H_res[[set_n]]@result$p.adjust < 0.05)) { cat("### ", set_n, "H set \n") y <- H_barplots[[set_n]] ifelse(nrow(y$data) > 0, plot(y), "") ifelse(nrow(H_res[[set_n]]) > 0, plot(dotplot(H_res[[set_n]], orderBy = "x")), "" ) cat("\n") cat("\n") ifelse(nrow(cnet_plots[[set_n]]$data) > 5, plot(cnet_plots[[set_n]]), "" ) cat("\n \n") } } |
288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 | dds_obj <- readRDS(dds_path) rld <- readRDS(rld_path) sig_transcripts <- purrr::map(sig_gene_map, function(x) { dplyr::pull(x, gene) }) %>% unlist() %>% unique() # select <- order(var(counts(dds_obj,normalized=TRUE)), # decreasing=TRUE)[1:400] df <- data.frame(SummarizedExperiment::colData(dds_obj)[, c(cond_id)]) rownames(df) <- colnames(SummarizedExperiment::assay(rld)) colnames(df) <- cond_id annotation_cols <- list(cond_id = group_colors) ha <- ComplexHeatmap::HeatmapAnnotation(condition = rld@colData[, cond_id], col = annotation_cols) normalized_expression <- SummarizedExperiment::assay(rld[sig_transcripts, ]) - rowMeans(SummarizedExperiment::assay(rld[sig_transcripts, ])) ComplexHeatmap::Heatmap(normalized_expression, cluster_rows = TRUE, show_row_names = FALSE, # col = viridis::viridis(100), top_annotation = ha, name = "Normalized gene expression in differentially expressed genes", # km = 4 ) |
326 327 328 329 330 331 332 333 334 335 336 337 | library("RColorBrewer") # vsd <- DESeq2::vst(dds_obj) sampleDists <- dist(t(SummarizedExperiment::assay(rld)), ) sampleDistMatrix <- as.matrix(sampleDists) # rownames(sampleDistMatrix) <- paste(vsd$condition) colnames(sampleDistMatrix) <- NULL colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255) ComplexHeatmap::pheatmap(sampleDistMatrix, clustering_distance_rows = sampleDists, clustering_distance_cols = sampleDists, col = colors ) |
347 348 349 350 351 352 | assay_data <- SummarizedExperiment::assay(rld) metadata <- SummarizedExperiment::colData(rld) pca_data <- PCAtools::pca(assay_data, metadata = metadata, removeVar = 0.1) scree <- screeplot(pca_data) scree |
362 363 364 365 366 367 368 369 370 371 372 373 374 | PCAtools::biplot(pca_data, colby = cond_id, legendPosition = "right") plot <- PCAtools::pairsplot(pca_data, components = PCAtools::getComponents(pca_data, c(1:5)), triangle = F, hline = 0, vline = 0, pointSize = 0.8, gridlines.major = FALSE, gridlines.minor = FALSE, colby = cond_id, title = "Pairs plot", plotaxes = T, margingaps = ggplot2::unit(c(-0.01, -0.01, -0.01, -0.01), "cm"), legendPosition = "none" ) plot |
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | library("DESeq2") parallel <- FALSE if (snakemake@threads > 1) { library("BiocParallel") # setup parallelization register(MulticoreParam(snakemake@threads)) parallel <- TRUE } if (!exists("snakemake")) { library(magrittr) the_yaml <- yaml::read_yaml("./configs/VascAge_config.yaml") BASE_DIR <- the_yaml$dirs$BASE_ANALYSIS_DIR cts <- file.path(BASE_DIR, "counts/all.tsv") %>% read.table(header = T, row.names = "gene", check.names = F) coldata <- read.table("./data/Vasc_age2020/Vascage_samples.tsv", header = TRUE, row.names = "sample", check.names = FALSE, stringsAsFactors = TRUE ) # coldata %>% dplyr::filter(cell_type == "tumor") ->coldata # cts <- cts[,rownames(small_coldata)] snakemake_conf <- yaml::read_yaml("./configs/VascAge_config.yaml") all_conditions <- names(snakemake_conf$diffexp$contrasts) } all_conditions <- names(snakemake@config$diffexp$contrasts) # colData and countData must have the same sample order, but this is ensured # by the way we create the count matrix cts <- read.table(snakemake@input[["counts"]], header = TRUE, row.names = "gene", check.names = FALSE ) coldata <- read.table(snakemake@params[["samples"]], header = TRUE, row.names = "sample", check.names = FALSE ) ## Reorder coldata rows to match cts col order (beacause deseq things) if (!all(colnames(cts) == rownames(coldata))) { sample_ids <- colnames(cts) coldata <- coldata[sample_ids, ] } # Remove NAs from Data (not supported) if (any(is.na(coldata[, c(all_conditions)]))) { na_index <- apply(coldata, 1, function(x) { any(is.na(x)) }) coldata <- coldata[!na_index, ] cts <- cts[, rownames(coldata)] } all_conditions <- names(snakemake@config$diffexp$contrasts) for (x in all_conditions) { coldata[, x] <- as.factor(coldata[, x]) } dds <- DESeqDataSetFromMatrix( countData = cts, colData = coldata, # design = as.formula("~condition")) design = as.formula(snakemake@params[["model"]]), ) # remove genes defined in config excluded_genes <- snakemake@config[["excluded_genes"]] # print(excluded_genes) if (length(excluded_genes) > 0) { ex_index <- purrr::map_int(excluded_genes, grep, rownames(dds), value = F ) if (length(ex_index) > 0) { dds <- dds[-ex_index, ] } } # remove uninformative rows dds <- dds[rowSums(counts(dds)) > ncol(dds) / 2, ] # normalization and preprocessing dds <- DESeq(dds, parallel = parallel ) cpm_filter <- apply(edgeR::cpm(counts(dds, normalized = T)), 1, function(x) { if(length(which(x > 0.5)) > 0.2 * length(x)) { val <- 1 } else { val <- 0 } as.logical(val) }) dds <- dds[cpm_filter,] saveRDS(dds, file = snakemake@output[[1]]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | log <- file(snakemake@log[[1]], open = "wt") sink(log) sink(log, type = "message") library("DESeq2") parallel <- FALSE if (snakemake@threads > 1) { library("BiocParallel") # setup parallelization register(MulticoreParam(snakemake@threads)) parallel <- TRUE } lfc_shrink <- snakemake@config[["diffexp"]][["shrink_lfc"]] dds <- readRDS(snakemake@input[[1]]) cond_id <- snakemake@wildcards[["condition"]] contrast_groups <- snakemake@params[["contrast"]] contrast <- c(cond_id, contrast_groups) print(contrast) coef_string <- paste0(cond_id, "_", contrast_groups[[1]], "_vs_", contrast_groups[[2]]) print(coef_string) if (!is.null(snakemake@config$diffexp$custom_model[[contrast[1]]])) { # Rerun deseq since it was intialized for (x in names(snakemake@config[["diffexp"]][["contrasts"]])) { colData(dds)[, x] <- as.factor(colData(dds)[, x]) } colData(dds)[, contrast[1]] <- as.factor(colData(dds)[, contrast[1]]) design(dds) <- as.formula(snakemake@config$diffexp$custom_model[[contrast[1]]]) dds <- DESeq(dds) } zero_index <- apply(counts(dds)[, colData(dds)[, cond_id] %in% contrast_groups], 1, function(x) { length(which(x > 0)) }) # dds <- dds[zero_index > ncol(dds[,colData(dds)[,cond_id] %in% contrast_groups])/2,] res <- results(dds, contrast = contrast, parallel = parallel) resstat <- res$stat # shrink fold changes for lowly expressed genes if (lfc_shrink) { dds@colData[, cond_id] <- relevel(dds@colData[, cond_id], ref = contrast_groups[2]) dds <- DESeq(dds, parallel = parallel) res_new <- lfcShrink(dds, coef = stringr::str_replace_all(coef_string, "-", "."), type = "apeglm") print(head(res_new)) # sort by p-value res_new$stat <- resstat res_new <- res_new[, c("baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] res_new <- res_new[order(res_new$padj), ] } # store results pdf(snakemake@output[["ma_plot"]]) plotMA(res, ylim = c(-2, 2)) if (lfc_shrink) { plotMA(res_new, ylim = c(-2, 2)) } dev.off() if (lfc_shrink) { res <- res_new } # write.table(as.data.frame(res), sep = "\t", quote = FALSE, file = snakemake@output[["table"]]) |
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | knitr::opts_chunk$set( echo = FALSE, warning = FALSE, message = FALSE, dev = "png", fig.width = 12, fig.height = 12 ) if (!require(dorothea)) { BiocManager::install("dorothea") } # library(dorothea) # library(bcellViper) library(dplyr) library(viper) library(decoupleR) library(RColorBrewer) if (!require(RNAscripts)) { devtools::install("./scripts/RNAscripts", upgrade = "never") } library(RNAscripts) library(ggplot2) library(readr) library(ComplexHeatmap) library(DESeq2) library(purrr) library(magrittr) if (exists("snakemake")) { ## Input Files dds_path <- snakemake@input[["dds_path"]] fpkm_path <- snakemake@input[["fpkm_path"]] ### Output dorothea_fp <- snakemake@input[["dorothea_fp"]] # Params cond_id <- names(snakemake@config[["diffexp"]][["contrasts"]]) samp_map <- snakemake@params[["samp_map"]] plot_path <- snakemake@params[["plot_path"]] comp_groups <- snakemake@config[["signif_comp"]] color_scheme <- purrr::map(snakemake@config[["group_colors"]], unlist) organism <- snakemake@config[["organism"]] } else { the_yaml <- yaml::read_yaml("../../config/STAD.yaml") BASE_ANALYSIS_DIR <- c(the_yaml$dirs$BASE_ANALYSIS_DIR) # comp_id <- "YECplusA-vs-YEC" # contrast_groups <- c("Young-EC_apelin", "Young-EC") dds_path <- file.path(paste0(BASE_ANALYSIS_DIR), "deseq2/all.rds") fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv") samp_map <- the_yaml$samples plot_path <- "./" cond_id <- names(the_yaml$diffexp$contrasts) comp_groups <- the_yaml$comp_groups color_scheme <- purrr::map(the_yaml$group_colors, unlist) organism <- "Homo sapiens" } if (!dir.exists(plot_path)) { dir.create(plot_path) } |
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | dds_file <- readRDS(dds_path) Normalized_counts <- getVarianceStabilizedData(dds_file) # Normalized_counts <- assay(vst(dds_file, blind = F)) fpkm <- read_tsv(fpkm_path) filer <- fpkm %>% dplyr::filter(gene %in% rownames(Normalized_counts)) %>% dplyr::filter(!duplicated(gname)) # joined_df <- join_tables(diffexp_tb,filer) %>% dplyr::filter(!duplicated(gname)) Normalized_counts <- Normalized_counts[filer$gene, ] rownames(Normalized_counts) <- filer$gname # %>% dplyr::select(gname, stat) %>% dplyr::filter(!is.na(stat)) %>% # column_to_rownames(var = "gname") %>% as.matrix() -> diffexp_matrix expression_matrix <- Normalized_counts # regulons <- dorothea_mm %>% filter(confidence %in% c("A", "B", "C")) sample_overview <- colData(dds_file) %>% as_tibble(rownames = "sample") |
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | organism_name <- RNAscripts::get_organism_omnipath_name(organism) net <- get_dorothea(organism = organism_name, levels = c("A", "B", "C")) viper_net <- net colnames(viper_net) <- colnames(dorothea_mm) tf_activities <- decoupleR::run_viper(expression_matrix, net, minsize = 4 ) tf_activities %>% dplyr::filter(statistic == "viper") %>% tidyr::pivot_wider( id_cols = "condition", names_from = "source", values_from = "score" ) %>% tibble::column_to_rownames("condition") %>% as.matrix() %>% t() -> tf_activities plot_doro_hm <- function(dds, cond, color_list, p_data) { anno_vec <- dds_file@colData %>% as.data.frame() %>% pull(!!cond) color_vec <- color_list[[cond]] if (!is.null(color_vec)) { ha <- HeatmapAnnotation( group = anno_vec, col = list( group = as_vector(color_vec) ) ) } else { ha <- HeatmapAnnotation(group = anno_vec) } ch <- Heatmap(p_data, top_annotation = ha, clustering_distance_columns = "euclidean", clustering_method_columns = "average", show_row_names = FALSE, show_column_names = F, row_names_gp = gpar(fontsize = 8), column_names_rot = 70 ) save_cheatmap_svg(ch, file.path(plot_path, glue::glue("progeny_heatmap_{cond}.svg"))) ch } if (!is.null(color_scheme)) { purrr::map(cond_id, plot_doro_hm, dds = dds_file, p_data = tf_activities, color_list = color_scheme) } else { purrr::map(cond_id, plot_doro_hm, dds = dds_file, p_data = tf_activities, color_list = color_scheme) } |
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 | plot_doro_plots <- function(tf_activities, dds_file, cond) { mean_tf_activities <- mean_tibble_from_mat( mat = tf_activities, contrast_groups = colData(dds_file)[, cond], s_map = sample_overview, cond_id = cond ) mean_tf_activities["TF"] <- rownames(tf_activities) plot_list <- list() for (c_group in unique(sample_overview %>% pull(!!cond))) { plot_table <- mean_tf_activities %>% dplyr::select(TF, c_group) %>% dplyr::arrange(!!sym(c_group)) histo <- ggplot(plot_table, aes(!!sym(c_group))) + geom_density() + ggtitle(c_group) + theme_bw() + xlab("NES") plot_table <- top_n( x = plot_table, n = 25, wt = abs(!!sym(c_group)) ) plot_table <- plot_table %>% mutate(color = ifelse(!!sym(c_group) > 0, "green", "red")) enrich_plot <- ggplot(plot_table, aes(x = TF, y = !!sym(c_group))) + scale_x_discrete(limits = plot_table$TF) + geom_point(size = 3, aes(col = color)) + geom_segment(aes( x = TF, xend = TF, y = 0, yend = !!sym(c_group) )) + coord_flip() + theme_bw() + ylab("Normalized Enrichment Score (NES)") + xlab("TF") + ggtitle(c_group) NES_plot <- ggplot(plot_table, aes(x = reorder(TF, !!sym(c_group)), y = !!sym(c_group))) + geom_bar(aes(fill = !!sym(c_group)), stat = "identity") + scale_fill_gradient2( low = "darkblue", high = "indianred", mid = "whitesmoke", midpoint = 0 ) + theme_minimal() + theme( axis.title = element_text(face = "bold", size = 12), axis.text.x = element_text( angle = 45, hjust = 1, size = 10, face = "bold" ), axis.text.y = element_text(size = 10, face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = "none" ) + ylab("NES") + xlab("Pathways") + ggtitle(c_group) plot(histo) plot(enrich_plot) plot(NES_plot) } } purrr::map(cond_id, plot_doro_plots, tf_activities = tf_activities, dds_file = dds_file ) |
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 | sample_acts <- run_wmean( mat = expression_matrix, network = net, .source = "source", .target = "target", .mor = "mor", times = 100, minsize = 5 ) sample_acts_mat <- sample_acts %>% dplyr::filter(statistic == "norm_wmean") %>% tidyr::pivot_wider( id_cols = "condition", names_from = "source", values_from = "score" ) %>% tibble::column_to_rownames("condition") %>% as.matrix() %>% t() if (!is.null(color_scheme)) { purrr::map(cond_id, plot_doro_hm, dds = dds_file, p_data = t(scale(t(sample_acts_mat), scale = F)), color_list = color_scheme ) } else { purrr::map(cond_id, plot_doro_hm, dds = dds_file, p_data = t(scale(t(sample_acts_mat))), color_list = color_scheme) } |
279 280 281 282 | purrr::map(cond_id, plot_doro_plots, tf_activities = t(scale(t(sample_acts_mat))), dds_file = dds_file ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | library(openxlsx) library(magrittr) library(dplyr) library(tidyr) library(glue) if (!require(RNAscripts)) { devtools::install("./scripts/RNAscripts", upgrade = "never") } library(RNAscripts) if (exists("snakemake")) { diffexp_tb_path <- snakemake@input[["table"]] cond_id <- snakemake@wildcards[["condition"]] fpkm_path <- snakemake@input[["fpkm"]] samp_map <- snakemake@params[["samp_map"]] contrast_groups <- snakemake@config[["diffexp"]][["contrasts"]][[cond_id]] contrast_names <- snakemake@params[["contrast_groups"]] names(contrast_groups) <- names(contrast_names) output_path <- snakemake@output[["outpath"]] print(c(diffexp_tb_path, fpkm_path, samp_map, contrast_groups, contrast_names)) pval_threshold <- snakemake@config[["diffexp"]][["pval_threshold"]] lfc_threshold <- snakemake@config[["diffexp"]][["LFC_threshold"]] } else { BASE_ANALYSIS_DIR <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/apelin_exp_test" test_config <- yaml::read_yaml("/desktop-home/heyer/projects/Vascular_Aging/RNAseq/multicondition-deseq2-enrichment/configs/VascAge_Apelin_config.yaml") cond_id <- "condition" contrast_groups <- test_config$diffexp$contrasts$condition contrast_names <- test_config$diffexp$contrasts$condition diffexp_tb_path <- as.list(file.path(BASE_ANALYSIS_DIR, glue::glue("results/diffexp/condition/{names(contrast_groups)}.diffexp.tsv"))) names(diffexp_tb_path) <- names(contrast_groups) lfc_threshold <- 1 pval_threshold <- 0.05 fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv") samp_map <- "/desktop-home/heyer/projects/Vascular_Aging/RNAseq/rna-seq-star-deseq2/data/apelin_2020/VascAge_samples_apelin.tsv" output_path <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/apelin_exp_test/results/diffexp/condition/excel_tables/diff_exp_man.xlsx" } print("read samples") sample_overview <- readr::read_tsv(samp_map) col_name <- c("gene", "gname") print("read fpkm table") fpkm <- readr::read_tsv(fpkm_path) sample_overview <- sample_overview %>% dplyr::filter(sample %in% colnames(fpkm)) build_output_table <- function(diff_exp_table, c_group, fpkm, col_name) { print(c_group) print(col_name) relevant_samples <- sample_overview %>% dplyr::filter(!!sym(cond_id) %in% c_group) %>% pull(sample) print(relevant_samples) deseq2_table <- readr::read_tsv(diff_exp_table, col_names = c( "gene_id", "baseMean", "logFoldChange", "lfcSE", "stat", "pvalue", "padj" ), skip = 1) print("read deseq") # print(colnames(fpkm)) fpkm_data <- fpkm %>% dplyr::filter(gene %in% deseq2_table$gene_id) %>% dplyr::select(col_name, relevant_samples) colnames(fpkm_data)[-c(1:2)] <- sample_overview %>% dplyr::filter(sample %in% colnames(fpkm_data)[-c(1:2)]) %>% pull(sample) print(names(fpkm_data)) print(names(deseq2_table)) joined_df <- join_tables(deseq2_table, fpkm_data) joined_df <- joined_df %>% dplyr::mutate(overexpressed_in = ifelse(logFoldChange > 0, c_group[1], c_group[2])) output_tb <- joined_df %>% dplyr::filter(padj < pval_threshold & abs(logFoldChange) > lfc_threshold) # Shorten names to less than 31 chars or excel gets angry output_tb } output_list <- purrr::map2(diffexp_tb_path, contrast_groups, build_output_table, fpkm = fpkm, col_name = col_name) print(contrast_names) names(output_list) <- purrr::map_chr(names(contrast_names), function(comp_name) { comp_name <- ifelse(nchar(comp_name) > 31, stringr::str_trunc(comp_name, 30, "right"), comp_name) comp_name }) print(names(output_list)) # names(output_list) <- names(contrast_names) openxlsx::write.xlsx(x = output_list, file = output_path, ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | import gffutils db = gffutils.create_db( snakemake.input[0], dbfn=snakemake.output.db, force=True, keep_order=True, merge_strategy="merge", sort_attribute_values=True, disable_infer_genes=True, disable_infer_transcripts=True, ) with open(snakemake.output.bed, "w") as outfileobj: for tx in db.features_of_type("transcript", order_by="start"): bed = [s.strip() for s in db.bed12(tx).split("\t")] bed[3] = tx.id outfileobj.write("{}\n".format("\t".join(bed))) |
1 2 3 4 5 6 7 8 9 10 11 12 13 | log <- file(snakemake@log[[1]], open = "wt") sink(log) sink(log, type = "message") library("DESeq2") # load deseq2 data dds <- readRDS(snakemake@input[[1]]) # obtain normalized counts counts <- assay(dds) svg(snakemake@output[[1]]) plotPCA(dds, intgroup = snakemake@params[["pca_labels"]]) dev.off() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | library(tidyverse) library(DESeq2) deseq_obj <- readRDS(snakemake@input[["dds_obj"]]) if (ncol(deseq_obj) < 50) { rld <- DESeq2::rlog(deseq_obj, blind = FALSE) } else { rld <- DESeq2::vst(deseq_obj, blind = FALSE) } saveRDS(rld, snakemake@output[["rld"]]) norm_counts <- assay(rld, withDimnames = T) %>% as_tibble(rownames = "gene") og_gene_names <- norm_counts$gene norm_counts$gene <- stringr::str_extract(norm_counts$gene, pattern = "^ENS[A-Z0-9]*") # this variable holds a mirror name until useEnsembl succeeds ('www' is last, because of very frequent 'Internal Server # Error's) species <- RNAscripts::get_organism_ensembl_name(snakemake@config[["organism"]]) stable_get_bm <- function(species) { mart <- "useast" rounds <- 0 while (class(mart)[[1]] != "Mart") { mart <- tryCatch( { # done here, because error function does not modify outer scope variables, I tried if (mart == "www") { rounds <- rounds + 1 } # equivalent to useMart, but you can choose the mirror instead of specifying a host biomaRt::useEnsembl(biomart = "ENSEMBL_MART_ENSEMBL", dataset = glue::glue("{species}_gene_ensembl"), mirror = mart) }, error = function(e) { # change or make configurable if you want more or less rounds of tries of all the mirrors if (rounds >= 3) { stop() } # hop to next mirror mart <- switch(mart, useast = "uswest", uswest = "asia", asia = "www", www = { # wait before starting another round through the mirrors, hoping that intermittent problems disappear Sys.sleep(30) "useast" } ) } ) } mart } mart <- stable_get_bm(species) # df <- read.table(snakemake@input']], sep='\t', header=1) gene_name_type <- snakemake@config[["gene_name_type"]] if (gene_name_type == "ENSEMBL") { g2g <- biomaRt::getBM(attributes = c("ensembl_gene_id", "external_gene_name"), filters = "ensembl_gene_id", values = stringr::str_extract(norm_counts$gene, pattern = "^ENS[A-Z0-9]*" ), mart = mart, ) colnames(g2g) <- c("gene", "gname") norm_counts$short_gene_names <- stringr::str_extract(norm_counts$gene, pattern = "^ENS[A-Z0-9]*") annotated <- dplyr::left_join(norm_counts, g2g, by = c(short_gene_names = "gene")) %>% dplyr::select(-short_gene_names) annotated$gname <- ifelse(annotated$gname == "" | is.na(annotated$gname), annotated$gene, annotated$gname) } else if (gene_name_type == "ENTREZ_ID") { stop("to be implemented") } else if (gene_name_type == "HGNC") { annotated <- norm_counts annotated$gname <- annotated$gene } else { stop("non valid gene name type") } # annotated$gene <- ifelse(annotated$external_gene_name == '', annotated$gene, annotated$external_gene_name) annotated$gene <- og_gene_names readr::write_tsv(annotated, snakemake@output[["fpkm"]]) |
9 10 11 12 | contrast_groups <- ifelse(exists("snakemake"), snakemake@wildcards[["contrast"]], "" ) |
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | library(CARNIVAL) library(readr) library(piano) library(dplyr) library(ggplot2) library(tibble) library(tidyr) library(dplyr) library(scales) library(plyr) library(GSEABase) library(network) library(reshape2) library(cowplot) library(pheatmap) library(ggraph) library(tidygraph) library(snowfall) |
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | if (exists("snakemake")) { carnival_path <- snakemake@input[["carnival_obj"]] tt_basepath <- snakemake@params[["tutorial_source_path"]] comp_group <- snakemake@wildcards[["contrast"]] out_file <- snakemake@output[[1]] organism <- snakemake@config[["organism"]] } else { yaml <- yaml::read_yaml("../../../configs/VascAge_cre_config.yaml") comp_group <- names(yaml$diffexp$contrasts$condition)[1] base_path <- yaml$dirs$BASE_ANALYSIS_DIR carnival_path <- file.path( base_path, "results/inversecarnival", glue::glue("{comp_group}_carnival_res.RDS.gz") ) tt_basepath <- "../transcriptutorial" out_file <- "" } source(file.path(tt_basepath, "support_enrichment.r")) source(file.path(tt_basepath, "support_networks.r")) process_carnival <- function(carnival_res) { carnival_res$weightedSIF <- carnival_res$weightedSIF %>% dplyr::filter(Weight != 0) carnival_res } msig_h <- msigdbr::msigdbr(organism, "C2", ) pathways <- loadGSC( file = dplyr::select( msig_h, c("gene_symbol", "gs_name") ), addInfo = dplyr::select( msig_h, c("gs_name", "gs_description") ) ) |
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | carnival_result <- readRDS(carnival_path) if (length(carnival_result) == 0) { write("CARNIVAL FAILED", file = out_file) quit(save = "no", status = 0) } carnival_result <- process_carnival(carnival_result) nodes_carnival <- extractCARNIVALnodes(carnival_result) sig_pathways <- runGSAhyper( genes = nodes_carnival$sucesses, universe = nodes_carnival$bg, gsc = pathways ) sig_pathways_df <- as.data.frame(sig_pathways$resTab) %>% tibble::rownames_to_column(var = "pathway") pathways_select <- sig_pathways_df %>% dplyr::select(pathway, `p-value`, `Adjusted p-value`) %>% dplyr::filter(`Adjusted p-value` <= 0.05) %>% # Signficance value fixed for now dplyr::rename(pvalue = `p-value`, AdjPvalu = `Adjusted p-value`) %>% dplyr::mutate(pathway = as.factor(pathway)) pathways_select <- data.frame(t(apply(pathways_select, 1, function(r) { aux <- unlist(strsplit(sub("_", ";", r["pathway"]), ";")) r["pathway"] <- gsub("_", " ", aux[2]) return(c(r, "source" = aux[1])) }))) if (ncol(pathways_select) == 4) { colnames(pathways_select) <- c("pathway", "pvalue", "AdjPvalu", "source") pathways_select$AdjPvalu <- as.numeric(pathways_select$AdjPvalu) ggdata <- pathways_select %>% dplyr::slice_min(AdjPvalu, n = 25) %>% dplyr::filter(AdjPvalu <= 0.05) %>% dplyr::group_by(source) %>% dplyr::arrange(AdjPvalu) %>% dplyr::slice(1:5) # Visualize top results ggplot(ggdata, aes( y = reorder(pathway, -log10(AdjPvalu)), x = -log10(AdjPvalu) ), color = source) + facet_grid(source ~ ., scales = "free", space = "free") + geom_bar(stat = "identity") + annotation_logticks(sides = "bt") + theme_bw() + theme( axis.title = element_text(face = "bold", size = 12), axis.text.y = element_text(size = 6) ) + ylab("") } |
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | plot_carn_network <- function(carn_res) { ed <- carn_res$weightedSIF colnames(ed) <- c("from", "sign", "to", "weight") nod <- data.frame(union(ed$from, ed$to)) colnames(nod) <- "nodes" nod$label <- nod$nodes joined_nod <- left_join(nod, carn_res$nodesAttributes, by = c("label" = "Node") ) yote <- tidygraph::tbl_graph( nodes = joined_nod, edges = ed ) %>% ggraph(layout = "auto") + geom_node_point(aes(color = AvgAct), size = 8) + scale_color_gradient2() + geom_edge_link(arrow = arrow(), aes( edge_colour = as.factor(sign), edge_alpha = weight )) + theme_graph() + geom_node_text(aes(label = label), vjust = 0.4, repel = T) yote } plot_carn_network(carnival_result) + ggtitle(comp_group) |
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | options(error = traceback, echo = FALSE) library(readr) library(piano) library(dplyr) library(ggplot2) library(tibble) library(tidyr) library(dplyr) library(scales) library(plyr) library(GSEABase) library(network) library(reshape2) library(cowplot) library(pheatmap) library(ggraph) library(tidygraph) library(ComplexHeatmap) library(RNAscripts) library(UpSetR) library(knitr) |
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | if (exists("snakemake")) { carnival_paths <- snakemake@input[["carnival_objs"]] %>% as.character() cond_id <- snakemake@wildcards[["condition"]] names(carnival_paths) <- names(snakemake@config[["diffexp"]][["contrasts"]][[cond_id]]) contrast_names <- names(snakemake@config[["diffexp"]][["contrasts"]][[cond_id]]) write(carnival_paths, file = stderr()) write(names(carnival_paths), file = stderr()) tt_basepath <- snakemake@params[["tutorial_source_path"]] # Output Files outpath <- snakemake@output[[1]] organism <- snakemake@config[["organism"]] } else { # BASE_ANALYSIS_DIR = '/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/Vasc_age2020/' cond_id <- "condition" the_yaml <- file.path("../../../configs/VascAge_Apelin_config.yaml") yaml_me <- yaml::read_yaml(the_yaml) BASE_ANALYSIS_DIR <- yaml_me$dirs$BASE_ANALYSIS_DIR carnival_paths <- file.path( BASE_ANALYSIS_DIR, "results/carnival/condition", paste0(names(yaml_me$diffexp$contrasts$condition), "_carnival_res.RDS.gz") ) names(carnival_paths) <- names(yaml_me$diffexp$contrasts$condition) contrast_names <- names(yaml_me$diffexp$contrasts$condition) base_path <- "" tt_basepath <- "../transcriptutorial" organism <- "Mus musculus" } source(file.path(tt_basepath, "support_enrichment.r")) source(file.path(tt_basepath, "support_networks.r")) ## Get Omnipath organism_number <- RNAscripts::get_organism_omnipath_id(organism) omniR <- OmnipathR::import_omnipath_interactions(organism = organism_number) # signed and directed omnipath_sd <- omniR %>% dplyr::filter(consensus_direction == 1 & (consensus_stimulation == 1 | consensus_inhibition == 1 )) # changing 0/1 criteria in consensus_stimulation/inhibition to -1/1 omnipath_sd$consensus_stimulation[which(omnipath_sd$consensus_stimulation == 0)] <- -1 omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 1)] <- -1 omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 0)] <- 1 # check consistency on consensus sign and select only those in a SIF format sif <- omnipath_sd[, c("source_genesymbol", "consensus_stimulation", "consensus_inhibition", "target_genesymbol")] %>% dplyr::filter(consensus_stimulation == consensus_inhibition) %>% unique.data.frame() sif$consensus_stimulation <- NULL colnames(sif) <- c("source", "interaction", "target") # remove complexes sif$source <- gsub(":", "_", sif$source) sif$target <- gsub(":", "_", sif$target) # dorothea for CARNIVAL carnival_sample_resolution <- purrr::map(carnival_paths, read_rds) for (i in seq_along(length(carnival_sample_resolution))) { if (length(carnival_sample_resolution[[i]]) == 0) { carnival_sample_resolution[[i]] <- NULL } } if (length(carnival_sample_resolution) == 0) { write("Carnvival unsuccessful", file = stderr()) readr::write_tsv(data.frame(), file = outpath) quit(save = "no", status = 0, runLast = FALSE) } msig_h <- msigdbr::msigdbr("Mus musculus", "C2", ) pathways <- loadGSC( file = dplyr::select(msig_h, c("gene_symbol", "gs_name")), addInfo = dplyr::select(msig_h, c("gs_name", "gs_description")) ) # nodes_carnival <- extractCARNIVALnodes(carnival_sample_resolution[[1]]) |
134 135 136 137 138 139 140 141 142 | carnival_sample_resolution <- purrr::map( carnival_sample_resolution, process_carnival ) carnival_sample_resolution <- carnival_sample_resolution[purrr::map(carnival_sample_resolution, length) > 0] if (length(carnival_sample_resolution) == 0) { knitr::knit_exit(append = "All sample carnival runs failed. Aborting...") } |
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | # get only summary files from CARNIVAL results sifts <- lapply(carnival_sample_resolution, function(x) { x$weightedSIF }) nodos <- lapply(carnival_sample_resolution, function(x) { x$nodesAttributes }) write(names(sifts), file = stderr()) # Calculate the number of edges and nodes in the networks and its density node_edge <- do.call(rbind, lapply(sifts, count_edges_nodes_degree)) # Calculate degree distribution for a sample count_degree <- sifts[[1]] %>% degree_count() # degree distribution p <- data.frame(table(count_degree$total_count) / nrow(count_degree)) colnames(p) <- c("Var1", "total_degree") p <- merge.data.frame(p, data.frame(table(count_degree$in_count) / nrow(count_degree)), all = T) colnames(p) <- c("Var1", "total_degree", "in_degree") p <- merge.data.frame(p, data.frame(table(count_degree$out_count) / nrow(count_degree)), all = T) colnames(p) <- c("k", "total_degree", "in_degree", "out_degree") p <- melt(p, value.name = "p", id.vars = "k") p$k <- relevel(p$k, "0") # visualise ggdat <- as.data.frame(node_edge) %>% tibble::rownames_to_column(var = "sample") %>% dplyr::mutate(condition = gsub(".Rep[0-9]{1}", "", sample)) # Plotting # relation between number of edges and nodes ggplot(ggdat, aes(x = nodes, y = edges, color = as.factor(condition))) + geom_point() + geom_text( label = ggdat$sample, check_overlap = TRUE, vjust = 0, nudge_y = 0.5, show.legend = F ) + theme_bw(base_size = 15) + guides(color = guide_legend(title = "Conditions")) + ggtitle("Node-edge composition") |
197 198 199 200 201 202 | # network degree ggplot(ggdat, aes(x = density, y = sample, fill = as.factor(condition))) + geom_col() + theme_bw(base_size = 15) + guides(fill = guide_legend(title = "Conditions")) + ggtitle("Network degree") |
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 | # degree distribution levels(p$k) <- levels(p$k) %>% as.numeric() %>% sort() dd <- ggplot(data = p, aes(x = k, y = p, group = variable, color = variable)) + geom_point() + geom_line() + theme_bw() + theme(legend.position = "none") + guides(color = guide_legend(title = "degree type")) + ggtitle("Degree distribution") ddp <- ggplot(data = p, aes(x = as.numeric(k), y = p, group = variable, color = variable)) + geom_point() + geom_line() + scale_x_continuous( breaks = as.numeric(p$k), trans = scales::log_trans() ) + scale_y_log10( breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)) ) + annotation_logticks() + theme_bw() + guides(color = guide_legend(title = "degree type")) + ggtitle("Degree distribution (log scale)") + xlab("k (ln)") + ylab("p (log10)") plot_grid(dd, ddp, labels = "auto", rel_widths = c(1, 2)) |
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 | # create a matrix of all interactions for all samples write(dim(sif), file = stderr()) interactions <- getTopology(networks = sifts, scafoldNET = sif) colnames(interactions) <- names(carnival_sample_resolution) # FIxes bug in topology function (To lazy to actually fix) ncol_interact <- ncol(interactions) # interactions <- interactions[,-c(1:ncol_interact/2)] # get the edges per sample net_int <- apply(interactions, 2, function(x, r) { r[which(!is.na(x))] }, rownames(interactions)) # calculate Jaccard indexes per pair combined <- expand.grid(1:length(names(sifts)), 1:length(names(sifts))) jac_index <- matrix( data = NA, nrow = length(names(sifts)), ncol = length(names(sifts)), dimnames = list(names(sifts), names(sifts)) ) for (i in 1:nrow(combined)) { n <- names(sifts)[combined[i, 1]] m <- names(sifts)[combined[i, 2]] jac_index[n, m] <- length(intersect(net_int[[n]], net_int[[m]])) / length(union(net_int[[n]], net_int[[m]])) } # Visualize the indexes in a heatmap pheatmap::pheatmap(jac_index, fontsize = 14, fontsize_row = 10, fontsize_col = 10, angle_col = 45, treeheight_col = 0 ) corrplot::corrplot(jac_index, ) |
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 | # Get common interactions in a group get_common_interactions <- function(interaction_list, samples, psmpl_per = 95, carnival_list = NULL) { stopifnot(all(samples %in% colnames(interaction_list))) interaction_list <- interaction_list[, samples] if (!is.null(carnival_list)) { nodos <- purrr::map_df(carnival_list[samples], function(carn_res) { carn_res$nodesAttributes %>% pull(AvgAct) }) nodos$nodes <- carnival_list[[1]]$nodesAttributes$Node } shared_interactions_WT <- NULL while (is.null(shared_interactions_WT)) { if (psmpl_per <= 0) { break } psmpl_per <- psmpl_per - 5 shared_interactions_WT <- getCoreInteractions(topology = interaction_list, psmpl = psmpl_per) } # Visualise the interactions if (!is.null(shared_interactions_WT)) { colnames(shared_interactions_WT) <- c("from", "sign", "to") labels_edge <- c("-1" = "inhibition", "1" = "activation") nodes <- data.frame(union(shared_interactions_WT$from, shared_interactions_WT$to)) colnames(nodes) <- "nodes" nodes$label <- nodes$nodes if (!is.null(carnival_list)) { nodes <- dplyr::inner_join(nodes, nodos) %>% mutate(sd = matrixStats::rowSds(as.matrix(.[samples]))) } else { nodes$sd <- rep(1, nrow(nodes)) } p <- tidygraph::tbl_graph(nodes = nodes, edges = shared_interactions_WT) %>% ggraph::ggraph(layout = "auto") + geom_node_point(aes(color = sd), size = 8) + geom_edge_link(arrow = arrow(), aes(edge_colour = as.factor(sign))) + theme_graph() + scale_color_viridis(option = "E") + geom_node_text(aes(label = label), nudge_y = 0.04) + labs(caption = glue::glue("Overlap in {psmpl_per} percent of groups")) } else { p <- NULL } p } get_common_interactions(interactions[], names(interactions), 100, carnival_list = carnival_sample_resolution) + ggtitle("Between all groups") # get_common_interactions(interactions[,c(6,7)], 100) + ggtitle("Apelin Treatment common interactions") |
354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 | # Get common interactions in a group create_overlap_matrix <- function(sig_gene_names) { gene_name_index <- unlist(sig_gene_names) %>% unique() %>% sort() #' Takes two character vecotrs (info and index) and checks which index #' fields are present in info #' #' @return returns a vecor of length(index) with 1 or 0 depending on #' True or False check_against_index <- function(info, index) { return(as.numeric(index %in% info)) } gene_matrix <- sapply(sig_gene_names, check_against_index, index = gene_name_index) rownames(gene_matrix) <- gene_name_index row_sum <- rowSums(gene_matrix) gene_matrix <- cbind(gene_matrix, total = rowSums(gene_matrix)) ordered_matrix <- gene_matrix[order(gene_matrix[, "total"], decreasing = T), ] DT::datatable(ordered_matrix) } build_comb_gene_set <- function(comb_mat, comb_mat_order) { combination_gene_sets <- list() for (c_name in ComplexHeatmap::comb_name(comb_mat)) { # c_name <- comb_name(comb_mat)[1] gnames <- ComplexHeatmap::extract_comb(m = comb_mat, comb_name = c_name) groups_index <- stringr::str_split(c_name, "") %>% unlist() group_names <- ComplexHeatmap::set_name(comb_mat)[as.logical(as.numeric(groups_index))] merged_group_name <- paste(group_names, collapse = " + ") combination_gene_sets[[merged_group_name]] <- tibble::tibble(gene_names = gnames, lfc = rep(0, length(gnames))) } if (length(combination_gene_sets) > 30) { combination_gene_sets <- combination_gene_sets[c(match(names(comb_mat_order), ComplexHeatmap::comb_name(comb_mat))[1:30])] } else { combination_gene_sets <- combination_gene_sets } combination_gene_sets } gsa_plot <- function(gene_list, background, pathways) { sig_pathways <- runGSAhyper( genes = gene_list, universe = background, gsc = pathways ) sig_pathways_df <- as.data.frame(sig_pathways$resTab) %>% tibble::rownames_to_column(var = "pathway") PathwaysSelect <- sig_pathways_df %>% dplyr::select(pathway, `p-value`, `Adjusted p-value`) %>% dplyr::filter(`Adjusted p-value` <= 0.05) %>% dplyr::rename(pvalue = `p-value`, AdjPvalu = `Adjusted p-value`) %>% dplyr::mutate(pathway = as.factor(pathway)) PathwaysSelect <- data.frame(t(apply(PathwaysSelect, 1, function(r) { aux <- unlist(strsplit(sub("_", ";", r["pathway"]), ";")) r["pathway"] <- gsub("_", " ", aux[2]) return(c(r, "source" = aux[1])) }))) if (ncol(PathwaysSelect) == 4) { colnames(PathwaysSelect) <- c("pathway", "pvalue", "AdjPvalu", "source") PathwaysSelect$AdjPvalu <- as.numeric(PathwaysSelect$AdjPvalu) ggdata <- PathwaysSelect %>% dplyr::slice_min(AdjPvalu, n = 25) %>% dplyr::filter(AdjPvalu <= 0.05) %>% dplyr::group_by(source) %>% dplyr::arrange(AdjPvalu) %>% dplyr::slice(1:5) # Visualize top results ggplot(ggdata, aes(y = reorder(pathway, -log10(AdjPvalu)), x = -log10(AdjPvalu)), color = source) + facet_grid(source ~ ., scales = "free", space = "free") + geom_bar(stat = "identity") + annotation_logticks(sides = "bt") + theme_bw() + theme( axis.title = element_text(face = "bold", size = 12), axis.text.y = element_text(size = 6) ) + ylab("") } } build_gene_sets <- function(node_list) { gene_name_set <- ComplexHeatmap::list_to_matrix(node_list) comb_mat <- ComplexHeatmap::make_comb_mat(gene_name_set) comb_mat_order <- ComplexHeatmap::comb_size(comb_mat) %>% sort(decreasing = T) create_overlap_matrix(node_list) combination_gene_sets_youndold <- build_comb_gene_set(comb_mat, comb_mat_order = comb_mat_order) combination_gene_sets_youndold } if (all(c(contrast_names) %in% names(interactions))) { get_common_interactions(interactions[], samples = contrast_names, psmpl_per = 95, carnival_list = carnival_sample_resolution ) nodes <- purrr::map(contrast_names, function(x) { carnival_sample_resolution[[x]]$nodesAttributes %>% dplyr::filter(AvgAct != 0) }) node_list <- purrr::map(nodes, pull, Node) names(node_list) <- contrast_names upset_act <- UpSetR::fromList(node_list) UpSetR::upset(upset_act) gene_name_set <- ComplexHeatmap::list_to_matrix(node_list) comb_mat <- ComplexHeatmap::make_comb_mat(gene_name_set) comb_mat_order <- ComplexHeatmap::comb_size(comb_mat) %>% sort(decreasing = T) # create_overlap_matrix(node_list) combination_gene_sets <- build_comb_gene_set(comb_mat, comb_mat_order = comb_mat_order) create_overlap_matrix(node_list) } |
501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 | sample_nodes <- purrr::map(carnival_sample_resolution, function(x) { x$nodesAttributes }) build_matrix_from_nodes <- function(node_list, node_type = "AvgAct") { gene_index <- purrr::map(node_list, ~ pull(., Node)) %>% unlist() %>% unique() node_mat <- purrr::map(node_list, ~ dplyr::select(., Node, !!node_type)) %>% purrr::reduce(full_join, by = "Node") colnames(node_mat) <- c("Node", names(node_list)) node_mat } avg_mat <- build_matrix_from_nodes(sample_nodes) %>% as.data.frame() rownames(avg_mat) <- avg_mat$Node avg_mat <- subset(avg_mat, select = -c(Node)) %>% as.matrix() non_zero_index <- apply(avg_mat, 1, function(x) !all(x == 0)) ComplexHeatmap::Heatmap(avg_mat[non_zero_index, ], column_names_rot = 45, row_names_gp = gpar(fontsize = 6)) |
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | knitr::opts_chunk$set( echo = FALSE, warning = FALSE, message = FALSE, dev = "png", fig.width = 12, fig.height = 12 ) if (!require(dorothea)) { BiocManager::install("dorothea") } library(dorothea) library(decoupleR) library(BiocParallel) library(dplyr) library(viper) library(RColorBrewer) if (!require(RNAscripts)) { devtools::install("./scripts/RNAscripts", upgrade = "never") } library(RNAscripts) library(ggplot2) library(readr) library(ComplexHeatmap) library(DESeq2) library(purrr) library(tidyverse) if (TRUE) { write(str(snakemake), file = stderr()) dds_path <- snakemake@input[["dds_obj"]] cond_var <- snakemake@wildcards[["condition"]] diffexp_tb_path <- snakemake@input[["table"]] fpkm_path <- snakemake@input[["fpkm"]] contrast_groups <- snakemake@wildcards[["contrast"]] register(MulticoreParam(snakemake@threads)) contrast_name <- contrast_groups plot_path <- snakemake@params[["plot_path"]] color_scheme <- snakemake@config[["group_colors"]][[cond_var]] organism <- snakemake@config[["organism"]] } else { the_yaml <- yaml::read_yaml("../../../config/STAD.yaml") snakedir <- "/Users/heyechri/Documents/software/heyer/multicondition-deseq2-enrichment" BASE_ANALYSIS_DIR <- file.path(snakedir, "data/STAD") dds_path <- file.path(paste0(BASE_ANALYSIS_DIR), "deseq2/all.rds") cond_var <- names(the_yaml$diffexp$contrasts)[2] diffexp_tb_path <- file.path( paste0(BASE_ANALYSIS_DIR), glue::glue("results/diffexp/{cond_var}/{names(the_yaml$diffexp$contrasts[[cond_var]])[1]}.diffexp.tsv") ) fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv") contrast_groups <- the_yaml$diffexp$contrasts[[cond_var]][1] plot_path <- "." register(SerialParam()) # s_groups<- c("d0-lung", "d15-lung", "d22-lung", "d36-lung", "18m-lung") contrast_name <- glue::glue("{contrast_groups[1]} vs {contrast_groups[2]}") comp_groups <- the_yaml$comp_groups color_scheme <- the_yaml$group_colors[[cond_var]] organism <- "Homo sapiens" } dir.create(plot_path, recursive = T) |
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | dds_obj <- readRDS(dds_path) diffexp_tb <- read_tsv(diffexp_tb_path, col_names = c( "gene_id", "baseMean", "logFoldChange", "lfcSE", "stat", "pvalue", "padj" ), skip = 1 ) Normalized_counts <- getVarianceStabilizedData(dds_obj) # Normalized_counts<- assay(vst(dds_obj,blind = FALSE)) fpkm <- read_tsv(fpkm_path) filer <- fpkm %>% dplyr::filter(gene %in% rownames(Normalized_counts)) %>% dplyr::filter(!duplicated(gname)) joined_df <- join_tables(diffexp_tb, filer) %>% dplyr::filter(!duplicated(gname)) Normalized_counts <- Normalized_counts[filer$gene, ] rownames(Normalized_counts) <- filer$gname joined_df %>% dplyr::select(gname, stat) %>% dplyr::filter(!is.na(stat)) %>% tibble::column_to_rownames(var = "gname") %>% as.matrix() -> diffexp_matrix regulons <- dorothea_mm %>% filter(confidence %in% c("A", "B", "C")) |
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | if (organism == "Mus musculus") { org_name <- "mouse" } else if (organism == "Homo sapiens") { org_name <- "human" } else { stop("Organism not supported. Please set Mus musculus or Homo sapiens in the config") } net <- get_dorothea(organism = org_name, levels = c("A", "B", "C")) deg <- joined_df %>% dplyr::select(gname, logFoldChange, stat, padj) %>% dplyr::filter(!is.na(stat)) %>% mutate(padj = tidyr::replace_na(padj, 1)) %>% tibble::column_to_rownames(var = "gname") %>% as.matrix() -> diffexp_matrix design <- dds_obj@colData sample_acts <- run_wmean( mat = Normalized_counts, network = net, .source = "source", .target = "target", .mor = "mor", times = 1000, minsize = 2 ) n_tfs <- 30 # Transform to wide matrix sample_acts_mat <- sample_acts %>% filter(statistic == "norm_wmean") %>% pivot_wider( id_cols = "condition", names_from = "source", values_from = "score" ) %>% tibble::column_to_rownames("condition") %>% as.matrix() # Get top tfs with more variable means across clusters tfs <- sample_acts %>% group_by(source) %>% dplyr::summarise(std = sd(score)) %>% arrange(-abs(std)) %>% head(n_tfs, n = 30) %>% pull(source) sample_acts_mat <- sample_acts_mat[, tfs] # Scale per sample sample_acts_mat <- scale(sample_acts_mat, scale = F) # Choose color palette palette_length <- 100 my_color <- colorRampPalette(c("Darkblue", "white", "red"))(palette_length) my_breaks <- c( seq(-3, 0, length.out = ceiling(palette_length / 2) + 1), seq(0.05, 3, length.out = floor(palette_length / 2)) ) if (!is.null(color_scheme)) { ha <- rowAnnotation( group = as.character(dds_obj@colData[, cond_var]), col = list( group = as_vector(color_scheme) ) ) } else { ha <- rowAnnotation(group = as.character(dds_obj@colData[, cond_var])) } # Plot ComplexHeatmap::Heatmap(sample_acts_mat, clustering_method_rows = "average", clustering_method_columns = "average", right_annotation = ha, heatmap_legend_param = list(title = "score") ) |
183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | contrast_acts <- run_wmean( mat = deg[, "stat", drop = FALSE], net = net, .source = "source", .target = "target", .mor = "mor", times = 1000, minsize = 3 ) contrast_acts f_contrast_acts <- contrast_acts %>% filter(statistic == "norm_wmean") %>% mutate(rnk = NA) # Filter top TFs in both signs msk <- f_contrast_acts$score > 0 f_contrast_acts[msk, "rnk"] <- rank(-f_contrast_acts[msk, "score"]) f_contrast_acts[!msk, "rnk"] <- rank(-abs(f_contrast_acts[!msk, "score"])) tfs <- f_contrast_acts %>% arrange(rnk) %>% head(n_tfs) %>% pull(source) f_contrast_acts <- f_contrast_acts %>% filter(source %in% tfs) # Plot ggplot(f_contrast_acts, aes(x = reorder(source, score), y = score)) + geom_bar(aes(fill = score), stat = "identity") + scale_fill_gradient2( low = "darkblue", high = "indianred", mid = "whitesmoke", midpoint = 0 ) + theme_minimal() + theme( axis.title = element_text(face = "bold", size = 12), axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold"), axis.text.y = element_text(size = 10, face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank() ) + xlab("TFs") |
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 | viper_net <- net colnames(viper_net) <- colnames(dorothea_mm) tf_activities_stat <- dorothea::run_viper(diffexp_matrix, viper_net, options = list( minsize = 5, eset.filter = FALSE, cores = 1, verbose = F, nes = TRUE ) ) tf_activities_stat_top25 <- tf_activities_stat %>% as.data.frame() %>% rownames_to_column(var = "GeneID") %>% dplyr::rename(NES = "stat") %>% dplyr::top_n(25, wt = abs(NES)) %>% dplyr::arrange(NES) %>% dplyr::mutate(GeneID = factor(GeneID)) NES_plot <- ggplot(tf_activities_stat_top25, aes(x = reorder(GeneID, NES), y = NES)) + geom_bar(aes(fill = NES), stat = "identity") + scale_fill_gradient2( low = "darkblue", high = "indianred", mid = "whitesmoke", midpoint = 0 ) + theme_minimal() + theme( axis.title = element_text(face = "bold", size = 12), axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold"), axis.text.y = element_text(size = 10, face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank() ) + xlab("Transcription Factors") + ggtitle(contrast_name) ggsave(filename = file.path(plot_path, glue::glue("{contrast_name}_NES_plot.svg")), NES_plot) NES_plot |
277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 | tf_activities_counts <- dorothea::run_viper(Normalized_counts, regulons, options = list( minsize = 5, eset.filter = FALSE, cores = 1, verbose = FALSE, method = c("scale") ) ) tf_activities_counts_filter <- tf_activities_counts %>% as.data.frame() %>% rownames_to_column(var = "GeneID") %>% dplyr::filter(GeneID %in% tf_activities_stat_top25$GeneID) %>% column_to_rownames(var = "GeneID") %>% as.matrix() tf_activities_vector <- as.vector(tf_activities_counts_filter) paletteLength <- 100 ha <- HeatmapAnnotation( group = dds_obj@colData$condition, col = list( group = as_vector(color_scheme) ) ) dorothea_hmap <- ComplexHeatmap::Heatmap(tf_activities_counts_filter, name = glue::glue("NES"), top_annotation = ha, show_column_names = F ) save_cheatmap_svg(x = dorothea_hmap, filename = file.path(plot_path, glue::glue("{contrast_name}_dorothea.svg"))) dorothea_hmap |
315 316 317 318 | tf_activities_CARNIVALinput <- tf_activities_stat %>% as.data.frame() %>% tibble::rownames_to_column(var = "TF") write_csv(tf_activities_CARNIVALinput, "../results/TFActivity_CARNIVALinput.csv") |
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | knitr::opts_chunk$set( echo = FALSE, warning = FALSE, message = FALSE, dev = "png", fig.width = 12, fig.height = 12 ) library(purrr) library(clusterProfiler) library(magrittr) library(ggplot2) library(org.Mm.eg.db) library(svglite) if (exists("snakemake")) { gsea_list <- snakemake@input[["gsea_result"]] plot_path <- snakemake@params[["plot_path"]] c_groups <- snakemake@params[["contrast_groups"]] write(unlist(c_groups), file = stderr()) names(gsea_list) <- names(x = c_groups) species_name <- snakemake@config[["organism"]] gene_name_type <- snakemake@config[["gene_name_type"]] gsea_conf <- snakemake@config[["gsea"]] save_plots <- TRUE } else { conf <- yaml::read_yaml("../../../configs/VascAge_Apelin_config.yaml") BASE_DIR <- file.path(conf$dirs$BASE_ANALYSIS_DIR) cond_id <- names(conf$diffexp$contrasts)[1] gsea_list <- as.list(file.path(BASE_DIR, glue::glue("results/diffexp/{cond_id}/{names(conf$diffexp$contrasts[[cond_id]])}.gseares.RDS"))) c_groups <- names(conf$diffexp$contrasts[[cond_id]]) names(gsea_list) <- c_groups # gsea_list <- gsea_list[1:3] plot_path <- "/home/heyer/tmp/" species_name <- "Mus musculus" gene_name_type <- "ENSEMBL" save_plots <- FALSE gsea_conf <- conf$gsea } limit <- 10 |
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | create_all_gsea_plots <- function(msig_list, contrast_n, limit = 30, plot_type, save_plots = F) { if (is.list(msig_list)) { msig_list <- msig_list[[1]] } if (nrow(msig_list) < limit) { sig_ids <- msig_list@result$ID } else { sig_ids <- msig_list@result$ID[1:limit] } all_plots <- purrr::map(sig_ids, function(gset_id) { p <- enrichplot::gseaplot2( x = msig_list, geneSetID = gset_id, title = msig_list[gset_id, ]$Description ) aplot::as.patchwork(p) }) names(all_plots) <- sig_ids if (save_plots) { gsea_plot_path <- glue::glue("{plot_path}/{contrast_n}/{plot_type}", recursive = T) dir.create(gsea_plot_path, recursive = T) purrr::map2(all_plots, sig_ids, function(p, n) { ggsave(plot = p, filename = file.path(gsea_plot_path, paste0(n, "_gseaplot.svgz")), device = svglite) }) } all_plots } gsea_result <- purrr::map(gsea_list, readRDS) |
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | build_gene_set_detail <- function(gset_res, gset_name, inkey) { ensembl_names <- gset_res@result[gset_name, ]$core_enrichment %>% stringr::str_split("/") %>% unlist() all_geneList <- gset_res@geneList %>% tibble::as_tibble() all_geneList[inkey] <- as.character(names(gset_res@geneList)) all_geneList <- all_geneList %>% dplyr::filter(!!rlang::sym(inkey) %in% gset_res@geneSets[[gset_name]]) %>% mutate(core_enrichment = ifelse(!!rlang::sym(inkey) %in% ensembl_names, "YES", "NO")) org_db <- RNAscripts::get_org_db(species_name) t_table <- clusterProfiler::bitr(all_geneList %>% dplyr::pull(inkey), inkey, "SYMBOL", OrgDb = org_db) yarp <- dplyr::inner_join(all_geneList, t_table) %>% dplyr::relocate(SYMBOL) %>% dplyr::arrange(dplyr::desc(value)) yarp } create_msig_entries <- function(gsea_res, msig_type, ktype, limit = 20) { stopifnot(!is.null(names(gsea_res))) for (set_n in names(gsea_res)) { cat("#### ", set_n, " {.tabset} \n") yote <- gsea_res[[set_n]][[msig_type]] if (is.null(yote)) { next } if (is.list(yote)) { yote <- yote[[1]] } org_db <- RNAscripts::get_org_db(species_name) if (is(gsea_result$basal_pos_vs_bas_neg$msig_gsea, "gseaResult")) { yote <- setReadable(yote, org_db, keyType = ktype) } msig_res <- gsea_res[[set_n]][[msig_type]] if (is.list(msig_res)) { msig_res <- msig_res[[1]] } all_plots <- create_all_gsea_plots(msig_res, plot_type = msig_type, contrast_n = set_n, limit = limit, save_plots = save_plots ) if (!is.null(msig_res)) { if (nrow(msig_res@result) < limit) { names(all_plots) <- msig_res@result$ID } else { names(all_plots) <- msig_res@result$ID[1:limit] } } print(knitr::kable(yote@result[, -c(1, length(yote@result))] %>% arrange(desc(NES)))) cat("\n") for (sig_gset in names(all_plots)) { cat("##### ", sig_gset, " \n") print(all_plots[[sig_gset]]) cat("\n") gene_enrich_table <- build_gene_set_detail(msig_res, gset_name = sig_gset, inkey = ktype ) print(knitr::kable(gene_enrich_table)) cat("\n") } cat("\n") cat("\n \n") } cat("\n") } |
190 191 192 193 194 195 196 197 198 199 200 201 202 203 | for (gsea_name in names(gsea_conf)) { if (gsea_conf[[gsea_name]]$use_gsea) { cat("\n") cat("### ", gsea_name , " {.tabset} \n") cat("\n") create_msig_entries(gsea_result, msig_type = gsea_name, ktype = gsea_conf[[gsea_name]][["id_class"]], limit = limit) cat("\n") cat("\n") gc() } } |
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | knitr::opts_chunk$set( echo = FALSE, warning = FALSE, message = FALSE, fig.width = 12, fig.height = 12 ) library(progeny) library(DESeq2) library(BiocParallel) library(readr) library(magrittr) library(tidyverse) library(RColorBrewer) library(ggplot2) library(decoupleR) library(ComplexHeatmap) if (!require(RNAscripts)) { devtools::install("./scripts/RNAscripts", upgrade = "never") } library(RNAscripts) if (exists("snakemake")) { dds_path <- snakemake@input[["dds_obj"]] diffexp_tb_path <- snakemake@input[["table"]] fpkm_path <- snakemake@input[["fpkm"]] cond_id <- snakemake@wildcards[["condition"]] contrast_groups <- snakemake@wildcards[["contrast"]] s_groups <- snakemake@params[["s_groups"]] register(MulticoreParam(snakemake@threads)) contrast_name <- contrast_groups plot_path <- snakemake@params[["plot_path"]] comp_groups <- snakemake@config[["comp_groups"]] color_scheme <- snakemake@config[["group_colors"]][[cond_id]] organism <- snakemake@config[["organism"]] } else { the_yaml <- yaml::read_yaml("../../../config/STAD.yaml") snakedir <- "/Users/heyechri/Documents/software/heyer/multicondition-deseq2-enrichment" BASE_ANALYSIS_DIR <- file.path(snakedir, "data/STAD") cond_id <- "SARIFA_Subtype" dds_path <- file.path(paste0(BASE_ANALYSIS_DIR), "deseq2/all.rds") comp_id <- names(the_yaml$diffexp$contrasts)[1] diffexp_tb_path <- file.path( paste0(BASE_ANALYSIS_DIR), glue::glue("results/diffexp/{cond_id}/{names(the_yaml$diffexp$contrasts[[cond_id]])[1]}.diffexp.tsv") ) fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv") contrast_groups <- the_yaml$diffexp$contrasts[[cond_id]][1] plot_path <- "." register(SerialParam()) s_groups <- names(the_yaml$group_colors[[cond_id]]) # s_groups<- c("d0-lung", "d15-lung", "d22-lung", "d36-lung", "18m-lung") contrast_name <- glue::glue("{contrast_groups[1]} vs {contrast_groups[2]}") comp_groups <- the_yaml$comp_groups color_scheme <- the_yaml$group_colors[[cond_id]] organism <- "Homo sapiens" } dir.create(plot_path, recursive = T) |
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | dds_obj <- readRDS(dds_path) diffexp_tb <- read_tsv(diffexp_tb_path, col_names = c( "gene_id", "baseMean", "logFoldChange", "lfcSE", "stat", "pvalue", "padj" ), skip = 1 ) # Issue: we cant use vst for a small test dataset but getVarianceStabilzedData Works Normalized_counts <- getVarianceStabilizedData(dds_obj) # Normalized_counts<- assay(vst(dds_obj, blind = FALSE)) fpkm <- read_tsv(fpkm_path) filer <- fpkm %>% dplyr::filter(gene %in% rownames(Normalized_counts)) %>% dplyr::filter(!duplicated(gname)) joined_df <- join_tables(diffexp_tb, filer) %>% dplyr::filter(!duplicated(gname)) Normalized_counts <- Normalized_counts[filer$gene, ] rownames(Normalized_counts) <- filer$gname joined_df %>% dplyr::select(gname, stat) %>% dplyr::filter(!is.na(stat)) %>% column_to_rownames(var = "gname") %>% as.matrix() -> diffexp_matrix write("finished parsing", file = stderr()) |
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | org_name <- get_organism_omnipath_name(organism) prog_net <- decoupleR::get_progeny(organism = org_name, top = 100) PathwayActivity_counts <- progeny(Normalized_counts, scale = T, organism = org_name, top = 1000, verbose = F, perm = 1) # Test case has some nans in progeny, these are set to 0 PathwayActivity_counts[is.nan(PathwayActivity_counts)] <- 0 Activity_counts <- as.vector(PathwayActivity_counts) ann_col <- colData(dds_obj) %>% as_tibble() %>% pull(!!sym(cond_id)) %>% fct_relevel(names(color_scheme)) if (!is.null(color_scheme)) { top_anno <- ComplexHeatmap::HeatmapAnnotation( sample = ann_col, col = list("sample" = as_vector(color_scheme)) ) } else { top_anno <- ComplexHeatmap::HeatmapAnnotation(sample = ann_col) } progeny_hmap <- ComplexHeatmap::Heatmap(t(PathwayActivity_counts), top_annotation = top_anno, clustering_distance_columns = "euclidean", clustering_method_columns = "average", show_column_names = F, name = ) save_cheatmap_svg(x = progeny_hmap, filename = file.path(plot_path, "progeny_hmap.svg")) progeny_hmap |
145 146 147 148 149 150 151 152 | library(broom) as_tibble(PathwayActivity_counts) %>% t() -> transposed_pact kruskal_res <- apply(transposed_pact, 1, kruskal.test, g = as.factor(ann_col)) kruskal_test <- purrr::map_df(kruskal_res, tidy, .id = "pathway") %>% mutate(padj = p.adjust(p.value, method = "BH")) %>% arrange(desc(statistic)) |
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | plot_activity <- as_tibble(PathwayActivity_counts) %>% add_column(condition = fct_relevel(ann_col, s_groups)) for (pathway in kruskal_test$pathway) { pval <- kruskal_test %>% dplyr::filter(pathway == !!pathway) %>% pull(padj) p <- ggplot(plot_activity, aes(x = condition, y = !!sym(pathway), fill = condition)) + geom_boxplot(outlier.shape = NA, color = "black") + geom_jitter(width = 0.1) + theme_bw() + geom_hline(yintercept = 0) + ggtitle(pathway) + labs(caption = paste0("kruskal adj.p = ", round(pval, 4))) + ylab("progeny score") + ggsignif::geom_signif( comparisons = comp_groups, step_increase = 0.06, map_signif_level = T, show.legend = , color = "black" ) if (!is.null(color_scheme)) { p <- p + scale_fill_manual(values = as_vector(color_scheme)) } ggsave(filename = file.path(plot_path, glue::glue("{pathway}_boxplot.svg"))) plot(p) } |
197 198 199 200 201 | PathwayActivity_zscore <- progeny(diffexp_matrix, scale = TRUE, organism = org_name, top = 100, perm = 10000, z_scores = TRUE, verbose = T ) %>% t() colnames(PathwayActivity_zscore) <- "NES" |
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | PathwayActivity_zscore_df <- as.data.frame(PathwayActivity_zscore) %>% rownames_to_column(var = "Pathway") %>% dplyr::arrange(NES) %>% dplyr::mutate(Pathway = factor(Pathway)) NES_plot <- ggplot(PathwayActivity_zscore_df, aes(x = reorder(Pathway, NES), y = NES)) + geom_bar(aes(fill = NES), stat = "identity") + scale_fill_gradient2( low = "darkblue", high = "indianred", mid = "whitesmoke", midpoint = 0 ) + theme_minimal() + theme( axis.title = element_text(face = "bold", size = 12), axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold"), axis.text.y = element_text(size = 10, face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank() ) + xlab("Pathways") + ggtitle(contrast_name) ggsave(filename = file.path(plot_path, "NES_plot.svg"), NES_plot) NES_plot |
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 | prog_matrix <- getModel(org_name, top = 100) %>% as.data.frame() %>% tibble::rownames_to_column("GeneID") ttop_KOvsWT_df <- diffexp_matrix %>% as.data.frame() %>% tibble::rownames_to_column("GeneID") scatter_plots <- progeny::progenyScatter( df = ttop_KOvsWT_df, weight_matrix = prog_matrix, statName = "stat", verbose = T ) yeet <- purrr::map2(scatter_plots[[1]], names(scatter_plots[[1]]), function(x, n) ggsave(filename = file.path(plot_path, paste0(n, "_scatter.svg")), plot = x)) purrr::map(scatter_plots[[1]], plot) |
262 263 264 265 266 267 268 269 270 | net <- get_progeny(organism = org_name, top = 100) minsize <- ifelse(nrow(Normalized_counts) > 1000, 5, 2) times <- ifelse(nrow(Normalized_counts) > 1000, 1000, 100) sample_acts <- decoupleR::run_wmean( mat = Normalized_counts, net = net, .source = "source", .target = "target", .mor = "weight", times = times, minsize = minsize ) |
274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 | # Transform to wide matrix sample_acts_mat <- sample_acts %>% filter(statistic == "norm_wmean") %>% pivot_wider( id_cols = "condition", names_from = "source", values_from = "score" ) %>% column_to_rownames("condition") %>% as.matrix() # Scale per sample sample_acts_mat <- scale(sample_acts_mat) %>% t() sample_acts_mat <- sample_acts_mat[, rownames(dds_obj@colData)] if (!is.null(color_scheme)) { ha <- HeatmapAnnotation( group = dds_obj@colData[, cond_id], col = list( group = as_vector(color_scheme) ) ) } else { ha <- HeatmapAnnotation(group = dds_obj@colData[, cond_id]) } # Plot ComplexHeatmap::Heatmap(sample_acts_mat, top_annotation = ha, clustering_method_columns = "average", clustering_method_rows = "average", show_column_names = F ) |
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 | as_tibble(sample_acts_mat) -> out_yeet yeet <- apply(sample_acts_mat, 1, kruskal.test, g = as.factor(ann_col)) kruskal_test <- purrr::map_df(yeet, tidy, .id = "pathway") %>% mutate(padj = p.adjust(p.value, method = "BH")) %>% arrange(desc(statistic)) plot_activity <- sample_acts_mat %>% t() %>% as.data.frame() plot_activity[, cond_id] <- ann_col for (pathway in kruskal_test$pathway) { pval <- kruskal_test %>% dplyr::filter(pathway == !!pathway) %>% pull(padj) p <- ggplot(plot_activity, aes(x = !!sym(cond_id), y = !!sym(pathway), fill = !!sym(cond_id))) + geom_boxplot(outlier.shape = NA, color = "black") + geom_jitter(width = 0.1) + theme_bw() + geom_hline(yintercept = 0) + ggtitle(pathway) + labs(caption = paste0("kruskal adj.p = ", round(pval, 4))) + ylab("progeny score") + ggsignif::geom_signif( comparisons = comp_groups, step_increase = 0.06, map_signif_level = T, show.legend = , color = "black" ) if (!is.null(color_scheme)) { p <- p + scale_fill_manual(values = as_vector(color_scheme)) } ggsave(filename = file.path(plot_path, glue::glue("{pathway}_decoupler_boxplot.svg"))) plot(p) } |
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 | deg <- joined_df %>% dplyr::select(gname, stat) %>% dplyr::filter(!is.na(stat)) %>% column_to_rownames(var = "gname") %>% as.matrix() -> diffexp_matrix design <- dds_obj@colData contrast_acts <- run_wmean( mat = deg, net = net, .source = "source", .target = "target", .mor = "weight", times = times, minsize = minsize ) # Filter norm_wmean f_contrast_acts <- contrast_acts %>% filter(statistic == "norm_wmean") # Plot ggplot(f_contrast_acts, aes(x = reorder(source, score), y = score)) + geom_bar(aes(fill = score), stat = "identity") + scale_fill_gradient2( low = "darkblue", high = "indianred", mid = "whitesmoke", midpoint = 0 ) + theme_minimal() + theme( axis.title = element_text(face = "bold", size = 12), axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold"), axis.text.y = element_text(size = 10, face = "bold"), panel.grid.major = element_blank(), panel.grid.minor = element_blank() ) + xlab("Pathways") |
382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 | plot_pathway <- function(net, pathway, deg) { df <- net %>% filter(source == pathway) %>% arrange(target) %>% mutate(ID = target, color = "3") %>% column_to_rownames("target") inter <- sort(intersect(rownames(deg), rownames(df))) df <- df[inter, ] df["t_value"] <- deg[inter, ] df <- df %>% mutate(color = if_else(weight > 0 & t_value > 0, "1", color)) %>% mutate(color = if_else(weight > 0 & t_value < 0, "2", color)) %>% mutate(color = if_else(weight < 0 & t_value > 0, "2", color)) %>% mutate(color = if_else(weight < 0 & t_value < 0, "1", color)) ggplot(df, aes(x = weight, y = t_value, color = color)) + geom_point() + scale_colour_manual(values = c("red", "royalblue3", "grey")) + ggrepel::geom_label_repel(aes(label = ID)) + theme_minimal() + theme(legend.position = "none") + geom_vline(xintercept = 0, linetype = "dotted") + geom_hline(yintercept = 0, linetype = "dotted") + ggtitle(pathway) } purrr::map(unique(net$source), plot_pathway, net = net, deg = deg) |
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | library(tibble) if (!require(RNAscripts)) { devtools::install("./scripts/RNAscripts", upgrade = "never") } library(RNAscripts) if (packageVersion("mitch") < "1.1.8") { devtools::install_github("markziemann/mitch") } # library(org.Mm.eg.db) library(mitch) library(tidyverse) library(ensembldb) library(AnnotationHub) ## Load the annotation resource. ah <- AnnotationHub() Gtf <- ah["AH28753"] ## Create a EnsDb database file from this. DbFile <- ensDbFromAH(Gtf) ## We can either generate a database package, or directly load the data edb <- EnsDb(DbFile) library(msigdbr) if (exists("snakemake")) { # Input Files diffexp_tables_paths <- snakemake@input[["tables"]] contrast_list <- snakemake@params[["contrasts"]] fpkm_path <- snakemake@input[["fpkm"]] # Output Files mitch_table <- snakemake@output[["mitch_table"]] mitch_rds <- snakemake@output[["mitch_rds"]] report_path <- snakemake@output[["mitch_report"]] # Parameters threads <- snakemake@threads enrichment_term_type <- "SYMBOL" } else { BASE_ANALYSIS_DIR <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/cre_2022/" contrast_list <- c( "basal_cre_pos_vs_basal_cre_neg", "tumor_cre_pos_vs_tumor_cre_neg", "tumor_cre_pos_vs_basal_cre_pos", "tumor_cre_neg_vs_basal_cre_neg" ) diffexp_tables_paths <- as.list(file.path(BASE_ANALYSIS_DIR, glue::glue("results/diffexp/{contrast_list}.diffexp.tsv"))) fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv") threads <- 1 report_path <- "big_yeeter.html" enrichment_term_type <- "SYMBOL" } ## Read data diff_exp_tables <- purrr::map(diffexp_tables_paths, readr::read_tsv, col_names = c( "gene_id", "baseMean", "logFoldChange", "lfcSE", "stat", "pvalue", "padj" ), skip = 1) names(diff_exp_tables) <- contrast_list diff_exp_frames <- purrr::map(diff_exp_tables, function(x) { gene_ids <- x$gene_id diff_exp_fr <- as.data.frame(x[, -1]) rownames(diff_exp_fr) <- gene_ids diff_exp_fr }) # names(diff_exp_frames) <- contrast_list Read FPKM file fpkm <- readr::read_tsv(fpkm_path) match_table <- fpkm %>% dplyr::select(c("gene", "gname")) match_vec <- setNames(object = match_table$gname, nm = match_table$gene) ### Setupt mitch input data prep_mitch_input <- function(deseq_list, e_term = "ENSEMBL") { # mitch_input_df <- mitch::mitch_import(diff_exp_frames, 'DESeq2', geneTable = as.data.frame(fpkm[,c('gene', # 'gname')])) mitch_input_df <- mitch::mitch_import(deseq_list, "DESeq2") gname_tb <- tibble::tibble(Transcript_id = rownames(mitch_input_df)) if (e_term == "ENSEMBL") { gname_tb["gene_id"] <- stringr::str_extract(gname_tb$Transcript_id, "ENSMUSG[0-9]*") } else if (e_term == "SYMBOL") { gname_tb["gene_id"] <- match_vec[gname_tb$Transcript_id] } rownames(mitch_input_df) <- gname_tb$gene_id colnames(mitch_input_df) <- stringr::str_remove_all(colnames(mitch_input_df), pattern = "-") ifelse(anyDuplicated(gname_tb$gene_id), warning("Duplicated genes in mitch input dataframe."), print("no duplicates")) mitch_input_df } #' Uniquify a vector #' #' @param x Input factor to check #' @return corrected facotr value #' @examples #' NULL uniquify <- function(x) { while (anyDuplicated(x)) { x[duplicated(x)] <- paste0(x[duplicated(x)], "_1") } x } mitch_input_df <- prep_mitch_input(diff_exp_frames) # mm_reactome <- buildReactome(output_type = enrichment_term_type) ensembl_reactome <- buildReactome(output_type = # 'ENSEMBL') if (enrichment_term_type == "SYMBOL") { # Get GENE ids new_ids <- ensembldb::select(edb, keys = rownames(mitch_input_df), keytype = "GENEID", columns = c("SYMBOL", "GENEID")) update_vector <- setNames(new_ids$SYMBOL, nm = new_ids$GENEID) # Stop if any duplicated genes in reactome gene set names(match_vec) <- stringr::str_extract(names(match_vec), "ENSMUSG[0-9]*") geneIDs <- match_vec[rownames(mitch_input_df)] geneIDs[names(update_vector)] <- update_vector col_names <- c("gs_name", "gene_symbol") mm_msigdbr <- msigdbr::msigdbr(species = "Mus musculus", category = "H") %>% dplyr::select(col_names) msigdb_gene_list <- split(x = mm_msigdbr$gene_symbol, f = mm_msigdbr$gs_name) ensembl_in_msigdb <- new_ids %>% dplyr::filter(SYMBOL %in% mm_msigdbr$gene_symbol) %>% pull(GENEID) ## Cleanup duplicates mitch_input_df <- mitch_input_df[(rownames(mitch_input_df) %in% ensembl_in_msigdb) | !(duplicated(geneIDs[rownames(mitch_input_df)]) | duplicated(geneIDs[rownames(mitch_input_df)], fromLast = T)), ] geneIDs <- match_vec[rownames(mitch_input_df)] # geneIDs[names(update_vector)] <- update_vector dup_mitch <- mitch_input_df[] %>% as_tibble(rownames = "ENSEMBL") dup_mitch["SYMBOL"] <- geneIDs[dup_mitch$ENSEMBL] # Remove samples which(dup_mitch$ENSEMBL %in% unlist(ensembl_reactome)) final_name_vec <- setNames(dup_mitch$SYMBOL, nm = dup_mitch$ENSEMBL) final_name_vec <- final_name_vec[-which(!(final_name_vec %in% mm_msigdbr$gene_symbol) & duplicated(final_name_vec))] # final_name_vec['ENSMUSG00000051396'] <- 'Gm45902' if (anyDuplicated(final_name_vec)) { stop("Duplicate in gene names") } # geneIDs <- uniquify(geneIDs) mitch_input_df <- mitch_input_df[names(final_name_vec), ] rownames(mitch_input_df) <- final_name_vec } ## Setup Databases to test msig_hallmarck <- msigdbr::msigdbr(species = 'Mus musculus') # bain <- msig_hallmarck %>% dplyr::select(c('gs_name', 'gene_symbol')) %>% group_split(gs_name) # more_bain <- purrr::map(bain, function(x){x$gene_symbol}) # mitch::mitch_calc(mitch_input_df, more_bain, cores = 6, priority ='significance') # mm_reactome <- buildReactome(output_type = enrichment_term_type) print(colnames(mitch_input_df)) # print(anyDuplicated(colnames(mitch_input_df))) reac_test <- mitch::mitch_calc(mitch_input_df, msigdb_gene_list, cores = threads) readr::write_tsv(reac_test$enrichment_result, mitch_table) saveRDS(reac_test, file = mitch_rds) mitch::mitch_report(reac_test, outfile = report_path) |
4
of
scripts/run_mitch.R
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | library(progeny) library(CARNIVAL) library(OmnipathR) library(readr) library(tibble) library(tidyr) library(dplyr) library(visNetwork) library(biomaRt) library(ggplot2) library(pheatmap) library(BiocParallel) library(DESeq2) library(decoupleR) library(RNAscripts) # library(biomaRt) Basic function to convert human to mouse gene names human = useMart(biomart= 'ensembl', dataset = # 'hsapiens_gene_ensembl') mouse = useMart(biomart = 'ensembl', dataset = 'mmusculus_gene_ensembl') mouse_human_homologs <- readr::read_tsv("http://www.informatics.jax.org/downloads/reports/HMD_HumanPhenotype.rpt", col_names = c( "hgene", "hID", "mgene", "mID", "lcol" )) if (exists("snakemake")) { dds_path <- snakemake@input[["dds_obj"]] diffexp_tb_path <- snakemake@input[["table"]] fpkm_path <- snakemake@input[["fpkm"]] carnival_output <- snakemake@output[["carnival_out"]] contrast_groups <- snakemake@wildcards[["contrast"]] s_groups <- snakemake@params[["s_groups"]] register(MulticoreParam(snakemake@threads)) contrast_name <- contrast_groups plot_path <- snakemake@params[["plot_path"]] comp_groups <- snakemake@config[["signif_comp"]] color_scheme <- snakemake@config[["group_colors"]] cplex_path <- snakemake@config[["cplex_solver"]] stopifnot(`Cplexpath doesnt exist please give path` = file.exists(cplex_path)) temp_path <- snakemake@params[["temp_path"]] nr <- snakemake@resources[["nr"]] thread_num <- snakemake@threads mem_mb <- snakemake@resources[["mem_mb"]] time_limit <- (snakemake@resources[["time_min"]] - 20) * 60 run_vanilla <- snakemake@params[["run_vanilla"]] perturbation_gene <- snakemake@params[["perturbation_gene"]] progeny_data <- "../data/progenyMembers.RData" } else { BASE_ANALYSIS_DIR <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/APLNR_KO" dds_path <- file.path(paste0(BASE_ANALYSIS_DIR), "deseq2/all.rds") diffexp_tb_path <- file.path(paste0(BASE_ANALYSIS_DIR), "results/diffexp/condition/AplnKO_vs_basal.diffexp.tsv") fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv") contrast_groups <- c("AplnKO", "basal") plot_path <- "." register(SerialParam()) s_groups <- c("AplnKO", "basal") contrast_name <- glue::glue("{contrast_groups[[1]]} vs {contrast_groups[[2]]}") the_yaml <- yaml::read_yaml("../configs/VascAge_APLN_KO.yaml") comp_groups <- the_yaml$comp_groups color_scheme <- the_yaml$group_colors carnival_output <- "./test_output.RDS.gz" cplex_path <- "/home/heyer/software/external/CPLEX_Studio201/cplex/bin/x86-64_linux/cplex" thread_num <- 6 temp_path <- "./" mem_mb <- 8192 time_limit <- 3600 run_vanilla <- TRUE progeny_data <- "./data/progenyMembers.RData" } # Read DESeq2 oobject and other tables dds_obj <- readRDS(dds_path) diffexp_tb <- read_tsv(diffexp_tb_path, col_names = c("gene_id", "baseMean", "logFoldChange", "lfcSE", "stat", "pvalue", "padj"), skip = 1 ) # Normalized_counts <- getVarianceStabilizedData(dds_obj) Normalized_counts <- assay(vst(dds_obj, blind = F)) fpkm <- read_tsv(fpkm_path) filer <- fpkm %>% dplyr::filter(gene %in% rownames(Normalized_counts)) %>% dplyr::filter(!duplicated(gname)) joined_df <- join_tables(diffexp_tb, filer) %>% dplyr::filter(!duplicated(gname)) Normalized_counts <- Normalized_counts[filer$gene, ] rownames(Normalized_counts) <- filer$gname joined_df %>% dplyr::select(gname, stat) %>% dplyr::filter(!is.na(stat)) %>% column_to_rownames(var = "gname") %>% as.matrix() -> diffexp_matrix # regulons <- dorothea_mm %>% dplyr::filter(confidence %in% c('A', 'B')) organism <- "mouse" doro_net <- decoupleR::get_dorothea(organism = organism, levels = c("A", "B", "C")) prog_net <- decoupleR::get_progeny(organism = organism, top = 100) PathwayActivity <- PathwayActivity_CARNIVALinput <- run_wmean( mat = diffexp_matrix, network = prog_net, .source = "source", .target = "target", .mor = "weight", times = 1000, minsize = 5 ) %>% dplyr::filter(statistic == "wmean") %>% as.data.frame() if (any(abs(PathwayActivity$score) > 1)) { PathwayActivity$score <- sign(PathwayActivity$score) * (1 - PathwayActivity$p_value) warning("decoupler based enriched failed, falling back on (1-pvalue) * sign(score)") } # PathwayActivity <- PathwayActivity_CARNIVALinput <- progeny(diffexp_matrix, scale = TRUE, organism = 'Mouse', top = # 100, perm = 10000, z_scores = F ) %>% t() %>% as.data.frame() %>% tibble::rownames_to_column(var = 'Pathway') # colnames(PathwayActivity)[2] <- 'score' progeny_key <- setNames(object = PathwayActivity$score, nm = PathwayActivity$source) prog_net %>% mutate(progeny_activity = recode(source, !!!progeny_key)) %>% mutate(carnival_score = sign(weight) * progeny_activity) -> prog_net prog_net %>% group_by(target) %>% summarise(carnival_score = mean(carnival_score)) -> prog_net_final tf_activities_stat <- decoupleR::run_wmean(diffexp_matrix, network = doro_net, times = 1000, minsize = 5) %>% filter(statistic == "norm_wmean") # options = list( minsize = 5, eset.filter = FALSE, cores = 1, verbose = FALSE, nes = TRUE ) ) prog_net_final %>% filter(!(target %in% tf_activities_stat$source)) -> prog_net_final tf_activities <- tf_activities_CARNIVALinput <- tf_activities_stat %>% dplyr::select(source, score) ## Get Omnipath omniR <- import_omnipath_interactions(organism = 10090) # omniR <- import_pathwayextra_interactions(organism = 10090) signed and directed omnipath_sd <- omniR %>% dplyr::filter(consensus_direction == 1 & (consensus_stimulation == 1 | consensus_inhibition == 1)) # changing 0/1 criteria in consensus_stimulation/inhibition to -1/1 omnipath_sd$consensus_stimulation[which(omnipath_sd$consensus_stimulation == 0)] <- -1 omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 1)] <- -1 omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 0)] <- 1 # check consistency on consensus sign and select only those in a SIF format sif <- omnipath_sd[, c("source_genesymbol", "consensus_stimulation", "consensus_inhibition", "target_genesymbol")] %>% dplyr::filter(consensus_stimulation == consensus_inhibition) %>% unique.data.frame() sif$consensus_stimulation <- NULL colnames(sif) <- c("source", "interaction", "target") # remove complexes sif$source <- gsub(":", "_", sif$source) sif$target <- gsub(":", "_", sif$target) # dorothea for CARNIVAL tf_activities_carnival <- data.frame(tf_activities, stringsAsFactors = F) rownames(tf_activities_carnival) <- tf_activities$source tf_activities_carnival$source <- NULL tf_list <- generateTFList(tf_activities_carnival, top = 50, access_idx = 1) tf_vec <- tf_list$score[, ] # For mouse change trp53 to tp53 and trp63 to tp63 names(tf_vec) <- gsub("^Trp", "Tp", names(tf_vec)) # progeny for CARNIVAL load(file = progeny_data) progenyMembers$gene$p53 <- 'TP53' progmem_mouse <- # purrr::map(progenyMembers$gene, convertHumanGeneHomologs, jax_database = mouse_human_homologs) progenyMembers$gene <- # progmem_mouse progenyMembers$gene$p53 <- 'Tp53' # PathwayActivity_carnival <- data.frame(PathwayActivity, stringsAsFactors = F) rownames(PathwayActivity_carnival) <- # PathwayActivity_carnival$Pathway PathwayActivity_carnival$Pathway <- NULL progenylist <- assignPROGENyScores( progeny # = t(PathwayActivity_carnival), progenyMembers = progenyMembers, id = 'gene', access_idx = 1 ) progeny_vec <- # progenylist$score progeny_vec <- setNames(prog_net_final$carnival_score, nm = prog_net_final$target) # get initial nodes lp_opts <- CARNIVAL::defaultCplexCarnivalOptions( solverPath = cplex_path, cplexMemoryLimit = mem_mb, threads = thread_num, timelimit = time_limit, lpFilename = file.path(temp_path, "lptest.lp"), outputFolder = file.path(temp_path, "carnout") ) cplex_opts <- suggestedCplexSpecificOptions() lp_opts[names(cplex_opts)] <- cplex_opts # lp_opts$solverPath <- cplex_path lp_opts$cplexMemoryLimit <- mem_mb lp_opts$threads <- thread_num lp_opts$timelimit <- time_limit # lp_opts$lpFilename <- file.path(temp_path, 'lptest.lp') lp_opts$outputFolder <- file.path(temp_path, 'carnout') dir.create(lp_opts$outputFolder, showWarnings = F, recursive = T) # run carnival dir.create(file.path(temp_path, "carnout"), recursive = T, showWarnings = F) # setwd(file.path(temp_path, 'carnout')) if (run_vanilla) { carnival_result <- runVanillaCarnival( perturbations = c(perturbation_gene = 1), measurements = unlist(tf_vec), priorKnowledgeNetwork = sif, weights = unlist(progeny_vec), carnivalOptions = lp_opts ) } else { carnival_result <- runInverseCarnival( measurements = unlist(tf_vec), priorKnowledgeNetwork = sif, weights = unlist(progeny_vec), carnivalOptions = lp_opts ) } if (length(carnival_result$sifAll) < 50) { if (nr >= 2) { write(paste0("Failed to converge after 2 attempts"), file = stderr()) carnival_result <- list() } else { stop(glue::glue("Attempt {nr}. Failed to converge restarting")) } } saveRDS(carnival_result, file = carnival_output) |
10 | knitr::opts_chunk$set(echo = TRUE) |
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | library(readr) library(piano) library(dplyr) library(ggplot2) library(tibble) library(tidyr) library(dplyr) library(scales) library(plyr) library(GSEABase) library(network) library(reshape2) library(cowplot) library(pheatmap) library(ggraph) library(tidygraph) if (!require(RNAscripts)) { devtools::install("../scripts/RNAscripts", upgrade = "never", quiet = TRUE) } library(RNAscripts) # set working directory # setwd("~/Projects/transcriptutorial/scripts") # source("./support_networks.r") ## We also load the support functions if (exists("snakemake")) { carnival_sample_result <- snakemake@input[["carnival_sample_result"]] tt_basepath <- snakemake@params[["tutorial_source_path"]] names(carnival_sample_result) <- snakemake@params[["sample_names"]] stopifnot(length(carnival_sample_result) > 0) workflow_config <- snakemake@config sample_table <- readr::read_tsv(workflow_config$samples) organism <- workflow_config$organism } else { BASE_DIR <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/tec_aplnrKO/results/carnival/sample_resolution" workflow_config <- yaml::read_yaml("../../../configs/tec_aplnrKO.yaml") sample_table <- readr::read_tsv("../../../data/tumor_vs_ec/Aplnr_KO.tsv") sample_ids <- sample_table %>% pull(sample) carnival_sample_result <- file.path(BASE_DIR, paste0(sample_ids, "_carn_res.RDS.gz")) names(carnival_sample_result) <- sample_ids base_path <- "/desktop-home/heyer/projects/Vascular_Aging" tt_basepath <- file.path(base_path, "RNAseq/rna-seq-star-deseq2/workflow/scripts/transcriptutorial") organism <- "Mus musculus" } source(file.path(tt_basepath, "support_enrichment.r")) source(file.path(tt_basepath, "support_networks.r")) carnival_sample_resolution <- purrr::map(carnival_sample_result, readRDS) # Check if any are 0 length_index <- purrr::map(carnival_sample_resolution, length) # Check if any are null lgl_index <- purrr::map_int(carnival_sample_resolution, length) == 0 carnival_sample_resolution <- carnival_sample_resolution[!lgl_index] stopifnot(length(carnival_sample_resolution) > 0) process_carnival <- function(carnival_res) { carnival_res$weightedSIF <- carnival_res$weightedSIF %>% dplyr::filter(Weight != 0) # carnival_res$nodesAttributes <- carnival_res$nodesAttributes %>% dplyr::filter(AvgAct != 0) carnival_res } carnival_sample_resolution <- purrr::map(carnival_sample_resolution, process_carnival) |
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | # read CARNIVAL results omnipath_id <- RNAscripts::get_organism_omnipath_id(organism) omniR <- OmnipathR::import_omnipath_interactions(organism = omnipath_id) # signed and directed omnipath_sd <- omniR %>% dplyr::filter(consensus_direction == 1 & (consensus_stimulation == 1 | consensus_inhibition == 1 )) # changing 0/1 criteria in consensus_stimulation/inhibition to -1/1 omnipath_sd$consensus_stimulation[which(omnipath_sd$consensus_stimulation == 0)] <- -1 omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 1)] <- -1 omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 0)] <- 1 # check consistency on consensus sign and select only those in a SIF format sif <- omnipath_sd[, c("source_genesymbol", "consensus_stimulation", "consensus_inhibition", "target_genesymbol")] %>% dplyr::filter(consensus_stimulation == consensus_inhibition) %>% unique.data.frame() sif$consensus_stimulation <- NULL colnames(sif) <- c("source", "interaction", "target") # remove complexes sif$source <- gsub(":", "_", sif$source) sif$target <- gsub(":", "_", sif$target) |
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 | # get only summary files from CARNIVAL results sifts <- lapply(carnival_sample_resolution, function(x) { x$weightedSIF }) nodos <- lapply(carnival_sample_resolution, function(x) { x$nodesAttributes }) write(names(sifts), file = stderr()) # Calculate the number of edges and nodes in the networks and its density node_edge <- do.call(rbind, lapply(sifts, count_edges_nodes_degree)) # Calculate degree distribution for a sample yeet <- do.call(rbind, sifts) %>% unique() count_degree <- yeet %>% degree_count() # degree distribution p <- data.frame(table(count_degree$total_count) / nrow(count_degree)) colnames(p) <- c("Var1", "total_degree") p <- merge.data.frame(p, data.frame(table(count_degree$in_count) / nrow(count_degree)), all = T) colnames(p) <- c("Var1", "total_degree", "in_degree") p <- merge.data.frame(p, data.frame(table(count_degree$out_count) / nrow(count_degree)), all = T) colnames(p) <- c("k", "total_degree", "in_degree", "out_degree") p <- melt(p, value.name = "p", id.vars = "k") p$k <- relevel(p$k, "0") # visualise ggdat <- as.data.frame(node_edge) %>% tibble::rownames_to_column(var = "sample") %>% dplyr::mutate(condition = gsub(".Rep[0-9]{1}", "", sample)) # Plotting # relation between number of edges and nodes ggplot(ggdat, aes(x = nodes, y = edges, color = as.factor(condition))) + geom_point() + geom_text( label = ggdat$sample, check_overlap = TRUE, vjust = 0, nudge_y = 0.5, show.legend = F ) + theme_bw(base_size = 15) + guides(color = guide_legend(title = "Conditions")) + ggtitle("Node-edge composition") |
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 | # degree distribution levels(p$k) <- levels(p$k) %>% as.numeric() %>% sort() dd <- ggplot(data = p, aes(x = k, y = p, group = variable, color = variable)) + geom_point() + geom_line() + theme_bw() + theme(legend.position = "none") + guides(color = guide_legend(title = "degree type")) + ggtitle("Degree distribution") ddp <- ggplot(data = p, aes(x = as.numeric(k), y = p, group = variable, color = variable)) + geom_point() + geom_line() + scale_x_continuous( breaks = as.numeric(p$k), trans = scales::log_trans() ) + scale_y_log10( breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)) ) + annotation_logticks() + theme_bw() + guides(color = guide_legend(title = "degree type")) + ggtitle("Degree distribution (log scale)") + xlab("k (ln)") + ylab("p (log10)") plot_grid(dd, ddp, labels = "auto", rel_widths = c(1, 2)) |
261 | ```
|
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 | get_common_interactions <- function(interaction_list, psmpl_per = 95, s_table = sample_table, condition_sel) { sample_id_list <- s_table %>% filter(condition %in% condition_sel) %>% pull(sample) interaction_list <- interaction_list[, sample_id_list] if (!is.null(dim(interaction_list))) { shared_interactions_WT <- getCoreInteractions(topology = interaction_list, psmpl = psmpl_per) # Visualise the interactions colnames(shared_interactions_WT) <- c("from", "sign", "to") labels_edge <- c("-1" = "inhibition", "1" = "activation") nodes <- data.frame(union(shared_interactions_WT$from, shared_interactions_WT$to)) colnames(nodes) <- "nodes" nodes$label <- nodes$nodes gg_graph <- tidygraph::tbl_graph(nodes = nodes, edges = shared_interactions_WT) %>% ggraph::ggraph(layout = "auto") + geom_node_point(color = "#C0C0C0", size = 8) + geom_edge_link(arrow = arrow(), aes(edge_colour = as.factor(sign))) + theme_graph() + geom_node_text(aes(label = label), vjust = 0.4) } else { gg_graph <- NULL } gg_graph } |
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 | # create a matrix of all interactions for all samples write(dim(sif), file = stderr()) interactions <- getTopology(networks = sifts, scafoldNET = sif) colnames(interactions) <- names(carnival_sample_resolution) # FIxes bug in topology function (To lazy to actually fix) ncol_interact <- ncol(interactions) # interactions <- interactions[rowSums(interactions) > 0,] # interactions <- interactions[,-c(1:ncol_interact/2)] # get the edges per sample # get the edges per sample net_int <- apply(interactions, 2, function(x, r) { r[which(!is.na(x))] }, rownames(interactions)) # calculate Jaccard indexes per pair combined <- expand.grid(1:length(names(sifts)), 1:length(names(sifts))) jac_index <- matrix( data = NA, nrow = length(names(sifts)), ncol = length(names(sifts)), dimnames = list(names(sifts), names(sifts)) ) for (i in 1:nrow(combined)) { n <- names(sifts)[combined[i, 1]] m <- names(sifts)[combined[i, 2]] jac_index[n, m] <- length(intersect(net_int[[n]], net_int[[m]])) / length(union(net_int[[n]], net_int[[m]])) } # Visualize the indexes in a heatmap pheatmap::pheatmap(jac_index, fontsize = 14, fontsize_row = 10, fontsize_col = 10, angle_col = 45, treeheight_col = 0 ) corrplot::corrplot(jac_index, order = "hclust") |
391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 | groups_to_check <- c( workflow_config$comp_groups, as.list(setNames( object = sample_table$condition %>% unique(), nm = sample_table$condition %>% unique() )) ) yoted <- purrr::map(groups_to_check, get_common_interactions, interaction_list = interactions, psmpl_per = 60, s_table = sample_table[!lgl_index, ]) if (any(purrr::map(yoted, is.null))) { null_index <- purrr::map_lgl(yoted, is.null) yoted <- yoted[!null_index] } yoted <- purrr::map2(yoted, names(yoted), function(x, y) x + ggtitle(y)) yoted |
411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 | build_matrix_from_nodes <- function(node_list, node_type = "AvgAct") { gene_index <- purrr::map(node_list, ~ pull(., Node)) %>% unlist() %>% unique() node_mat <- purrr::map(node_list, ~ dplyr::select(., Node, !!node_type)) %>% purrr::reduce(full_join, by = "Node") colnames(node_mat) <- c("Node", names(node_list)) node_mat } avg_mat <- build_matrix_from_nodes(nodos) rownames(avg_mat) <- avg_mat$Node avg_mat <- subset(avg_mat, select = -c(Node)) %>% as.matrix() non_zero_index <- apply(avg_mat, 1, function(x) length(which(x != 0)) >= 2) ComplexHeatmap::Heatmap(avg_mat[non_zero_index, ], column_names_rot = 45, row_names_gp = grid::gpar(fontsize = 6), cluster_columns = FALSE) |
432 | sessionInfo() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | library(readr) library(CARNIVAL) if (!require(RNAscripts)) { devtools::install("../scripts/RNAscripts", upgrade = "never", quiet = TRUE) } library(RNAscripts) library(magrittr) library(dplyr) library(OmnipathR) # files if (exists("snakemake")) { tf_act_path <- snakemake@input[["tf_activities"]] path_act_path <- snakemake@input[["pathway_activities"]] cplex_path <- snakemake@config[["cplex_solver"]] carnival_sample_result <- snakemake@output[["carnival_sample_result"]] thread_num <- snakemake@threads time_limit <- (snakemake@resources[["time_min"]] - 20) * 60 mem_mb <- snakemake@resources[["mem_mb"]] run_vanilla <- snakemake@params[["run_vanilla"]] temp_path <- snakemake@params[["temp_path"]] nr <- snakemake@resources[["nr"]] sample_id <- snakemake@wildcards[["sample"]] progeny_data <- "./workflow/data/progenyMembers.RData" } else { base_path <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/APLN_KO" tf_act_path <- file.path(base_path, "dorothea/TF_act_sample_resolution.csv") path_act_path <- file.path(base_path, "progeny/sample_progney.csv") carnival_output <- "./test_output.RDS.gz" cplex_path <- "/home/heyer/software/external/CPLEX_Studio201/cplex/bin/x86-64_linux/cplex" thread_num <- 4 temp_path <- file.path(base_path, "results/carnival/temp/VascAge_Young-EC-Plus-5") mem_mb <- 8192 time_limit <- 3600 run_vanilla <- FALSE sample_id <- "VascAge_Young-EC-Plus-5" progeny_data <- "./workflow/data/progenyMembers.RData" nr <- 1 } if (nr == 4) { saveRDS(list(), file = carnival_sample_result) write("Failed to converge", file = stderr()) quit(save = "no") } sample_id <- stringr::str_replace_all(sample_id, pattern = "-", replacement = ".") tf_activities <- read_csv(tf_act_path) pathway_activity <- read_csv(path_act_path) mouse_human_homologs <- readr::read_tsv("http://www.informatics.jax.org/downloads/reports/HMD_HumanPhenotype.rpt", col_names = c("hgene", "hID", "mgene", "mID", "lcol") ) ## Get Omnipath get_organism_number <- function(organism_name) { if (organism_name == "human") { organism_number <- 9606 } else if (organism_name == "mouse") { organism_number <- 10090 } organism_number } organism <- "mouse" org_nb <- get_organism_number(organism) omniR <- import_omnipath_interactions(organism = org_nb) # signed and directed omnipath_sd <- omniR %>% dplyr::filter(consensus_direction == 1 & (consensus_stimulation == 1 | consensus_inhibition == 1 )) # changing 0/1 criteria in consensus_stimulation/inhibition to -1/1 omnipath_sd$consensus_stimulation[which(omnipath_sd$consensus_stimulation == 0)] <- -1 omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 1)] <- -1 omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 0)] <- 1 # check consistency on consensus sign and select only those in a SIF format sif <- omnipath_sd[, c( "source_genesymbol", "consensus_stimulation", "consensus_inhibition", "target_genesymbol" )] %>% dplyr::filter(consensus_stimulation == consensus_inhibition) %>% unique.data.frame() sif$consensus_stimulation <- NULL colnames(sif) <- c("source", "interaction", "target") # remove complexes sif$source <- gsub(":", "_", sif$source) sif$target <- gsub(":", "_", sif$target) # dorothea for CARNIVAL tf_activities_carnival <- data.frame(tf_activities, stringsAsFactors = F) rownames(tf_activities_carnival) <- tf_activities$source tf_activities_carnival$source <- NULL tfList <- generateTFList(tf_activities_carnival, top = 50, access_idx = 1:ncol(tf_activities_carnival) ) # progeny for CARNIVAL load(file = progeny_data) # progenyMembers$gene$p53 <- "TP53" if (organism == "mouse") { progmem_mouse <- purrr::map(progenyMembers$gene, convertHumanGeneHomologs, mouse_human_homologs) progenyMembers$gene <- progmem_mouse progenyMembers$gene$p53 <- "Tp53" names(progenyMembers$gene)[names(progenyMembers$gene) == "JAK.STAT"] <- "JAK-STAT" } else { progenyMembers$gene$p53 <- "TP53" } pathway_activity_carnival <- data.frame(pathway_activity, stringsAsFactors = F) rownames(pathway_activity_carnival) <- pathway_activity_carnival %>% dplyr::pull(pathways) # pathway_activity_carnival <- pathway_activity_carnival[,-c()] pathway_activity_carnival$pathways <- NULL progenylist <- assignPROGENyScores( progeny = t(pathway_activity_carnival), progenyMembers = progenyMembers, id = "gene", access_idx = 1:ncol(pathway_activity_carnival) ) # get initial nodes # run carnival for all samples dir.create(file.path(temp_path, "carnout"), recursive = T, showWarnings = F) # setwd(file.path(temp_path, "carnout")) lp_opts <- CARNIVAL::defaultCplexCarnivalOptions( solverPath = cplex_path, cplexMemoryLimit = mem_mb, threads = thread_num, timelimit = time_limit, lpFilename = file.path(temp_path, "lptest.lp"), outputFolder = file.path(temp_path, "carnout") ) cplex_opts <- suggestedCplexSpecificOptions() lp_opts[names(cplex_opts)] <- cplex_opts # lp_opts$solverPath <- cplex_path lp_opts$cplexMemoryLimit <- mem_mb lp_opts$threads <- thread_num lp_opts$timelimit <- time_limit if (run_vanilla) { carnival_result <- runVanillaCarnival( perturbations = c("Aplnr" = 1, "Apln" = 1), measurements = unlist(tfList[[sample_id]]), priorKnowledgeNetwork = sif, weights = unlist(progenylist[[sample_id]]), carnivalOptions = lp_opts ) } else { carnival_result <- runInverseCarnival( measurements = unlist(tfList[[sample_id]]), priorKnowledgeNetwork = sif, weights = unlist(progenylist[[sample_id]]), carnivalOptions = lp_opts ) # transoform to data.frame # carnival_result$weightedSIF <- data.frame(carnival_result$weightedSIF, # stringsAsFactors = F) # carnival_result$weightedSIF$Sign <- as.numeric(carnival_result$weightedSIF$Sign) # carnival_result$weightedSIF$Weight <- as.numeric(carnival_result$weightedSIF$Weight) # carnival_result$nodesAttributes <- data.frame(carnival_result$nodesAttributes, stringsAsFactors = F) # carnival_result$nodesAttributes$ZeroAct <- as.numeric(carnival_result$nodesAttributes$ZeroAct) # carnival_result$nodesAttributes$UpAct <- as.numeric(carnival_result$nodesAttributes$UpAct) # carnival_result$nodesAttributes$DownAct <- as.numeric(carnival_result$nodesAttributes$DownAct) # carnival_result$nodesAttributes$AvgAct <- as.numeric(carnival_result$nodesAttributes$AvgAct) } if (length(carnival_result$sifAll) < 50) { if (nr >= 2) { write(paste0("Failed to converge after 2 attempts"), file = stderr()) carnival_result <- list() } else { stop("Failed to converge restarting") } } saveRDS(carnival_result, carnival_sample_result) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | library(readr) library(viper) library(DESeq2) library(dorothea) library(RNAscripts) library(magrittr) # put whatever is your working directory here if (exists("snakemake")) { dds_obj_rlog <- snakemake@input[["dds_obj"]] fpkm_path <- snakemake@input[["fpkm"]] outpath <- snakemake@output[["sample_dorothea_table"]] threads <- snakemake@threads } else { BASE_PATH <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/apelin_exp" dds_obj_rlog <- file.path(BASE_PATH, "deseq2/rlog_transform.RDS.gz") fpkm_path <- file.path(BASE_PATH, "fpkm/all.tsv") threads <- 1 } # count_df_rlog <- as.data.frame(read_csv("data/count_df_rlog.csv")) count_df_rlog <- readRDS(dds_obj_rlog) %>% assay() # row.names(count_df_rlog) <- count_df_rlog$gene # count_df_rlog <- count_df_rlog[,-1] # count_df_rlog <- count_df_rlog[complete.cases(count_df_rlog),] fpkm <- read_tsv(fpkm_path) rename_count_rownames <- function(count_table, fpkm_table) { filer <- fpkm_table %>% dplyr::filter(gene %in% rownames(count_table)) %>% dplyr::filter(!duplicated(gname)) count_table <- count_table[filer$gene, ] rownames(count_table) <- filer$gname count_table } count_df_rlog <- rename_count_rownames( count_table = count_df_rlog, fpkm_table = fpkm ) # regulons <- dorothea_mm %>% dplyr::filter(confidence %in% c("A", "B", "C")) doro_net <- decoupleR::get_dorothea(organism = "mouse", levels = c("A", "B", "C")) TF_activity <- decoupleR::run_wmean(count_df_rlog, network = doro_net, times = 1000, minsize = 5) %>% dplyr::filter(statistic == "norm_wmean") %>% dplyr::select(source, condition, score) %>% tidyr::pivot_wider(names_from = condition, values_from = score) # TF_activity <- run_viper(count_df_rlog, # regulons = regulons, # options = list( # method = "scale", minsize = 4, # eset.filter = FALSE, cores = threads, # verbose = FALSE # ) # ) %>% as.data.frame() ### Preparing dorothea ## Dorothea/viper # TF_activity$TF <- row.names(TF_activity) readr::write_csv(TF_activity, outpath) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | library(DESeq2) library(readr) library(viper) library(progeny) library(RNAscripts) library(magrittr) library(decoupleR) if (exists("snakemake")) { dds_obj_rlog <- snakemake@input[["dds_obj"]] fpkm_path <- snakemake@input[["fpkm"]] outpath <- snakemake@output[["sample_progeny_table"]] threads <- snakemake@threads } else { BASE_PATH <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/cre_2022" dds_obj_rlog <- file.path(BASE_PATH, "deseq2/rlog_transform.RDS.gz") fpkm_path <- file.path(BASE_PATH, "fpkm/all.tsv") threads <- 1 } ## Read and import data count_df_rlog <- readRDS(dds_obj_rlog) %>% assay() %>% as.matrix() fpkm <- read_tsv(fpkm_path) # Rename rownames print(rownames(count_df_rlog)) count_df_rlog <- rename_count_rownames( count_table = count_df_rlog, fpkm_table = fpkm ) # Import progeny network and run decoupler wmean organism <- "mouse" prog_net <- decoupleR::get_progeny(organism = organism, top = 100) PathwayActivity_counts <- decoupleR::run_wmean(count_df_rlog, network = prog_net, .mor = "weight", times = 1000 ) %>% dplyr::filter(statistic == "norm_wmean") %>% dplyr::mutate(prog_score = (1 - p_value) * sign(score)) %>% tidyr::pivot_wider( id_cols = "condition", names_from = "source", values_from = "prog_score" ) %>% tibble::column_to_rownames("condition") %>% as.matrix() PathwayActivity_counts <- as.data.frame(t(PathwayActivity_counts)) PathwayActivity_counts$pathways <- rownames(PathwayActivity_counts) readr::write_csv(PathwayActivity_counts, outpath) |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | __author__ = "Julian de Ruiter" __copyright__ = "Copyright 2017, Julian de Ruiter" __email__ = "julianderuiter@gmail.com" __license__ = "MIT" from os import path from snakemake.shell import shell extra = snakemake.params.get("extra", "") # Set this to False if multiqc should use the actual input directly # instead of parsing the folders where the provided files are located use_input_files_only = snakemake.params.get("use_input_files_only", False) if not use_input_files_only: input_data = set(path.dirname(fp) for fp in snakemake.input) else: input_data = set(snakemake.input) output_dir = path.dirname(snakemake.output[0]) output_name = path.basename(snakemake.output[0]) log = snakemake.log_fmt_shell(stdout=True, stderr=True) shell( "multiqc" " {extra}" " --force" " -o {output_dir}" " -n {output_name}" " {input_data}" " {log}" ) |
Support
- Future updates
Related Workflows





