BioWorkflows: Supplementary material for the OLOGRAM-MODL paper

from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnchoredText

from scipy.stats import beta
from pygtftk.stats.beta import fit_beta

# Graph paths
res_file_a = snakemake.output["a"]   
res_file_b = snakemake.output["b"] 
res_file_min = snakemake.output["mini"] 
res_file_max = snakemake.output["maxi"] 
res_file_v = snakemake.output["v"] 

# Create directories if needed
for filepath in [res_file_a, res_file_b, res_file_v]:
    path = Path(filepath).parent # Get parent directory
    Path(path).mkdir(parents=True, exist_ok=True)


# Fix the parameters to be fiitted
a, b = 1., 2.

def beta_var(a,b) : return (a*b)/((a+b)**2 * (a+b+1)) 
true_variance = beta_var(a,b)

np.random.seed(seed=42)

# ---------------------------- Experiment ----------------------------- #
# Make many different fittings with many different samples sizes
K = 200
SAMPLE_SIZES = [100]*K + [200]*K + [1000]*K + [10000]*K + [20000]*K

result = list()

for size in SAMPLE_SIZES:
    obs = beta.rvs(a, b, size=size)  # Generate data
    ahat, bhat, mhat, chat = fit_beta(obs) # Fit

    # Record result
    result += [{
        'samples':size,
        'ahat':ahat,
        'bhat':bhat,
        'mhat':mhat,
        'chat':chat,
        'var':np.var(obs, ddof=1),
        'rebuilt_var':beta_var(ahat, bhat)
    }]


# ------------------------------- Plot figures ------------------------------- #
result_df = pd.DataFrame(result)
result_df['Number of samples'] = result_df['samples'] # Renaming


### For alpha

meanplot = result_df.boxplot(column=['ahat'], by='Number of samples')

# Add a line for the true parameter
plt.axhline(y=a, color='g', linestyle='-')

at = AnchoredText("True α = "+str(a),
                  prop=dict(size=8), frameon=True,
                  loc='upper right',
                  )
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
meanplot.add_artist(at)

plt.savefig(res_file_a)
plt.close()

### Same for beta

meanplot = result_df.boxplot(column=['bhat'], by='Number of samples')

plt.axhline(y=b, color='b', linestyle='-')
at = AnchoredText("True β = "+str(b), prop=dict(size=8), frameon=True, loc='upper right')
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
meanplot.add_artist(at)

plt.savefig(res_file_b)
plt.close()


### And min and max

meanplot = result_df.boxplot(column=['mhat'], by='Number of samples')
plt.axhline(y=0, color='r', linestyle='-')
at = AnchoredText("True minimum = "+str(0), prop=dict(size=8), frameon=True, loc='upper right')
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
meanplot.add_artist(at)
plt.savefig(res_file_min)
plt.close()

meanplot = result_df.boxplot(column=['chat'], by='Number of samples')
plt.axhline(y=1, color='r', linestyle='-')
at = AnchoredText("True maximum = "+str(1), prop=dict(size=8), frameon=True, loc='upper right')
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
meanplot.add_artist(at)
plt.savefig(res_file_max)
plt.close()


### And for variance, to show Neg Binom fitting would thus be better

# Renaming
result_df['Empirical'] = result_df['var']
result_df['Fitted'] = result_df['rebuilt_var']

sns.set_theme(style="whitegrid")
dd = pd.melt(result_df,id_vars=['Number of samples'],value_vars=['Empirical','Fitted'],var_name='Variance')
vplot = sns.boxplot(x='Number of samples', y='value', data=dd, hue='Variance',
    linewidth=1)

vplot.get_figure().savefig(res_file_v)

Python Pandas numpy matplotlib seaborn scipy pygtftk From line 5 of scripts/beta_precision.py

import numpy as np
import pandas as pd
import pandas.api.types as pdtypes
from pygtftk.stats.intersect.modl import tree
from functools import partial
import math
from collections import Counter

from plotnine import ggplot, aes, labs, scale_color_gradient, geom_point, geom_abline, scale_x_log10, scale_y_log10, geom_violin, geom_boxplot, position_dodge, scale_y_log10, xlab, ylab, ggtitle

ROOT_PATH = "./output/ologram_result_scatacseq_pbmc/"
DATA_ROOT_PATH = "./output/sc_atac_seq_pbmc_data/"

# For manual execution
# ROOT_PATH = "../output/ologram_result_scatacseq_pbmc/"
# DATA_ROOT_PATH = "../output/sc_atac_seq_pbmc_data/"


## Superclusters: how to regroup the cell types
# For now, this is hardcoded
SUPERCLUSTERS = {
    "CD14+ Monocytes":  "cd14",
    "CD4 Naive":        "cd48",
    "CD8 Naive":        "cd48",
    "Quer":             "NA",
    "...":              "NA",
    "NA":               "NA",
    "pre-B cell":       "preB"
}


# ---------------------------------------------------------------------------- #

## Read the OLOGRAM-MODL result file
# Read dataframe to create the found_combis dictionary
df_res = pd.read_csv(ROOT_PATH+"merged_batches_result.tsv", sep='\t', header=0, index_col=None)
# Pval set to 0 or -1 are changed to 1e-320 and NaN respectively
df_res.loc[df_res['summed_bp_overlaps_pvalue'] == 0, 'summed_bp_overlaps_pvalue'] = 1e-320
df_res.loc[df_res['summed_bp_overlaps_pvalue'] == -1, 'summed_bp_overlaps_pvalue'] = np.nan

# Create a Library tree with the combinations, like in the MODL algorithm itself  
print("Creating a Library for these words. This can be very long for longer words.")
print("In this particular Snakemake, it should take about 12 minutes.")

L = tree.Library()
L.build_nodes_for_words_from_ologram_result_df(df_res)
L.assign_nodes()


# ---------------------------------------------------------------------------- #


# Remember the features names are in L.features_names
# And the interesting attributes of each node are: 
#   node.word, node.children, node.s pval and fc
def word_to_combi(word): return [L.features_names[i]  for i in range(len(word)) if word[i]]

# Retrieves all combis and enrichment
def work_on_this_node(node, graph):
    return word_to_combi(node.word), node.fc, node.pval, node.s

# Iterate over all nodes
mygraphfunc = partial(work_on_this_node, graph=L)
global_results = {}
tree.apply_recursively_to_all_nodes(L.root_node, mygraphfunc, global_results)

df = pd.DataFrame(global_results.values(), columns = ["combination","fc","pval","s"])

df['combi_length'] = (df['combination'].apply(len)) -2
# Substract 2 to the length to discard the "Query" and "..." terms of the combinations

# Now, get the enrichments of all 2-wise combis, all 3-wise, etc.
def entropy(data):
    """
    Compute Shannon entropy.
    """
    if len(data) <= 1: return 0

    # Data counts
    counts = Counter()
    for d in data: counts[d] += 1

    ent = 0
    probs = [float(c) / len(data) for c in counts.values()]
    for p in probs:
        if p > 0.: ent -= p * math.log(p, 2)

    return ent


# ---------------------------------------------------------------------------- #


# Read the translation table as a dictionary and convert
# If df_map is a pandas dataframe with two columns, 'name' and 'category'

df_map = pd.read_csv(DATA_ROOT_PATH + "cell_to_class.tsv", sep='\t', header=0, index_col=None)
df_map.set_index('id', inplace=True)
df_map.transpose()
dict_translate = df_map['class'].to_dict()

dict_translate["Query"] = "NA" # Add the 'Query' class
dict_translate["..."] = "NA"


def combination_to_combi_of_classes(combination):
    new_combination = []
    for element in combination:
        current_key = dict_translate[element]
        element_to_add = SUPERCLUSTERS[current_key]
        if element_to_add != "NA": 
            new_combination += [element_to_add]
    return new_combination

df['combination_classes'] = df['combination'].apply(combination_to_combi_of_classes)


## Compute entropy
df['entropy'] = df['combination_classes'].apply(entropy)
# Round to 2 decimals and make a category
df['entropy'] = df['entropy'].apply(lambda x: round(x,2)).astype(pdtypes.CategoricalDtype())


# Using only combis of each individual length

all_lengths = sorted(set(df['combi_length']))

# No point in going beyond, roughly, 12
all_lengths = [l for l in all_lengths if l <= 12]

for length in all_lengths:

    df_filtered = df.loc[df['combi_length'] == length,:]

    try:
        p = (ggplot(data=df_filtered, mapping = aes(x='entropy', y='fc'))
                + geom_violin(position = position_dodge(1), width = 1)
                + geom_boxplot(position = position_dodge(1), width = 0.25)
                + xlab("Entropy") + ylab("Fold change (log2)")
                + ggtitle("Order "+str(length)))

        p.save(filename = ROOT_PATH + "entropy_graph/entropy_length_" + str(length) + "_fc.png")



        p = (ggplot(data=df_filtered, mapping = aes(x='entropy', y='s'))
                + geom_violin(position = position_dodge(1), width = 1)
                + geom_boxplot(position = position_dodge(1), width = 0.25)
                + xlab("Entropy") + ylab("True total overlapping bp.")
                + ggtitle("Order "+str(length)))

        p.save(filename = ROOT_PATH + "entropy_graph/entropy_length_" + str(length) + "_s.png")


    except:
        print("Skipping length",length,"which caused an error.")

Python Pandas numpy pygtftk From line 4 of scripts/combi_entropy_analysis.py

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.offsetbox import AnchoredText

# root_dir = './output/supp_fig4/'
# concat_file = root_dir + 'merged_ologram_test.tsv'
# res_file_1 = root_dir + 'S_mean_boxplot.pdf'
# res_file_2 = root_dir + 'S_var_boxplot.pdf'

concat_file = snakemake.input["merged"]         # Concatenated shuffles
truth_file = snakemake.input["truth"]           # Deep sampled empirical p-value
res_file = snakemake.output[0]                  # Result file


# ---------------------------- Read result files ----------------------------- #
def result_ologram_to_list_of_lists(file, is_truth = False):

    # Skip even rows except for 0, they are the headers of other files (non-truth data only)
    def logic(index):
        if (index % 2 == 0) and index != 0 : return True
        return False 

    if not is_truth: df = pd.read_csv(file, sep='\t', header = 0, index_col = None, skiprows = lambda x: logic(x))
    else: df = pd.read_csv(file, sep='\t', header = 0, index_col = None)

    # Iterate over each row and build a result object
    result = list()
    for i, row in df.iterrows():
        header = row['feature_type'].split('_')

        if not is_truth:
            r = {
                'nb_minibatches': int(header[2]),
                'try_id': int(header[5]),
                'p_val': row['summed_bp_overlaps_pvalue'],
                'log_10(p_val)': np.log10(row['summed_bp_overlaps_pvalue'] + 1E-320),
                'log_10(empirical_pval)': np.log10(row["summed_bp_overlaps_empirical_pvalue"] + 1E-320)
            }
        else:
            r = {
                'p_val': row['summed_bp_overlaps_pvalue'],
                'log_10(p_val)': np.log10(row['summed_bp_overlaps_pvalue'] + 1E-320),
                'log_10(empirical_pval)': np.log10(row["summed_bp_overlaps_empirical_pvalue"] + 1E-320)
                #'log_10(beta_pval)': np.log10(row["ad_hoc_beta_summed_bp_overlaps_pvalue"] + 1E-320)
            }

        result += [r]

    return result


result = result_ologram_to_list_of_lists(concat_file)
result_truth = result_ologram_to_list_of_lists(truth_file, is_truth = True)

# ------------------------------- Plot figure -------------------------------- #
result_df = pd.DataFrame(result)

MINIBATCH_SIZE = 10 # Hardcoded for now, keep it the same in the Snakefile
result_df['Number of shuffles'] = MINIBATCH_SIZE * result_df['nb_minibatches']

meanplot = result_df.boxplot(column=['log_10(p_val)'], by='Number of shuffles')
result_truth_df = pd.DataFrame(result_truth)

# Add a line for the true p-val (as estimated by very deep sampling)
true_pval = list(result_truth_df['log_10(empirical_pval)'])[0] # Hardcoded to read the first line, since there should be only one
plt.axhline(y=true_pval, color='g', linestyle='-')

#beta_pval = list(result_truth_df['log_10(beta_pval)'])[0] # Hardcoded to read the first line, since there should be only one
#plt.axhline(y=beta_pval, color='r', linestyle='-')

at = AnchoredText("True p-val = "+'{0:.4g}'.format(10**true_pval),
                  prop=dict(size=8), frameon=True,
                  loc='upper right',
                  )
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
meanplot.add_artist(at)

plt.savefig(res_file)

Python Pandas numpy matplotlib From line 6 of scripts/depth_boxplot.py

import numpy as np
import pandas as pd

import time
import subprocess
import warnings
import random

from pygtftk.stats.intersect.modl.dict_learning import Modl, test_data_for_modl
from pygtftk.stats.intersect.modl.apriori import Apriori, matrix_to_list_of_transactions, apriori_results_to_matrix
from pygtftk.stats.intersect.modl.subroutines import learn_dictionary_and_encode
from pygtftk import utils

## ---------------------------- Parameters ---------------------------------- ##

np.random.seed(42)
random.seed(42)

utils.VERBOSITY = 3 # Force debug messages to appear
N_THREADS = 4

## Paths
# Hardcoded for now. It was necessary to launch this script in 
# shell and not via snakemake.script to allow proper redirection of the log
OUTPUT_ROOT = "output/benchmark/comparison/" # Output path
EXT_PATH = "ext/"   # Path to external tools

# # For manual launch
# OUTPUT_ROOT = "../output/benchmark/comparison/" # Output path
# EXT_PATH = "../ext/"   # Path to external tools

# Debug : plots are not shown, only printed, so use Agg backend for matplotlib
import matplotlib
matplotlib.use("Agg")


# Data parameters
NB_SETS = 6
NFLAGS = 10000
NOISE = 0.12
NOISE_HIGH = 0.2
CORR_GROUPS = [(0,1),(0,1,2,3),(4,5)]

# MODL parameters
N_QUERIED_ATOMS = 3

# Other algorithms' parameters
MIN_SUPPORT = 1E-10 # This value is an epsilon for 0. Hence, return all itemsets ordered by support



## --------------------------- Found combinations --------------------------- ##

# Generate data with the AB, ABCD, EF combinations, adding uniform noise
names = [str(i) for i in range(NB_SETS)]
x = test_data_for_modl(nflags = NFLAGS, number_of_sets = NB_SETS,
    noise = NOISE, cor_groups = CORR_GROUPS)

# Convert to list of transactions
transactions = matrix_to_list_of_transactions(x, names) 




## Run MODL

combi_miner = Modl(x, 
    multiple_overlap_target_combi_size = -1,                          # Optional: limit the size of the combinations
    multiple_overlap_max_number_of_combinations = N_QUERIED_ATOMS,    # How many words to find ?
    nb_threads = N_THREADS,                                           # Full multithreading
    smother = True,                                                   # Reduce each row's abundance to its square root. Helps find rarer combinations but magnifies the noise.
    step_1_factor_allowance = 2,                                      # Optional: how many words to ask for in each step 1 rebuilding
    normalize_words = True,                                           # Normalize word sum of square in step 2
    step_2_alpha = None)                                              # Optional: override the sparsity control in step 2
modl_interesting_combis = combi_miner.find_interesting_combinations()


# Run MODL without smothering
x = test_data_for_modl(noise = NOISE_HIGH)
combi_miner = Modl(x, 
    multiple_overlap_max_number_of_combinations=N_QUERIED_ATOMS, 
    smother = False)
modl_interesting_combis_no_smother = combi_miner.find_interesting_combinations()


# Run MODL without word normalization
x = test_data_for_modl(noise = NOISE)
combi_miner = Modl(x, 
    multiple_overlap_max_number_of_combinations=N_QUERIED_ATOMS, 
    normalize_words = False)
modl_interesting_combis_not_normalized = combi_miner.find_interesting_combinations()


print("-- MODL INTERESTING COMBINATIONS --")
print("Standard =", modl_interesting_combis)
print("No smother, more noise =", modl_interesting_combis_no_smother)
print("No word normalization =", modl_interesting_combis_not_normalized)
print("-------------------------------")


## Run Apriori and FP-growth

# Apriori
myminer = Apriori(min_support = MIN_SUPPORT)
myminer.run_apriori(transactions)
results = myminer.produce_results()
apriori_results_df = apriori_results_to_matrix(results, names)

print("-- APRIORI ASSOCIATION RULES --")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(apriori_results_df)
print("-------------------------------")



# FP-Growth
from mlxtend.frequent_patterns import fpgrowth
x_as_dataframe = pd.DataFrame(x)
fpres = fpgrowth(x_as_dataframe, min_support=MIN_SUPPORT)



pd.set_option('display.max_rows', 1000)
print(fpres)


# Here, output the matrices to BED files
# from matrix_to_bed import intersection_matrix_to_bedfiles
# bedpaths = intersection_matrix_to_bedfiles(x, output_root)

# # Also print it to a tsv, and to a list of transactions
# np.savetxt(OUTPUT_ROOT+'artifmat.tsv', delimiter='\t')

# def format_list_brackets(alist): return '{'+','.join(alist)+'}'
# with open(OUTPUT_ROOT+'artiftransact_brackets.tsv', 'w+') as f:
#     for item in transactions:
#         f.write(format_list_brackets(item)+'\n')


# And in the specific SPMF format
with open(OUTPUT_ROOT+'artiftransact.txt', 'w+') as f:
    for item in transactions: f.write(' '.join(item)+'\n')



## ---------- Manual launch of other itemset miner(s)

## Run LCM
command_line = ["java", "-jar", EXT_PATH + "spmf.jar", # Java SPMF toolset
    "run", "LCM",                        # Run this algorithm
    OUTPUT_ROOT + "artiftransact.txt",   # Query file
    OUTPUT_ROOT + "output_lcm.txt",      # Output
    str(round(MIN_SUPPORT*100))+"%"      # Min support (parameter)
    ] 


# NOTE Remember that in subprocess.run, the command line must be a list of 
# arguments and not a string of the command
stdout_captured = subprocess.run(command_line, 
    stdout=subprocess.PIPE, universal_newlines=True).stdout





## Run CL-Max (approximate itemset miner)

warnings.filterwarnings('ignore') # Silence NumPy warnings for display

# Load it by running the code (kinda hacky but works for now)
exec(open(EXT_PATH + "cl-max.py").read())



# Run a variety of parameters
# Esp. rounding threshold. And random seeds seemed to change results a lot !
RANDOM_SEED = 1234
min_support = 0.1
for num_cluster in [3,6]:
    for rounding_threshold in [0.5,0.9]:

        print("-----------------")
        print("num_cluster =",num_cluster)
        print("rounding_threshold =",rounding_threshold)

        np.random.seed(RANDOM_SEED)
        random.seed(RANDOM_SEED)

        cl_max = CL_MAX(min_support, num_cluster, rounding_threshold)

        # Load dataset
        cl_max.dataset = pd.DataFrame(x)
        cl_max.sorted_items = names
        cl_max.transactions = [[int(i) for i in t] for t in transactions if t] # Remove empties
        cl_max.maximum = NB_SETS
        cl_max.remove_non_frequent_single_items()

        # Run the algorithm
        MFIs = cl_max._CL_MAX()

Python Pandas numpy matplotlib pygtftk mlxtend From line 5 of scripts/modl_comparison.py

QUERY_PATH  = "input/as_ginom/query_som_trans_lung.bed"        # The query set
INCL_PATH   = "input/as_ginom/mappability_human.bed"           # Inclusion: the analysis will be limited to the sub-genome designated in this files. Regions outside of it will be discarded.
GENOME_PATH = "input/hg19.genome"                              # Chromosome sizes

# Reference sets
MORE_PATHS  = [
    "input/as_ginom/reference_1_hg19_Refseq_promoters_2000bp.bed",
    "input/as_ginom/reference_2_hg19_Refseq_gene_bodies.bed",
    "input/as_ginom/reference_3_Broad_A549_H3k04me3_Dex.bed",
    "input/as_ginom/reference_4_Broad_A549_H3k36me3_Dex.bed",
    "input/as_ginom/reference_5_Broad_A549_H3k79me2_Dex.bed"
]
# The meaning of the combinations  returnedwill always be : query, followed by the reference sets in the order they are given
# For example, '[101010]' means "Query + Reference 2 (gene bodies) + Reference 4 (H3K36me3)"


## Parameters

# Subsampling parameters
KEEP_N_TIMES_QUERY = 100          # For each '1' line with the query, how many '0' lines without it do we keep?
QUERY_WEIGHT = 100                # In the Naive Bayes classifier, how much more do we weigh the presence of the query?

# MODL parameters
DESIRED_ITEMSETS = 7              # How many interesting combinations should MODL return?

RANDOM_SEED = 42

# ---------------------------------------------------------------------------- #

def presence_vector(row, combis):
    """
    Given a list of combinations, returns a vector saying whether they are
    present or absent in this row. The order in the vector is the same as given
    in the combis list.

    This works exactly like an inexact (transitive) count, which is OLOGRAM-MODL's default operating mode.

    Example:

    >>> row = [1,0,1,1,0,1]
    >>> combis = [1,0,1,0,0,0],[0,1,1,0,0,0]
    >>> res = presence_vector(row, combis)
    >>> assert list(res) == [1,0]

    """

    result = []
    for combi in combis:
        mask = row - combi

        # If any element from the combi is missing, it's absent, so
        # add a 0. Otherwise add a 1.
        if min(mask) < 0 : result +=[0]
        else: result += [1]

    return result


import pandas as pd
import numpy as np
import sklearn
import pybedtools
from sklearn.naive_bayes import GaussianNB

from pygtftk.stats.intersect.overlap_stats_compute import compute_true_intersection
from pygtftk.stats.intersect.overlap.overlap_regions import does_combi_match_query
from pygtftk.stats.intersect.modl.dict_learning import Modl
from pygtftk import utils
from pygtftk.stats.intersect.read_bed import read_bed_as_list as read_bed
from pygtftk.utils import chrom_info_as_dict
from pygtftk import arg_formatter

np.random.seed(RANDOM_SEED)  # Random seed for reproducibility



## Prepare files
bedA = pybedtools.BedTool(QUERY_PATH).sort().merge()
bedsB = [pybedtools.BedTool(bedfilepath).sort().merge() for bedfilepath in MORE_PATHS]


# Do the exclusion manually Generate a fake bed for the entire genome, using the chromsizes
bed_incl = pybedtools.BedTool(INCL_PATH)
chrom_len = chrom_info_as_dict(open(GENOME_PATH, 'r'))

full_genome_bed = [str(chrom) + '\t' + '0' + '\t' + str(chrom_len[chrom]) + '\n' for chrom in chrom_len if chrom != 'all_chrom']
full_genome_bed = pybedtools.BedTool(full_genome_bed)
bed_excl = full_genome_bed.subtract(bed_incl)

bedA = read_bed.exclude_concatenate(bedA, bed_excl)
bedsB = [read_bed.exclude_concatenate(bedB, bed_excl) for bedB in bedsB]
full_genome_bed_after_excl = read_bed.exclude_concatenate(full_genome_bed, bed_excl)


# Note that by definition, in this intersections' matrix only regions where at 
# least two sets are open are given. For example {4} alone is not found. 
# To fix it, use a fake full genome bed as query, so there is always one file 
# open, then truncate it to get the flags_matrix. 
#true_intersection = compute_true_intersection(bedA, bedsB)
true_intersection = compute_true_intersection(full_genome_bed_after_excl, [bedA] + bedsB)
flags_matrix = np.array([i[3] for i in true_intersection])

"""
# NOTE : The length of each intersection is also accessible, in case you want
# the final matrix to have one row per X base pairs instead of 1 row per intersection/event.
# For reference, the true_intersection object looks like this: [('chr1', 150, 180, np.array([1,1,0])), ('chr1', 180, 200, np.array([1,1,1])), ('chr1', 350, 380, np.array([1,1,0]))]
for i in true_intersection: # For each observed intersection...
    inter_chr, inter_start, inter_stop = i[0], i[1], i[2]   # Get the chromosome, start and end of the observed intersection
    intersection_length = inter_stop - inter_start          # Deduce the length of the intersection
    intersection_flags = i[3]                               # Which combination was observed here?
    # Now you can process it.
"""

flags_matrix = flags_matrix[:,1:] # Remove first column that represents the fake, full genome BED.


# Print unique rows with transitive (inexact) counting
uniqueValues, occurCount = np.unique(flags_matrix, return_counts=True, axis=0)
for uniqueRow in uniqueValues:
    count = 0

    for row in flags_matrix:
        if does_combi_match_query(
            tuple(row), 
            tuple(uniqueRow), 
            exact = False):
            count += 1
    print(uniqueRow, ' - freq = ', count)
print("------------------")


# Perform subsampling of the majority class ('0', meaning query is absent)
flags_matrix_with_query = flags_matrix[flags_matrix[:, 0] != 0]
draw_this_many = int(np.around(KEEP_N_TIMES_QUERY * len(flags_matrix_with_query)))
random_rows = flags_matrix[np.random.randint(flags_matrix.shape[0], size=draw_this_many),:]
flags_matrix = np.concatenate((flags_matrix_with_query,random_rows))



# This is the custom error function that we will use
def custom_error_function_bnb(X_true, X_rebuilt, encoded, dictionary):
    """
    This is a non-pure function that only cares about the dictionary and discards X_true in 
    favor of the previous flags_matrix.
    """

    # Always assume that the first column is the query, so we split it.
    # Truncate the first column, with the query, since that is what we want to predict
    Y = flags_matrix[:,0]
    X = flags_matrix[:,1:]

    new_dictionary = []
    for atom in dictionary:
        new_dictionary += [atom[1:]]
    dictionary = np.array(new_dictionary)


    # Convert to a binary matrix giving, for each atom in the dictionary, 
    # whether it is present at this row
    X_bydict_bin = [presence_vector(row, dictionary) for row in X]
    X_bydict_bin = np.array(X_bydict_bin)

    # Use Gaussian Naive Bayes on the data and retrieve accuracy
    bnb = GaussianNB()

    # Potential sample weights
    sample_weights = []
    for y in Y:
        if y == 1 : sample_weights += [QUERY_WEIGHT]
        else: sample_weights += [1]


    bnb.fit(X_bydict_bin, Y, sample_weight= sample_weights)
    Y_pred = bnb.predict(X_bydict_bin)

    # Compute F1-score
    score = sklearn.metrics.f1_score(Y, Y_pred)

    # print(pd.crosstab(Y,Y_pred))

    return -score # We want an error, higher means worse



## Run MODL with custom error function based on error of BNB
utils.VERBOSITY = 3 # Ensure DEBUG messages are shown


# Keep only combis WITH query (first element is not 0)
flags_matrix_with_query = flags_matrix[flags_matrix[:, 0] != 0]

combi_miner = Modl(flags_matrix_with_query,
    multiple_overlap_max_number_of_combinations = DESIRED_ITEMSETS,        # How many words to find ?
    error_function = custom_error_function_bnb)                            # Custom error function in step 2
interesting_combis = combi_miner.find_interesting_combinations()

Python Pandas numpy sklearn Pybedtools BiocSklearn pygtftk From line 17 of scripts/modl_perspective.py

import numpy as np
np.random.seed(42)

import pandas as pd
from plotnine import ggplot, aes, geom_point, geom_line, geom_smooth, scale_x_continuous, scale_y_log10, xlab, ylab

import time

from pygtftk.stats.intersect.modl.dict_learning import Modl, test_data_for_modl
from pygtftk.stats.intersect.modl.apriori import Apriori, matrix_to_list_of_transactions, apriori_results_to_matrix
from pygtftk.stats.intersect.modl.subroutines import learn_dictionary_and_encode
from pygtftk import utils


# Debug : plots are not shown, only printed, so use Agg backend for matplotlib
import matplotlib
matplotlib.use("Agg")

OUTPUT_ROOT = "output/benchmark/scaling/" # Hardcoded for now. It was necessary to launch this script in shell and not via snakemake.script to allow proper redirection of the log


## ---------------------------- Parameters ---------------------------------- ##
utils.VERBOSITY = 0 # We don't want to record DEBUG messages for these tests
REPEATS = range(5) # Repeat all operations N times to get the average

N_THREADS = 4

## MODL
SIZES = [6,9,12,18,24,30,40,50]  # Numbers of sets (columns)
STEPS = [3,5,8,10,12,15,18,20,22,25,30] # Numbers of queried atoms

# When varying only one parameter, which is the default for the others ?
DEFAULT_QUERIED_ATOMS = 8
DEFAULT_N_SETS = 8

# Data parameters
NFLAGS = 10000
NOISE = 0.1     # Previously 0.5


## ---------------------------- Time benchmarks ----------------------------- ##

## Number of sets
df_bench = pd.DataFrame(columns = ['nb_sets','time'])   # Prepare dataframe

for _ in REPEATS:

    for size in SIZES:
        X = test_data_for_modl(nflags = NFLAGS, number_of_sets = size, noise = NOISE)

        start_time = time.time()
        combi_miner = Modl(X, multiple_overlap_max_number_of_combinations = DEFAULT_QUERIED_ATOMS, nb_threads = N_THREADS)
        modl_interesting_combis = combi_miner.find_interesting_combinations()
        stop_time = time.time()

        df_bench = df_bench.append({'nb_sets':size, 'time': stop_time-start_time}, ignore_index = True)

df_bench['nb_sets'] = df_bench['nb_sets'].astype(int)
p = (ggplot(df_bench) + aes('nb_sets', 'time')
 + geom_point() + geom_smooth(span=.3) + scale_x_continuous()
 + xlab("Number of sets") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig1")


## Number of queried words
df_bench = pd.DataFrame(columns = ['step','time'])

X = test_data_for_modl(nflags = NFLAGS, number_of_sets = DEFAULT_N_SETS, noise = NOISE)

for _ in REPEATS:
    for step in STEPS:

        start_time = time.time()
        combi_miner = Modl(X, multiple_overlap_max_number_of_combinations = step, nb_threads = N_THREADS)
        modl_interesting_combis = combi_miner.find_interesting_combinations()
        stop_time = time.time()

        df_bench = df_bench.append({'step':step, 'time': stop_time-start_time}, ignore_index = True)

df_bench['step'] = df_bench['step'].astype(int)
p = (ggplot(df_bench) + aes('step', 'time')
 + geom_point() + geom_smooth(span=.3) + scale_x_continuous()
 + xlab("Queried nb. of atoms") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig2")

Python Pandas numpy matplotlib pygtftk From line 5 of scripts/modl_scaling_1_2.py

import numpy as np
np.random.seed(42)

import pandas as pd
from plotnine import ggplot, aes, geom_point, geom_line, geom_smooth, scale_x_continuous, scale_y_log10, xlab, ylab

import time

from pygtftk.stats.intersect.modl.dict_learning import Modl, test_data_for_modl
from pygtftk.stats.intersect.modl.apriori import Apriori, matrix_to_list_of_transactions, apriori_results_to_matrix
from pygtftk.stats.intersect.modl.subroutines import learn_dictionary_and_encode
from pygtftk import utils

from mlxtend.frequent_patterns import fpgrowth, apriori

# Debug : plots are not shown, only printed, so use Agg backend for matplotlib
import matplotlib
matplotlib.use("Agg")

OUTPUT_ROOT = "output/benchmark/scaling/" # Hardcoded for now. It was necessary to launch this script in shell and not via snakemake.script to allow proper redirection of the log


## ---------------------------- Parameters ---------------------------------- ##
utils.VERBOSITY = 0 # We don't want to record debug messages for these tests
REPEATS = range(5) # Repeat all operations N times to get the average

## Elementary operation (DL) vs other itemset miners

# Number of words, and 1/min_support for the comparisons
SCALING_FACTORS =  [1,2,5,10,25,30,40,50,75,100,150,200,250,300,350,400,500]    

# Data generation parameters
NOISE = 0.5
NLINES = 5000
NSETS = 13

# DL parameters
ALPHA = 0.5

## -------------- Elementary operation benchmark : apriori vs fpgrowth vs dict learning

## Support and number of queried words
df_bench = pd.DataFrame(columns = ['scaling_factor','algo','time'])   # Prepare df

for _ in REPEATS:

    for k in SCALING_FACTORS:

        # NOTE Scaling factor of k means:
        # - Apriori and FP-growth will have min support of 1/k
        # - Dictionary learning will learn k atoms
        current_min_support = 1/k

        X = test_data_for_modl(nflags = NLINES, number_of_sets = NSETS, noise = NOISE)
        names = [str(i) for i in range(X.shape[1])]
        transactions = matrix_to_list_of_transactions(X, names)
        X_as_dataframe = pd.DataFrame(X)

        # Apriori - my pure Python implementation
        start_time = time.time()
        myminer = Apriori(min_support = current_min_support)
        myminer.run_apriori(transactions)
        results = myminer.produce_results()
        stop_time = time.time()

        df_bench = df_bench.append({'scaling_factor':k, 'algo':'apriori_pure_python', 'time': stop_time-start_time}, ignore_index = True)  


        # Apriori 
        # It seems this particular apriori implementation reserves a very large amount of memory when support is very low, so cap it.
        if current_min_support >= 0.02: 
            start_time = time.time()
            apriori(X_as_dataframe, min_support = current_min_support)
            stop_time = time.time()

            df_bench = df_bench.append({'scaling_factor':k, 'algo':'apriori', 'time': stop_time-start_time}, ignore_index = True)  

        # FP-Growth
        start_time = time.time()
        result = fpgrowth(X_as_dataframe, min_support=current_min_support)
        stop_time = time.time()

        df_bench = df_bench.append({'scaling_factor':k, 'algo':'fpgrowth', 'time': stop_time-start_time}, ignore_index = True)  

        # Dict learning
        start_time = time.time()
        U_df, V_df, error = learn_dictionary_and_encode(X, n_atoms = k, alpha = ALPHA, n_jobs = 1)
        stop_time = time.time()

        df_bench = df_bench.append({'scaling_factor':k, 'algo':'DL', 'time': stop_time - start_time}, ignore_index = True)   


df_bench['scaling_factor'] = df_bench['scaling_factor'].astype(int)
p = (ggplot(df_bench) + aes('scaling_factor', 'time', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous()
 + xlab("Scaling factor (k)") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig3")

p = (ggplot(df_bench) + aes('scaling_factor', 'time', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + scale_y_log10()
 + xlab("Scaling factor (k)") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig3_log10")


# Normalized time to scaling factor of 1
minimum_time = df_bench[df_bench['scaling_factor']==1][['algo','time']]
df_bench['time_relative'] = df_bench['time'] # Placeholder
for index, row in df_bench.iterrows():
    my_algo = row['algo']
    my_minimum_time = pd.to_numeric(minimum_time.loc[minimum_time['algo'] == my_algo]['time']).min()
    df_bench.at[index,'time_relative'] = row['time']/my_minimum_time

p = (ggplot(df_bench) + aes('scaling_factor', 'time_relative', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous()
 + xlab("Scaling factor (k)") + ylab("Time (relative)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig3_relative")


p = (ggplot(df_bench) + aes('scaling_factor', 'time_relative', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + scale_y_log10()
 + xlab("Scaling factor (k)") + ylab("Time (relative)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig3_relative_log10")

Python Pandas numpy matplotlib pygtftk mlxtend From line 5 of scripts/modl_scaling_3.py

import numpy as np
np.random.seed(42)

import pandas as pd
from plotnine import ggplot, aes, geom_point, geom_line, geom_smooth, scale_x_continuous, scale_y_log10, xlab, ylab

import time

from pygtftk.stats.intersect.modl.dict_learning import Modl, test_data_for_modl
from pygtftk.stats.intersect.modl.apriori import Apriori, matrix_to_list_of_transactions, apriori_results_to_matrix
from pygtftk.stats.intersect.modl.subroutines import learn_dictionary_and_encode
from pygtftk import utils


from mlxtend.frequent_patterns import fpgrowth, apriori

# Debug : plots are not shown, only printed, so use Agg backend for matplotlib
import matplotlib
matplotlib.use("Agg")

OUTPUT_ROOT = "output/benchmark/scaling/" # Hardcoded for now. It was necessary to launch this script in shell and not via snakemake.script to allow proper redirection of the log



## ---------------------------- Parameters ---------------------------------- ##
utils.VERBOSITY = 0 # We don't want to record debug messages for these tests
REPEATS = range(5) # Repeat all operations N times to get the average

## Elementary operation (DL) vs other itemset miners

# Number of sets (columns), minimum of 6
SETS_NB = [6,7,8,9,10,11,12,13,14,16,18,20,22,24,26]    

# Data generation parameters
NOISE = 0.5
LOW_NOISE = 0.01
N_FLAGS = 5000

# DL parameters
ALPHA = 0
N_ATOMS = 120

# Other algorithms' parameters
MIN_SUPPORT = 1/100 

## -------------- Elementary operation benchmark : apriori vs fpgrowth vs dict learning



## Number of sets
df_bench = pd.DataFrame(columns = ['set_nb','algo','time'])   # Prepare df

for _ in REPEATS:

    for size in SETS_NB:

        # Generate data
        X = test_data_for_modl(nflags = N_FLAGS, number_of_sets = size, noise = NOISE)
        names = [str(i) for i in range(X.shape[1])]
        transactions = matrix_to_list_of_transactions(X, names)
        X_as_dataframe = pd.DataFrame(X)

        X_low_noise = test_data_for_modl(nflags = N_FLAGS, number_of_sets = size, noise = LOW_NOISE)
        X_noiseless = test_data_for_modl(nflags = N_FLAGS, number_of_sets = size, noise = 0)

        # Apriori
        # Cap it to the max number that is not unreasonable
        if size <= 15:
            start_time = time.time()
            myminer = Apriori(min_support = MIN_SUPPORT)
            myminer.run_apriori(transactions)
            results = myminer.produce_results()
            stop_time = time.time()

            df_bench = df_bench.append({'set_nb':size, 'algo':'apriori_pure_python', 'time': stop_time-start_time}, ignore_index = True)   


        # Apriori
        # NOTE: this implementation has high RAM cost for large data or low min_support, be careful
        if size <= 15:
            start_time = time.time()
            result = apriori(X_as_dataframe, min_support = MIN_SUPPORT)
            stop_time = time.time()
            df_bench = df_bench.append({'set_nb':size, 'algo':'apriori', 'time': stop_time-start_time}, ignore_index = True)  


        # FP-Growth
        start_time = time.time()
        result = fpgrowth(X_as_dataframe, min_support = MIN_SUPPORT)
        stop_time = time.time()

        df_bench = df_bench.append({'set_nb':size, 'algo':'fpgrowth', 'time': stop_time-start_time}, ignore_index = True)  


        # Dict learning
        start_time = time.time()
        U_df, V_df, error = learn_dictionary_and_encode(X, n_atoms = N_ATOMS, alpha = ALPHA, n_jobs = 1)
        stop_time = time.time()

        df_bench = df_bench.append({'set_nb':size, 'algo':'DL', 'time': stop_time - start_time}, ignore_index = True)   


        # Dict learning - low noise data
        start_time = time.time()
        U_df, V_df, error = learn_dictionary_and_encode(X_low_noise, n_atoms = N_ATOMS, alpha = ALPHA, n_jobs = 1)
        stop_time = time.time()

        df_bench = df_bench.append({'set_nb':size, 'algo':'DL_low_noise_data', 'time': stop_time - start_time}, ignore_index = True)  


        # Dict learning - noiseless data
        start_time = time.time()
        U_df, V_df, error = learn_dictionary_and_encode(X_noiseless, n_atoms = N_ATOMS, alpha = ALPHA, n_jobs = 1)
        stop_time = time.time()

        df_bench = df_bench.append({'set_nb':size, 'algo':'DL_noiseless_data', 'time': stop_time - start_time}, ignore_index = True)  



df_bench['set_nb'] = df_bench['set_nb'].astype(int)
p = (ggplot(df_bench) + aes('set_nb', 'time', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous()
 + xlab("Number of sets") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig4")

p = (ggplot(df_bench) + aes('set_nb', 'time', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + scale_y_log10()
 + xlab("Number of sets") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig4_log10")


# Normalized time to minimum number of sets
min_nb_sets = min(SETS_NB)
minimum_time = df_bench[df_bench['set_nb'] == min_nb_sets][['algo','time']]
df_bench['time_relative'] = df_bench['time'] # Placeholder
for index, row in df_bench.iterrows():
    my_algo = row['algo']
    my_minimum_time = pd.to_numeric(minimum_time.loc[minimum_time['algo'] == my_algo]['time']).min()
    df_bench.at[index,'time_relative'] = row['time']/my_minimum_time

p = (ggplot(df_bench) + aes('set_nb', 'time_relative', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous()
 + xlab("Number of sets") + ylab("Time (relative)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig4_relative")


p = (ggplot(df_bench) + aes('set_nb', 'time_relative', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + scale_y_log10()
 + xlab("Number of sets") + ylab("Time (relative)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig4_relative_log10")

Python Pandas numpy matplotlib pygtftk mlxtend From line 5 of scripts/modl_scaling_4.py

import numpy as np
np.random.seed(42)

import pandas as pd
from plotnine import ggplot, aes, geom_point, geom_line, geom_smooth, scale_x_continuous, scale_y_log10, xlab, ylab

import time

from pygtftk.stats.intersect.modl.dict_learning import Modl, test_data_for_modl
from pygtftk.stats.intersect.modl.apriori import Apriori, matrix_to_list_of_transactions, apriori_results_to_matrix
from pygtftk.stats.intersect.modl.subroutines import learn_dictionary_and_encode
from pygtftk import utils

from mlxtend.frequent_patterns import fpgrowth, apriori

# Debug : plots are not shown, only printed, so use Agg backend for matplotlib
import matplotlib
matplotlib.use("Agg")

OUTPUT_ROOT = "output/benchmark/scaling/" # Hardcoded for now. It was necessary to launch this script in shell and not via snakemake.script to allow proper redirection of the log

## ---------------------------- Parameters ---------------------------------- ##
utils.VERBOSITY = 0 # We don't want to record debug messages for these tests
REPEATS = range(5) # Repeat all operations N times to get the average


## Elementary operation (DL) vs other itemset miners

# Number of lines (in thousands)
LINES_NB = [1,2,5,8,10,15,20,25,40,50]
NB_SETS = 15  

# Data generation parameters
NOISE = 0.5

# DL parameters
ALPHA = 0
N_ATOMS = 120

# Other algo parameters
MIN_SUPPORT = 1/100


## -------------- Elementary operation benchmark : apriori vs fpgrowth vs dict learning

## Number of lines
df_bench = pd.DataFrame(columns = ['lines','algo','time'])   # Prepare df

for _ in REPEATS:

    for lines in LINES_NB:

        true_line_nb = lines * 1000

        # Generate data
        # Scaling factor is in the thouands
        X = test_data_for_modl(nflags = true_line_nb, number_of_sets = NB_SETS, noise = NOISE)
        names = [str(i) for i in range(X.shape[1])]
        transactions = matrix_to_list_of_transactions(X, names)
        X_as_dataframe = pd.DataFrame(X)

        # Apriori
        start_time = time.time()
        myminer = Apriori(min_support = MIN_SUPPORT)
        myminer.run_apriori(transactions)
        results = myminer.produce_results()
        stop_time = time.time()

        df_bench = df_bench.append({'lines':lines, 'algo':'apriori_pure_python', 'time': stop_time-start_time}, ignore_index = True)   


        # Apriori
        # NOTE: this implementation has high RAM cost for large data or low min_support, be careful with scaling
        if true_line_nb <= 20000:
            start_time = time.time()
            result = apriori(X_as_dataframe, min_support = MIN_SUPPORT)
            stop_time = time.time()
            df_bench = df_bench.append({'lines':lines, 'algo':'apriori', 'time': stop_time-start_time}, ignore_index = True)  


        # FP-Growth
        start_time = time.time()
        result = fpgrowth(X_as_dataframe, min_support = MIN_SUPPORT)
        stop_time = time.time()

        df_bench = df_bench.append({'lines':lines, 'algo':'fpgrowth', 'time': stop_time-start_time}, ignore_index = True)  


        # Dict learning
        start_time = time.time()
        U_df, V_df, error = learn_dictionary_and_encode(X, n_atoms = N_ATOMS, alpha = ALPHA, n_jobs = 1)
        stop_time = time.time()

        df_bench = df_bench.append({'lines':lines, 'algo':'DL', 'time': stop_time - start_time}, ignore_index = True)   


df_bench['lines'] = df_bench['lines'].astype(int)
p = (ggplot(df_bench) + aes('lines', 'time', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous()
 + xlab("Number of lines") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig5")

p = (ggplot(df_bench) + aes('lines', 'time', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + scale_y_log10()
 + xlab("Number of lines") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig5_log10")


# Normalized time to minimum line number
min_nb_lines = min(LINES_NB)
minimum_time = df_bench[df_bench['lines'] == min_nb_lines][['algo','time']]
df_bench['time_relative'] = df_bench['time'] # Placeholder
for index, row in df_bench.iterrows():
    my_algo = row['algo']
    my_minimum_time = pd.to_numeric(minimum_time.loc[minimum_time['algo'] == my_algo]['time']).min()
    df_bench.at[index,'time_relative'] = row['time']/my_minimum_time

p = (ggplot(df_bench) + aes('lines', 'time_relative', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous()
 + xlab("Number of lines") + ylab("Time (relative)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig5_relative")

Python Pandas numpy matplotlib pygtftk mlxtend From line 5 of scripts/modl_scaling_5.py

args <- commandArgs(trailingOnly = TRUE)
this_dir <- toString(args[1])

setwd(this_dir)
print(getwd())


# ---------------------------------------------------------------------------- #

# Install packages
allpkgs <- c("UpSetR", "dplyr", "reshape2", "patchwork", "latex2exp")
for (p in allpkgs){
  if (!requireNamespace(p)){ install.packages(c(p), repos="http://cran.us.r-project.org", dependencies = TRUE) }
}

library('UpSetR')
require(ggplot2)
require(plyr)
require(dplyr)
require(gridExtra)
require(grid)
library(stringr)
library(RColorBrewer)
library(reshape2)
library(patchwork)
library(latex2exp)

d <- read.table("all_ologram_results.txt", head=T, sep="\t")

# Compute -log10(pval)
d$summed_bp_overlaps_pvalue[d$summed_bp_overlaps_pvalue == 0] <- 1e-320
d$log10_pval <- -log10(d$summed_bp_overlaps_pvalue)


## Extract ("regexp with capture '()'") the query name
d$query <- str_match(d$query, "-([0-9A-Za-z]+)/00_ologram.*")[,2]

## Extract the combination
tmp <- lapply(strsplit(as.character(d$feature_type), "\\+"), str_match, "_([^_]+)_")

# Quick hack
tmp <- lapply(tmp, "[", i=,j=2)
tmp <- lapply(lapply(lapply(tmp, "[", -1), rev), "[", -1)

all_features <- unique(unlist(tmp))
feature_mat <- matrix(0, nrow=nrow(d), ncol=length(all_features))
colnames(feature_mat) <- all_features
for(i in 1:length(tmp)){
  for(j in tmp[[i]]){
    feature_mat[i, j] <- 1
  }
}

## Prepare a binary matrix with TF as column and combi as row 
feature_mat <- as.data.frame(feature_mat)
feature_mat$degree <- rowSums(feature_mat) + 1
d <- cbind(d, as.matrix(feature_mat))

## Let's plot
d$color <-as.character(factor(d$query, levels=brewer.pal(3,"Paired")))

d$combination <- 1:nrow(d)

dm <- melt(data = d, id.vars = c("degree",
                                 "summed_bp_overlaps_true",
                                 "summed_bp_overlaps_pvalue",
                                 "query",
                                 "combination",
                                 "log10_pval",
                                 "summed_bp_overlaps_log2_fold_change"),
     measure.vars=all_features)
dm$value[dm$value == 0] <- NA
dm$Factors <- dm$variable



## Order combination levels by *summed basepairs in the true overlaps*
dm$combination <- factor(dm$combination, ordered = T, levels = unique(as.character(dm$combination[order(dm$summed_bp_overlaps_true, decreasing = T)])))

dm$Factors <- factor(dm$Factors, levels=c("Ctcf", "Rad21", "Smc1a", "Smc3",
                                            "Irf1", "Irf9", "Stat1", "Stat6",
                                            "Nanog","Pou5f1","Klf4", "Sox2"), order=T)


### Point colors
dm$Dataset <- as.character(dm$variable)
dm$Dataset[dm$Dataset %in% c("Ctcf", "Rad21", "Smc1a", "Smc3")] <- "Insulators"
dm$Dataset[dm$Dataset %in% c("Irf1", "Irf9", "Stat1", "Stat6")] <- "Interferon\nresponse"
dm$Dataset[dm$Dataset %in% c("Nanog","Pou5f1","Klf4", "Sox2")] <- "Stem cells"


### Take the n best combi for each query
n_best <- 20

dm %>% 
  arrange(desc(query)) %>% 
  arrange(desc(summed_bp_overlaps_true)) %>%
  group_by(query) %>% 
  slice(1:(n_best*length(all_features)-1)) %>% 
  ungroup() -> dm_sub

### Order queries
dm_sub$query <- factor(dm_sub$query, levels = c("Nanog", "Ctcf", "Irf1"),
                   ordered=TRUE)

### Plot
p1 <- ggplot(dm_sub, aes(y = Factors, x=combination,  fill=Dataset)) +
    geom_point(aes(size=value), guide='none', shape=22, color="white") +
    theme_bw() +
    scale_size(guide="none") +
    theme(axis.text.x = element_text(angle=0, size=7),
          axis.text.y = element_text(size=7),
          legend.key.height = unit(1,"line"),
          axis.title.y = element_text(size=8)) +
    scale_color_manual(values=c("#D0AB58", "#3F4EF3", "#A63965")) +
    scale_fill_manual(values=c("#D0AB58", "#3F4EF3", "#A63965")) +
    facet_wrap(~query, scale="free_x") +
    scale_x_discrete(name ="Combinations",
                     labels=c(1:n_best)) 

p2 <- ggplot(dm_sub,aes(x=combination, y = log10_pval, fill= factor(degree) )) +
  geom_bar(stat="identity", position="dodge") +
  theme_bw() +
  theme(axis.title.x = element_blank(),
        axis.text.y = element_text(size=8),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        strip.text = element_blank(),
        strip.background = element_blank(),
        axis.title.y = element_text(size=8)) +
  scale_fill_manual(name="Combi. order", values=c("#FFC30F", "#1D9A6C", "#104765", "#08062C")) +
  facet_wrap(~query, scale="free_x", strip.position = "bottom")  

p3 <- ggplot(dm_sub,aes(x=combination, y = summed_bp_overlaps_log2_fold_change, fill = factor(degree) )) +
  geom_bar(stat="identity", position="dodge") +
  theme_bw() +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        strip.background = element_blank(),
        strip.text = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none",
        axis.title.y = element_text(size=8)) +
  scale_fill_manual(values=c("#FFC30F", "#1D9A6C", "#104765", "#08062C")) +
  facet_wrap(~query, scale="free_x", strip.position = "bottom") +
  ylab(TeX('$log2(\\sum bp_{obs}/ \\sum bp_{sim})$'))  


# Save the combined plot
combined_plot <- p3 + p2 + p1 + plot_layout(ncol = 1, heights = c(1,1,2))
ggsave("murine_fig.png", plot = combined_plot, width = 12, height = 8, dpi = 240)

R ggplot2 dplyr stringr RColorBrewer reshape2 patcHwork gridExtra plyr UpSetR From line 4 of scripts/murine_analysis.R

this.dir = "./output/sc_atac_seq_pbmc_data/"
setwd(this.dir)
print(getwd())


## Library
if (!requireNamespace("BiocManager", quietly = TRUE)){
  install.packages("BiocManager", repos="http://cran.us.r-project.org", dependencies = TRUE)
}
BiocManager::install()
setRepositories(ind=1:5) # To automatically install Bioconductor dependencies

# Install Signac
if (!requireNamespace("Signac")){
  BiocManager::install(c("GenomeInfoDbData", 'BSgenome.Hsapiens.UCSC.hg19', 'EnsDb.Hsapiens.v75','BSgenome.Hsapiens.UCSC.hg38', 'EnsDb.Hsapiens.v86'))

  BiocManager::install(c("ggbio","chromVAR","TFBSTools","motifmatchr"))

  install.packages("Signac", repos="http://cran.us.r-project.org", dependencies = TRUE)
}


library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(EnsDb.Hsapiens.v75)
library(ggplot2)
library(patchwork)

library(parallel)

set.seed(1234)



## Download and read the data from Signac PBMC vignette

# Download all
options(timeout = 3600)


# Counts
destfile = "atac_v1_pbmc_10k_filtered_peak_bc_matrix.h5"
if (!file.exists(destfile)) { download.file("https://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_10k/atac_v1_pbmc_10k_filtered_peak_bc_matrix.h5", destfile = destfile) }
counts <- Read10X_h5(filename = "atac_v1_pbmc_10k_filtered_peak_bc_matrix.h5")
# From the vignette : each row of the matrix represents a region of the genome (a peak), that is predicted to represent a region of open chromatin. 
# Each value in the matrix represents the number of Tn5 integration sites for each single barcode (i.e. a cell) that map within each peak

# Metadata
destfile = "atac_v1_pbmc_10k_singlecell.csv"
if (!file.exists(destfile)) { download.file("https://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_10k/atac_v1_pbmc_10k_singlecell.csv", destfile = destfile) }
metadata <- read.csv(file = "atac_v1_pbmc_10k_singlecell.csv",
  header = TRUE, row.names = 1)

# Fragment file and index
destfile = "atac_v1_pbmc_10k_fragments.tsv.gz"
if (!file.exists(destfile)) { download.file("https://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_10k/atac_v1_pbmc_10k_fragments.tsv.gz", destfile = destfile) }
destfile = "atac_v1_pbmc_10k_fragments.tsv.gz.tbi"
if (!file.exists(destfile)) { download.file("https://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_10k/atac_v1_pbmc_10k_fragments.tsv.gz.tbi", destfile = destfile) }
# From the vignette : This represents a full list of all unique fragments across all single cells (not just those associated to peaks)


destfile = "pbmc_10k_v3.rds"
if (!file.exists(destfile)) { download.file("https://www.dropbox.com/s/zn6khirjafoyyxl/pbmc_10k_v3.rds?dl=1", destfile = destfile) }


## Create Signac objects
chrom_assay <- CreateChromatinAssay(
  counts = counts, sep = c(":", "-"),  genome = 'hg19',
  fragments = "atac_v1_pbmc_10k_fragments.tsv.gz",
  min.cells = 10, min.features = 200
)

pbmc <- CreateSeuratObject(
  counts = chrom_assay,
  assay = "peaks",
  meta.data = metadata
)

# Add gene annotations
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v75)
seqlevelsStyle(annotations) <- 'UCSC' # change to UCSC style since the data was mapped to hg19
genome(annotations) <- "hg19"
Annotation(pbmc) <- annotations





# NOTE: to get the atackseq data, use `pbmc[['peaks']]`
# To get the associated genomic ranges, use `granges(pbmc)`

## Quality control

pbmc <- NucleosomeSignal(object = pbmc) # compute nucleosome signal score per cell
pbmc <- TSSEnrichment(object = pbmc, fast = FALSE) # compute TSS enrichment score per cell
# add blacklist ratio and fraction of reads in peaks
pbmc$pct_reads_in_peaks <- pbmc$peak_region_fragments / pbmc$passed_filters * 100
pbmc$blacklist_ratio <- pbmc$blacklist_region_fragments / pbmc$peak_region_fragments
pbmc$high.tss <- ifelse(pbmc$TSS.enrichment > 2, 'High', 'Low')
pbmc$nucleosome_group <- ifelse(pbmc$nucleosome_signal > 4, 'NS > 4', 'NS < 4')

# Remove outliers
pbmc <- subset(x = pbmc,
  subset = peak_region_fragments > 3000 &
    peak_region_fragments < 20000 &
    pct_reads_in_peaks > 15 &
    blacklist_ratio < 0.05 &
    nucleosome_signal < 4 &
    TSS.enrichment > 2
)

## Perform normalization+feature selection, and then UMAP cluster computation
pbmc <- RunTFIDF(pbmc)
pbmc <- FindTopFeatures(pbmc, min.cutoff = 'q0')
pbmc <- RunSVD(pbmc)

pbmc <- RunUMAP(object = pbmc, reduction = 'lsi', dims = 2:30)
pbmc <- FindNeighbors(object = pbmc, reduction = 'lsi', dims = 2:30)
pbmc <- FindClusters(object = pbmc, verbose = FALSE, algorithm = 3)

#DimPlot(object = pbmc, label = TRUE) + NoLegend()


# Gene activity matrix, using chromatin accessibility as proxy for activity
gene.activities <- GeneActivity(pbmc)
pbmc[['RNA']] <- CreateAssayObject(counts = gene.activities)
pbmc <- NormalizeData(object = pbmc,
  assay = 'RNA',  normalization.method = 'LogNormalize',
  scale.factor = median(pbmc$nCount_RNA)
)
DefaultAssay(pbmc) <- 'RNA'


## Load names from transversal scRNA-Seq study

pbmc_rna <- readRDS("pbmc_10k_v3.rds") # Load the pre-processed scRNA-seq data for PBMCs

transfer.anchors <- FindTransferAnchors(
  reference = pbmc_rna, query = pbmc,
  reduction = 'cca'
)
predicted.labels <- TransferData(
  anchorset = transfer.anchors, refdata = pbmc_rna$celltype,
  weight.reduction = pbmc[['lsi']], dims = 2:30
)

pbmc <- AddMetaData(object = pbmc, metadata = predicted.labels)

#plot1 <- DimPlot( object = pbmc_rna, group.by = 'celltype',label = TRUE,repel = TRUE) + NoLegend() + ggtitle('scRNA-seq')
#plot2 <- DimPlot( object = pbmc, group.by = 'predicted.labels',label = TRUE, repel = TRUE) + NoLegend() + ggtitle('scATAC-seq')
#plot1 + plot2


# Change names
pbmc <- subset(pbmc, idents = 14, invert = TRUE)
pbmc <- RenameIdents(
  object = pbmc,
  '0' = 'CD14 Mono',
  '1' = 'CD4 Memory',
  '2' = 'CD8 Effector',
  '3' = 'CD4 Naive',
  '4' = 'CD14 Mono',
  '5' = 'DN T',
  '6' = 'CD8 Naive',
  '7' = 'NK CD56Dim',
  '8' = 'pre-B',
  '9' = 'CD16 Mono',
  '10' = 'pro-B',
  '11' = 'DC',
  '12' = 'NK CD56bright',
  '13' = 'pDC'
)


DefaultAssay(pbmc) <- 'peaks' # Important, change back to working with peaks instead of gene activities


## Find differantially accessible peaks between clusters
## May be useful to create a restriction for shuffling

# da_peaks <- FindMarkers(
#   object = pbmc,
#   ident.1 = "CD4 Naive",
#   ident.2 = "CD14 Mono",
#   min.pct = 0.2,
#   test.use = 'LR',
#   latent.vars = 'peak_region_fragments'
# )


# Translation between peaks's column name and cell id + predicted type
translation = data.frame(class = pbmc$predicted.id, id = pbmc$cell_id)
write.table(translation, file="cell_to_class.tsv", sep = '\t')

## Convert to BED

ifelse(!dir.exists("bed"), dir.create("bed"), FALSE) # Create directory




# For each cell (ie. tag)...
n_tags = ncol(pbmc[['peaks']])
#for (j in 1:n_tags) {}

message_parallel <- function(...){
  system(sprintf('echo "\n%s\n"', paste0(..., collapse="")))
}

signal_to_bed = function(j){

  # Get the vector of signal for this cell, giving the signal or absence thereof
  # at each candidate region
  signal_this_tag = data.frame(
    pbmc[['peaks']][1:nrow(pbmc[['peaks']]), j], check.names = FALSE
    )

  # Get the tag's id and predicted class
  tag = colnames(signal_this_tag)
  tag_id = translation[tag,"id"]
  tag_class = translation[tag,"class"]
  tag_class = chartr(" ", "_", tag_class)

  # Open a BED file
  bed_dir = paste("bed/",tag_class,"/", sep = '')
  bedpath = paste(bed_dir,tag_id,".bed", sep = '')

  ifelse(!dir.exists(bed_dir), dir.create(bed_dir), FALSE)

  # For each region (row), write it to the BED file if detected in this cell
  for(i in 1:nrow(signal_this_tag)) {
    signal = signal_this_tag[i,1]

    # Since we work on the peaks matric and not on the fragments themselves,
    # I binarize the counts value 
    if(signal > 0){
      region = rownames(signal_this_tag)[i]
      region_list = strsplit(region, split='-', fixed=TRUE)[[1]]
      line = paste(region_list[1],'\t',region_list[2],'\t',region_list[3], sep = '')
      write(line, file = bedpath, append = TRUE)
    }
  }

  message_parallel("Cell tag ",j,"/",n_tags,"complete.")

  return(TRUE)
}

res = mclapply(1:n_tags, signal_to_bed, mc.cores = 4)




# One BED file with all regions
bedpath_all = "bed/allmerged.bed" 
all_regions = rownames(pbmc[['peaks']])
for (region in all_regions) {
  region_list = strsplit(region, split='-', fixed=TRUE)[[1]]
  line = paste(region_list[1],'\t',region_list[2],'\t',region_list[3], sep = '')
  write(line, file = bedpath_all, append = TRUE)
}






## ------ Final step: random selection
# Randomly select 50 cells from CD14, 25 from CD4 and 25 from CD8

ifelse(!dir.exists("bed_selected"), dir.create("bed_selected"), FALSE)

draw_for_this_type = function(cell_type, size, randomize = FALSE){

  # Get file list with full path 
  dirpath = paste("bed/",cell_type, sep = "")
  files <- list.files(dirpath, full.names = TRUE)

  # Select the desired number of files by simple random sampling,
  # or just take the top N if `randomize` is False
  if (randomize) {
    randomize <- sample(seq(files))
    files2analyse <- files[randomize] 
  }
  else {
    files2analyse <- files
  }

  files2analyse <- files2analyse[(1:size)]

  # Move files
  for(i in seq(files2analyse)){
    file.copy(from = files2analyse[i], to = "bed_selected/")
  }
}

draw_for_this_type("CD14+_Monocytes", 30, FALSE)
draw_for_this_type("CD4_Naive", 15, FALSE)
draw_for_this_type("CD8_Naive", 15, FALSE)
draw_for_this_type("pre-B_cell", 15, FALSE)

R ggplot2 EnsDb.Hsapiens.v75 Seurat patcHwork BiocManager Signac GenomeInfoDb From line 11 of scripts/pbmc_download_data.R

shell: """
    # Create incl file by merging all BEDs
    cat input/mcf7/*.bed | bedtools sort | bedtools merge > {output.incl}

    # And add another one
    cat input/mcf7_additional/*.bed > input/extended_all_mcf7.bed
    awk -F"\t" 'NF{{NF-=3}};3' < input/extended_all_mcf7.bed > input/extended_all_mcf7_3col.bed
    cat {output.incl} input/extended_all_mcf7_3col.bed | sed -e 's/ /\t/g' | bedtools sort | bedtools merge > {output.incl_extended}

    # Once done, move future query
    mv input/mcf7/foxa1.bed {output.query}
"""

SnakeMake BEDTools From line 81 of master/Snakefile

shell: """
    gunzip input/*.gz
    gunzip input/*/*.gz    
"""

SnakeMake From line 110 of master/Snakefile

shell: """
mkdir -p output/artificial_data

# Generate query
bedtools random -n $((3*{params.size})) -l $(({params.length})) -seed 123456 -g input/hg38.genome > {output.query}

# Generate A and B
head -n $(({params.size})) {output.query} > {output.a}
bedtools shuffle -i {output.a} -incl {output.query} -excl {output.a} -g input/hg38.genome -noOverlapping -seed 654321 > {output.b}
# Do not put all regions of query into A and B

# Ensure there is at least one common line in A and B and C so they are displayed
echo "chr1	1	2	0	1	-" >> {output.a}
echo "chr1	1	2	0	1	-" >> {output.b}
echo "chr1	1	2	0	1	-" >> {output.query}

cp {output.a} {output.a2}

# Generate D
bedtools random -n $((2*{params.size})) -l $(({params.length})) -seed 654321 -g input/hg38.genome > {output.c}
"""

SnakeMake BEDTools From line 134 of master/Snakefile

shell: """

mkdir -p output/artificial_data_calibrate/data

# Prepare query
bedtools random -n $((100*{params.size})) -l $(({params.length})) -seed 123456 -g input/artificial_simple.genome > {output.query}

# And filler regions that do NOT overlap with query
# Use a different set each time
bedtools shuffle -i {output.query} -excl {output.query} -g input/artificial_simple.genome -noOverlapping -seed 654321 > {output.filler1}
bedtools shuffle -i {output.query} -excl {output.query} -g input/artificial_simple.genome -noOverlapping -seed 789789 > {output.filler2}


# Calibrating : get the first K lines, with different K, of the query file
# and fill the rest with random regions that do NOT overlap with the query

head -n $((21*{params.size})) {output.query} > {output.a}
head -n $((79*{params.size})) {output.filler1} >> {output.a}

tail -n $((26*{params.size})) {output.query} > {output.b}
tail -n $((74*{params.size})) {output.filler2} >> {output.b}
"""

SnakeMake BEDTools From line 178 of master/Snakefile

shell: '''
# Prepare input and output dir
mkdir -p output/artificial_data_finesse
mkdir -p output/ologram_result_artificial_finesse

# Prepare genome
printf "chr1\t100000000\nchr2\t100000000\nchr3\t100000000\nchr4\t100000000\nchr5\t100000000\n" > {output.genome}

# Draw regions
bedtools random -g {output.genome} -n 10000 -l 1000 -seed 42021958 | bedtools sort > {output.a}

# Get some regions from A, and randomize the rest
head -n 51 {output.a} > {output.b}.temp
bedtools random -g {output.genome} -n 10000 -l 1000 -seed 98131240 >> {output.b}.temp
bedtools sort -i {output.b}.temp > {output.b}
'''

SnakeMake BEDTools From line 215 of master/Snakefile

shell:"""
mkdir -p output/sc_atac_seq_pbmc_data/
Rscript scripts/pbmc_download_data.R

# Preparing the incl files
cat output/sc_atac_seq_pbmc_data/bed/*.bed | bedtools sort | bedtools merge > {output.allreg}
bedtools slop -i {output.allreg} -g input/hg19.genome -b 250 > {output.allreg_slopped}
"""

SnakeMake BEDTools From line 243 of master/Snakefile

shell:"""
mkdir -p output/murine_data/selected_tf/

# Download data
for i in `cat {input.sources} | grep mm10 | awk '{{print $0,"\t","bla"}}' | sed 's/\"//g' | cut -f1,6,7 | perl -npe 's/\t/-/g'| perl -npe 's/ +/_/g'`
do
    n=`echo $i | sed 's/\-.*//'`
    wget http://dbarchive.biosciencedbc.jp/kyushu-u/mm10/eachData/bed10/$n.10.bed -O output/murine_data/selected_tf/$i.bed --cipher 'DEFAULT:!DH'
done


# Prepare incl
wget reftss.clst.riken.jp/datafiles/current/mouse/refTSS_v3.1_mouse_coordinate.mm10.bed.gz -O output/murine_data/refTSS_v3.1_mouse_coordinate.mm10.bed.gz --cipher 'DEFAULT:!DH'
gunzip output/murine_data/refTSS_v3.1_mouse_coordinate.mm10.bed.gz

cat output/murine_data/refTSS_v3.1_mouse_coordinate.mm10.bed | bedtools slop -l {params.incl_prom_left_slop} -r 1 -s -g input/mm10.genome | bedtools sort | bedtools merge > {output.incl}
"""

SnakeMake BEDTools From line 268 of master/Snakefile

shell: """
gtftk ologram -z -c hg38 -p {input.query} --more-bed {params.trs} \
    -o output/ologram_result_mcf7 --force-chrom-peak --force-chrom-more-bed  \
    -V 3 -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} \
    --more-bed-multiple-overlap --bed-incl {input.incl} --no-date
"""

SnakeMake From line 341 of master/Snakefile

shell: """
gtftk ologram -z -c hg38 -p {input.query} --more-bed {params.trs}\
    -o output/ologram_result_mcf7_filtered --force-chrom-peak --force-chrom-more-bed --no-date \
    -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} -V 3 \
    --more-bed-multiple-overlap --bed-incl {input.incl} \
    --multiple-overlap-max-number-of-combinations {params.max_combis}               
"""

SnakeMake From line 369 of master/Snakefile

shell: """
head -n 1 {input.res} > {output}
grep -w -f {input.filtered} {input.res} >> {output}
"""

SnakeMake From line 388 of master/Snakefile

shell: """
gtftk ologram -z -c hg38 -p {input.query} --more-bed {params.trs} \
    -o output/ologram_result_mcf7_full_dhs --force-chrom-peak --force-chrom-more-bed  \
    -V 3 -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} \
    --more-bed-multiple-overlap --bed-incl {input.incl} --no-date
"""

SnakeMake From line 412 of master/Snakefile

shell:"""
mkdir -p output/benchmark
python scripts/modl_comparison.py 2> {log.err} 1> {log.out}

# Signal we are done
touch {output}
"""

SnakeMake From line 436 of master/Snakefile

    shell:"""
    mkdir -p output/benchmark/perspective
    python scripts/modl_perspective.py 2> {log.err} 1> {log.out}

    # Signal we are done
    touch {output}
    """


"""
Rules to evaluate MODL time scaling, and scaling of the elementary operations, in parallel
"""

SnakeMake From line 456 of master/Snakefile

shell:"""  
mkdir -p output/benchmark/scaling

# Print information about the CPU
lscpu > output/cpu_info.txt

python scripts/modl_scaling_1_2.py 2> {log.err} 1> {log.out}
"""

SnakeMake From line 477 of master/Snakefile

shell:"""
mkdir -p output/benchmark/scaling
python scripts/modl_scaling_3.py 2> {log.err} 1> {log.out}
"""

SnakeMake From line 493 of master/Snakefile

shell:"""
mkdir -p output/benchmark/scaling
python scripts/modl_scaling_4.py 2> {log.err} 1> {log.out}
"""

SnakeMake From line 505 of master/Snakefile

shell:"""
mkdir -p output/benchmark/scaling
python scripts/modl_scaling_5.py 2> {log.err} 1> {log.out}
"""

SnakeMake From line 517 of master/Snakefile

shell: """
mkdir -p output/ologram_result_ginom
mkdir -p output/ologram_result_ginom_filtered

# With MODL
gtftk ologram -z -c hg19 -p {input.query} --more-bed {params.trs} \
    -o output/ologram_result_ginom_filtered --force-chrom-peak --force-chrom-more-bed --no-date \
    -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} -V 3 \
    --more-bed-multiple-overlap --bed-incl {input.incl} \
    --multiple-overlap-max-number-of-combinations {params.max_combis}   

# Without MODL
gtftk ologram -z -c hg19 -p {input.query} --more-bed {params.trs}\
    -o output/ologram_result_ginom --force-chrom-peak --force-chrom-more-bed --no-date \
    -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} -V 3 \
    --more-bed-multiple-overlap --bed-incl {input.incl}      
"""

SnakeMake From line 561 of master/Snakefile

shell:"""
mkdir -p output/murine_result
mkdir -p output/murine_result_restricted

# Without restricting to estimated promoters only

gtftk ologram -z -f -w -q -c mm10 -p output/murine_data/selected_tf/SRX1815531-Ctcf-Blood.bed \
    --more-bed `ls output/murine_data/selected_tf/* | grep -vi ctcf` --more-bed-multiple-overlap --no-date \
    -k {threads} -V 3 -j summed_bp_overlaps_pvalue -a summed_bp_overlaps_pvalue -g 0.05 -o output/murine_result/SRX1815531-Ctcf \
    -mn {params.minibatch_number} -ms {params.minibatch_size}

gtftk ologram -z -f -w -q -c mm10 -p output/murine_data/selected_tf/SRX1583885-Irf1-Blood.bed \
    --more-bed `ls output/murine_data/selected_tf/* | grep -vi Irf1` --more-bed-multiple-overlap --no-date \
    -k {threads} -V 3  -j summed_bp_overlaps_pvalue -a summed_bp_overlaps_pvalue -g 0.05 -o output/murine_result/SRX1583885-Irf1 \
    -mn {params.minibatch_number} -ms {params.minibatch_size}

gtftk ologram -z -f -w -q -c mm10 -p output/murine_data/selected_tf/ERX1633247-Nanog-Pluripotent_stem_cell.bed \
    --more-bed `ls output/murine_data/selected_tf/* | grep -vi nanog` --more-bed-multiple-overlap --no-date \
    -k {threads} -V 3  -j summed_bp_overlaps_pvalue -a summed_bp_overlaps_pvalue -g 0.05 -o output/murine_result/ERX1633247-Nanog \
    -mn {params.minibatch_number} -ms {params.minibatch_size}



# With the restriction

gtftk ologram -z -f -w -q -c mm10 -p output/murine_data/selected_tf/SRX1815531-Ctcf-Blood.bed \
    --more-bed `ls output/murine_data/selected_tf/* | grep -vi ctcf` --more-bed-multiple-overlap --no-date \
    -k {threads} -V 3 -j summed_bp_overlaps_pvalue -a summed_bp_overlaps_pvalue -g 0.05 -o output/murine_result_restricted/SRX1815531-Ctcf \
    -mn {params.minibatch_number} -ms {params.minibatch_size} --bed-incl {input.incl}

gtftk ologram -z -f -w -q -c mm10 -p output/murine_data/selected_tf/SRX1583885-Irf1-Blood.bed \
    --more-bed `ls output/murine_data/selected_tf/* | grep -vi Irf1` --more-bed-multiple-overlap --no-date \
    -k {threads} -V 3  -j summed_bp_overlaps_pvalue -a summed_bp_overlaps_pvalue -g 0.05 -o output/murine_result_restricted/SRX1583885-Irf1 \
    -mn {params.minibatch_number} -ms {params.minibatch_size} --bed-incl {input.incl}

gtftk ologram -z -f -w -q -c mm10 -p output/murine_data/selected_tf/ERX1633247-Nanog-Pluripotent_stem_cell.bed \
    --more-bed `ls output/murine_data/selected_tf/* | grep -vi nanog` --more-bed-multiple-overlap --no-date \
    -k {threads} -V 3  -j summed_bp_overlaps_pvalue -a summed_bp_overlaps_pvalue -g 0.05 -o output/murine_result_restricted/ERX1633247-Nanog \
    -mn {params.minibatch_number} -ms {params.minibatch_size} --bed-incl {input.incl}
"""

SnakeMake From line 591 of master/Snakefile

shell:"""
mkdir -p output/ologram_result
mkdir -p output/ologram_result_artificial
mkdir -p output/ologram_result_artificial_calibrate

# Run on artificial data
gtftk ologram -z -c hg38 -p {input.query} --more-bed {params.peaks} \
    -o output/ologram_result_artificial --force-chrom-peak --force-chrom-more-bed \
    -V 3 -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} \
    --more-bed-multiple-overlap --no-date -K output/TMP_ologram_result_artificial

# Run on calibrated artificial data
gtftk ologram -z -c input/artificial_simple.genome -p {input.query_calibrate} --more-bed {params.peaks_calibrate} \
    -o output/ologram_result_artificial_calibrate --force-chrom-peak --force-chrom-more-bed \
    -V 3 -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} \
    --more-bed-multiple-overlap --no-date -K output/TMP_ologram_result_artificial_calibrate
"""

SnakeMake From line 659 of master/Snakefile

shell: """
mkdir -p output/multovl_result_artificial_calibrate

# On full artificial data
start=`date +%s.%N`
{params.multovl_exec} {input.query} {params.peaks} --nointrack \
    --reshufflings {params.shuffle_numbers} --free input/hg38.genome.bed \
    > {output.artificial}
end=`date +%s.%N`

runtime=$( echo "$end - $start" | bc -l )
echo "Duration of MULTOVL for full artificial data was (in seconds) : " $runtime > output/multovl_result_artificial_calibrate/timer.txt


# On artificial data, negative control only
{params.multovl_exec} {input.query} output/artificial_data/data/neg_control.bed --nointrack \
    --reshufflings {params.shuffle_numbers} --free input/hg38.genome.bed \
    > {output.artificial_neg_control}


# On simpler, calibrated artificial data
{params.multovl_exec} {input.query_calibrate} {params.peaks_calibrate} --nointrack \
    --reshufflings {params.shuffle_numbers} --free input/artificial_simple.genome.bed \
    > output/multovl_result_artificial_calibrate/partial1.txt

{params.multovl_exec} {input.query_calibrate} {params.peaks_calibrate_as_list[0]} --nointrack \
    --reshufflings {params.shuffle_numbers} --free input/artificial_simple.genome.bed \
    > output/multovl_result_artificial_calibrate/partial2.txt

{params.multovl_exec} {input.query_calibrate} {params.peaks_calibrate_as_list[1]} --nointrack \
    --reshufflings {params.shuffle_numbers} --free input/artificial_simple.genome.bed \
    > output/multovl_result_artificial_calibrate/partial3.txt


# Concatenate results for calibrated, with new line between each
awk 'FNR==1{{print ""}}1' output/multovl_result_artificial_calibrate/partial* > {output.calibrated}

rm output/multovl_result_artificial_calibrate/partial* # Cleanup
"""

SnakeMake From line 703 of master/Snakefile

shell:'''
cat output/ologram_result_artificial_finesse/ologram_output/*.tsv > {output.r}
'''

SnakeMake From line 757 of master/Snakefile

shell:'''
MINIBATCH_SIZE=10

for MINIBATCH_NB in 1 2 5 10 15 20 35
do
  for TRYNUMBER in $(seq 1 40)
  do
    gtftk ologram -ms $MINIBATCH_SIZE -mn $MINIBATCH_NB -p {input.a} \
        --more-bed {input.b} -z -c {input.genome} -V 1 -s ${{TRYNUMBER}} -k {threads}\
        --more-bed-labels artificial_minibatches_${{MINIBATCH_NB}}_try_num_${{TRYNUMBER}} \
        --force-chrom-peak   --force-chrom-more-bed\
        -o output/ologram_result_artificial_finesse/ologram_output
  done
done
touch {output}
'''

SnakeMake From line 770 of master/Snakefile

shell:'''
MINIBATCH_SIZE=10
for MINIBATCH_NB in 50 100 250
do
  for TRYNUMBER in $(seq 1 8)
  do
    gtftk ologram -ms $MINIBATCH_SIZE -mn $MINIBATCH_NB -p {input.a} \
        --more-bed {input.b} -z -c {input.genome} -V 1 -k {threads} -s ${{TRYNUMBER}}  \
        --more-bed-labels artificial_minibatches_${{MINIBATCH_NB}}_try_num_${{TRYNUMBER}} \
        --force-chrom-peak   --force-chrom-more-bed\
        -o output/ologram_result_artificial_finesse/ologram_output
  done
done
touch {output}
'''

SnakeMake From line 794 of master/Snakefile

shell:'''
MINIBATCH_SIZE=10
MINIBATCH_NB=500
for TRYNUMBER in $(seq 1 3)
do
    gtftk ologram -ms $MINIBATCH_SIZE -mn $MINIBATCH_NB -p {input.a} \
        --more-bed {input.b} -z -c {input.genome} -V 1 -k {threads} -s ${{TRYNUMBER}} \
        --more-bed-labels artificial_minibatches_${{MINIBATCH_NB}}_try_num_${{TRYNUMBER}} \
        --force-chrom-peak  --force-chrom-more-bed\
        -o output/ologram_result_artificial_finesse/ologram_output
done
touch {output}
'''

SnakeMake From line 817 of master/Snakefile

shell:'''
gtftk ologram -p {input.a} --more-bed {input.b} \
    -mn {params.minibatch_number} -ms {params.minibatch_size} \
    -z -c {input.genome} -V 1 --no-date -k {threads} -s {wildcards.i} \
    --force-chrom-peak --force-chrom-more-bed \
    -o output/ologram_result_artificial_finesse/truth/run_{wildcards.i}
'''

SnakeMake From line 851 of master/Snakefile

shell:'''
gtftk ologram -p {input.a} --more-bed {input.b} \
    -mn {params.minibatch_number} -ms {params.minibatch_size} \
    -z -c {input.genome} -V 1 --no-date -k {threads} \
    --force-chrom-peak --force-chrom-more-bed \
    --display-fit-quality -K output/ologram_result_artificial_finesse/truth_fit \
    -o output/ologram_result_artificial_finesse/truth_histogram
'''

SnakeMake From line 877 of master/Snakefile

shell:"""
mkdir -p output/ologram_bad_fit

printf "chr1\t100000000\nchr2\t100000000\nchr3\t100000000\nchr4\t100000000\nchr5\t100000000\n" > output/ologram_bad_fit/my.genome
bedtools random -g my.genome -n 100 -l 100000 -seed 42021958 | bedtools sort > output/ologram_bad_fit/A.bed
head -n 10 A.bed > output/ologram_bad_fit/B_tmp.bed
bedtools random -g my.genome -n 90  -l 100000 -seed 98131240 >> output/ologram_bad_fit/B_tmp.bed
bedtools sort -i output/ologram_bad_fit/B_tmp.bed > output/ologram_bad_fit/B.bed

gtftk ologram -ms 250 -mn 100 -p output/ologram_bad_fit/A.bed --more-bed output/ologram_bad_fit/B.bed -z -c output/ologram_bad_fit/my.genome -V 1 --no-date -o output/ologram_bad_fit/ -s 42 -k 8 --display-fit-quality -K output/ologram_bad_fit/tmp_files

touch output/ologram_bad_fit/bad_fit_done
"""

SnakeMake BEDTools From line 892 of master/Snakefile

shell: """
gtftk ologram_merge_runs --inputfiles `ls output/ologram_result_artificial_finesse/truth/*/*.tsv` -o {output} -V 3 --ori-shuffles 2000
"""

SnakeMake From line 918 of master/Snakefile

script:
    "scripts/depth_boxplot.py"

SnakeMake From line 928 of master/Snakefile

script:
    "scripts/beta_precision.py"

SnakeMake From line 942 of master/Snakefile

shell:"""
mkdir -p output/ologram_result_scatacseq_pbmc/run_{wildcards.i}

gtftk ologram -z -c hg19 -p {input} --more-bed {params.peaks} --no-date \
    -o output/ologram_result_scatacseq_pbmc/run_{wildcards.i} \
    --force-chrom-peak --force-chrom-more-bed \
    -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} -V 3 \
    --more-bed-multiple-overlap --bed-incl {input}
"""

SnakeMake From line 960 of master/Snakefile

shell: """
gtftk ologram_merge_runs --inputfiles `ls output/ologram_result_scatacseq_pbmc/*/*.tsv` -o {output} -V 3 --ori-shuffles 4
"""

SnakeMake From line 978 of master/Snakefile

shell: """
mkdir -p output/ologram_result_scatacseq_pbmc/entropy_graph/

# Run the Python script to produce the figures
python scripts/combi_entropy_analysis.py

# Signal we are done
touch {output}
"""

SnakeMake From line 989 of master/Snakefile

shell: """

## Merging OLOGRAM results
# Print the data and a supplementary column for file names
cat output/murine_{wildcards.kind}/*X*/*tsv | head -n 1 | awk 'BEGIN{{FS=OFS="\t"}}{{print $0,"query"}}'> output/murine_{wildcards.kind}/all_ologram_results.txt
# Print data without header
for i in `ls output/murine_{wildcards.kind}/*X*/*tsv`; do  awk -v f=$i 'BEGIN{{FS=OFS="\t"}}{{print $0,f}}' $i ; done | grep -v nb_intersections_expectation_shuffled >> output/murine_{wildcards.kind}/all_ologram_results.txt

## Now call the R script, with the point of execution in argument
Rscript scripts/murine_analysis.R "./output/murine_{wildcards.kind}/"
"""

SnakeMake From line 1007 of master/Snakefile

shell: """
gtftk ologram_modl_treeify -i {input} -o {output} -l {params.queryname}
"""

SnakeMake From line 1045 of master/Snakefile

Supplementary material for the OLOGRAM-MODL paper

Help improve this workflow!

Code Snippets

Comments

Support

Free

Related Workflows

public

public

public

public

public

public