Advantages of KCCA-Based GRN Construction in Propanet

public 1yr ago 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories output, operation

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

Propanet with KCCA GRN

Advantages of KCCA base GRN construction

Complex relationship of multiple transcription factors (TFs) regulating multiple target genes (TGs) are modeled
Different combinations of co-workin

Code Snippets

shell:
    "python src/6_make_result.py {params.result_dir} --kegg {params.kegg}" 

SnakeMake From line 24 of main/Snakefile

shell:
    "python src/5_extract_target_genes.py --nwk {input.nwk} --deg {input.DEGli} --tf {params.tf_li_file} \
    --tfrank {input.tfranktrim} --out {params.output_dir}/subnetwork.{wildcards.i}"

SnakeMake From line 40 of main/Snakefile

shell:
    "python src/4_main_propanet.py -TFliFile {params.tf_li_file} \
    -nwkFile {params.temp}/{params.prefix}.tp -expFile {params.exp_file} \
    -binFile {params.seed_file} -N {params.n_sample} -p {params.n_threads} \
    -cond {params.prefix} -outD {params.temp}"

SnakeMake From line 66 of main/Snakefile

shell:
    "python src/3_network_weight.py -nwk {input} -exp {params.exp_file} -o {output} -p {params.n_threads}"

SnakeMake From line 86 of main/Snakefile

run:
    import pickle as pkl

    with open(input, "rb") as f:
        data = pkl.load(f)

    with open(output, "w") as f:
        for key, val in data.items():
            for start, end in val:
                f.write(f"{start}\t{end}\n")

SnakeMake From line 96 of main/Snakefile

shell:
    "python src/2_GRN_inference.py {input.template_network} \
    {input.gene_exp_profile} {input.deg_profile} \
    {output} -TFli {input.tf_list} -TFmodule {input.dict_cluster_TF} \
    -TGmodule {input.dict_cluster_TG} -nComp {params.n_comp} -thr {params.edge_thr} \
    -reg {params.reg} -nThreads {params.nThreads} {params.norm}"

SnakeMake From line 127 of main/Snakefile

run:
    import pandas as pd
    import pickle
    df_TG_community_groupBy = pd.read_csv(input.tsv,sep='\t')
    with open(output,'wb') as f:
        pickle.dump(df_TG_community_groupBy.groupby('community')['gene'].apply(list).to_dict(),f)

SnakeMake Pandas From line 139 of main/Snakefile

shell:
    "Rscript src/1_clustering.R {input.inst_nwk_TF} {output.cluster} {output.edgelist}"

SnakeMake From line 152 of main/Snakefile

shell:
    "python src/0_instantiate_nwk.py {input.template_nwk} \
    {input.gene_exp} -corrCut {params.corrCut} -o {output.inst_nwk} -nThreads {params.n_threads}"  

SnakeMake From line 164 of main/Snakefile

run:
    import pandas as pd
    import numpy as np

    df_TF = pd.read_csv(input.TF_li,sep='\t')
    df_nwk = pd.read_csv(input.PPI,sep='\t') 

    df_nwk_TF = df_nwk.loc[lambda x:np.logical_and(x.protein1.isin(df_TF.Name),x.protein2.isin(df_TF.Name))]
    df_nwk_TF.to_csv(output.PPI_TF,sep='\t',index=False)

    df_nwk_TG = df_nwk.loc[lambda x:np.logical_and(~x.protein1.isin(df_TF.Name),~x.protein2.isin(df_TF.Name))]
    df_nwk_TG.to_csv(output.PPI_TG,sep='\t',index=False)

SnakeMake Pandas numpy From line 176 of main/Snakefile

run:
    import pandas as pd
    df = pd.read_csv(input.deg_binary, sep = "\t", index_col = 0)

    for i in range(0, n_timepoints):
        df[df.columns[i]].to_csv(join(intermediate_results_dir, "/timepoints/deg_profile.tp") + str(i+1), sep = "\t")

SnakeMake Pandas From line 197 of main/Snakefile

run:
    import pandas as pd

    df = pd.read_csv(input.exp_file, sep = "\t", index_col = 0)
    for i, sample in enumeerate(range(params.n_replicates - 1, params.n_timepoints, params.n_replicates)):

SnakeMake Pandas From line 213 of main/Snakefile

import pandas as pd
import argparse
import numpy as np
import time
import sys
from multiprocessing import Pool
from scipy.stats.stats import pearsonr

def isNum(x):
    try:
        float(x)
        return True
    except:
        return False

def corrCut(nwk,cutoff=None):
    '''correlation cutoff, positive sorting'''
    nwk.dropna(subset=['correlation'],inplace=True)
    nwk.sort_values(by=['correlation'],inplace=True,ascending=False)
    if cutoff!=None: 
        return nwk.loc[lambda x:x.correlation>=cutoff]
    else:
        return nwk

def setMinExp(nwk,exp,expCut):
    '''remove gene whose expression is lower than expCut'''
    filtered_gene = exp[exp.max(axis=1)>1]['Hybridization REF']
    boolean_mask = np.logical_and(nwk['protein1'].isin(filtered_gene),nwk['protein2'].isin(filtered_gene))
    return nwk[boolean_mask]

def expCut(nwk,exp,sample_list,expCut):
    '''remove gene whose mean of group(mutated/not-mutated) expression is lower than expCut'''
    with open(sample_list) as f:
        mutSamples=f.read().strip().split()
    exp['no_mut']=exp.loc[:,~exp.columns.isin(mutSamples)].mean(axis=1)
    exp['mut']=exp.loc[:,exp.columns.isin(mutSamples)].mean(axis=1)
    boolean_mask = np.logical_or(exp['no_mut']>=1,exp['mut']>=1)
    gene_selected = exp[boolean_mask]['Hybridization REF']
    boolean_mask2 = np.logical_and(nwk['protein1'].isin(gene_selected),nwk['protein2'].isin(gene_selected))
    return nwk[boolean_mask2]

def FCcut(nwk,FC_df,FCcut):
    keys = FC_df.iloc[:,0]
    values = FC_df.iloc[:,1]
    dictionary = dict(zip(keys, values))
    dictionary.pop('?','Not Found')
    first_col = np.array([dictionary[i] for i in nwk.iloc[:,0]])
    second_col = np.array([dictionary[i] for i in nwk.iloc[:,1]])
    boolean_mask = np.logical_and(abs(first_col) >= FCcut, abs(second_col) >= FCcut)
    boolean_mask2 = nwk['protein1'].apply(lambda x: dictionary[x])*nwk['protein2'].apply(lambda x: dictionary[x])>0
    bigFC_nwk = nwk[boolean_mask & boolean_mask2]
    return bigFC_nwk

if __name__=="__main__":
    parser = argparse.ArgumentParser(description='python 0_instantiate_nwk.py nwkFile exp -corrCut [] -nThreads [] -o []')
    parser.add_argument('nwk',help='network')
    parser.add_argument('exp',help='exp File')
    parser.add_argument('-corrCut',type=float, required=False,help='corealtion cutoff')
    parser.add_argument('-nThreads',type=int, required=False,default=1)
    parser.add_argument('-o',required=True,help='output')
    args=parser.parse_args()

    ####correaltion score combined with string score
    start=time.time()
    exp=pd.read_csv(args.exp,sep='\t',header=0,index_col=0)

    #remove duplicates
    data=[]
    with open(args.nwk) as f:
        for line in f.readlines():
            tmp=line.strip().split('\t')
            data.append(sorted(tmp))

    df_nwk=pd.DataFrame(data[1:],columns=['Gene_A','Gene_B'],dtype=float)

    df_nwk.drop_duplicates(subset=['Gene_A','Gene_B'],inplace=True)
    df_nwk_filt = df_nwk.loc[lambda x:np.logical_and(x.Gene_A.isin(exp.index),x.Gene_B.isin(exp.index))].loc[lambda x:x.Gene_A != x.Gene_B]

    #make exp dictionary to calculate correlation
    lst_exps=dict() 
    with open(args.exp) as f:
        lines=f.readlines()
    for line in lines:
        s=line.strip().split('\t')
        if not isNum(s[1]):
            continue
        else:
            gene, exps = s[0], list(map(float,s[1:]))
            lst_exps[gene]=exps
    lst_pairs2=zip(df_nwk_filt['Gene_A'],df_nwk_filt['Gene_B'])

    def myCorr(x):
        g1,g2=sorted(x)
        if np.all(np.array(lst_exps[g1])==lst_exps[g1][0]) or np.all(np.array(lst_exps[g2])==lst_exps[g2][0]):
            val,pval=(0,1)
        else:
            val, pval = pearsonr(lst_exps[g1],lst_exps[g2])
        return (g1,g2,val)

    p = Pool(args.nThreads)
    res2=p.imap_unordered(myCorr, lst_pairs2)
    p.close()
    p.join()

    corr_res2=[]
    for g1,g2,val in res2:
        if g1==g2:
            continue
        corr_res2.append([g1,g2,val])

    df_nwk_corr=pd.DataFrame(corr_res2,columns=['Gene_A','Gene_B','correlation'])
    df_nwk_corrCut=corrCut(df_nwk_corr,args.corrCut)

    end=time.time()
    time_elapsed=end-start
    df_nwk_corrCut.to_csv(args.o,sep='\t',header=True,index=False)
    print(args.o, 'time_elapsed', time_elapsed)

Python Pandas numpy scipy From line 1 of src/0_instantiate_nwk.py

library(igraph)
library(dplyr)

args = commandArgs(trailingOnly=TRUE)
file_name = args[1] 
out_file = args[2] 
out_file2 = args[3]
## Data import and cleaning
print('START')
parsed_genes = read.csv(file_name, sep='\t',header=TRUE,stringsAsFactors = FALSE)

## Creating igraph object

edgelist = as.matrix(parsed_genes[,1:2])
g = graph.edgelist(edgelist, directed=FALSE)

## Community detection : Multilevel algorithm
mlc <- multilevel.community(g)

## Writing the results to a file

mlc_community_list <- as.data.frame(as.matrix(membership(mlc)))
mlc_community_list$gene <- rownames(mlc_community_list)
mlc_community_member <- arrange(mlc_community_list, V1) %>%
  select(gene, V1)
colnames(mlc_community_member)[2] <- 'community'

community_info = mlc_community_member

# Lookup Table
vals <- community_info[,2]
keys <- community_info[,1]
lookup <- setNames(vals, keys)

index = which(lookup[parsed_genes[,1]] != lookup[parsed_genes[,2]])
filtered_edgelist <- parsed_genes[-index,]

write.table(mlc_community_member, row.names=FALSE, col.names=TRUE, file=out_file, sep='\t', quote=FALSE)
write.table(filtered_edgelist, row.names=FALSE, col.names=TRUE, file=paste(out_file2,sep=''), sep='\t',quote=FALSE)

R dplyr igraph From line 4 of src/1_clustering.R

import itertools
import pickle
import argparse
import operator

import pickle
from multiprocessing import Pool

import networkx as nx
import numpy as np
import pandas as pd
from scipy.linalg import norm, eigh
from scipy.stats import fisher_exact
from sklearn.preprocessing import MinMaxScaler

from utils import rcca

def weighted_kcca(data, reg=0.5, numCC=None, kernelcca=True, ktype='linear', gausigma=1.0, degree=2):
    """Set up and solve the kernel CCA eigenproblem"""
    if kernelcca:
        kernel = [rcca._make_kernel(d, ktype=ktype, gausigma=gausigma, degree=degree) for d in data]
    else:
        kernel = [d.T for d in data]

    nDs = len(kernel)
    nFs = [k.shape[0] for k in kernel]
    numCC = min([k.shape[1] for k in kernel]) if numCC is None else numCC

    # Get the auto- and cross-covariance matrices
    crosscovs = [np.dot(ki, kj.T) for ki in kernel for kj in kernel]

    # Allocate left-hand side (LH) and right-hand side (RH):
    LH = np.zeros((sum(nFs), sum(nFs)))
    RH = np.zeros((sum(nFs), sum(nFs)))

    # Fill the left and right sides of the eigenvalue problem
    for i in range(nDs):
        RH[sum(nFs[:i]) : sum(nFs[:i+1]),
           sum(nFs[:i]) : sum(nFs[:i+1])] = (crosscovs[i * (nDs + 1)]
                                             + reg * np.eye(nFs[i]))

        for j in range(nDs):
            if i != j:
                LH[sum(nFs[:j]) : sum(nFs[:j+1]),
                   sum(nFs[:i]) : sum(nFs[:i+1])] = crosscovs[nDs * j + i]

    LH = (LH + LH.T) / 2.
    RH = (RH + RH.T) / 2.

    maxCC = LH.shape[0]
    r, Vs = eigh(LH, RH, eigvals=(maxCC - numCC, maxCC - 1))
    r[np.isnan(r)] = 0
    rindex = np.argsort(r)[::-1]
    comp = []
    Vs = Vs[:, rindex]
    for i in range(nDs):
        comp.append(Vs[sum(nFs[:i]):sum(nFs[:i + 1]), :numCC])
    return np.sort(r)[::-1], comp

class _CCABase(rcca._CCABase):
    def __init__(self, numCV=None, reg=None, regs=None, numCC=None,
                 numCCs=None, kernelcca=True, ktype=None, verbose=False,
                 select=0.2, cutoff=1e-15, gausigma=1.0, degree=2):
        self.numCV = numCV
        self.reg = reg
        self.regs = regs
        self.numCC = numCC
        self.numCCs = numCCs
        self.kernelcca = kernelcca
        self.ktype = ktype
        self.cutoff = cutoff
        self.select = select
        self.gausigma = gausigma
        self.degree = degree
        if self.kernelcca and self.ktype == None:
            self.ktype = 'linear'
        self.verbose = verbose

    def train(self, data):
        nT = data[0].shape[0]
        if self.verbose:
            print('Training CCA, kernel = %s, regularization = %0.4f, '
                  '%d components' % (self.ktype, self.reg, self.numCC))

        eigenV, comps = weighted_kcca(data, self.reg, self.numCC, kernelcca=self.kernelcca, ktype=self.ktype, gausigma=self.gausigma, degree=self.degree)
        ###weighted sample weight, weighted by eigenvalue
        comps = [np.array([eigenV]*nT)*comps[i] for i in range(len(data))]

        self.cancorrs, self.ws, self.comps = rcca.recon(data, comps, kernelcca=self.kernelcca)
        if len(data) == 2:
            self.cancorrs = self.cancorrs[np.nonzero(self.cancorrs)]
        return self

class weighted_CCA(_CCABase):
    """Attributes:
        reg (float): regularization parameter. Default is 0.1.
        numCC (int): number of canonical dimensions to keep. Default is 10.
        kernelcca (bool): kernel or non-kernel CCA. Default is True.
        ktype (string): type of kernel used if kernelcca is True.
                        Value can be 'linear' (default) or 'gaussian'.
        verbose (bool): default is True.
    Returns:
        ws (list): canonical weights
        comps (list): canonical components
        cancorrs (list): correlations of the canonical components
                         on the training dataset
        corrs (list): correlations on the validation dataset
        preds (list): predictions on the validation dataset
        ev (list): explained variance for each canonical dimension
    """
    def __init__(self, reg=0.5, numCC=10, kernelcca=True, ktype=None,
                 verbose=False, cutoff=1e-15):
        super(weighted_CCA, self).__init__(reg=reg, numCC=numCC, kernelcca=kernelcca,
                                  ktype=ktype, verbose=verbose, cutoff=cutoff)

    def train(self, data):
        return super(weighted_CCA, self).train(data)

def kcca_embedding(TF_exp, TG_exp, normalize, n_comp=1, kernel='gaussian', reg = 0.5):
    kcca = weighted_CCA(kernelcca=True, ktype=kernel, reg = reg, numCC = n_comp) 
    kcca.train([TF_exp.to_numpy(), TG_exp.to_numpy()])

    #print('x_weight_dim', kcca.ws[0].shape)
    #print('y_weight_dim', kcca.ws[1].shape)

    TF_embed = pd.DataFrame(kcca.ws[0], index=TF_exp.columns)
    TG_embed = pd.DataFrame(kcca.ws[1], index=TG_exp.columns)

    if normalize == True:
        TF_embed_norm = TF_embed.apply(lambda x:x/norm(x), axis=1).fillna(0) #L2 norm
        TG_embed_norm = TG_embed.apply(lambda x:x/norm(x), axis=1).fillna(0) #L2 norm
        return (kcca, TF_embed_norm, TG_embed_norm)
    else:
        return (kcca, TF_embed, TG_embed)

def dotProduct(instance1, instance2):
    res = 0
    for x in range(len(instance1)):
        res += instance1[x] * instance2[x]
    return res


def getNeighbors(instance):
    '''Generate k-nearest neighbors of testInstance among trainingSet
    =================================================================
    '''
    test, idx1, trainingSet = instance
    testInstance = test.loc[idx1,:]
    li_distances = []
    length = len(testInstance)-1
    for i in range(len(trainingSet)):
        #dist = euclideanDistance(testInstance, trainingSet.iloc[i])
        #dist = correlation(testInstance, trainingSet.iloc[i])
        dist = dotProduct(testInstance, trainingSet.iloc[i])
        li_distances.append((trainingSet.index[i], dist))
    #sort list by the second item in each element from list
    #e.g.) li_distances = [(tg1,dist), (tg2,dist)]
    li_distances.sort(key = operator.itemgetter(1),reverse=True)
    dict_res = dict()
    dict_res[idx1] = li_distances
    #e.g.) dict_res = {tf1:[(tg1,dist),(tg2,dist),(tg3,dist)]}
    return dict_res

def inp_pair(df1,df2):
    li1 = set(df1.index)
    for tf in li1:
        yield (df1,tf,df2)

def TFTG_nwk(df_TF, df_TG):
    outs = {}
    for tf in set(df_TF.index):
        inst = (df_TF,tf,df_TG) 
        outs.update(getNeighbors(inst))
    return outs

def inp_pair_modularized_TFTG_nwk(nx_graph, dict_tf_comm, dict_tg_comm, edge_cutoff, df_exp, n_comp, reg, normalize):
    for comID_tf,comID_tg in itertools.product(dict_tf_comm.keys(),dict_tg_comm.keys()):
        yield (nx_graph, dict_tf_comm, dict_tg_comm, comID_tf, comID_tg, edge_cutoff, df_exp, n_comp, reg, normalize)

def mergeDicts(iter_dicts):
    dict_all=dict()
    for each_dict in iter_dicts:
        dict_all.update(each_dict)
    return dict_all

def modularized_TFTG_nwk(instance):
    def all_tftg_candidates(nx_obj, tfs, tgs):
        tfs_common, tgs_common = set(tfs).intersection(nx_obj.nodes()), set(tgs).intersection(nx_obj.nodes())
        dict_cent_allnodes=nx.betweenness_centrality_subset(nx_obj,tfs_common,tgs_common)
        set_nodes = set(pd.DataFrame.from_dict(dict_cent_allnodes,orient='index',columns=['paths']).loc[lambda x:x.paths !=0].index.to_list()) ##########
        nx_obj = nx_obj.subgraph(set_nodes.union(tfs).union(tgs)) ###########
        dict_tftgs = nx.to_dict_of_lists(nx_obj)
        dict_tftgs_filtered ={}
        for key in dict_tftgs:
            if len(dict_tftgs[key])==0:
                continue
            else:
                dict_tftgs_filtered[key] = dict_tftgs[key]
        return nx_obj, dict_tftgs_filtered

    def addEdges(dict_pair):
        li_res = []
        for tf in dict_pair.keys():
            for tg,dist in dict_pair[tf]:
                li_res.append((tf,tg,dist))
        return li_res

    def filterEdges(dict_pair,cutoff):
        dict_res = {}
        for tf in dict_pair.keys():
            dict_res[tf] = [(dist) for tg,dist in dict_pair[tf] if dist > cutoff]
        return dict_res

    def list2dict(li_pair):
        dict_res = {}
        for tf,tg in li_pair:
            if tf in dict_res:
                dict_res[tf].append(tg)
            else:
                dict_res[tf]=[tg]
        return dict_res

    def rearrange_dict(dict1):
        dict2 = {}
        for i1 in dict1:
            for i2,i3 in dict1[i1]:
                dict2[(i1,i2)]=i3
        return dict2

    nx_graph, dict_tf_comm, dict_tg_comm, comID_tf, comID_tg, edge_cutoff, df_exp, n_comp, reg, normalize = instance[0], instance[1], instance[2], instance[3], instance[4], instance[5], instance[6], instance[7], instance[8], instance[9]
    res = {}
    # print('TF_cluster: {}, \t TG_cluster:{}'.format(comID_tf,comID_tg))
    nx_graph_tmp, dict_tftg_candidates_tmp = all_tftg_candidates(nx_graph,dict_tf_comm[comID_tf],dict_tg_comm[comID_tg])
    if len(nx_graph_tmp.edges)==0:
        res[(comID_tf,comID_tg)] = []
        return res   
    else:    
        df_graph_tmp = pd.DataFrame(nx_graph_tmp.edges,columns=['TF','TG'])
    ### detecting valid TFTG pair with kCCA
    set_allNodes = set(nx_graph_tmp.nodes())
    set_outDegreeNodes = set(df_graph_tmp.loc[:,'TF'].to_list())
    set_visited = set()
    TFs = set(dict_tf_comm[comID_tf]).intersection(set_outDegreeNodes)
    set_visited.update(TFs)
    TGs = []
    for TF in TFs:
        if TF in dict_tftg_candidates_tmp:
            TGs.extend(dict_tftg_candidates_tmp[TF])
    TGs = set(TGs).intersection(set_allNodes)        
    if len(TGs) == 0:
        res[(comID_tf,comID_tg)] = []
        return res
    kcca, TF_embed, TG_embed = kcca_embedding(df_exp.loc[:,df_exp.columns.isin(TFs)],df_exp.loc[:,df_exp.columns.isin(TGs)],normalize=normalize,n_comp=n_comp,reg=reg)
    df_TFTG_pair_final = pd.DataFrame()
    dict_TFTG_pair = rearrange_dict(TFTG_nwk(TF_embed,TG_embed))
    if len(dict_TFTG_pair)==0:
        res[(comID_tf,comID_tg)] = []
        return res
    else:
        df_TFTG_pair = pd.DataFrame.from_dict(dict_TFTG_pair,orient='index',columns=['weight_kCCA'])
    li_TFTG_pair = list(set(nx_graph_tmp.edges()).intersection(set(df_TFTG_pair.index.to_list()))) ######
    df_TFTG_pair = df_TFTG_pair.loc[li_TFTG_pair]
    df_TFTG_pair_final = df_TFTG_pair
    df_TFTG_pair_2nd = df_TFTG_pair
    while True:
        TFs_2nd = set([tg for tf,tg in df_TFTG_pair_2nd.index.to_list()]).intersection(set_outDegreeNodes)
        set_visited.update(TFs_2nd)
        TGs_2nd = set(sum([dict_tftg_candidates_tmp[TF] for TF in TFs_2nd if TF in dict_tftg_candidates_tmp.keys()],[]))-set_visited
        if len(TGs_2nd) == 0:
            break
        else:
            kcca_2nd, TF_embed_2nd, TG_embed_2nd = kcca_embedding(df_exp.loc[:,df_exp.columns.isin(TFs_2nd)],df_exp.loc[:,df_exp.columns.isin(TGs_2nd)],normalize=normalize,n_comp=n_comp,reg=reg)
            dict_TFTG_pair_2nd = rearrange_dict(TFTG_nwk(TF_embed_2nd,TG_embed_2nd))
            if len(dict_TFTG_pair_2nd)==0:
                break 
            else:
                df_TFTG_pair_2nd = pd.DataFrame.from_dict(dict_TFTG_pair_2nd,orient='index',columns=['weight_kCCA'])
            li_TFTG_pair_2nd = list(set(nx_graph_tmp.edges()).intersection(set(df_TFTG_pair_2nd.index.to_list()))) 
            if len(li_TFTG_pair_2nd) == 0:
                break
            df_TFTG_pair_2nd = df_TFTG_pair_2nd.loc[li_TFTG_pair_2nd]
            df_TFTG_pair_final = pd.concat([df_TFTG_pair_final,df_TFTG_pair_2nd],axis=0)
            li_TFTG_pair += li_TFTG_pair_2nd

#####################################################################################################################################

    if df_TFTG_pair_final.shape[0]!=0:
        scaler=MinMaxScaler()
        df_TFTG_pair_final['weight_kCCA_rescaled'] = scaler.fit_transform(df_TFTG_pair_final['weight_kCCA'].values.reshape(-1,1))
        #######edge_cutoff with rescaled edge weight 
        li_TFTG_pair_final = df_TFTG_pair_final.loc[lambda x:x['weight_kCCA_rescaled'] > edge_cutoff].index.to_list() 
        res[(comID_tf,comID_tg)] = li_TFTG_pair_final

#####################################################################################################################################
    else:
        res[(comID_tf,comID_tg)] = []

    with open('./report.pkl', 'wb') as f:
        pickle.dump(res, f) 

    return res 

def all_tftg_candidates(instance):
    nx_obj, dict_tf_comm, dict_tg_comm, tf_id, tg_id = instance[0], instance[1], instance[2], instance[3], instance[4]
    tfs, tgs = dict_tf_comm[tf_id], dict_tg_comm[tg_id]
    tfs_common, tgs_common = set(tfs).intersection(nx_obj.nodes()), set(tgs).intersection(nx_obj.nodes())
    dict_cent_allnodes=nx.betweenness_centrality_subset(nx_obj,tfs_common,tgs_common)
    set_nodes = set(pd.DataFrame.from_dict(dict_cent_allnodes,orient='index',columns=['paths']).loc[lambda x:x.paths !=0].index.to_list())
    nx_obj = nx_obj.subgraph(set_nodes.union(tfs).union(tgs)) ###########
    dict_res = {}
    li_li_edges = pd.DataFrame(nx_obj.edges).values.tolist()
    set_tup_edges = [(i1,i2) for i1,i2 in li_li_edges]
    dict_res[(tf_id,tg_id)] = set_tup_edges
    return dict_res

def inp_pair_all_tftg_candidates(nx_GRN,dict_cluster_1,dict_cluster_2):
    for tf_id,tg_id in itertools.product(dict_cluster_1.keys(),dict_cluster_2.keys()):
        yield (nx_GRN,dict_cluster_1,dict_cluster_2,tf_id,tg_id)

def edgelist2nodeset(dict_edges):
    dict_nodeSet = {}
    for key in dict_edges:
        dict_nodeSet[key] = set(np.array(dict_edges[key]).flatten())
    return dict_nodeSet

def fisher_exact_test(dict_nodeSet, set_ref, nodeSet):
    dict_jaccard = {}
    set_ref = set_ref.intersection(nodeSet)
    for key in dict_nodeSet:
        if len(set(dict_nodeSet[key])) ==0:
            continue
        numDEGs = len(set(dict_nodeSet[key]).intersection(set_ref))
        coeff, pval= fisher_exact([[numDEGs,len(set_ref)-numDEGs],[len(dict_nodeSet[key])-numDEGs, len(nodeSet)-len(set_ref)+numDEGs]])
        dict_jaccard[key] = pval
    return dict_jaccard    


def GRN_inference(nx_GRN, df_exp, li_deg, tfModule, tgModule, nComp, thr, reg, normalize, nThreads=20):
    df_exp_filt=df_exp.loc[:,set(df_exp.columns.to_list()).intersection(set(li_deg))]

    set_nodes_tmp = set()
    for node in df_exp_filt.columns.to_list():
        if node in nx_GRN.nodes:
            set_nodes_tmp = set_nodes_tmp.union(node)
            set_nodes_tmp = set_nodes_tmp.union(set(nx_GRN.neighbors(node)))
    numDEGs = len(set_nodes_tmp.intersection(set(li_deg)))
    numTotal = len(set_nodes_tmp)
    nx_GRN_tmp = nx_GRN.subgraph(set_nodes_tmp).copy()
    tf_filt_keys = [key for key in dict_tf_community if len(set(nx_GRN_tmp.nodes()).intersection(set(dict_tf_community[key])))>1]
    tg_filt_keys = [key for key in dict_tg_community if len(set(nx_GRN_tmp.nodes()).intersection(set(dict_tg_community[key])))>1]
    dict_tf_community_filt = {}
    dict_tg_community_filt = {}

    for key in tf_filt_keys:
        dict_tf_community_filt[key] = dict_tf_community[key]
    for key in tg_filt_keys:
        dict_tg_community_filt[key] = dict_tg_community[key]

    inps = inp_pair_modularized_TFTG_nwk(nx_GRN_tmp, dict_tf_community_filt, dict_tg_community_filt, thr, df_exp, nComp, reg, normalize)

    with Pool(nThreads) as p:
        outs = p.map(modularized_TFTG_nwk,inps)


    return mergeDicts(outs)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='python 2_GRN_inference.py nwk_GRN exp deg out -TFli [TF cataolog] -TFmodule [] -TGmodule [] -nComp [# components] -thr [thr] -nThreads [] -normalize []')
    parser.add_argument('nwk_GRN',help="GRN to be utilized as a guide TF-TG relation")
    parser.add_argument('exp',help='exp profile of genes')
    parser.add_argument('deg',help='deg list of genes')
    parser.add_argument('out',help='out file name')
    parser.add_argument('-TFli',required=True, help="TF catalog")
    parser.add_argument('-TFmodule',required=True)
    parser.add_argument('-TGmodule',required=True)
    parser.add_argument('-nComp',required=True, type=int, help='# components')
    parser.add_argument('-thr', required=True, type=float, help='')
    parser.add_argument('-reg',type=float)
    parser.add_argument('-nThreads',required=False, type=int)
    parser.add_argument('-normalize',required=False, default=True, action='store_true')
    args = parser.parse_args()

    df_tfs = pd.read_csv(args.TFli,sep='\t')
    li_TFs = df_tfs.iloc[:,0].to_list()
    with open(args.TFmodule,'rb') as f:
        dict_tf_community =pickle.load(f)

    with open(args.TGmodule,'rb') as f:
        dict_tg_community =pickle.load(f)

    GRN = pd.read_csv(args.nwk_GRN,sep='\t')
    GRN = nx.from_pandas_edgelist(GRN,'TF','TG',create_using=nx.DiGraph())

    exp = pd.read_csv(args.exp,sep='\t',index_col=0 ).T 

    deg = pd.read_csv(args.deg,sep='\t',names=['gene']).iloc[:,0].to_list()
    dict_res = GRN_inference(GRN,exp,deg,dict_tf_community,dict_tg_community,args.nComp,args.thr,args.reg,args.normalize,args.nThreads)

    with open(args.out,'wb') as f:
        pickle.dump(dict_res, f)        

Python Pandas numpy scipy utils sklearn networkx From line 1 of src/2_GRN_inference.py

import argparse
import sys
import os

import pandas as pd
import numpy as np

from tqdm import tqdm

from scipy.stats import pearsonr
from multiprocessing import Pool, Manager

def calculate_corr(x):
    g1, g2 = x

    try:
        exp1, exp2 = gene_exp.loc[g1, :], gene_exp.loc[g2, :]
    except KeyError:
        print(g1, g2)

    corr, _ = corr_function(exp1, exp2)

    q.put((g1, g2, corr))
    return  

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='python %(prog)s -nwk nwkFile -exp expFile -o out')
    parser.add_argument('-nwk', required=True, help='Network file')
    parser.add_argument('-exp', required=True, help='gene expression File')
    parser.add_argument('-o', required=True, help = "output file")
    parser.add_argument('-p', required=True, help="number of threads")
    args = parser.parse_args()

    with open(args.nwk) as f:
        edge_list = list(map(lambda x: x.strip().split(), f.readlines()))

    m = Manager()
    q = m.Queue()

    gene_exp = pd.read_csv(args.exp, sep = "\t", index_col=0)
    corr_function = pearsonr

    print("calculate correlation coefficent...")
    print(len(edge_list))

    with Pool(processes=int(args.p)) as pool:
        with tqdm(total=len(edge_list)) as pbar:
            for _ in pool.imap_unordered(calculate_corr, edge_list):
                pbar.update()

    with open(args.o, 'w') as fw:
        while not q.empty():
            start, end, corr = q.get()
            fw.write('\t'.join([start, end, str(corr)])+'\n')

    print('end...')

Python Pandas numpy scipy tqdm From line 1 of src/3_network_weight.py

import pandas as pd
import numpy as np
import os
import argparse
import networkx as nx
import time
from tqdm import tqdm
import utils.propagation as NP
import utils.target_gene as TG 

def infByNode(TF, g, TFset) :
    visited = set()
    profit = 0
    stack = []
    stack.append(TF)
    while len(stack)>0:
        v = stack.pop()
        if v not in visited :
            visited.add(v)
            if v not in TFset: profit += abs(g.nodes[v]['weight'])
            for s in g.successors(v) :
                if s not in visited:
                    stack.append(s)
    visitedTGs = visited - TFset
    return profit, len(visited)-1, len(visitedTGs) 

def IM(nwk,TFset,repeat) :
    '''
    Influence Maximization
    ==============================
    Input 
        nwk (nx.object)
        TFset (set)
        repeat (int) 
    Output
        infNo (dict): reachability for each TF
        TFRank (list): sorted TF list
    '''

    infNo = {}
    for n in TFset :
        infNo[n]=0.0
    for _ in tqdm(range(repeat)):
        # Produce G'
        g = nwk.copy()
        #for (a,b) in network.edges() :
        #    if np.random.randint(1,1000)>abs(g[a][b]['weight']*1000) :
        #        g.remove_edge(a, b)
                #Calculate influcence (profit)
        for TF in TFset :
            profit, lenInf, _ = infByNode(TF, g, TFset)
            #print TF,profit,lenInf,lenInfTG
            if lenInf>0 and not np.isnan(profit/float(lenInf)) : infNo[TF] += profit/float(lenInf)
    for n in TFset:
        infNo[n]=infNo[n]/float(repeat)
    TFRank = sorted(infNo.keys(), key=lambda x: infNo[x], reverse=True)
    for key in infNo.keys() :
        if (infNo[key]==0.0) : TFRank.remove(key)
    return TFRank, infNo

def TF_adding_NP(DEGli, geneSet, TFli, TFrankFile, DEGnetFile, seed, coverNo=200, coverage=None):
    '''
    Trim TF list with improving correlation using Network propagation    
    =================================================================
    Input
        DEGli (li)
        geneSet (set)
        TFli (li) 
        TFrankFile (str)
        DEGnetFile (str)
        seed (pd.DataFrame)
        coverNo (int)
        coverage (float)           
    Output
        spearmanLi (li)
        cover (li)
        TF_trimmed (li)
        lst_node_weight (li)
    '''
    DEGnet = nx.read_edgelist(DEGnetFile,data=(('weight',float),),create_using=nx.DiGraph())
    TFset=set(TFli)
    nodeCnt=len(set(DEGnet.nodes())-TFset)
    TFli_rank=pd.read_csv(TFrankFile,sep='\t',header=None)[0].tolist()
    corr=[]
    cover=[]
    TF_trimmed=[]
    TG_set=set()
    prevCor = 0
    currTol = 0
    spearmanLi = []
    for iTF,TF in tqdm(enumerate(TFli_rank)):
        #seedFile: only selected TF( TFli[:iTF] ) is marked, otherwise 0
        TF_trimmed.append(TFli_rank[iTF])
        seed2weight = seed.loc[TF_trimmed,:].T.to_dict('records')[0]
        wk = NP.Walker(DEGnetFile, absWeight=True)
        spearman, lst_node_weight = wk.run_exp(seed2weight, TFset, 0.1, normalize=False)
        spearmanLi.append(spearman)
        corTmp=0
        corTmp=spearman[0]
        corr.append(corTmp)
        #TG_set |= TG.Target_genes(TF,DEGnet,DEGli,TFset)
        edges, TGalltmp, TGtmp = TG.Target_genes(TF,DEGnet,DEGli,TFset,geneSet)
        if TG_set == (TG_set | TGtmp): 
            TF_trimmed.pop()
            continue
        TG_set |= TGtmp

        c=float(len(TG_set))/nodeCnt
        cover.append(c)

        if prevCor > corTmp : currTol +=1
        else : currTol = 0
        if coverage != None and (cover[-1] > coverage or len(TG_set)>coverNo):
            TF_trimmed.pop()
            break
    seed2weight = seed.loc[TF_trimmed,:].T.to_dict('records')[0]
    _, lst_node_weight = wk.run_exp(seed2weight, TFset, 0.1, normalize=False)
    return spearmanLi, cover, TF_trimmed, lst_node_weight

if __name__ == "__main__":    
    parser = argparse.ArgumentParser()
    parser.add_argument('--tf',help='TF list File')
    parser.add_argument('--nwk',help='Network file')
    parser.add_argument('--lvl',help='DE level of genes')
    parser.add_argument('--deg',help='deg list file')
    parser.add_argument('--out',help='project (output directory) name')
    parser.add_argument('--imround', help="influence maximization round", default=1000)
    parser.add_argument('-p',default='10',type=int,help='# process for multiprocessing')
    parser.add_argument('-c',default='0.5',type=float,help='coverage threshold', choices=range(0, 1))
    parser.add_argument('-coverNo',default='200',type=float)
    args=parser.parse_args()

    if not os.path.exists(os.path.dirname(args.out)): 
        os.makedirs(os.path.dirname(args.out))

    with open(args.tf) as f:
        tfli = f.read().strip().split()

    with open(args.deg) as f:
        degli = set(f.read().strip().split())

    #networkFile --> networkx object, expFile --> pd.DataFrame
    print('Number of transcription factors', len(set(tfli)))
    print('Number of differenitial expressed genes', len(set(degli)))

    network = nx.read_edgelist(args.nwk, data=(('weight',float),),create_using=nx.DiGraph())

    #IM, NP for each timepoint
    print(network)

    #step0: parsing exp data, network for each timepoint
    start=time.time() 

    weight_tp = pd.read_csv(args.lvl, sep = "\t")
    DEG_tp = weight_tp.where(weight_tp["gene"].isin(set(degli)), other = 0)

    DEGliFile = args.deg

    ##Make nwk with DEGs in specific timepoint
    network = network.subgraph(set(degli)|set(tfli))
    DEGnetFile = args.out + ".nwk" 
    nx.write_edgelist(network,DEGnetFile,data=['weight'],delimiter='\t')

    ##Assign new weight to timepoint-specific network
    weight_DEG_nwk = weight_tp[weight_tp["gene"].isin(network.nodes())].set_index('gene')
    weight_dict = weight_DEG_nwk.T.to_dict('records')[0]
    nx.set_node_attributes(network,weight_dict,'weight')

    #step1: Influence maximization
    start=time.time() 
    ##preprocess
    nodes=pd.Series(list(set([x for x in network.nodes()])))
    TFset=set(nodes[nodes.isin(tfli)].tolist())
    ##influence maximization
    TFrank, infNo = IM(network,TFset, int(args.imround))
    ##Output into file 
    TFrankFile = args.out + '.tf.rank'

    with open(TFrankFile,'w') as TFrankF:
        for TF in TFrank:
            TFrankF.write(TF+'\n')#IM result--> cold.D2.TF_rank.1

    print('TF ranking done', time.strftime("%H:%M:%S",time.gmtime(time.time()-start)), '\n')

    #step2: NP based on TF rank
    start=time.time() 
    ##TF adding NP
    seed = weight_tp.set_index('gene')
    spearmanLi,coverage,TF_trimmed,lst_node_weight = TF_adding_NP(degli, degli, tfli, TFrankFile, DEGnetFile, seed, args.coverNo, coverage=args.c) 

    ##Save Output as file 
    TFtrimF = TFrankFile + '.trim'

    with open(TFtrimF,'w') as f:
        f.write('\n'.join(TF_trimmed))

    print(DEGliFile, TFrankFile, TFtrimF)
    print("main process end")

Python Pandas numpy utils tqdm networkx From line 1 of src/4_main_propanet.py

import os
import argparse

import networkx as nx
import numpy as np

from utils.target_gene import Target_genes

def makeGeneSet(DEGFile,geneSetF):
    ##-control
    DEGSet = set(np.genfromtxt(DEGFile, dtype=np.str))
    if geneSetF==None: 
        geneSet = DEGSet

    else: 
        geneSet = set(np.genfromtxt(geneSetF, dtype=np.str))

    return set(DEGSet)&geneSet

if __name__=="__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--nwk',help='network file')
    parser.add_argument('--deg')
    parser.add_argument('--tf')
    parser.add_argument('--tfrank')
    parser.add_argument('--geneSetFile')
    parser.add_argument('--out')
    args=parser.parse_args()

    if not os.path.exists(os.path.dirname(args.out)): 
        os.mkdir(os.path.dirname(args.out))

    geneSet = makeGeneSet(args.deg, args.geneSetFile)
    network = nx.read_edgelist(args.nwk,data=(('weight',float),),create_using=nx.DiGraph())
    TFSet = set(np.genfromtxt(args.tf, dtype=np.str, delimiter='\t'))

    with open(args.deg) as f1, open(args.tf) as f2:
        DEGli=f1.read().strip().split()
        TFli=f2.read().strip().split()
        edgeSet = set()

    for idx,TF in enumerate(TFli):
        edges, TGenes, TGs = Target_genes(TF,network,DEGli,TFSet,geneSet)
        edgeSet |= edges

        with open(args.out,'w') as f4:
            for (a, b, c) in edgeSet: 
                f4.write(a+'\t'+b+'\t'+str(c)+'\n')

Python numpy utils networkx From line 1 of src/5_extract_target_genes.py

import pandas as pd
import argparse
import sys
import os

def findDETG(resPath, cond) : # cond =='cold'
  bins = pd.read_csv('exp/exp.'+cond+'.DEG.binary', sep='\t')
  for f in os.listdir(resPath) :
    if f[-4:]!='trim': continue
    tp = f.split('.')[5]
    TFSet = set(open(resPath+'/'+f, 'r').read().strip().split()[2::2])
    DEGSet = set(bins['gene'][bins.iloc[:,44+int(tp)]!=0])
    print(tp, len(DEGSet&TFSet), DEGSet&TFSet)
  return   

def makeDict(keggF) :
  symDict = {}
  descDict = {}
  for line in open(keggF, 'r').readlines():
    tp = line.split('\t')
    kid = tp[0].split(':')[1]
    sd = tp[1].split(';')
    sym = '' if len(sd)==1 else sd[0].split(', ')[0]
    desc = sd[0] if len(sd)==1 else sd[1]
    symDict[kid]=sym.rstrip()
    descDict[kid]=desc.rstrip()
  return symDict, descDict

def main_TG():
  parser = argparse.ArgumentParser(usage='python %(prog)s resPath')
  parser.add_argument('resPath')
  parser.add_argument('--kegg', required= True)
  args=parser.parse_args()

  symDict, descDict = makeDict(args.kegg)
  outF = open(args.resPath+'/TGdesc.txt', 'w')
  for f in os.listdir(args.resPath) :
    if os.path.isdir(args.resPath+'/'+f) or f[-2:]!='TG': continue
    tp = f.split('.')[2].upper()
    TF = f.split('.')[4]
    for line in open(args.resPath+'/'+f, 'r').readlines():
      TG = line.rstrip()
      TFsym = '' if TF not in symDict else symDict[TF]
      TFdesc = '' if TF not in descDict else descDict[TF]
      if TG not in symDict : outF.write('{}\t{}\t{}\t{}\t{}\t\t\n'.format(tp, TF, TFsym, TFdesc, TG))
      else : outF.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(tp, TF, TFsym, TFdesc, TG, symDict[TG], descDict[TG]))

def main():
  parser = argparse.ArgumentParser(usage='python %(prog)s resPath')
  parser.add_argument('resPath')
  args=parser.parse_args()
  symDict, descDict = makeDict('data/kegg_ath_gene.txt')
  outF = open(args.resPath+'/timeseries_network.txt', 'w')
  for f in os.listdir(args.resPath+'/TG') :
    if os.path.isdir(args.resPath+'/TG/'+f) or f[-2:]=='TG' or f[-3:]=='txt': continue
    print(f)
    tp = 'T'+f.split('.')[1]
    for line in open(args.resPath+'/TG/'+f, 'r').readlines():
      temp = line.split('\t')
      source = temp[0]
      target = temp[1]
      TFsym = '' if source not in symDict else symDict[source]
      TFdesc = '' if source not in descDict else descDict[source]
      if target not in symDict : outF.write('{}\t{}\t{}\t{}\t{}\t\t\n'.format(tp, source, TFsym, TFdesc, target))
      else : outF.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(tp, source, TFsym, TFdesc, target, symDict[target], descDict[target]))

if __name__=='__main__':
        main()