Using Snakemake for Parameterized Workflow Execution in Python
Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
Disruption
This repository contains the code for: https://munjungkim.github.io/files/ic2s2_2023_03_02_disruptiveness.pdf
Snakemake
Since the current python code requires many parameters, it would be better to use snakemake tool. Once a container has been created, you can execute the following commands in the
workflow
directory:
snakemake '{working_dirctory}/data/original/150_5_q_1_ep_5_bs_4096_embedding/distance.npy' -j
This command will locate the rule for generating the
{working_directory}/data/original/150_5_q_1_ep_5_bs_4096_embedding/distance.npy
file, which corresponds to the
calculating_distance
rule in our Snakefile.
This rule takes the
{working_directory}/data/original/150_5_q_1_ep_5_bs_4096_embedding/in.npy
and
{working_directory}/data/original/150_5_q_1_ep_5_bs_4096_embedding/out.npy
files as inputs, and Snakemake will execute the rule responsible for generating these files, named as the
embedding_all_network
rule in our Snakefile.
The
embedding_all_network
rule executes the following command:
python3 scripts/Embedding.py {input} {params.Dsize} {params.Window} {params.Device1} {params.Device2} {params.Name} {params.q} {params.epoch} {params.batch}
. The parameters for this command are defined within the embedding_all_network rule as follows:
params: Name = "{network_name}", Dsize = "{dim}", Window = "{win}", Device1 = "6", Device2 = "7", q = "{q}", epoch = "{ep}", batch = "{bs}"
Without Snakemake
Without snakemake, you can follow the following steps.
Embedding Calculation
To calculate the embedding vectors, you can use the following command line:
python3 scripts/Embedding.py {path/to/citation_network_file} {embedding_dimension} {window_size} {device1} {device2} {citation_network_name} {q_value} {epoch_size} {batch_size}
For example, you can run the command as shown below:
python3 scripts/Embedding.py /data/original/citation_net.npz 200 5 6 7 original 1 5 1024
Embedding.py
will train the node2vec model and save the result of in-vectors under the path
{path/to/citation_network_file}/{DIM}_{WIN}_q_{Q}_ep_{EPOCH}_bs_{BATCH}_embedding/
. For instance, the above command will save in and out vectors in the path
/data/original/200_5_q_1_ep_5_bs_1024_embedding/in.npy
and
/data/original/200_5_q_1_ep_5_bs_1024_embedding/out.npy
.
Distance Calculation
Based on the embedding vectors you calculate from the above process, you can execute the following command to calculate the distance.
python3 scripts/Distance_Disruption.py distance {path/to/invectors} {path/to/outvectors} {path/to/citation_network_file} {device name}
For example, you can run the command as shown below:
python3 scripts/Distance_Disruption.py distance /data/original/ {path/to/outvectors} {path/to/citation_network_file} {device name}
Code Snippets
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | import scipy import node2vecs import torch import numpy as np import pickle import os import argparse import configparser import sys import logging import tqdm if __name__ == "__main__": MEASURE = sys.argv[1] EMBEDDING_IN = sys.argv[2] EMBEDDING_OUT = sys.argv[3] NETWORK = sys.argv[4] DEVICE = sys.argv[5] if MEASURE == 'disruption': net = scipy.sparse.load_npz(NETWORK) di = utils.calc_disruption_index(net, batch_size=None) NET_FOLDER = os.path.abspath(os.path.join(NETWORK, os.pardir)) SAVE_DIR = os.path.join(NET_FOLDER,'disruption.npy') np.save(SAVE_DIR,di) elif MEASURE =='distance': logging.basicConfig(filename = 'Distance.log',level=logging.INFO, format='%(asctime)s %(message)s') # net = scipy.sparse.load_npz(NETWORK) in_vec = np.load(EMBEDDING_IN) out_vec = np.load(EMBEDDING_OUT) EMBEDDING_FOLDER = os.path.abspath(os.path.join(EMBEDDING_IN, os.pardir)) in_vec_torch = torch.from_numpy(in_vec).to(DEVICE) out_vec_torch = torch.from_numpy(out_vec).to(DEVICE) n = len(out_vec_torch) distance= [] batch_size = int(n/100) + 1 logging.info('Starting calculating the distances') for i in tqdm.tqdm(range(100)): X = in_vec_torch[i*batch_size: (i+1)*batch_size] Y = out_vec_torch[i*batch_size: (i+1)*batch_size] numerator = torch.diag(torch.matmul(X,torch.transpose(Y,0,1))) norms_X = torch.sqrt((X * X).sum(axis=1)) norms_Y = torch.sqrt((Y * Y).sum(axis=1)) denominator = norms_X*norms_Y cs = 1 - torch.divide(numerator, denominator) distance.append(cs.tolist()) distance_lst = np.array([dis for sublist in distance for dis in sublist]) logging.info('Saving the files.') SAVE_DIR = os.path.join(EMBEDDING_FOLDER,'distance.npy') np.save(SAVE_DIR, distance_lst) |
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | import scipy import sys # import utils import node2vecs import logging import matplotlib.pyplot as plt import torch import numpy as np import pickle import os import argparse import pandas as pd import torch.nn as nn if __name__ == "__main__": logging.basicConfig(filename = 'Embedding.log', level = logging.INFO, format='%(asctime)s %(message)s') # DATA_DIR = '/home/munjkim/SoS/Disruption/data' NET = sys.argv[1] # citation network file. The type is .npz DIM = int(sys.argv[2]) #Dimension of the embedding WIN = int(sys.argv[3]) #window size of the node2vec DEV1 = sys.argv[4] # Device to use for in-vectors DEV2 = sys.argv[5] # Device to use for out-vectors NAME = sys.argv[6] # Name of the network (Choose one between 'original', 'original/Restricted_{year}', 'random/random_i', 'destroyed/destroyed_i') Q =int(sys.argv[7]) # the value of q for the biased random walk EPOCH = int(sys.argv[8]) # the number of epochs BATCH = int(sys.argv[9]) # the sie of batches DATA_DIR = sys.argv[10] logging.info('Arg Parse Success.') logging.info(NET) net = scipy.sparse.load_npz(NET) sampler = node2vecs.RandomWalkSampler(net, walk_length = 160) noise_sampler = node2vecs.utils.node_sampler.ConfigModelNodeSampler(ns_exponent=1.0) noise_sampler.fit(net) n_nodes = net.shape[0] dim =DIM logging.info("Dimension: "+str(dim)) logging.info('gpu') os.environ["CUDA_VISIBLE_DEVICES"] = DEV1 + ',' + DEV2 model = node2vecs.Word2Vec(vocab_size = n_nodes, embedding_size= dim, padding_idx = n_nodes) loss_func = node2vecs.Node2VecTripletLoss(n_neg=1) MODEL_FOLDER = f"{DIM}_{WIN}_q_{Q}_ep_{EPOCH}_bs_{BATCH}_embedding" SAVE_DIR = os.path.join(DATA_DIR,NAME,MODEL_FOLDER) dataset = node2vecs.TripletDataset( adjmat=net, window_length=WIN, num_walks = 25, noise_sampler=noise_sampler, padding_id=n_nodes, buffer_size=1e4, context_window_type="right", # we can limit the window to cover either side of center words. `context_window_type="double"` specifies a context window that extends both left and right of a focal node. context_window_type="left" or ="right" specifies that the window extends left or right, respectively. epochs=EPOCH, # number of epochs negative=1, # number of negative node per context p = 1, # (inverse) weight for the probability of backtracking walks q = Q, # (inverse) weight for the probability of depth-first walks walk_length = 160 # Length of walks ) logging.info('Start training: Dim'+str(DIM) + '_Win'+str(WIN)) node2vecs.train( model=model, dataset=dataset, loss_func=loss_func, batch_size=BATCH, learning_rate=1e-3, num_workers=15, ) model.eval() logging.info('Finish training') in_vec = model.ivectors.weight.data.cpu().numpy()[:n_nodes, :] # in_vector out_vec = model.ovectors.weight.data.cpu().numpy()[:n_nodes, :] # out_vector SAVE_DIR_IN = os.path.join(SAVE_DIR ,"in.npy") SAVE_DIR_OUT = os.path.join(SAVE_DIR ,"out.npy") np.save(SAVE_DIR_IN,in_vec) np.save(SAVE_DIR_OUT,out_vec) |
54 55 | shell: 'python3 scripts/Embedding.py {input} {params.Dsize} {params.Window} {params.Device1} {params.Device2} {params.Name} {params.q} {params.epoch} {params.batch} {params.work_dir}' |
68 69 | shell: 'python3 scripts/Distance_disruption.py distance {input.invec} {input.outvec} {input.net} {params.Device}' |
Support
- Future updates
Related Workflows





