Source code for markovCluster

import re, copy, sys, os
from collections import defaultdict
import pickle
import logging

[docs]def runMarkovCluster(out_dir,ext_edges,base_model,coef): """ This function prepares the inputs to the markov clustering algorithm (MCL) and creates a pickle file for the output clusters with interaction information. It also returns a modified baseline model (without introducing new nodes). Parameters ---------- out_dir : str Path of the directory that will include all output files (including intermediate and final results) ext_edges : set Holds the interactions in the reading output file (extracted events). Each interaction is in the form: (regulator element, regulated element, type of interaction (+/-)) base_model : dict Dictionary that holds baseline model elements and corresponding regulator elements coef : int The inflation parameter of the markov clustering algorithm Returns ------- res : list Each item in this list is a grouped extension (i.e., this indivisible group is one candidate for model extension). It's also stored as 'grouped_ext' file. new_base_model : dict The baseline model elements are keys of this dict and the values are the corresponindg regulator elements(includes new edges information from extracted events). """ ext_model,new_base_model = buildExtGraph(ext_edges,base_model) clusteringAlgo(out_dir,ext_model,coef) ModelNetwork(out_dir,base_model) res = getGroupedExt(os.path.join(out_dir,'markov_cluster'),ext_edges) pickle.dump(res, open(os.path.join(out_dir,'grouped_ext'),'wb')) return res, new_base_model
[docs]def buildExtGraph(ext_edges,base_model=dict()): """ A utility function for runMarkovCluster(), this function constructs two graph models, one with the whole extension information (i.e., with both new edges and new nodes), another for the modified baseline model (i.e.,without introducing new nodes) Parameters ---------- ext_edges : set Holds the interactions in the reading output file (extracted events). Each interaction is in the form: (regulator element, regulated element, type of interaction (+/-)) base_model : dict Dictionary that holds baseline model elements and corresponding regulator elements Returns ------- ext_model : dict This dict contains the elements of both the baseline model and the reading output file. Those elements are the keys and the values are the corresponding regulator elements. new_base_model : dict The baseline model elements are keys of this dict and the values are the corresponindg regulator elements. """ new_base_model=dict() for k in base_model: new_base_model[k]=base_model[k]['regulators'] ext_model = copy.deepcopy(new_base_model) for edge in ext_edges: if edge[1] in ext_model: ext_model[edge[1]].add(edge[0]) else: ext_model[edge[1]] = {edge[0]} return ext_model,new_base_model
[docs]def clusteringAlgo(MCL_result_folder,ext_model,coef): """ A utility function for runMarkovCluster(), this function is designed to run Markov Clustering Algorithm(MCL) obtained at https://micans.org/mcl/, build on its latest stable release /mcl/src/mcl-14-137 Parameters ---------- MCL_result_folder : str Folder name of the directory that will store the intermediate and final result file of MCL algorithm, default as 'examples/Output/'. Inside the folder, 'markov_cluster' file is the final clustering result, with each row in this file being a cluster. ext_model : dict This dict contains the elements of both the baseline model and the reading output file. Those elements are the keys and the values are the corresponding regulator elements. coef : int The inflation parameter of the Markov Clustering Algorithm(MCL) """ # translate ext_model into the abc format supported by MCL, stored inside 'abc_model'(an intermediate output) abc_model = os.path.join(MCL_result_folder,'abc_model') output_stream = open(abc_model, 'w') for tgt in sorted(ext_model): for reg in sorted(ext_model[tgt]): if tgt==reg: continue output_stream.write(reg+' '+tgt+' '+str(1)+'\n') output_stream.close() MCL_result_file = os.path.join(MCL_result_folder,'markov_cluster') cmd = 'mcl '+abc_model.replace(' ','\ ')+' --abc -I '+str(coef)+' -o '+MCL_result_file.replace(' ','\ ') logging.info('Running the following command through MCL algorithm:\n{}\n'.format(cmd)) os.system(cmd)
[docs]def ModelNetwork(out_dir,base_model): """ A utility function for runMarkovCluster(), this function translates the baseline model to a file with edges of interactions (serves as an intermediate result file) Parameters ---------- out_dir : str Path of the directory that will include the output file base_model : dict Dictionary that holds baseline model elements and corresponding regulator elements """ new_base_model=dict() for k in base_model: new_base_model[k]=base_model[k]['regulators'] abc_model = os.path.join(out_dir,'abc_model_network') output_stream = open(abc_model, 'w') for tgt in sorted(new_base_model): for reg in sorted(new_base_model[tgt]): if tgt==reg: continue output_stream.write(reg+' '+tgt+'\n') output_stream.close()
[docs]def getGroupedExt(cluster_file,ext_edges): """ A utility function for runMarkovCluster(), this function summarizes the clustering result file and interaction information to generate list of candidate extensions Parameters ---------- cluster_file : str The path of the markov_cluster file (the result of MCL algorithm), each row in this file is a list that is classified as a cluster ext_edges : set Holds the interactions in the reading output file (extracted events). Each interaction is in the form: (regulator element, regulated element, type of interaction (+/-)) Returns ------- res : list Each item in this list is a grouped extension (i.e., this indivisible group is one candidate for model extension) """ group_num = 1 get_group = dict() get_ext = defaultdict(list) res = list() with open(cluster_file) as f: for idx, line in enumerate(f,start=1): for ele in re.findall('\S+',line.strip()): #print(idx) get_group[ele] = idx for edge in ext_edges: g1 = get_group[edge[0]] if edge[0] in get_group else sys.maxsize g2 = get_group[edge[1]] if edge[1] in get_group else sys.maxsize group = min(g1, g2) if group==sys.maxsize: continue get_ext[group] += [list(edge)] for key in sorted(get_ext): if not get_ext[key]: continue res.append([group_num]+get_ext[key]) group_num += 1 return res