"""
@author: Yasmine
"""
import pandas as pd
import re
import os
import argparse
import openpyxl
import networkx as nx
import pickle
import time
import logging
from markovCluster import runMarkovCluster
# define regex for valid characters in variable names
_VALID_CHARS = r'a-zA-Z0-9\@\_\/'
# This function and the following function are inherited from DySE framework
[docs]def get_model(model_file):
"""
This function reads the baseline model of BioRECIPES format and returns two useful dictionaries
Parameters
----------
model_file : str
The path of the baseline model file
Returns
-------
model_dict : dict
Dictionary that holds critical information of each baseline model element
regulators : dict
Contains baseline model elements and corresponding regulator elements
"""
global _VALID_CHARS
regulators = dict()
model_dict = dict()
# Load the input file containing elements and regulators
df_model = pd.read_excel(model_file, na_values='NaN', keep_default_na = False)
# check model format
if df_model.columns[0].lower() == 'element attributes':
df_model = df_model.reset_index()
df_model = df_model.rename(columns=df_model.iloc[1]).drop([0,1]).set_index('#')
input_col_name = [x.strip() for x in df_model.columns if ('element name' in x.lower())]
input_col_ids = [x.strip() for x in df_model.columns if ('element ids' in x.lower())]
input_col_type = [x.strip() for x in df_model.columns if ('element type' in x.lower())]
input_col_X = [x.strip() for x in df_model.columns if ('variable' in x.lower())]
input_col_A = [x.strip() for x in df_model.columns if ('positive regulation rule' in x.lower())]
input_col_I = [x.strip() for x in df_model.columns if ('negative regulation rule' in x.lower())]
# set index to variable name column
# remove empty variable names
# append cols with the sets of regulators using .apply
for curr_row in df_model.index:
element_name = df_model.loc[curr_row,input_col_name[-1]].strip()
ids = str(df_model.loc[curr_row,input_col_ids[0]]).strip().upper().split(',')
#print(ids)
element_type = df_model.loc[curr_row,input_col_type[0]].strip()
var_name = df_model.loc[curr_row,input_col_X[0]].strip()
pos_regulators = df_model.loc[curr_row,input_col_A[0]].strip()
neg_regulators = df_model.loc[curr_row,input_col_I[0]].strip()
if var_name == '':
continue
curr = []
if pos_regulators != '':
curr += re.findall('['+_VALID_CHARS+']+',pos_regulators)
if neg_regulators != '':
curr += re.findall('['+_VALID_CHARS+']+',neg_regulators)
# returning regulators separately for compatibility with runMarkovCluster
regulators[var_name] = set(curr)
model_dict[var_name] = {
'name' : element_name,
'ids' : ids,
'type' : element_type,
'regulators' : set(curr)}
return model_dict, regulators
[docs]def getVariableName(model_dict, curr_map, ext_element_info):
"""
A utility function for parseExtension(), which matches the element name from the extracted event to an element in the baseline model
Parameters
----------
model_dict : dict
Dictionary that holds critical information of each baseline model element
curr_map: dict
Temporary dictionary that contains already matched pairs
ext_element_info: list
List of information for certain element in the extracted event, starting with element name
Returns
-------
match : str
The most likely matched element name in model_dict, to the element represented by ext_element_info; Otherwise, return the extended element name suffix by "_ext"
"""
global _VALID_CHARS
ext_element_name = ext_element_info[0]
#FIXME: hard-coded here and lines 123/124,
#element name is the first in the tuple, followed by element_type, then five columns later element_id
# Check for valid element name
if ext_element_name=='':
#logging.warn('Missing element name in extensions')
return ''
elif re.search('[^'+_VALID_CHARS+']+',ext_element_name):
#logging.warn(('Skipping due to invalid characters in variable name: %s') % str(ext_element_name))
return ''
ext_element_id = ext_element_info[5]
ext_element_type = ext_element_info[1]
if ext_element_name in curr_map:
return curr_map[ext_element_name]
# from the location and type
match = ext_element_name + '_ext'
confidence = 0.0
# Iterate all names in the dictionary and find the most likely match
for key,value in model_dict.items():
curr_conf = 0.0
if ext_element_id.upper() in value['ids']:
curr_conf = 1
elif ext_element_name.upper().startswith(value['name'].upper()) \
or value['name'].upper().startswith(ext_element_name.upper()):
curr_conf = 0.8
if curr_conf>0 and value['type'].lower().startswith(ext_element_type):
curr_conf += 1
if curr_conf > confidence:
match = key
confidence = curr_conf
if curr_conf==2: break
curr_map[ext_element_name] = match
return match
[docs]def parseExtension(model_dict, ext_file):
"""
This function parses the interactions within the reading output file (extracted events) into a set object, each component in the set has compatible format for BioRECIPES.
Parameters
----------
model_dict : dict
Dictionary that holds critical information of each baseline model element
ext_file : str
The path of the reading output file (extracted events)
Returns
-------
ext_edges : set
Each interaction within the reading output file will have the form:
(regulator element, regulated element, type of interaction (+/-))
"""
regulator_col = 0
regulated_col = 8
interaction_sign_col = 16
#FIXME: hard coded here, the 1st, 9th, 17th columns of ReadingOutput file
#have to be 'Regulator Name', 'Regulated Name' and 'Sign'
ext_edges, curr_map = set(), dict()
with open(ext_file) as f:
for line in f:
if line.startswith('Regulator Name'): continue
line = line.strip()
s = re.split(',',line)
name1, name2 = getVariableName(model_dict,curr_map,s[regulator_col:regulated_col]), getVariableName(model_dict,curr_map,s[regulated_col:interaction_sign_col])
if name1=="" or name2=="": continue
pos = '+' if s[interaction_sign_col]=='positive' else '-'
if (name2 in model_dict and name1 in model_dict[name2]):
continue
ext_edges.add((name1, name2, pos))
return ext_edges
[docs]def merge_clusters(regulators, path, ReturnTh):
"""
This function records indices of clusters to be merged based on the existence of return paths.
It generates the grouped_ext_Merged pickle file that contains the merged clusters.
Parameters
----------
regulators : dict
Contains baseline model elements and corresponding regulator elements
path : str
The path of the directory that contains the grouped_ext file
ReturnTh : int
A user-defined integer threshold for the number of return paths, beyond which clusters will be merged
"""
# Merge clusters if there is one or more return paths
G = nx.DiGraph()
G = makeDiGraphBase(regulators)
com_edges = list()
group_num = 1
extensions = pickle.load(open(os.path.join(path,"grouped_ext"),'rb'))
for ii in range(0,len(extensions)):
for jj in range(ii+1,len(extensions)):
count = 0
cluster1 = extensions[ii]
cluster2 = extensions[jj]
G1 = nx.DiGraph()
G2 = nx.DiGraph()
for e in cluster1[1:]:
#ee=e[1].split('->')
if e[2] == '+':
G1.add_edge(e[0], e[1],weight=1)
elif e[2] == '-':
G1.add_edge(e[0], e[1],weight=0)
for e in cluster2[1:]:
#ee=e[1].split('->')
if e[2] == '+':
G2.add_edge(e[0], e[1],weight=1)
elif e[2] == '-':
G2.add_edge(e[0], e[1],weight=0)
Gall = nx.compose(G1,G2)
for g in G.edges:
if g[0] in G1:
for ne in G.successors(g[1]):
if ne in G2:
for ne1 in G2.successors(ne):
if ne1 in G:
count = count+1
if g[0] in G1:
for ne in G.successors(g[0]):
if ne in G2:
for ne1 in G2.successors(ne):
if ne1 in G:
count = count+1
if count > int(ReturnTh): #set threshold for the number of return paths
logging.info('Merge clusters NO.{} and NO.{}'.format(str(ii+1),str(jj+1)))
#print(str(ii) + " and " + str(jj))
Gall = nx.compose(G1,G2)
NODESS = list()
for (node1,node2,data) in Gall.edges(data=True):
temp=list()
temp.append(node1)
temp.append(node2)
if data['weight'] == 0:
temp.append('-')
elif data['weight'] == 1:
temp.append('+')
NODESS.append(temp)
com_edges.append([group_num] + NODESS)
group_num = group_num+1
pickle.dump(com_edges, open(os.path.join(path, "grouped_ext_Merged"),'wb')) #Merged clusters
return
[docs]def makeDiGraphBase(regulators):
"""
A utility function for merge_clusters(), this function converts the baseline model into a directed graph.
Parameters
----------
regulators : dict
Contains baseline model elements and corresponding regulator elements
Returns
-------
G : DiGraph()
Directed graph of the baseline model
"""
G = nx.DiGraph()
G.clear()
for key, values in regulators.items():
G.add_node(key)
for value in values:
G.add_edge(value, key)
return G
[docs]def getRow(mdl):
"""
A utility function for extend_model(), this function returns a dict indicating the row of each element of the parsed model.
Parameters
----------
mdl : str
The path that will be used to store new extended model spreadsheet
Returns
-------
res : dict
A dict indicating the row number of each element
"""
df = pd.read_excel(mdl)
res = dict()
for i in df.index:
el=df['Element Name'][i].strip()
res[el]=i+2
return res
[docs]def extend_model(base_mdl,clusters,ext_mdl):
"""
This function creates a new xlsx for each candidate model (i.e. baseline model + cluster)
Parameters
----------
base_mdl : str
The path that contains the baseline model spreadsheet to be extended
clusters : list
It holds the edges inside each grouped extension
ext_mdl : str
The path that will be used to store new extended model spreadsheet
"""
os.system('cp '+base_mdl+' '+ext_mdl)
df = pd.read_excel(ext_mdl)
pos=df.columns.get_loc("Positive Regulation Rule")+1
neg=df.columns.get_loc("Negative Regulation Rule")+1
var_name=df.columns.get_loc("Variable")+1
ini0=df.columns.get_loc("State List 0")+1 #uncomment the following lines if you have more than one scenario in the properties
ini1=df.columns.get_loc("State List 1")+1
ini2=df.columns.get_loc("State List 2")+1
# ini3=df.columns.get_loc("State List 3")+1
# ini4=df.columns.get_loc("State List 4")+1
# ini5=df.columns.get_loc("State List 5")+1
el_name=df.columns.get_loc("Element Name")+1
name_to_row = getRow(ext_mdl)
curr_row = len(name_to_row)+2
wb = openpyxl.load_workbook(ext_mdl)
ws = wb.active
for e in clusters[1:]:
if e[0] not in name_to_row:
ws.cell(row=curr_row,column=el_name,value=e[0])
ws.cell(row=curr_row,column=var_name,value=e[0])
ws.cell(row=curr_row,column=ini0,value=1) #uncomment the following lines if you have more than one scenario in the properties
ws.cell(row=curr_row,column=ini1,value=1)
ws.cell(row=curr_row,column=ini2,value=1)
# ws.cell(row=curr_row,column=ini3,value=1)
# ws.cell(row=curr_row,column=ini4,value=1)
# ws.cell(row=curr_row,column=ini5,value=1)
name_to_row[e[0]] = curr_row
curr_row += 1
if e[1] not in name_to_row:
ws.cell(row=curr_row,column=el_name,value=e[1])
ws.cell(row=curr_row,column=var_name,value=e[1])
ws.cell(row=curr_row,column=ini0,value=1) #uncomment the following lines if you have more than one scenario in the properties
ws.cell(row=curr_row,column=ini1,value=1)
ws.cell(row=curr_row,column=ini2,value=1)
# ws.cell(row=curr_row,column=ini3,value=1)
# ws.cell(row=curr_row,column=ini4,value=1)
# ws.cell(row=curr_row,column=ini5,value=1)
name_to_row[e[1]] = curr_row
curr_row += 1
col = pos if e[2]=='+' else neg
original = ws.cell(row=name_to_row[e[1]],column=col).value
original = original+',' if original!=None else ''
ws.cell(row=name_to_row[e[1]],column=col,value=original+e[0])
wb.save(ext_mdl)
def get_args():
parser = argparse.ArgumentParser(description="Network model extension using ACCORDION")
parser.add_argument('ReadingOutput', type=str,help="Reading output spreadsheet")
parser.add_argument('Baseline', type=str,help="Baseline model in BioRECIPES format")
parser.add_argument('Inflation', type=str,help="Inflation parameter for Markov clustering")
parser.add_argument('ReturnTh', type=str,help="Return path threshold")
parser.add_argument('out', type=str,help="Output directory")
args = parser.parse_args()
return(args)
def main():
args = get_args()
t0 = time.time()
#Reading output .csv file. File format(RegulatedName,RegulatedID,RegulatedType,RegulatorName,RegulatorID,RegulatorType,PaperID)
interaction_filename = args.ReadingOutput
#Baseline model
model_dict, regulators = get_model(args.Baseline)
#use parseExtension if Reading output format is (RegulatedName,RegulatedID,RegulatedType,RegulatorName,RegulatorID,RegulatorType,PaperID)
exttt=parseExtension(model_dict, interaction_filename)
res,new_base_model = runMarkovCluster(args.out,exttt,model_dict,args.Inflation) # try 1.5,2,4,6
merge_clusters(regulators, args.out, args.ReturnTh)
# for example, extend the baseline model to include the first cluster from the unmerged result 'res' and generate 'extension.xlsx'
#extend_model(args.Baseline,res[0],args.out+'extension.xlsx')
t1 = time.time()
total = t1-t0
logging.info("Time to run ACCORDION in seconds: {}".format(str(total)))
if __name__ == '__main__':
main()