Source code for runAccordion

"""
@author: Yasmine
"""

import pandas as pd
import re
import os
import argparse
import openpyxl
import networkx as nx
import pickle
import time
import logging
from markovCluster import runMarkovCluster

# define regex for valid characters in variable names
_VALID_CHARS = r'a-zA-Z0-9\@\_\/'

# This function and the following function are inherited from DySE framework
[docs]def get_model(model_file):
	"""
	This function reads the baseline model of BioRECIPES format and returns two useful dictionaries

	Parameters
	----------
	model_file : str
		The path of the baseline model file

	Returns
	-------
	model_dict : dict
		Dictionary that holds critical information of each baseline model element
	regulators : dict
		 Contains baseline model elements and corresponding regulator elements
	"""

	global _VALID_CHARS

	regulators = dict()
	model_dict = dict()


	# Load the input file containing elements and regulators
	df_model = pd.read_excel(model_file, na_values='NaN', keep_default_na = False)
	# check model format
	if df_model.columns[0].lower() == 'element attributes':
		df_model = df_model.reset_index()
		df_model = df_model.rename(columns=df_model.iloc[1]).drop([0,1]).set_index('#')

	input_col_name = [x.strip() for x in df_model.columns if ('element name' in x.lower())]
	input_col_ids = [x.strip() for x in df_model.columns if ('element ids' in x.lower())]
	input_col_type = [x.strip() for x in df_model.columns if ('element type' in x.lower())]
	input_col_X = [x.strip() for x in df_model.columns if ('variable' in x.lower())]
	input_col_A = [x.strip() for x in df_model.columns if ('positive regulation rule' in x.lower())]
	input_col_I = [x.strip() for x in df_model.columns if ('negative regulation rule' in x.lower())]

	# set index to variable name column
	# remove empty variable names
	# append cols with the sets of regulators using .apply
	for curr_row in df_model.index:
		element_name = df_model.loc[curr_row,input_col_name[-1]].strip()
		ids = str(df_model.loc[curr_row,input_col_ids[0]]).strip().upper().split(',')
		#print(ids)
		element_type = df_model.loc[curr_row,input_col_type[0]].strip()
		var_name = df_model.loc[curr_row,input_col_X[0]].strip()
		pos_regulators = df_model.loc[curr_row,input_col_A[0]].strip()
		neg_regulators = df_model.loc[curr_row,input_col_I[0]].strip()

		if var_name == '':
			continue

		curr = []

		if pos_regulators != '':
			curr += re.findall('['+_VALID_CHARS+']+',pos_regulators)

		if neg_regulators != '':
			curr += re.findall('['+_VALID_CHARS+']+',neg_regulators)

		# returning regulators separately for compatibility with runMarkovCluster
		regulators[var_name] = set(curr)
		model_dict[var_name] = {
			'name' : element_name,
			'ids' : ids,
			'type' : element_type,
			'regulators' : set(curr)}

	return model_dict, regulators

[docs]def getVariableName(model_dict, curr_map, ext_element_info):
	"""
	A utility function for parseExtension(), which matches the element name from the extracted event to an element in the baseline model

	Parameters
	----------
	model_dict : dict
		Dictionary that holds critical information of each baseline model element
	curr_map: dict
		Temporary dictionary that contains already matched pairs
	ext_element_info: list
		List of information for certain element in the extracted event, starting with element name

	Returns
	-------
	match : str
		The most likely matched element name in model_dict, to the element represented by ext_element_info; Otherwise, return the extended element name suffix by "_ext"
	"""

	global _VALID_CHARS

	ext_element_name = ext_element_info[0]
	#FIXME: hard-coded here and lines 123/124,
    #element name is the first in the tuple, followed by element_type, then five columns later element_id

	# Check for valid element name
	if ext_element_name=='':
		#logging.warn('Missing element name in extensions')
		return ''
	elif re.search('[^'+_VALID_CHARS+']+',ext_element_name):
		#logging.warn(('Skipping due to invalid characters in variable name: %s') % str(ext_element_name))
		return ''

	ext_element_id = ext_element_info[5]
	ext_element_type = ext_element_info[1]

	if ext_element_name in curr_map:
		return curr_map[ext_element_name]

	# from the location and type
	match = ext_element_name + '_ext'
	confidence = 0.0
	# Iterate all names in the dictionary and find the most likely match
	for key,value in model_dict.items():

		curr_conf = 0.0
		if ext_element_id.upper() in value['ids']:
			curr_conf = 1
		elif ext_element_name.upper().startswith(value['name'].upper()) \
			or value['name'].upper().startswith(ext_element_name.upper()):
			curr_conf = 0.8

		if curr_conf>0 and value['type'].lower().startswith(ext_element_type):
			curr_conf += 1

		if curr_conf > confidence:
			match = key
			confidence = curr_conf
			if curr_conf==2: break

	curr_map[ext_element_name] = match
	return match

[docs]def parseExtension(model_dict, ext_file):
	"""
     This function parses the interactions within the reading output file (extracted events) into a set object, each component in the set has compatible format for BioRECIPES.

     Parameters
     ----------
     model_dict : dict
 		Dictionary that holds critical information of each baseline model element
     ext_file : str
         The path of the reading output file (extracted events)

     Returns
     -------
     ext_edges : set
         Each interaction within the reading output file will have the form:
         (regulator element, regulated element, type of interaction (+/-))
	"""

	regulator_col = 0
	regulated_col = 8
	interaction_sign_col = 16
	#FIXME: hard coded here, the 1st, 9th, 17th columns of ReadingOutput file
	#have to be 'Regulator Name', 'Regulated Name' and 'Sign'
	ext_edges, curr_map = set(),  dict()

	with open(ext_file) as f:
		for line in f:
			if line.startswith('Regulator Name'): continue
			line = line.strip()
			s = re.split(',',line)
			name1, name2 = getVariableName(model_dict,curr_map,s[regulator_col:regulated_col]), getVariableName(model_dict,curr_map,s[regulated_col:interaction_sign_col])
			if name1=="" or name2=="": continue
			pos = '+' if s[interaction_sign_col]=='positive' else '-'

			if (name2 in model_dict and name1 in model_dict[name2]):
				continue
			ext_edges.add((name1, name2, pos))
	return ext_edges

[docs]def merge_clusters(regulators, path, ReturnTh):
    """
    This function records indices of clusters to be merged based on the existence of return paths.
    It generates the grouped_ext_Merged pickle file that contains the merged clusters.

    Parameters
    ----------
    regulators : dict
		Contains baseline model elements and corresponding regulator elements
    path : str
		The path of the directory that contains the grouped_ext file
    ReturnTh : int
		A user-defined integer threshold for the number of return paths, beyond which clusters will be merged
    """
    # Merge clusters if there is one or more return paths

    G = nx.DiGraph()
    G = makeDiGraphBase(regulators)
    com_edges = list()
    group_num = 1
    extensions = pickle.load(open(os.path.join(path,"grouped_ext"),'rb'))

    for ii in range(0,len(extensions)):
        for jj in range(ii+1,len(extensions)):
            count = 0
            cluster1 = extensions[ii]
            cluster2 = extensions[jj]
            G1 = nx.DiGraph()
            G2 = nx.DiGraph()

            for e in cluster1[1:]:
                #ee=e[1].split('->')
                if e[2] == '+':
                    G1.add_edge(e[0], e[1],weight=1)
                elif e[2] == '-':
                    G1.add_edge(e[0], e[1],weight=0)

            for e in cluster2[1:]:
                #ee=e[1].split('->')
                if e[2] == '+':
                    G2.add_edge(e[0], e[1],weight=1)
                elif e[2] == '-':
                    G2.add_edge(e[0], e[1],weight=0)
            Gall = nx.compose(G1,G2)
            for g in G.edges:
                if g[0] in G1:
                    for ne in G.successors(g[1]):
                        if ne in G2:
                            for ne1 in G2.successors(ne):
                                if ne1 in G:
                                    count = count+1
                if g[0] in G1:
                    for ne in G.successors(g[0]):
                        if ne in G2:
                            for ne1 in G2.successors(ne):
                                if ne1 in G:
                                    count = count+1
            if count > int(ReturnTh): #set threshold for the number of return paths
                logging.info('Merge clusters NO.{} and NO.{}'.format(str(ii+1),str(jj+1)))
                #print(str(ii) + " and " + str(jj))
                Gall = nx.compose(G1,G2)
                NODESS = list()
                for (node1,node2,data) in Gall.edges(data=True):
                    temp=list()
                    temp.append(node1)
                    temp.append(node2)
                    if data['weight'] == 0:
                        temp.append('-')
                    elif data['weight'] == 1:
                        temp.append('+')
                    NODESS.append(temp)
                com_edges.append([group_num] + NODESS)
                group_num = group_num+1

    pickle.dump(com_edges, open(os.path.join(path, "grouped_ext_Merged"),'wb')) #Merged clusters

    return


[docs]def makeDiGraphBase(regulators):
    """
    A utility function for merge_clusters(), this function converts the baseline model into a directed graph.

    Parameters
    ----------
    regulators : dict
		Contains baseline model elements and corresponding regulator elements

    Returns
    -------
    G : DiGraph()
        Directed graph of the baseline model
    """
    G = nx.DiGraph()
    G.clear()
    for key, values in regulators.items():
        G.add_node(key)
        for value in values:
            G.add_edge(value, key)
    return G

[docs]def getRow(mdl):
	"""
 	A utility function for extend_model(), this function returns a dict indicating the row of each element of the parsed model.

	Parameters
	----------
	mdl : str
		The path that will be used to store new extended model spreadsheet

	Returns
	-------
	res : dict
        A dict indicating the row number of each element
	"""

	df = pd.read_excel(mdl)
	res = dict()
	for i in df.index:
		el=df['Element Name'][i].strip()
		res[el]=i+2
	return res

[docs]def extend_model(base_mdl,clusters,ext_mdl):
	"""
 	This function creates a new xlsx for each candidate model (i.e. baseline model + cluster)

	Parameters
	----------
	base_mdl : str
		The path that contains the baseline model spreadsheet to be extended
	clusters : list
		It holds the edges inside each grouped extension
	ext_mdl :	 str
		The path that will be used to store new extended model spreadsheet
	"""

	os.system('cp '+base_mdl+' '+ext_mdl)
	df = pd.read_excel(ext_mdl)
	pos=df.columns.get_loc("Positive Regulation Rule")+1
	neg=df.columns.get_loc("Negative Regulation Rule")+1
	var_name=df.columns.get_loc("Variable")+1
	ini0=df.columns.get_loc("State List 0")+1 #uncomment the following lines if you have more than one scenario in the properties
	ini1=df.columns.get_loc("State List 1")+1
	ini2=df.columns.get_loc("State List 2")+1
#	ini3=df.columns.get_loc("State List 3")+1
#	ini4=df.columns.get_loc("State List 4")+1
#	ini5=df.columns.get_loc("State List 5")+1
	el_name=df.columns.get_loc("Element Name")+1
	name_to_row = getRow(ext_mdl)
	curr_row = len(name_to_row)+2
	wb = openpyxl.load_workbook(ext_mdl)
	ws = wb.active
	for e in clusters[1:]:
		if e[0] not in name_to_row:
			ws.cell(row=curr_row,column=el_name,value=e[0])
			ws.cell(row=curr_row,column=var_name,value=e[0])
			ws.cell(row=curr_row,column=ini0,value=1) #uncomment the following lines if you have more than one scenario in the properties
			ws.cell(row=curr_row,column=ini1,value=1)
			ws.cell(row=curr_row,column=ini2,value=1)
#			ws.cell(row=curr_row,column=ini3,value=1)
#			ws.cell(row=curr_row,column=ini4,value=1)
#			ws.cell(row=curr_row,column=ini5,value=1)
			name_to_row[e[0]] = curr_row
			curr_row += 1
		if e[1] not in name_to_row:
			ws.cell(row=curr_row,column=el_name,value=e[1])
			ws.cell(row=curr_row,column=var_name,value=e[1])
			ws.cell(row=curr_row,column=ini0,value=1) #uncomment the following lines if you have more than one scenario in the properties
			ws.cell(row=curr_row,column=ini1,value=1)
			ws.cell(row=curr_row,column=ini2,value=1)
#			ws.cell(row=curr_row,column=ini3,value=1)
#			ws.cell(row=curr_row,column=ini4,value=1)
#			ws.cell(row=curr_row,column=ini5,value=1)
			name_to_row[e[1]] = curr_row
			curr_row += 1
		col = pos if e[2]=='+' else neg
		original = ws.cell(row=name_to_row[e[1]],column=col).value
		original = original+',' if original!=None else ''
		ws.cell(row=name_to_row[e[1]],column=col,value=original+e[0])
	wb.save(ext_mdl)

def get_args():

    parser = argparse.ArgumentParser(description="Network model extension using ACCORDION")
    parser.add_argument('ReadingOutput', type=str,help="Reading output spreadsheet")
    parser.add_argument('Baseline', type=str,help="Baseline model in BioRECIPES format")
    parser.add_argument('Inflation', type=str,help="Inflation parameter for Markov clustering")
    parser.add_argument('ReturnTh', type=str,help="Return path threshold")
    parser.add_argument('out', type=str,help="Output directory")
    args = parser.parse_args()
    return(args)

def main():

    args = get_args()

    t0 = time.time()
    #Reading output .csv file. File format(RegulatedName,RegulatedID,RegulatedType,RegulatorName,RegulatorID,RegulatorType,PaperID)
    interaction_filename = args.ReadingOutput

    #Baseline model
    model_dict, regulators = get_model(args.Baseline)

    #use parseExtension if Reading output format is (RegulatedName,RegulatedID,RegulatedType,RegulatorName,RegulatorID,RegulatorType,PaperID)
    exttt=parseExtension(model_dict, interaction_filename)

    res,new_base_model = runMarkovCluster(args.out,exttt,model_dict,args.Inflation) # try 1.5,2,4,6

    merge_clusters(regulators, args.out, args.ReturnTh)

	# for example, extend the baseline model to include the first cluster from the unmerged result 'res' and generate 'extension.xlsx'
	#extend_model(args.Baseline,res[0],args.out+'extension.xlsx')

    t1 = time.time()
    total = t1-t0
    logging.info("Time to run ACCORDION in seconds: {}".format(str(total)))

if __name__ == '__main__':
    main()