import numpy as np
import re
from pathlib import Path
from fast_forward.interaction_distribution import INTERACTIONS, interaction_distribution
from fast_forward.itp_to_ag import find_mol_indices
from collections import defaultdict
[docs]
def hellinger(p,q):
return np.round(np.sqrt(np.sum(np.power((np.sqrt(p) - np.sqrt(q)),2))) / np.sqrt(2),2)
[docs]
def std_dev_hist(hist, bins):
bin_centers = (bins[:-1] + bins[1:]) / 2
return np.sqrt(np.cov(bin_centers, aweights=hist, bias=True))
[docs]
def calc_score(ref, test, weights=None, interaction_type='distances'):
'''
Compute the score between two distributions.
The score is a weighted sum of the Hellinger distance and the difference in means of the distributions
normalized by the standard deviation of the reference distribution. If mean is outside of the standard deviation
of the reference distribution, it is capped at 1.
ref : np.ndarray
Reference distribution.
test : np.ndarray
Test distribution.
weights : list
Weights with which to calculate the final score. Should be ordered [Hellinger weight, non-Hellinger weight]
interaction_type : str
interaction type to calculate the score for.
Returns
-------
float
Score between the two distributions, between [0, 1].
'''
if weights is None:
weights = [0.7, 0.3]
bins=INTERACTIONS[interaction_type]['bins']
ref = ref / np.sum(ref) if np.sum(ref) > 0 else ref # normalize distributions
test = test / np.sum(test) if np.sum(test) > 0 else test
bin_centers = (bins[:-1] + bins[1:]) / 2
mean_diff = np.average(bin_centers, weights=ref) - np.average(bin_centers, weights=test)
mean_diff_norm = mean_diff / (2 * std_dev_hist(ref, bins)) # normalize mean difference by standard deviation of reference distribution
mean_diff_norm = np.min([np.abs(mean_diff_norm), 1]) # 1 as maximum penalty
score = hellinger(ref, test) * weights[0] + mean_diff_norm * weights[1] # score is a weighted sum of Hellinger distance and mean difference normalized by standard deviation
return np.round(score, 2)
[docs]
def score_matrix(molname, block, universe, file_map: dict, file_prefix: str, hellinger_weight=0.7, include_constraints=False):
"""
Compute a pairwise distance score matrix by comparing simulated
distributions from the trajectory with reference distributions
loaded from disk.
Parameters
----------
molname : str
Name of the molecule in the universe.
block : :class:`~vermouth.molecule.Block`
Molecule block containing atom definitions and constraints.
universe : :class:`~MDAnalysis.core.universe.Universe`
Universe used to compute simulated pairwise distance distributions.
file_map : dict
Map of reference file names to their paths.
file_prefix : str
Prefix used to construct expected reference filenames.
hellinger_weight : float
Optional. Weight for the Hellinger distance contribution.
include_constraints : bool
Optional. Whether constrained atom pairs should use normal weights.
Returns
-------
score_matrix : ndarray
Symmetric matrix of pairwise scores.
plot_data : dict
Reference and simulated distributions for plotting.
"""
plot_data = defaultdict(dict)
natoms = len(block.nodes)
score_matrix = np.zeros((natoms, natoms))
constraints = []
for constraint in block.interactions['constraints']:
constraints.append(set(constraint.atoms))
for node1, name1 in block.nodes(data='atomname'):
for node2, name2 in list(block.nodes(data='atomname'))[node1+1:]:
resid1 = block.nodes[node1]['resid']
resid2 = block.nodes[node2]['resid']
atoms = np.array([node1, node2])
group_name = f'{resid1}_{name1}_{resid2}_{name2}' # following the naming convention introduced in ITPInteractionMapper
indices = find_mol_indices(universe,
atoms,
molname)
distr = interaction_distribution(universe, 'distances', indices)
# calculate simulation distribution
probs = distr[0].T[1]
reference_name = f"{file_prefix}{group_name}_distances_distr.dat"
try:
ref_file = file_map[reference_name]
except KeyError:
print(f"{group_name} file not found! If your prefix ends with numbers, this could be the reason.")
if file_prefix == "":
print("your prefix is set to the default value: consider changing it to the actual prefix.")
continue
reference_data = np.genfromtxt(ref_file)
# if the distance is constrained, the mean difference is weighted more
if {node1, node2} in constraints and not include_constraints:
weights = [0.2, 0.8] # can be adjusted in the future
else:
weights = [hellinger_weight, 1-hellinger_weight]
# calculate score and populate matrix
score = calc_score(reference_data.T[1], probs, weights, interaction_type='distances')
score_matrix[node1, node2] = float(score)
score_matrix[node2, node1] = float(score)
plot_data['distances'][group_name] = {"x": reference_data.T[0],
"Reference": reference_data.T[1],
'Simulated': probs}
return score_matrix, plot_data