Source code for fast_forward.score

import numpy as np
import re
from pathlib import Path
from fast_forward.interaction_distribution import INTERACTIONS, interaction_distribution
from fast_forward.itp_to_ag import find_mol_indices
from collections import defaultdict

[docs] def hellinger(p,q): return np.round(np.sqrt(np.sum(np.power((np.sqrt(p) - np.sqrt(q)),2))) / np.sqrt(2),2)
[docs] def std_dev_hist(hist, bins): bin_centers = (bins[:-1] + bins[1:]) / 2 return np.sqrt(np.cov(bin_centers, aweights=hist, bias=True))
[docs] def calc_score(ref, test, weights=None, interaction_type='distances'): ''' Compute the score between two distributions. The score is a weighted sum of the Hellinger distance and the difference in means of the distributions normalized by the standard deviation of the reference distribution. If mean is outside of the standard deviation of the reference distribution, it is capped at 1. ref : np.ndarray Reference distribution. test : np.ndarray Test distribution. weights : list Weights with which to calculate the final score. Should be ordered [Hellinger weight, non-Hellinger weight] interaction_type : str interaction type to calculate the score for. Returns ------- float Score between the two distributions, between [0, 1]. ''' if weights is None: weights = [0.7, 0.3] bins=INTERACTIONS[interaction_type]['bins'] ref = ref / np.sum(ref) if np.sum(ref) > 0 else ref # normalize distributions test = test / np.sum(test) if np.sum(test) > 0 else test bin_centers = (bins[:-1] + bins[1:]) / 2 mean_diff = np.average(bin_centers, weights=ref) - np.average(bin_centers, weights=test) mean_diff_norm = mean_diff / (2 * std_dev_hist(ref, bins)) # normalize mean difference by standard deviation of reference distribution mean_diff_norm = np.min([np.abs(mean_diff_norm), 1]) # 1 as maximum penalty score = hellinger(ref, test) * weights[0] + mean_diff_norm * weights[1] # score is a weighted sum of Hellinger distance and mean difference normalized by standard deviation return np.round(score, 2)
[docs] def score_matrix(molname, block, universe, file_map: dict, file_prefix: str, hellinger_weight=0.7, include_constraints=False): """ Compute a pairwise distance score matrix by comparing simulated distributions from the trajectory with reference distributions loaded from disk. Parameters ---------- molname : str Name of the molecule in the universe. block : :class:`~vermouth.molecule.Block` Molecule block containing atom definitions and constraints. universe : :class:`~MDAnalysis.core.universe.Universe` Universe used to compute simulated pairwise distance distributions. file_map : dict Map of reference file names to their paths. file_prefix : str Prefix used to construct expected reference filenames. hellinger_weight : float Optional. Weight for the Hellinger distance contribution. include_constraints : bool Optional. Whether constrained atom pairs should use normal weights. Returns ------- score_matrix : ndarray Symmetric matrix of pairwise scores. plot_data : dict Reference and simulated distributions for plotting. """ plot_data = defaultdict(dict) natoms = len(block.nodes) score_matrix = np.zeros((natoms, natoms)) constraints = [] for constraint in block.interactions['constraints']: constraints.append(set(constraint.atoms)) for node1, name1 in block.nodes(data='atomname'): for node2, name2 in list(block.nodes(data='atomname'))[node1+1:]: resid1 = block.nodes[node1]['resid'] resid2 = block.nodes[node2]['resid'] atoms = np.array([node1, node2]) group_name = f'{resid1}_{name1}_{resid2}_{name2}' # following the naming convention introduced in ITPInteractionMapper indices = find_mol_indices(universe, atoms, molname) distr = interaction_distribution(universe, 'distances', indices) # calculate simulation distribution probs = distr[0].T[1] reference_name = f"{file_prefix}{group_name}_distances_distr.dat" try: ref_file = file_map[reference_name] except KeyError: print(f"{group_name} file not found! If your prefix ends with numbers, this could be the reason.") if file_prefix == "": print("your prefix is set to the default value: consider changing it to the actual prefix.") continue reference_data = np.genfromtxt(ref_file) # if the distance is constrained, the mean difference is weighted more if {node1, node2} in constraints and not include_constraints: weights = [0.2, 0.8] # can be adjusted in the future else: weights = [hellinger_weight, 1-hellinger_weight] # calculate score and populate matrix score = calc_score(reference_data.T[1], probs, weights, interaction_type='distances') score_matrix[node1, node2] = float(score) score_matrix[node2, node1] = float(score) plot_data['distances'][group_name] = {"x": reference_data.T[0], "Reference": reference_data.T[1], 'Simulated': probs} return score_matrix, plot_data