Source code for fast_forward.score

import numpy as np
import re
from pathlib import Path
from fast_forward.interaction_distribution import INTERACTIONS, interaction_distribution
from fast_forward.itp_to_ag import find_mol_indices
from collections import defaultdict


[docs]
def hellinger(p,q):
    return np.round(np.sqrt(np.sum(np.power((np.sqrt(p) - np.sqrt(q)),2))) / np.sqrt(2),2)



[docs]
def std_dev_hist(hist, bins):
    bin_centers = (bins[:-1] + bins[1:]) / 2
    return np.sqrt(np.cov(bin_centers, aweights=hist, bias=True))



[docs]
def calc_score(ref, test, weights=None, interaction_type='distances'):
    '''
    Compute the score between two distributions.
    The score is a weighted sum of the Hellinger distance and the difference in means of the distributions
    normalized by the standard deviation of the reference distribution. If mean is outside of the standard deviation
    of the reference distribution, it is capped at 1.

    ref : np.ndarray
        Reference distribution.
    test : np.ndarray
        Test distribution.
    weights : list
        Weights with which to calculate the final score. Should be ordered [Hellinger weight, non-Hellinger weight]
    interaction_type : str
        interaction type to calculate the score for.

    Returns
    -------
    float
        Score between the two distributions, between [0, 1].
    '''
    if weights is None:
        weights = [0.7, 0.3]

    bins=INTERACTIONS[interaction_type]['bins']
    ref = ref / np.sum(ref) if np.sum(ref) > 0 else ref # normalize distributions
    test = test / np.sum(test) if np.sum(test) > 0 else test

    bin_centers = (bins[:-1] + bins[1:]) / 2

    mean_diff = np.average(bin_centers, weights=ref) - np.average(bin_centers, weights=test)
    mean_diff_norm = mean_diff / (2 * std_dev_hist(ref, bins)) # normalize mean difference by standard deviation of reference distribution
    mean_diff_norm = np.min([np.abs(mean_diff_norm), 1]) # 1 as maximum penalty

    score = hellinger(ref, test) * weights[0] + mean_diff_norm * weights[1] # score is a weighted sum of Hellinger distance and mean difference normalized by standard deviation
    return np.round(score, 2)



[docs]
def score_matrix(molname, block, universe, file_map: dict, file_prefix: str, hellinger_weight=0.7, include_constraints=False):
    """
    Compute a pairwise distance score matrix by comparing simulated
    distributions from the trajectory with reference distributions
    loaded from disk.

    Parameters
    ----------
    molname : str
        Name of the molecule in the universe.
    block : :class:`~vermouth.molecule.Block`
        Molecule block containing atom definitions and constraints.
    universe : :class:`~MDAnalysis.core.universe.Universe`
        Universe used to compute simulated pairwise distance distributions.
    file_map : dict
        Map of reference file names to their paths.
    file_prefix : str
        Prefix used to construct expected reference filenames.
    hellinger_weight : float
        Optional. Weight for the Hellinger distance contribution.
    include_constraints : bool
        Optional. Whether constrained atom pairs should use normal weights.

    Returns
    -------
    score_matrix : ndarray
        Symmetric matrix of pairwise scores.
    plot_data : dict
        Reference and simulated distributions for plotting.
    """

    plot_data = defaultdict(dict)
    natoms = len(block.nodes)
    score_matrix = np.zeros((natoms, natoms))

    constraints = []
    for constraint in block.interactions['constraints']:
        constraints.append(set(constraint.atoms))

    for node1, name1 in block.nodes(data='atomname'):
        for node2, name2 in list(block.nodes(data='atomname'))[node1+1:]:
            resid1 = block.nodes[node1]['resid']
            resid2 = block.nodes[node2]['resid']
            atoms = np.array([node1, node2])
            group_name = f'{resid1}_{name1}_{resid2}_{name2}' # following the naming convention introduced in ITPInteractionMapper
            indices = find_mol_indices(universe,
                            atoms,
                            molname)
            distr = interaction_distribution(universe, 'distances', indices)
            # calculate simulation distribution
            probs = distr[0].T[1]

            reference_name = f"{file_prefix}{group_name}_distances_distr.dat"
            try:
                ref_file = file_map[reference_name]
            except KeyError:
                print(f"{group_name} file not found! If your prefix ends with numbers, this could be the reason.")
                if file_prefix == "":
                    print("your prefix is set to the default value: consider changing it to the actual prefix.")
                continue

            reference_data = np.genfromtxt(ref_file)
            
            # if the distance is constrained, the mean difference is weighted more
            if {node1, node2} in constraints and not include_constraints:
                weights = [0.2, 0.8] # can be adjusted in the future
            else:
                weights = [hellinger_weight, 1-hellinger_weight]

            # calculate score and populate matrix
            score = calc_score(reference_data.T[1], probs, weights, interaction_type='distances')
            score_matrix[node1, node2] = float(score)
            score_matrix[node2, node1] = float(score)
            plot_data['distances'][group_name] = {"x": reference_data.T[0],
                                                    "Reference": reference_data.T[1],
                                                    'Simulated': probs}
    return score_matrix, plot_data
Source code for fast_forward.score

FastForward

Navigation

Related Topics