Source code for fast_forward.cgsmiles

# Copyright 2020 University of Groningen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import numba
import networkx as nx
import pysmiles
from cgsmiles.resolve import MoleculeResolver
from .map_file_parers import Mapping
from .universe_handler import ResidueIter



[docs]
def remove_explicit_hydrogens(graph):
    """
    Returns a subgraph of a molecule sans hydrogen atoms and
    annotates the number of hydrogen atoms on each remaining node.
    """
    not_hnodes = [node for node in graph.nodes if graph.nodes[node]["element"] != "H"]
    not_h_graph = graph.subgraph(not_hnodes)
    for node in not_hnodes:
        hcount = sum(
            1 for neighbor in nx.neighbors(graph, node)
            if graph.nodes[neighbor]["element"] == "H"
        )
        not_h_graph.nodes[node]["hcount"] = hcount
    return not_h_graph




[docs]
def load_cgsmiles_library(filepath):
    """
    Given a file that lists cgsmiles strings, read and return them as a list.

    Parameters
    ----------
    filepath: str

    Returns
    -------
    list[str]
    """
    cgsmiles_strs = []
    with open(filepath) as _file:
        for line in _file:
            cgsmiles_strs.append(line.strip())
    return cgsmiles_strs




[docs]
def find_one_graph_match(graph1, graph2):
    """
    Returns one ISMAGS match when graphs are isomorphic, otherwise [].
    Matches are done based on element and bond order, which is sufficient
    to also account for charges implicitly via the orders.

    Parameters
    ----------
    graph1: networkx.Graph
    graph2: networkx.Graph

    Returns
    -------
    abc.Iterator
    """

    def node_match(n1, n2):
        for attr in ["element", "hcount"]:
            if n1[attr] != n2[attr]:
                return False
        return True

    if len(graph1) != len(graph2):
        raw_matches = iter([])
    else:
        graph1_no_hatoms = remove_explicit_hydrogens(graph1)
        graph2_no_hatoms = remove_explicit_hydrogens(graph2)
        GM = nx.isomorphism.GraphMatcher(
            graph1_no_hatoms,
            graph2_no_hatoms,
            node_match=node_match,
        )
        raw_matches = GM.subgraph_isomorphisms_iter()

    try:
        mapping = next(raw_matches)
        # Add hydrogen atoms to the mapping
        for node in graph1.nodes:
            if graph1.nodes[node].get("element") == "H" and node not in mapping:
                anchor = list(graph1.neighbors(node))[0]
                hatoms_target = [
                    n for n in graph1.neighbors(anchor)
                    if graph1.nodes[n]["element"] == "H"
                ]
                hatoms_ref = [
                    n for n in graph2.neighbors(mapping[anchor])
                    if graph2.nodes[n]["element"] == "H"
                ]
                # Enforced by graph matching, but assert to catch any zip-masking issues
                assert len(hatoms_target) == len(hatoms_ref)
                mapping.update(dict(zip(hatoms_target, hatoms_ref)))
    except StopIteration:
        mapping = []

    return mapping



def _most_common(_list):
    return max(set(_list), key=_list.count)



[docs]
def get_mappings(cg, univ, _match, mappings):
    """
    Assigns each coarse CG node a resname and resid based on the all-atom
    universe. If a bead is split between residues, the resname and resid of
    the majority of atoms is used.

    Parameters
    ----------
    cg: networkx.Graph
        The coarse molecule graph.
    univ: :class:`fast_forward.universe_handler.UniverseHandler`
        The universe storing the fine-grained molecule information.
    _match: dict
        A dict mapping the coarse atoms to fine-grained atoms.
    mappings: dict[:class:`fast_forward.map_file_parser.Mapping`]
        A dict of mapping objects.

    Returns
    -------
    dict[:class:`fast_forward.map_file_parser.Mapping`]
        Updated mappings.
    """
    target_resids = {}
    for bead in cg.nodes:
        atoms = cg.nodes[bead]['graph'].nodes
        # if no resname mapping is provided everything is defaulted
        # to RES
        resname_default = "RES"
        # we cannot simply take the resids provided in the input
        # because we allow mapping across residues in which case
        # there is no unique resid and the most common one is not
        # safe enough
        resid_default = 1 
        resname = cg.nodes[bead].get('resname', resname_default)
        resid = cg.nodes[bead].get('resid', resid_default)
        cg.nodes[bead]['resid'] = resid
        cg.nodes[bead]['resname'] = resname
        mapping = mappings.get(resname, Mapping(resname, resname))
        target_resid = target_resids.get(resname, resid)
        target_resids[resname] = target_resid
        if resid != target_resids[resname]:
            continue
        for atom in atoms:
            weight = np.float32(cg.nodes[bead]['graph'].nodes[atom].get('weight', 1))
            mapping.add_atom(
                # we need to append the bead index here because in a cgsmiles string
                # the bead names do not need to be unique but the mapping infrastructure
                # assumes this. Making this more flexible should be a future objective
                cg.nodes[bead]["fragname"]+f"{bead}",
                _match[atom],
                atom=univ.atoms[_match[atom]].name,
                weight=weight,
            )
        mappings[resname] = mapping

    return mappings



def _annotate_vs(cg_graph):
    """
    Identifies virtual sites in the CG graph and annotates them.

    If a node in cg_graph has no mapped atom positions, it is treated as a
    virtual site. Its mapped position is composed as the union of fragment
    graphs of neighboring atoms, which is equivalent to a virtual_sitesn
    mapping.

    Parameters
    ----------
    cg_graph: networkx.Graph
        The graph describing a coarse molecule.
    """
    for node in cg_graph.nodes:
        if len(cg_graph.nodes[node].get('graph', [])) == 0:
            g = nx.Graph()
            for neigh in cg_graph.neighbors(node):
                g.add_nodes_from(list(cg_graph.nodes[neigh]['graph'].nodes))
            cg_graph.nodes[node]['graph'] = g


def _annotate_residues(cg, res):
    """
    Propagates resname and resid from a residue-level graph down to CG nodes.

    Parameters
    ----------
    cg: networkx.Graph
        The coarse-grained molecule graph whose nodes will be annotated.
    res: networkx.Graph
        The residue-level graph produced by an extra resolver pass.
    """
    for node in res:
        resname = res.nodes[node]["fragname"]
        resid = res.nodes[node]["fragid"]
        for cgnode in res.nodes[node]['graph'].nodes:
            cg.nodes[cgnode]["resname"] = resname
            cg.nodes[cgnode]["resid"] = resid


def _resolve_cg_match(cgs_str, mol_graph):
    """
    Resolves a cgsmiles string and returns the matched (cg, aa, match) triple,
    or None if the string does not match mol_graph.

    Parameters
    ----------
    cgs_str: str
        A cgsmiles string encoding the CG mapping.
    mol_graph: networkx.Graph
        The all-atom molecule graph to match against.

    Returns
    -------
    tuple[networkx.Graph, networkx.Graph, dict] or None
    """
    resolver = MoleculeResolver.from_string(cgs_str, last_all_atom=True)
    nres = cgs_str.count("{")
    assert nres > 1, "cgsmiles string must contain at least two resolution levels."

    if nres > 2:
        res, _ = resolver.resolve()

    cg, aa = resolver.resolve()

    if nres > 2:
        _annotate_residues(cg, res)

    _annotate_vs(cg)
    match = find_one_graph_match(aa, mol_graph)
    return (cg, aa, match) if match else None


def _collect_bead_data(cg, match, mol_idxs, mol_graph, bead_count, res_iter):
    """
    Iterates over all molecule instances and collects per-bead mapped atoms,
    weights, and bead indices.

    Parameters
    ----------
    cg: networkx.Graph
    match: dict
        Mapping from CG atom indices to all-atom indices (for one molecule instance).
    mol_idxs: list[int]
        Indices of all instances of this molecule in the universe.
    mol_graph: networkx.Graph
    bead_count: int
        Running bead index to continue from.
    res_iter: ResidueIter

    Returns
    -------
    tuple[list, list, list, int, ResidueIter]
        mapped_atoms, weights, bead_idxs, updated bead_count, res_iter
    """
    mapped_atoms, weights, bead_idxs = [], [], []
    prev_resid = -1
    offset = 0

    for _ in mol_idxs:
        for bead in cg.nodes:
            resid = cg.nodes[bead]['resid']
            resname = cg.nodes[bead]['resname']
            if prev_resid != resid:
                prev_resid = resid
                res_iter.add_residue(resid=resid, resname=resname)

            atoms = cg.nodes[bead]['graph'].nodes
            mapped_atoms.append(
                numba.typed.List([match[atom] + offset for atom in atoms])
            )
            bead_weights = nx.get_node_attributes(cg.nodes[bead]['graph'], 'weight')
            weights.append(
                np.array([bead_weights.get(atom, 1.0) for atom in atoms], dtype=np.float32)
            )
            bead_idxs.append(bead_count)
            bead_count += 1

        offset += len(mol_graph)

    return mapped_atoms, weights, bead_idxs, bead_count, res_iter



[docs]
def cgsmiles_to_mapping(univ, cgsmiles_strs, mol_names, mol_matching=True):
    """
    Given a list of mappings described by cgsmiles strings, maps atom indices
    to CG beads using graph isomorphism.

    Parameters
    ----------
    univ: :class:`fast_forward.universe_handler.UniverseHandler`
    cgsmiles_strs: list[str]
    mol_names: list[str]
    mol_matching: bool
        If False, the order of cgsmiles strings is assumed to match the order
        of molecules.

    Returns
    -------
    list, list, dict
    """
    all_mapped_atoms, all_bead_idxs, all_weights = [], [], []
    mappings = {}
    bead_count = 0
    res_iter = ResidueIter()

    for idx, mol_name in enumerate(mol_names):
        mol_graph = univ.molecule_graphs[mol_name]
        candidates = [cgsmiles_strs[idx]] if mol_matching else cgsmiles_strs

        # Find the first cgsmiles string that matches this molecule
        resolved = None
        for cgs_str in candidates:
            resolved = _resolve_cg_match(cgs_str, mol_graph)
            if resolved is not None:
                break
        if resolved is None:
            raise SyntaxError(
                f'No matching cgsmiles string found for molecule {mol_name}.'
            )
        cg, aa, match = resolved

        mappings = get_mappings(cg, univ, match, mappings)
        mol_idxs = univ.mol_idxs_by_name[mol_name]
        mapped_atoms, weights, bead_idxs, bead_count, res_iter = _collect_bead_data(
            cg, match, mol_idxs, mol_graph, bead_count, res_iter
        )
        all_mapped_atoms.extend(mapped_atoms)
        all_weights.extend(weights)
        all_bead_idxs.extend(bead_idxs)

    return (
        numba.typed.List(all_mapped_atoms),
        numba.typed.List(all_bead_idxs),
        mappings,
        numba.typed.List(all_weights),
        res_iter,
    )
Source code for fast_forward.cgsmiles

FastForward

Navigation

Related Topics