Source code for fast_forward.cgsmiles

# Copyright 2020 University of Groningen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import numba
import networkx as nx
import pysmiles
from cgsmiles.resolve import MoleculeResolver
from .map_file_parers import Mapping
from .universe_handler import ResidueIter


[docs] def remove_explicit_hydrogens(graph): """ Returns a subgraph of a molecule sans hydrogen atoms and annotates the number of hydrogen atoms on each remaining node. """ not_hnodes = [node for node in graph.nodes if graph.nodes[node]["element"] != "H"] not_h_graph = graph.subgraph(not_hnodes) for node in not_hnodes: hcount = sum( 1 for neighbor in nx.neighbors(graph, node) if graph.nodes[neighbor]["element"] == "H" ) not_h_graph.nodes[node]["hcount"] = hcount return not_h_graph
[docs] def load_cgsmiles_library(filepath): """ Given a file that lists cgsmiles strings, read and return them as a list. Parameters ---------- filepath: str Returns ------- list[str] """ cgsmiles_strs = [] with open(filepath) as _file: for line in _file: cgsmiles_strs.append(line.strip()) return cgsmiles_strs
[docs] def find_one_graph_match(graph1, graph2): """ Returns one ISMAGS match when graphs are isomorphic, otherwise []. Matches are done based on element and bond order, which is sufficient to also account for charges implicitly via the orders. Parameters ---------- graph1: networkx.Graph graph2: networkx.Graph Returns ------- abc.Iterator """ def node_match(n1, n2): for attr in ["element", "hcount"]: if n1[attr] != n2[attr]: return False return True if len(graph1) != len(graph2): raw_matches = iter([]) else: graph1_no_hatoms = remove_explicit_hydrogens(graph1) graph2_no_hatoms = remove_explicit_hydrogens(graph2) GM = nx.isomorphism.GraphMatcher( graph1_no_hatoms, graph2_no_hatoms, node_match=node_match, ) raw_matches = GM.subgraph_isomorphisms_iter() try: mapping = next(raw_matches) # Add hydrogen atoms to the mapping for node in graph1.nodes: if graph1.nodes[node].get("element") == "H" and node not in mapping: anchor = list(graph1.neighbors(node))[0] hatoms_target = [ n for n in graph1.neighbors(anchor) if graph1.nodes[n]["element"] == "H" ] hatoms_ref = [ n for n in graph2.neighbors(mapping[anchor]) if graph2.nodes[n]["element"] == "H" ] # Enforced by graph matching, but assert to catch any zip-masking issues assert len(hatoms_target) == len(hatoms_ref) mapping.update(dict(zip(hatoms_target, hatoms_ref))) except StopIteration: mapping = [] return mapping
def _most_common(_list): return max(set(_list), key=_list.count)
[docs] def get_mappings(cg, univ, _match, mappings): """ Assigns each coarse CG node a resname and resid based on the all-atom universe. If a bead is split between residues, the resname and resid of the majority of atoms is used. Parameters ---------- cg: networkx.Graph The coarse molecule graph. univ: :class:`fast_forward.universe_handler.UniverseHandler` The universe storing the fine-grained molecule information. _match: dict A dict mapping the coarse atoms to fine-grained atoms. mappings: dict[:class:`fast_forward.map_file_parser.Mapping`] A dict of mapping objects. Returns ------- dict[:class:`fast_forward.map_file_parser.Mapping`] Updated mappings. """ target_resids = {} for bead in cg.nodes: atoms = cg.nodes[bead]['graph'].nodes # if no resname mapping is provided everything is defaulted # to RES resname_default = "RES" # we cannot simply take the resids provided in the input # because we allow mapping across residues in which case # there is no unique resid and the most common one is not # safe enough resid_default = 1 resname = cg.nodes[bead].get('resname', resname_default) resid = cg.nodes[bead].get('resid', resid_default) cg.nodes[bead]['resid'] = resid cg.nodes[bead]['resname'] = resname mapping = mappings.get(resname, Mapping(resname, resname)) target_resid = target_resids.get(resname, resid) target_resids[resname] = target_resid if resid != target_resids[resname]: continue for atom in atoms: weight = np.float32(cg.nodes[bead]['graph'].nodes[atom].get('weight', 1)) mapping.add_atom( # we need to append the bead index here because in a cgsmiles string # the bead names do not need to be unique but the mapping infrastructure # assumes this. Making this more flexible should be a future objective cg.nodes[bead]["fragname"]+f"{bead}", _match[atom], atom=univ.atoms[_match[atom]].name, weight=weight, ) mappings[resname] = mapping return mappings
def _annotate_vs(cg_graph): """ Identifies virtual sites in the CG graph and annotates them. If a node in cg_graph has no mapped atom positions, it is treated as a virtual site. Its mapped position is composed as the union of fragment graphs of neighboring atoms, which is equivalent to a virtual_sitesn mapping. Parameters ---------- cg_graph: networkx.Graph The graph describing a coarse molecule. """ for node in cg_graph.nodes: if len(cg_graph.nodes[node].get('graph', [])) == 0: g = nx.Graph() for neigh in cg_graph.neighbors(node): g.add_nodes_from(list(cg_graph.nodes[neigh]['graph'].nodes)) cg_graph.nodes[node]['graph'] = g def _annotate_residues(cg, res): """ Propagates resname and resid from a residue-level graph down to CG nodes. Parameters ---------- cg: networkx.Graph The coarse-grained molecule graph whose nodes will be annotated. res: networkx.Graph The residue-level graph produced by an extra resolver pass. """ for node in res: resname = res.nodes[node]["fragname"] resid = res.nodes[node]["fragid"] for cgnode in res.nodes[node]['graph'].nodes: cg.nodes[cgnode]["resname"] = resname cg.nodes[cgnode]["resid"] = resid def _resolve_cg_match(cgs_str, mol_graph): """ Resolves a cgsmiles string and returns the matched (cg, aa, match) triple, or None if the string does not match mol_graph. Parameters ---------- cgs_str: str A cgsmiles string encoding the CG mapping. mol_graph: networkx.Graph The all-atom molecule graph to match against. Returns ------- tuple[networkx.Graph, networkx.Graph, dict] or None """ resolver = MoleculeResolver.from_string(cgs_str, last_all_atom=True) nres = cgs_str.count("{") assert nres > 1, "cgsmiles string must contain at least two resolution levels." if nres > 2: res, _ = resolver.resolve() cg, aa = resolver.resolve() if nres > 2: _annotate_residues(cg, res) _annotate_vs(cg) match = find_one_graph_match(aa, mol_graph) return (cg, aa, match) if match else None def _collect_bead_data(cg, match, mol_idxs, mol_graph, bead_count, res_iter): """ Iterates over all molecule instances and collects per-bead mapped atoms, weights, and bead indices. Parameters ---------- cg: networkx.Graph match: dict Mapping from CG atom indices to all-atom indices (for one molecule instance). mol_idxs: list[int] Indices of all instances of this molecule in the universe. mol_graph: networkx.Graph bead_count: int Running bead index to continue from. res_iter: ResidueIter Returns ------- tuple[list, list, list, int, ResidueIter] mapped_atoms, weights, bead_idxs, updated bead_count, res_iter """ mapped_atoms, weights, bead_idxs = [], [], [] prev_resid = -1 offset = 0 for _ in mol_idxs: for bead in cg.nodes: resid = cg.nodes[bead]['resid'] resname = cg.nodes[bead]['resname'] if prev_resid != resid: prev_resid = resid res_iter.add_residue(resid=resid, resname=resname) atoms = cg.nodes[bead]['graph'].nodes mapped_atoms.append( numba.typed.List([match[atom] + offset for atom in atoms]) ) bead_weights = nx.get_node_attributes(cg.nodes[bead]['graph'], 'weight') weights.append( np.array([bead_weights.get(atom, 1.0) for atom in atoms], dtype=np.float32) ) bead_idxs.append(bead_count) bead_count += 1 offset += len(mol_graph) return mapped_atoms, weights, bead_idxs, bead_count, res_iter
[docs] def cgsmiles_to_mapping(univ, cgsmiles_strs, mol_names, mol_matching=True): """ Given a list of mappings described by cgsmiles strings, maps atom indices to CG beads using graph isomorphism. Parameters ---------- univ: :class:`fast_forward.universe_handler.UniverseHandler` cgsmiles_strs: list[str] mol_names: list[str] mol_matching: bool If False, the order of cgsmiles strings is assumed to match the order of molecules. Returns ------- list, list, dict """ all_mapped_atoms, all_bead_idxs, all_weights = [], [], [] mappings = {} bead_count = 0 res_iter = ResidueIter() for idx, mol_name in enumerate(mol_names): mol_graph = univ.molecule_graphs[mol_name] candidates = [cgsmiles_strs[idx]] if mol_matching else cgsmiles_strs # Find the first cgsmiles string that matches this molecule resolved = None for cgs_str in candidates: resolved = _resolve_cg_match(cgs_str, mol_graph) if resolved is not None: break if resolved is None: raise SyntaxError( f'No matching cgsmiles string found for molecule {mol_name}.' ) cg, aa, match = resolved mappings = get_mappings(cg, univ, match, mappings) mol_idxs = univ.mol_idxs_by_name[mol_name] mapped_atoms, weights, bead_idxs, bead_count, res_iter = _collect_bead_data( cg, match, mol_idxs, mol_graph, bead_count, res_iter ) all_mapped_atoms.extend(mapped_atoms) all_weights.extend(weights) all_bead_idxs.extend(bead_idxs) return ( numba.typed.List(all_mapped_atoms), numba.typed.List(all_bead_idxs), mappings, numba.typed.List(all_weights), res_iter, )