# Source code for scm.plams.mol.identify

```
from collections import OrderedDict
from itertools import combinations
import numpy as np
try:
import networkx
has_networkx = True
except ImportError:
has_networkx = False
from scm.plams.core.functions import add_to_class
from scm.plams.core.private import sha256
from scm.plams.mol.molecule import Molecule
from scm.plams.tools.units import Units
__all__ = ["label_atoms"]
possible_flags = ["BO", "RS", "EZ", "DH", "CO", "H2"]
[docs]def twist(v1, v2, v3, tolerance=None):
"""
Given 3 vectors in 3D space measure their "chirality" with *tolerance*.
Returns a pair. The first element is an integer number measuring the orientation (clockwise vs counterclockwise) of *v1* and *v3* while looking along *v2*. Values 1 and -1 indicate this case and the second element of returned pair is ``None``. Value 0 indicates that *v1*, *v2*, and *v3* are coplanar, and the second element of the returned pair is indicating if two turns made by going *v1*->*v2*->*v3* are the same (left-left, right-right) or the opposite (left-right, right-left).
"""
tolerance = 1e-2 if tolerance is None else tolerance
v1 /= np.linalg.norm(v1)
v2 /= np.linalg.norm(v2)
v3 /= np.linalg.norm(v3)
x = np.dot(v3, np.cross(v1, v2))
if abs(x) <= tolerance: # v1, v2, v3 are coplanar
return 0, int(np.sign(np.dot(np.cross(v1, v2), np.cross(v2, v3))))
return int(np.sign(x)), None
[docs]def bend(v1, v2, tolerance=None):
"""Check if two vectors in 3D space are parallel or perpendicular, with *tolerance* (in degrees).
Returns 1 if *v1* and *v2* are collinear, 2 if they are perpendicular, 0 otherwise."""
tolerance = 7.5 if tolerance is None else tolerance
v1 /= np.linalg.norm(v1)
v2 /= np.linalg.norm(v2)
angle = Units.convert(abs(np.arccos(np.dot(v1, v2))), "rad", "deg")
if angle <= tolerance:
return 1
if abs(angle - 90.0) <= tolerance:
return 2
return 0
[docs]def unique_atoms(atomlist):
"""Filter *atomlist* (list or |Molecule|) for atoms with unique ``IDname``."""
d = {}
for atom in atomlist:
if atom.IDname not in d:
d[atom.IDname] = 0
d[atom.IDname] += 1
return [atom for atom in atomlist if d[atom.IDname] == 1]
[docs]def initialize(molecule):
"""Initialize atom labeling algorithm by setting ``IDname`` and ``IDdone`` attributes for all atoms in *molecule*."""
for at in molecule:
at.IDname = at.symbol
at.IDdone = False
[docs]def clear(molecule):
"""Remove ``IDname`` and ``IDdone`` attributes from all atoms in *molecule*."""
for at in molecule:
if hasattr(at, "IDname"):
del at.IDname
if hasattr(at, "IDdone"):
del at.IDdone
[docs]def iterate(molecule, flags):
"""Perform one iteration of atom labeling alogrithm.
First, mark all atoms that are unique and have only unique neighbors as "done". Then calculate new label for each atom that is not done. Return True if the number of different atom labels increased during this iteration.
"""
names = len(set(at.IDname for at in molecule))
unique = set(unique_atoms(molecule))
for atom in molecule:
if atom in unique and all(N in unique for N in atom.neighbors()):
atom.IDdone = True
if not atom.IDdone:
atom.IDnew = new_name(atom, flags)
for atom in molecule:
if not atom.IDdone:
atom.IDname = atom.IDnew
new_names = len(set(atom.IDname for atom in molecule))
return new_names > names # True means this iteration increased the number of distinct names
[docs]def new_name(atom, flags):
"""Compute new label for *atom*.
The new label is based on the existing label of *atom*, labels of all its neighbors and (possibly) some additional conformational information. The labels of neighbors are not obtained directly by reading neighbor's ``IDname`` but rather by a process called "knocking". The *atom* knocks all its bonds. Each knocked bond returns an identifier describing the atom on the other end of the bond. The identifier is composed of knocked atom's ``IDname`` together with some additional information desribing the character of the bond and knocked atom's spatial environment. The exact behavior of this mechanism is adjusted by the contents of *flags* dictionary (see :func:`label_atoms` for details).
"""
knocks = [knock(atom, bond, flags) for bond in atom.bonds]
knocks.sort(key=lambda x: x[0])
labels = set(i[0] for i in knocks) # types of neighbors
more = []
if flags["RS"] and len(knocks) == 4 and len(labels) == 4: # 4 different neighbors
v1 = atom.vector_to(knocks[0][1])
v2 = atom.vector_to(knocks[1][1])
v3 = atom.vector_to(knocks[2][1])
more.append("RS" + str(twist(v1, v2, v3, flags.get("twist_tol"))))
if flags["CO"] and len(knocks) >= 4:
d = OrderedDict()
for label, at in knocks:
if label not in d:
d[label] = []
d[label].append(at)
for label in d:
if len(d[label]) > 1:
angles = [
bend(atom.vector_to(a), atom.vector_to(b), flags.get("bend_tol"))
for a, b in combinations(d[label], 2)
]
angles.sort()
else: # len(d[label]) == 1
v1 = atom.vector_to(d[label][0])
angles = []
for k in d:
if k != label:
angles.append(sorted(bend(v1, atom.vector_to(a), flags.get("bend_tol")) for a in d[k]))
more.append("CO" + str(angles))
return sha256("|".join([atom.IDname] + [i[0] for i in knocks] + more))
[docs]def knock(A, bond, flags):
"""Atom *A* knocks one of its bonds.
*bond* has to be a bond formed by atom *A*. The other end of this bond (atom S) returns its description, consisting of its ``IDname`` plus, possibly, some additional information. If *BO* flag is set, the description includes the bond order of *bond*. If *EZ* flag is set, the description includes additional bit of information whenever E/Z isomerism is possible. If *DH* flag is set, the description includes additional information for all dihedrals A-S-N-F such that A is a unique neighbor of S and F is a unique neighbor of N.
"""
S = bond.other_end(A)
ret = S.IDname
if flags["BO"] and bond.order != 1:
ret += "BO" + str(bond.order)
if flags["H2"]:
S_nbors = S.neighbors()
S_nbors.remove(A)
if len(S_nbors) == 3 and len(unique_atoms(S_nbors)) == 3:
S_nbors.sort(key=lambda x: x.IDname)
v1, v2, v3 = [S.vector_to(i) for i in S_nbors]
t = twist(v1, v2, v3, flags.get("twist_tol"))
ret += "H2" + str(t)
if flags["EZ"] or flags["DH"]:
S_unique = unique_atoms(S.neighbors())
if A in S_unique: # *A* is a unique neighbor of *S*
S_unique.remove(A)
S_unique.sort(key=lambda x: x.IDname)
for b in S.bonds:
N = b.other_end(S)
if N in S_unique:
N_unique = unique_atoms(N.neighbors())
if S in N_unique:
N_unique.remove(S)
N_unique.sort(key=lambda x: x.IDname)
if N_unique:
F = N_unique[0]
v1 = A.vector_to(S)
v2 = S.vector_to(N)
v3 = N.vector_to(F)
t = twist(v1, v2, v3, flags.get("twist_tol"))
if flags["DH"]:
ret += "DH" + str(t)
elif flags["EZ"] and b.order == 2 and t[0] == 0:
# A-S=N-F are coplanar
ret += "EZ" + str(t[1])
break
return (ret, S)
[docs]def label_atoms(molecule, **kwargs):
"""Label atoms in *molecule*.
Boolean keyword arguments:
* *BO* -- include bond orders
* *RS* -- include R/S stereoisomerism
* *EZ* -- include E/Z stereoisomerism
* *DH* -- include some dihedrals to detect alkane rotamers and syn-anti conformers in cycloalkanes
* *CO* -- include more spatial info to detect different conformation of coordination complexes (flat square, octaedr etc.)
Numerical keyword arguments:
* *twist_tol* -- tolerance for :func:`twist` function
* *bend_tol* -- tolerance for :func:`bend` function
Diherdals considered with *DH* are all the dihedrals A-B-C-D such that A is a unique neighbor or B and D is a unique neighbor of C.
For atoms with 4 or more neighbors, the *CO* flag includes information about relative positions of equivalent/non-equivalent neighbors by checking if vectors from the central atom to the neighbors form 90 or 180 degrees angles.
"""
initialize(molecule)
while iterate(molecule, kwargs):
pass
return molecule
[docs]def molecule_name(molecule):
"""Compute the label of the whole *molecule* based on ``IDname`` attributes of all the atoms."""
names = [atom.IDname for atom in molecule]
names.sort()
return sha256(" ".join(names))
@add_to_class(Molecule)
def label(self, level=1, keep_labels=False, flags=None):
"""Compute the label of this molecule using chosen *level* of detail.
Possible levels are:
* **0**: does not pefrom any atom labeling, returns empirical formula (see :meth:`~scm.plams.mol.molecule.Molecule.get_formula`)
* **1**: only direct connectivity is considered, without bond orders (in other words, treats all the bonds as single bonds)
* **2**: use connectivity and bond orders
* **3**: use connectivity, bond orders and some spatial information to distinguish R/S and E/Z isomers
* **4**: use all above, plus more spatial information to distinguish different rotamers and different types of coordination complexes
If you need more precise control of what is taken into account while computing the label (or adjust the tolerance for geometrical operations) you can use the *flags* argument. It should be a dictionary of parameters recognized by :func:`~scm.plams.mol.identify.label_atoms`. Each of two letter boolean flags has to be present in *flags*. If you use *flags*, *level* is ignored.
The *level* argument can also be a tuple of integers. In that case the labeling algorithm is run multiple times and the returned value is a tuple (with the same length as *level*) containing labels calculated with given levels of detail.
This function, by default, erases ``IDname`` attributes of all atoms at the end. You can change this behavior with *keep_labels* argument.
If the molecule does not contain bonds, :meth:`~scm.plams.mol.molecule.Molecule.guess_bonds` is used to determine them.
.. note::
This method is a new PLAMS feature and its still somewhat experimental. The exact details of the algorithm can, and probably will, change in future. You are more than welcome to provide any feedback or feature requests.
"""
if isinstance(level, (tuple, list)):
return tuple(self.label(i) for i in level)
if flags is None:
if level == 0:
return self.get_formula()
flags = {i: False for i in possible_flags}
if level >= 2:
flags["BO"] = True
if level >= 3:
flags["RS"] = True
flags["EZ"] = True
if level >= 4:
flags["DH"] = True
flags["CO"] = True
if len(self.bonds) == 0:
self.guess_bonds()
clear(self)
label_atoms(self, **flags)
ret = molecule_name(self)
if not keep_labels:
clear(self)
return ret
@add_to_class(Molecule)
def set_local_labels(self, niter=2, flags=None):
"""
Set atomic labels (IDnames) that are unique for local structures of a molecule
* ``niter`` -- The number of iterations in the atom labeling scheme
The idea of this method is that the number of iterations can be specified.
If kept low (default niter), local structures over different molecules will have the same label.
"""
if flags is None:
flags = {i: False for i in possible_flags}
initialize(self)
for i in range(niter):
iterate(self, flags)
if has_networkx:
def get_graph(mol, dic, level=1):
"""
Create a networkx graph for this molecule that can be used to compare (all info is in the edge.weight attribute)
"""
nats = len(mol)
mol = mol.copy()
if len(mol.bonds) == 0:
mol.guess_bonds()
if not hasattr(mol.atoms[0], "IDname"):
mol.label(level=1, keep_labels=True)
# Get the connectivigty matrix (remove bond orders)
matrix = mol.bond_matrix()
matrix[matrix > 0] = 1
matrix = matrix.astype(np.int32)
# Multilpy the graph entries with the unique labels for each atom
identifiers = np.array([dic[at.IDname] if at.IDname in dic.keys() else None for at in mol.atoms])
if None in identifiers:
return None
identifiers = identifiers.astype(np.int32)
matrix *= identifiers.reshape((1, nats))
matrix *= identifiers.reshape((nats, 1))
# Create the graph
graph = networkx.from_numpy_matrix(matrix)
return graph
@add_to_class(Molecule)
def find_permutation(self, other, level=1):
"""
Reorder atoms in this molecule to match the order in some *other* molecule. The reordering is applied only if the perfect match is found. Returned value is the applied permutation (as a list of integers) or ``None``, if no reordering was performed.
"""
# Get bonds and unique atomIDs if needed
if len(self.bonds) == 0:
self.guess_bonds()
if not hasattr(self.atoms[0], "IDname"):
self.label(level=1, keep_labels=True)
# Link atom IDs to integers
dic = {}
for at in self.atoms:
if not at.IDname in dic.keys():
dic[at.IDname] = max([v for v in dic.values()]) + 1 if len(dic) > 0 else 1
# Create the graphs
graph = get_graph(self, dic, level=1)
graph2 = get_graph(other, dic, level=1)
if graph2 is None:
return None
# Match
GM = networkx.isomorphism.GraphMatcher(
graph, graph2, edge_match=networkx.isomorphism.categorical_edge_match("weight", 1)
)
isomorphic = GM.is_isomorphic()
if not isomorphic:
return None
# Invert the solution dictionary, to be able to reorder the first graph
dic = {}
for k, v in GM.mapping.items():
dic[v] = k
keys = sorted([key for key in dic.keys()])
indices = [dic[key] for key in keys]
return indices
@add_to_class(Molecule)
def reorder(self, other, level=1):
"""
Reorder atoms in this molecule to match the order in some *other* molecule. The reordering is applied only if the perfect match is found. Returned value is a new ``Molecule`` object, or ``None`` if no reordering was performed. See also :func:`~scm.plams.mol.identify.find_permutation`.
"""
indices = self.find_permutation(other, level=level)
if indices is None:
return None
mol = self.get_fragment(indices)
return mol
```