"""
A re-implementation of DSSP algorithm :footcite:p:`Kabsch1983`, taken from
*pydssp* v.0.9.0 (https://github.com/ShintaroMinami/PyDSSP) by Shintaro Minami,
distributed under MIT license.
Current implementation doesn't use `einops` as a dependency, instead directly
using `numpy` operations for axis rearrangement. However, this implementation
does not allow for batch computation, in contrast with `pydssp`, since it's
designed to be used in per-frame manner in protein trajectories.
"""
import numpy as np
from MDAnalysis.lib.distances import capped_distance, minimize_vectors
CONST_Q1Q2 = 0.084
CONST_F = 332
DEFAULT_CUTOFF = -0.5
DEFAULT_MARGIN = 1.0
HBOND_SEARCH_CUTOFF = 5.0
def _upsample(a: np.ndarray, window: int) -> np.ndarray:
"""Performs array upsampling with given window along given axis.
Example
-------
.. code-block:: python
hbmap = np.arange(4*4).reshape(4,4)
print(hbmap)
# [[ 0 1 2 3]
# [ 4 5 6 7]
# [ 8 9 10 11]
# [12 13 14 15]]
print(_upsample(hbmap))
# [[[[ 0 1 2]
# [ 4 5 6]
# [ 8 9 10]]
# [[ 1 2 3]
# [ 5 6 7]
# [ 9 10 11]]]
# [[[ 4 5 6]
# [ 8 9 10]
# [12 13 14]]
# [[ 5 6 7]
# [ 9 10 11]
# [13 14 15]]]]
Parameters
----------
a : np.ndarray
input array
window : int
upsample window
Returns
-------
np.ndarray
unfolded array
"""
return _unfold(_unfold(a, window, -2), window, -2)
def _unfold(a: np.ndarray, window: int, axis: int):
"Helper function for 2D array upsampling"
idx = (
np.arange(window)[:, None]
+ np.arange(a.shape[axis] - window + 1)[None, :]
)
unfolded = np.take(a, idx, axis=axis)
return np.moveaxis(unfolded, axis - 1, -1)
def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray:
"""Fills in hydrogen atoms positions if they are absent, under the
assumption that C-N-H and H-N-CA angles are perfect 120 degrees,
and N-H bond length is 1.01 A.
Parameters
----------
coord : np.ndarray
input coordinates in Angstrom, shape (n_residues, 4, 3),
where second axes corresponds to (N, CA, C, O) atom coordinates
Returns
-------
np.ndarray
coordinates of additional hydrogens, shape (n_residues-1, 3)
.. versionadded:: 2.8.0
"""
# C_i, N_i, H_i and CA_{i+1} are all in the peptide bond plane
# we wanna get C_{i+1} - N_{i} vectors and normalize them
# ---------
# v1 = vec(C_i, N_i)
# v2 = vec(CA_{i+1}, N_i)
# v3 = vec(N_i, H_i) = ?
# we use the assumption that all the angles are 120 degrees,
# and |v3| = 1.01, hence
# we can derive v3 = (v1/|v1| + v2/|v2|)*|v3|
# get v1 = vec(C_i, N_i)
vec_cn = coord[1:, 0] - coord[:-1, 2]
vec_cn = vec_cn / np.linalg.norm(vec_cn, axis=-1, keepdims=True)
# get v2 = vec(CA_{i+1}, N_{i})
vec_can = coord[1:, 0] - coord[1:, 1]
vec_can = vec_can / np.linalg.norm(vec_can, axis=-1, keepdims=True)
vec_nh = vec_cn + vec_can
vec_nh = vec_nh / np.linalg.norm(vec_nh, axis=-1, keepdims=True)
# vec_(0, H) = vec(0, N) + vec_nh
return coord[1:, 0] + 1.01 * vec_nh
def get_hbond_map(
coord: np.ndarray,
donor_mask: np.ndarray | None = None,
cutoff: float = DEFAULT_CUTOFF,
margin: float = DEFAULT_MARGIN,
return_e: bool = False,
box: np.ndarray | None = None,
) -> np.ndarray:
"""Returns hydrogen bond map
Parameters
----------
coord : np.ndarray
input coordinates in either (n, 4, 3) or (n, 5, 3) shape
(without or with hydrogens respectively), where ``n`` is number of residues.
If hydrogens are not present, then ideal positions (see :func:_get_hydrogen_atom_positions)
are used.
donor_mask : np.array
Mask out any hydrogens that should not be considered (in particular HN
in PRO). If ``None`` then all H will be used (behavior up to 2.10.0).
.. versionadded:: 2.10.0
cutoff : float, optional
cutoff, by default DEFAULT_CUTOFF
margin : float, optional
margin, by default DEFAULT_MARGIN
return_e : bool, optional
if to return energy instead of hbond map, by default False
box : array_like, optional
The unitcell dimensions of the system, which can be orthogonal or
triclinic and must be provided in the same format as returned by
:attr:`MDAnalysis.coordinates.timestep.Timestep.dimensions`:
``[lx, ly, lz, alpha, beta, gamma]``.
.. versionadded:: 2.11.0
Returns
-------
np.ndarray
output hbond map or energy depending on return_e param
.. versionadded:: 2.8.0
.. versionchanged:: 2.10.0
Support masking of hydrogen donors via `donor_mask` (especially needed
for ignoring HN on proline residues). Backport of PRO fix from pydssp 0.9.1.
.. versionchanged:: 2.11.0
Speedup with KDTree and support periodic boundary checking via `box` param.
"""
n_residues, n_atom_types, _xyz = coord.shape
assert n_atom_types in (
4,
5,
), "Number of atoms should be 4 (N,CA,C,O) or 5 (N,CA,C,O,H)"
if n_atom_types == 4:
h_1 = _get_hydrogen_atom_position(coord)
elif n_atom_types == 5:
h_1 = coord[1:, 4]
coord = coord[:, :4]
else: # pragma: no cover
raise ValueError(
"Number of atoms should be 4 (N,CA,C,O) or 5 (N,CA,C,O,H)"
)
# after this:
# h.shape == (n_residues, 3)
# coord.shape == (n_residues, 4, 3)
n_atoms, c_atoms, o_atoms = coord[1:, 0], coord[:-1, 2], coord[:-1, 3]
# We can reduce the need for a complete pairwise distance matrix by first searching for
# candidate pairs within a certain distance cutoff and only computing the energy
# for these relevant pairs rather than "potential" HBonds between far apart residues
pairs, d_on = capped_distance(
n_atoms,
o_atoms,
max_cutoff=HBOND_SEARCH_CUTOFF,
box=box,
)
# Exclude local pairs (i, i), (i, i+1), (i, i+2) that are too close for SS HBonds
local_mask = abs(pairs[:, 0] - pairs[:, 1]) >= 2
pairs = pairs[local_mask]
# Exclude donor H absence (Proline)
if donor_mask is not None:
donor_indices = np.where(np.array(donor_mask) == 0)[0]
mask = ~np.isin(pairs[:, 0], donor_indices - 1)
pairs = pairs[mask]
# compute distances and energy as previously but only for the potential pairs
# still returning the same energy matrix that would have otherwise been made
o_indices = pairs[:, 1]
n_indices = pairs[:, 0]
# d_on = d_on[local_mask]
def _distances(x, y, box=None):
if box is None:
return np.linalg.norm(x[o_indices] - y[n_indices], axis=-1)
else:
return np.linalg.norm(
minimize_vectors(x[o_indices] - y[n_indices], box=box),
axis=-1,
)
d_on = _distances(o_atoms, n_atoms, box)
d_ch = _distances(c_atoms, h_1, box)
d_oh = _distances(o_atoms, h_1, box)
d_cn = _distances(c_atoms, n_atoms, box)
# electrostatic interaction energy
# e[i, j] = e(CO_i) - e(NH_j)
e = np.zeros((n_residues, n_residues))
e[n_indices + 1, o_indices] = (
CONST_Q1Q2
* (1.0 / d_on + 1.0 / d_ch - 1.0 / d_oh - 1.0 / d_cn)
* CONST_F
)
if return_e: # pragma: no cover
return e
hbond_map = np.clip(cutoff - margin - e, a_min=-margin, a_max=margin)
hbond_map = (np.sin(hbond_map / margin * np.pi / 2) + 1.0) / 2
return hbond_map
[docs]def assign(
coord: np.ndarray,
donor_mask: np.ndarray | None = None,
box: np.ndarray | None = None,
) -> np.ndarray:
"""Assigns secondary structure for a given coordinate array,
either with or without assigned hydrogens
Parameters
----------
coord : np.ndarray
input coordinates in either (n, 4, 3) or (n, 5, 3) shape,
without or with hydrogens, respectively. Second dimension `k` represents
(N, CA, C, O) atoms coordinates (if k=4), or (N, CA, C, O, H) coordinates
(when k=5).
donor_mask : np.array | None, optional
Mask out any hydrogens that should not be considered (in particular HN
in PRO). If ``None`` then all H will be used (behavior up to 2.9.0).
.. versionadded:: 2.10.0
box : array_like, optional
The unitcell dimensions of the system, which can be orthogonal or
triclinic and must be provided in the same format as returned by
:attr:`MDAnalysis.coordinates.timestep.Timestep.dimensions`:
``[lx, ly, lz, alpha, beta, gamma]``.
.. versionadded:: 2.11.0
Returns
-------
np.ndarray
output (n,) array with one-hot labels in C3 notation ('-', 'H', 'E'),
representing loop, helix and sheet, respectively.
.. versionadded:: 2.8.0
.. versionchanged:: 2.10.0
Support masking of donors.
.. versionchanged:: 2.11.0
Speedup with KDTree and support periodic boundary checking via `box` param.
"""
# get hydrogen bond map
hbmap = get_hbond_map(coord, donor_mask=donor_mask, box=box)
hbmap = np.swapaxes(hbmap, -1, -2) # convert into "i:C=O, j:N-H" form
# identify turn 3, 4, 5
turn3 = np.diagonal(hbmap, offset=3) > 0.0
turn4 = np.diagonal(hbmap, offset=4) > 0.0
turn5 = np.diagonal(hbmap, offset=5) > 0.0
# assignment of helical secondary structures
h3 = np.pad(turn3[:-1] * turn3[1:], [[1, 3]])
h4 = np.pad(turn4[:-1] * turn4[1:], [[1, 4]])
h5 = np.pad(turn5[:-1] * turn5[1:], [[1, 5]])
# helix4 first, as alpha helix
helix4 = h4 + np.roll(h4, 1, 0) + np.roll(h4, 2, 0) + np.roll(h4, 3, 0)
h3 = h3 * ~np.roll(helix4, -1, 0) * ~helix4 # helix4 is higher prioritized
h5 = h5 * ~np.roll(helix4, -1, 0) * ~helix4 # helix4 is higher prioritized
helix3 = h3 + np.roll(h3, 1, 0) + np.roll(h3, 2, 0)
helix5 = (
h5
+ np.roll(h5, 1, 0)
+ np.roll(h5, 2, 0)
+ np.roll(h5, 3, 0)
+ np.roll(h5, 4, 0)
)
# identify bridge
unfoldmap = _upsample(hbmap, 3) > 0.0
unfoldmap_rev = np.swapaxes(unfoldmap, 0, 1)
p_bridge = (unfoldmap[:, :, 0, 1] * unfoldmap_rev[:, :, 1, 2]) + (
unfoldmap_rev[:, :, 0, 1] * unfoldmap[:, :, 1, 2]
)
p_bridge = np.pad(p_bridge, [[1, 1], [1, 1]])
a_bridge = (unfoldmap[:, :, 1, 1] * unfoldmap_rev[:, :, 1, 1]) + (
unfoldmap[:, :, 0, 2] * unfoldmap_rev[:, :, 0, 2]
)
a_bridge = np.pad(a_bridge, [[1, 1], [1, 1]])
# ladder
ladder = (p_bridge + a_bridge).sum(-1) > 0.0
# H, E, L of C3
helix = (helix3 + helix4 + helix5) > 0.0
strand = ladder
loop = ~helix * ~strand
onehot = np.stack([loop, helix, strand], axis=-1)
return onehot