DADApy¶
DADApy is a Python package for distance-based analysis of high-dimensional spaces. Find more details on the documentation and GitHub repository.
This tutorial covers three progressively more complex case studies:
- Carbamazepine — a flexible organic molecule; introduces pairwise-distance features, structural alignment, density estimation, clustering, and information imbalance.
- Zeolite–water (simple Al site) — extends the workflow to a periodic system with water adsorption.
- AWB zeolite (scaled) — scales to larger supercells with different Al:Si ratios using a reusable helper pipeline.
All trajectory loading and distance calculations use ASE (Atomic Simulation Environment) native functions.
1. Carbamazepine: Molecular Conformational Analysis¶
Carbamazepine (CBZ) is a pharmaceutical compound whose conformational flexibility is clinically relevant. Here we analyse an NVT MD trajectory at 1000 K and identify the dominant conformational states.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from ase.io import read, write
from ase.visualize import view
from dadapy import Data
from dadapy import plot as dpl
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score, confusion_matrix
import warnings
warnings.filterwarnings("ignore")
import platform
if platform.system() == "Windows":
# DADApy 0.3.1 bug: kstar.py passes dist_indices as int64 to a Cython
# function compiled to expect int32. Patch it here.
import time as _t, warnings as _w
import dadapy._cython.cython_density as _cd
from dadapy.kstar import KStar
def _kstar_fixed(self, Dthr=23.92812698):
if self.intrinsic_dim is None:
_w.warn("Intrinsic dimension undefined, computing with compute_id_2NN()")
self.compute_id_2NN()
if self.verb: print(f"kstar estimation started, Dthr = {Dthr}")
t0 = _t.time()
self.set_kstar(_cd._compute_kstar(
self.intrinsic_dim, self.N, self.maxk, Dthr,
self.dist_indices.astype(np.int32),
self.distances.astype(np.float64),
))
if self.verb: print(f"{_t.time()-t0:.2f}s computing kstar")
KStar.compute_kstar = _kstar_fixed
1.1 Loading the Trajectory¶
ase.io.read with index=':' reads all frames from an extended xyz, returning a list of Atoms objects, and carrying positions, symbols, cell parameters, and atomic.
traj_ase = read("../../data/dadapy/carbamazepine-nvt-T1000.0-traj.extxyz", index=":")
n_frames = len(traj_ase)
n_atoms = len(traj_ase[0])
print(f"Frames : {n_frames}")
print(f"Atoms/frame : {n_atoms}")
print(f"Elements : {sorted(set(traj_ase[0].get_chemical_symbols()))}")
Frames : 332 Atoms/frame : 30 Elements : ['C', 'H', 'N', 'O']
# Quick 3-D snapshot of the first 6 frames
fig = plt.figure(figsize=(18, 12))
for i in range(6):
atoms = traj_ase[i]
pos = atoms.get_positions()
symbols = atoms.get_chemical_symbols()
ax = fig.add_subplot(2, 3, i + 1, projection='3d')
ax.scatter(pos[:, 0], pos[:, 1], pos[:, 2], s=40)
for sym, p in zip(symbols, pos):
ax.text(*p, sym, fontsize=8, color='red')
ax.set_title(f'Frame {i}')
ax.set_xlabel('X (\u00c5)'); ax.set_ylabel('Y (\u00c5)'); ax.set_zlabel('Z (\u00c5)')
plt.tight_layout()
plt.show()
1.2 Building the Feature Matrix¶
We represent each frame as the vector of all unique pairwise atomic distances. For 30 atoms this gives $\binom{30}{2} = 435$ features.
Atoms.get_all_distances() returns an $N \times N$ distance matrix; np.triu_indices(N, k=1) selects the upper triangle (unique pairs).
n_pairs = n_atoms * (n_atoms - 1) // 2
print(f"Unique atom pairs: {n_pairs}")
# Shape: (n_frames, n_pairs)
distance_vectors = np.array([
atoms.get_all_distances()[np.triu_indices(n_atoms, k=1)]
for atoms in traj_ase
])
print(f"Feature matrix shape: {distance_vectors.shape}")
Unique atom pairs: 435 Feature matrix shape: (332, 435)
# Heatmap of the pairwise distance matrix for frame 0
atom_labels = traj_ase[0].get_chemical_symbols()
dist_matrix0 = traj_ase[0].get_all_distances()
df0 = pd.DataFrame(dist_matrix0, index=atom_labels, columns=atom_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(df0, mask=np.tril(np.ones_like(df0, dtype=bool)),
annot=False, cmap='viridis', square=True)
plt.title("Pairwise Distance Matrix - Frame 0 (\u00c5)")
plt.show()
1.3 Structural Alignment¶
To remove rigid-body motion from the trajectory, each frame is:
- Centred at its centre of mass.
- Rotated onto the principal axes of inertia of the reference frame (frame 0).
- Refined with the Kabsch algorithm to minimise RMSD to the reference.
def _inertia_tensor(coords, masses):
com = np.average(coords, axis=0, weights=masses)
r = coords - com
I = np.zeros((3, 3))
for m, (x, y, z) in zip(masses, r):
I[0, 0] += m * (y*y + z*z)
I[1, 1] += m * (x*x + z*z)
I[2, 2] += m * (x*x + y*y)
I[0, 1] -= m * x * y
I[0, 2] -= m * x * z
I[1, 2] -= m * y * z
I[1, 0] = I[0, 1]; I[2, 0] = I[0, 2]; I[2, 1] = I[1, 2]
return I, com
def _kabsch(P, Q):
"""Rotation matrix R minimising RMSD between P and Q."""
V, _, Wt = np.linalg.svd(P.T @ Q)
d = np.sign(np.linalg.det(V @ Wt))
return V @ np.diag([1.0, 1.0, d]) @ Wt
def align_trajectory(traj_pos, masses, ref_idx=0):
"""Align all frames to ref_idx via principal-axes + Kabsch."""
I_ref, com_ref = _inertia_tensor(traj_pos[ref_idx], masses)
evals, evecs = np.linalg.eigh(I_ref)
axes = evecs[:, np.argsort(evals)]
if np.linalg.det(axes) < 0:
axes[:, 2] *= -1
ref_aligned = (traj_pos[ref_idx] - com_ref) @ axes
aligned = []
for pos in traj_pos:
_, com = _inertia_tensor(pos, masses)
centered = pos - com
R = _kabsch(centered, ref_aligned)
aligned.append(centered @ R)
return np.array(aligned)
masses = traj_ase[0].get_masses()
traj_pos = np.array([a.get_positions() for a in traj_ase])
traj_aligned = align_trajectory(traj_pos, masses)
print(f"Aligned trajectory shape: {traj_aligned.shape}")
Aligned trajectory shape: (332, 30, 3)
1.4 Intrinsic Dimensionality¶
Before density estimation it is useful to know the intrinsic dimension (ID) — the effective number of degrees of freedom needed to describe the data.
DADApy provides two estimators:
- 2NN (Two-Nearest-Neighbours): fast, scale-dependent.
- Gride: more robust multi-scale estimator.
Both are plotted against the neighbourhood scale. A plateau in the ID curve indicates the true dimensionality.
data_id = Data(distance_vectors)
ids_2nn, errs_2nn, scales_2nn = data_id.return_id_scaling_2NN()
ids_gride, errs_gride, scales_gride = data_id.return_id_scaling_gride(range_max=100)
plt.figure(figsize=(7, 4))
plt.errorbar(scales_2nn, ids_2nn, yerr=errs_2nn, fmt='o-', label='2NN', alpha=0.8)
plt.errorbar(scales_gride, ids_gride, yerr=errs_gride, fmt='s-', label='Gride', alpha=0.8)
plt.xscale('log')
plt.xlabel('Scale (neighbours k)')
plt.ylabel('Intrinsic Dimension')
plt.title('ID Scaling - Carbamazepine')
plt.legend(); plt.grid(True, alpha=0.3); plt.tight_layout(); plt.show()
1.5 Density Estimation and Clustering¶
DADApy estimates local log-density with the PAk estimator (adaptive kNN), then clusters the landscape with ADP (Automatic Density Peaks). The parameter Z sets the minimum density-peak significance; halo=True labels uncertain boundary points as noise (cluster = -1).
data = Data(distance_vectors, verbose=True)
data.compute_id_2NN()
data.compute_distances(maxk=20)
# Windows: DADApy's Cython backend expects int32 dist_indices, not int64
data.dist_indices = data.dist_indices.astype(np.int32)
data.distances = data.distances.astype(np.float64)
log_den, log_den_err = data.compute_density_PAk()
clusters = data.compute_clustering_ADP(Z=1.65, halo=True)
unique_clusters, counts = np.unique(clusters, return_counts=True)
print("\nCluster populations:")
for c, n in zip(unique_clusters, counts):
label = 'noise' if c == -1 else f'Cluster {c}'
print(f" {label}: {n} frames ({n / len(clusters) * 100:.1f}%)")
Computation of distances started Computation of the distances up to 100 NNs started 0.02 seconds for computing distances ID estimation finished: selecting ID of 13.641089433088126 Computation of distances started Computation of the distances up to 20 NNs started 0.01 seconds for computing distances kstar estimation started, Dthr = 23.92812698 0.00s computing kstar PAk density estimation started 0.01 seconds optimizing the likelihood for all the points PAk density estimation finished Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 7.557868957519531e-05 sec Clustering finished, 3 clusters found total time is, 0.0017993450164794922 Cluster populations: noise: 216 frames (65.1%) Cluster 0: 49 frames (14.8%) Cluster 1: 24 frames (7.2%) Cluster 2: 42 frames (12.7%) Cluster 11: 1 frames (0.3%)
xy = TSNE(n_components=2, perplexity=30, random_state=0).fit_transform(distance_vectors)
cmap_c = plt.cm.get_cmap('tab10', len(unique_clusters))
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
sc = ax1.scatter(xy[:, 0], xy[:, 1], c=log_den, cmap='viridis', s=10)
ax1.set_title('PAk log-density'); ax1.set_xticks([]); ax1.set_yticks([])
plt.colorbar(sc, ax=ax1, label='log density')
for i, c in enumerate(unique_clusters):
mask = clusters == c
ax2.scatter(xy[mask, 0], xy[mask, 1],
color='lightgray' if c == -1 else cmap_c(i),
s=10, alpha=0.3 if c == -1 else 0.9,
label='noise' if c == -1 else f'Cluster {c}')
ax2.set_title('ADP Clusters'); ax2.set_xticks([]); ax2.set_yticks([])
ax2.legend(markerscale=2)
plt.suptitle('Carbamazepine - t-SNE projection', y=1.01)
plt.tight_layout(); plt.show()
# Free-energy landscape: F = -log(density), projected onto PCA
pca2d = PCA(n_components=2).fit_transform(distance_vectors)
free_energy = -log_den
plt.figure(figsize=(8, 6))
sc = plt.scatter(pca2d[:, 0], pca2d[:, 1], c=free_energy, cmap='viridis', s=10, alpha=0.7)
plt.tricontourf(pca2d[:, 0], pca2d[:, 1], free_energy, levels=50, cmap='viridis', alpha=0.3)
plt.colorbar(sc, label='Relative Free Energy (k$_B$T)')
plt.title('Free Energy Landscape - Carbamazepine (PCA)')
plt.xlabel('PC1'); plt.ylabel('PC2')
plt.show()
# Cluster hierarchy dendrogram
dpl.get_dendrogram(data, cmap='Set2', logscale=False)
1.6 Cluster Characterisation¶
For each cluster we compute structural descriptors:
- Planarity of the ring system (RMS out-of-plane deviation, in \u00c5)
- Radius of gyration $R_g$
- Dihedral angles (ring twist and substituent)
- N-O hydrogen-bond donor distance
DIHEDRALS = {
'ring_twist' : (10, 4, 1, 3),
'substituent': (2, 13, 1, 3),
}
HBONDS = [(2, 0)] # N-O distance
def dihedral(p0, p1, p2, p3):
b0 = p0 - p1
b1 = p2 - p1
b2 = p3 - p2
b1 /= np.linalg.norm(b1)
v = b0 - np.dot(b0, b1) * b1
w = b2 - np.dot(b2, b1) * b1
return np.degrees(np.arctan2(np.dot(np.cross(b1, v), w), np.dot(v, w)))
def planarity(coords):
X = coords - coords.mean(axis=0)
_, _, V = np.linalg.svd(X)
return np.sqrt(np.mean(np.dot(X, V[-1]) ** 2))
def radius_of_gyration(coords):
c = coords.mean(axis=0)
return np.sqrt(((coords - c) ** 2).sum(axis=1).mean())
def analyze_cluster(frames):
plan, rg = [], []
dih = {k: [] for k in DIHEDRALS}
hb = {i: [] for i in range(len(HBONDS))}
for c in frames:
plan.append(planarity(c))
rg.append(radius_of_gyration(c))
for name, (a, b, d, e) in DIHEDRALS.items():
dih[name].append(dihedral(c[a], c[b], c[d], c[e]))
for i, (a, b) in enumerate(HBONDS):
hb[i].append(np.linalg.norm(c[a] - c[b]))
return {
'Planarity (\u00c5)' : (np.mean(plan), np.std(plan)),
'Rg (\u00c5)' : (np.mean(rg), np.std(rg)),
**{f'Dih_{k} (\u00b0)' : (np.mean(v), np.std(v)) for k, v in dih.items()},
**{f'HBond_{i} (\u00c5)': (np.mean(v), np.std(v)) for i, v in hb.items()},
}
cluster_frames_dict = {c: np.where(clusters == c)[0] for c in unique_clusters}
rows = []
for cid, frame_ids in cluster_frames_dict.items():
if cid == -1:
continue
props = analyze_cluster(traj_aligned[frame_ids])
row = {'Cluster': cid}
row.update({k: f"{v[0]:.3f} \u00b1 {v[1]:.3f}" for k, v in props.items()})
rows.append(row)
pd.DataFrame(rows).set_index('Cluster')
| Planarity (Å) | Rg (Å) | Dih_ring_twist (°) | Dih_substituent (°) | HBond_0 (Å) | |
|---|---|---|---|---|---|
| Cluster | |||||
| 0 | 1.092 ± 0.121 | 3.312 ± 0.036 | -17.684 ± 117.320 | -1.301 ± 162.168 | 2.312 ± 0.077 |
| 1 | 1.062 ± 0.152 | 3.326 ± 0.038 | -60.328 ± 107.955 | -14.791 ± 150.580 | 2.312 ± 0.068 |
| 2 | 1.011 ± 0.154 | 3.345 ± 0.041 | -62.745 ± 112.511 | -3.490 ± 45.854 | 2.298 ± 0.090 |
| 11 | 0.946 ± 0.000 | 3.371 ± 0.000 | 141.067 ± 0.000 | 27.144 ± 0.000 | 2.124 ± 0.000 |
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for cid, frame_ids in cluster_frames_dict.items():
if cid == -1:
continue
frames = traj_aligned[frame_ids]
label = f'Cluster {cid}'
axes[0].hist([planarity(f) for f in frames], alpha=0.5, label=label, bins=30)
axes[1].hist([radius_of_gyration(f) for f in frames], alpha=0.5, label=label, bins=30)
axes[2].hist([dihedral(f[10], f[4], f[1], f[3]) for f in frames], alpha=0.5, label=label, bins=30)
axes[0].set_xlabel('Planarity (\u00c5)'); axes[0].set_title('Ring planarity')
axes[1].set_xlabel('Rg (\u00c5)'); axes[1].set_title('Radius of gyration')
axes[2].set_xlabel('Angle (\u00b0)'); axes[2].set_title('Ring-twist dihedral')
for ax in axes:
ax.legend()
plt.tight_layout(); plt.show()
# Inter-cluster transitions along the trajectory
transitions = Counter(
(clusters[i], clusters[i + 1])
for i in range(len(clusters) - 1)
if clusters[i] != clusters[i + 1]
)
print('Most frequent inter-cluster transitions:')
for (a, b), count in transitions.most_common(10):
print(f' {a} \u2192 {b}: {count}')
Most frequent inter-cluster transitions: -1 → 2: 31 0 → -1: 30 -1 → 0: 30 2 → -1: 30 -1 → 1: 16 1 → -1: 16 0 → 2: 4 2 → 0: 4 2 → 11: 1 11 → -1: 1
1.7 Z-Parameter Sensitivity¶
The number of clusters depends on Z. Scanning Z and plotting the resulting cluster count reveals stable plateaus that are robust to the choice of Z.
Z_values = np.linspace(0.5, 3.0, 15)
n_clusters_Z = [len(set(data.compute_clustering_ADP(Z=Z, halo=True)) - {-1})
for Z in Z_values]
plt.figure(figsize=(6, 4))
plt.plot(Z_values, n_clusters_Z, 'o-')
plt.xlabel('Z')
plt.ylabel('Number of clusters')
plt.title('Cluster count vs Z - Carbamazepine')
plt.grid(True, alpha=0.3); plt.tight_layout(); plt.show()
Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 6.937980651855469e-05 sec Clustering finished, 7 clusters found total time is, 0.0010287761688232422 Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.001 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 5.2928924560546875e-05 sec Clustering finished, 4 clusters found total time is, 0.0012669563293457031 Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 3.933906555175781e-05 sec Clustering finished, 4 clusters found total time is, 0.0008783340454101562 Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 6.604194641113281e-05 sec Clustering finished, 3 clusters found total time is, 0.0009827613830566406 Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 4.076957702636719e-05 sec Clustering finished, 3 clusters found total time is, 0.0008940696716308594 Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 3.4332275390625e-05 sec Clustering finished, 3 clusters found total time is, 0.0008411407470703125 Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 4.029273986816406e-05 sec Clustering finished, 3 clusters found total time is, 0.0008304119110107422 Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 3.695487976074219e-05 sec Clustering finished, 3 clusters found total time is, 0.0008194446563720703 Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 3.2901763916015625e-05 sec Clustering finished, 2 clusters found total time is, 0.0008249282836914062 Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 4.649162292480469e-05 sec Clustering finished, 2 clusters found total time is, 0.0009522438049316406 Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 3.24249267578125e-05 sec Clustering finished, 2 clusters found total time is, 0.0009181499481201172 Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 2.2649765014648438e-05 sec Clustering finished, 1 clusters found total time is, 0.0008497238159179688 Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 2.7418136596679688e-05 sec Clustering finished, 1 clusters found total time is, 0.0009016990661621094 Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 2.5272369384765625e-05 sec Clustering finished, 1 clusters found total time is, 0.0008828639984130859 Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.000 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.000 sec Number of clusters before multimodality test= 7 Identification of the saddle points: 0.001 sec Multimodality test finished: 0.000 sec Final operations: 2.4318695068359375e-05 sec Clustering finished, 1 clusters found total time is, 0.0008893013000488281
1.8 Information Imbalance and Greedy Feature Selection¶
The information imbalance $\Delta(A \to B)$ measures how well the nearest-neighbour structure in space $A$ predicts that in space $B$. $\Delta = 0$ means $A$ perfectly captures $B$; $\Delta = 1$ means they are independent.
We use it to:
- Compare the full 435-dimensional distance space with the top 14 PCA components.
- Find the ~20 most informative pairwise distances via greedy forward selection.
X_pca = PCA(n_components=14).fit_transform(distance_vectors)
X_full = Data(distance_vectors)
delta_full_pca, delta_pca_full = X_full.return_inf_imb_two_selected_coords(
coords1=list(range(distance_vectors.shape[1])),
coords2=list(range(14))
)
print(f'\u0394(full \u2192 PCA-14) = {delta_full_pca:.3f}')
print(f'\u0394(PCA-14 \u2192 full) = {delta_pca_full:.3f}')
Δ(full → PCA-14) = 0.179 Δ(PCA-14 → full) = 0.435
# Greedy forward selection of the most informative pairwise distances
best_tuples, best_imbalances, _ = X_full.greedy_feature_selection_full(
n_coords=20, k=10, n_best=5, symm=True
)
final_features = best_tuples[-1]
print(f'Selected {len(final_features)} features')
print(f'Symmetric \u0394 at last step: {best_imbalances[-1]}')
taking full space as the target representation total number of computations is: 435 total number of computations is: 2160 total number of computations is: 2161 total number of computations is: 2150 total number of computations is: 2148 total number of computations is: 2143 total number of computations is: 2135 total number of computations is: 2136 total number of computations is: 2132 total number of computations is: 2128 total number of computations is: 2123 total number of computations is: 2117 total number of computations is: 2105 total number of computations is: 2103 total number of computations is: 2100 total number of computations is: 2096 total number of computations is: 2091 total number of computations is: 2085 total number of computations is: 2080 total number of computations is: 2075 Selected 20 features Symmetric Δ at last step: [0.04 0.04]
# Map feature indices back to readable atom-pair labels
atom_pairs = [(i, j) for i in range(n_atoms) for j in range(i + 1, n_atoms)]
symbols_cbz = traj_ase[0].get_chemical_symbols()
selected_labels = [
f"{symbols_cbz[atom_pairs[k][0]]}{atom_pairs[k][0]}-"
f"{symbols_cbz[atom_pairs[k][1]]}{atom_pairs[k][1]}"
for k in final_features
]
frame0_dists = distance_vectors[0, final_features]
plt.figure(figsize=(12, 4))
plt.bar(range(len(final_features)), frame0_dists)
plt.xticks(range(len(final_features)), selected_labels, rotation=45, ha='right')
plt.ylabel('Distance (\u00c5)')
plt.title('Top 20 most informative pairwise distances - Frame 0')
plt.tight_layout(); plt.show()
# Cluster with reduced feature space and compare
data_red = Data(distance_vectors[:, final_features])
data_red.compute_id_2NN()
data_red.compute_distances(maxk=100)
data_red.dist_indices = data_red.dist_indices.astype(np.int32)
data_red.distances = data_red.distances.astype(np.float64)
_, _ = data_red.compute_density_PAk()
cluster_red = data_red.compute_clustering_ADP(Z=1.65, halo=True)
ari = adjusted_rand_score(clusters, cluster_red)
print(f'ARI (full vs 20-feature reduced): {ari:.3f}')
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.scatter(xy[:, 0], xy[:, 1], c=clusters, cmap='tab10', s=10)
ax2.scatter(xy[:, 0], xy[:, 1], c=cluster_red, cmap='tab10', s=10)
ax1.set_title('Full 435D clustering')
ax2.set_title('Reduced 20D clustering')
for ax in (ax1, ax2):
ax.set_xticks([]); ax.set_yticks([])
plt.tight_layout(); plt.show()
ARI (full vs 20-feature reduced): 0.578
2. Zeolite-Water Analysis¶
Now we analyse a NVT trajectory of a zeolite framework with adsorbed water, focusing on the local coordination environment of the Al site. The key difference from the molecular case is that the system is periodic all distances must be computed with minimum-image convention (mic=True).
2.1 Loading the Trajectory¶
traj_z = read("../../data/dadapy/T1.traj", index=":")
print(f"Frames : {len(traj_z)}")
print(f"Atoms/frame : {len(traj_z[0])}")
print(f"Elements : {sorted(set(traj_z[0].get_chemical_symbols()))}")
print(f"PBC : {traj_z[0].get_pbc()}")
Frames : 20000 Atoms/frame : 112 Elements : ['Al', 'H', 'O', 'Si'] PBC : [ True True True]
2.2 Identifying Water Molecules¶
Water oxygens are distinguished from framework oxygens by checking whether an O atom has two H neighbours closer than 1.3 \u00c5, using get_distances with mic=True for correctness with periodic boundary conditions.
symbols_z = np.array(traj_z[0].get_chemical_symbols())
O_idx = np.where(symbols_z == 'O')[0]
H_idx = np.where(symbols_z == 'H')[0]
Al_idx_z = np.where(symbols_z == 'Al')[0]
water_O_z = [
o for o in O_idx
if np.sort(traj_z[0].get_distances(o, H_idx, mic=True))[1] < 1.3
]
water_O_z = np.array(water_O_z)
print(f"Al sites : {len(Al_idx_z)}")
print(f"Water oxygens : {len(water_O_z)}")
Al sites : 1 Water oxygens : 5
2.3 Feature Construction: Sorted Al-O Distances¶
For each frame the descriptor is the sorted vector of distances from the single Al site to all water oxygens. Sorting makes the descriptor invariant to the labelling (permutation) of water molecules.
k_z = len(water_O_z)
X_z = np.array([
np.sort(atoms.get_distances(Al_idx_z[0], water_O_z, mic=True))[:k_z]
for atoms in traj_z
])
print(f"Feature matrix shape: {X_z.shape}")
print(f"First frame distances (\u00c5): {np.round(X_z[0], 2)}")
Feature matrix shape: (20000, 5) First frame distances (Å): [3.53 4.3 5.55 5.66 6.18]
2.4 DADApy Clustering¶
data_z = Data(X_z, verbose=True)
data_z.compute_id_2NN()
data_z.compute_distances(maxk=150)
data_z.dist_indices = data_z.dist_indices.astype(np.int32)
data_z.distances = data_z.distances.astype(np.float64)
log_den_z, _ = data_z.compute_density_PAk()
clusters_z = data_z.compute_clustering_ADP(Z=1.6, halo=True)
print('\nCluster populations:')
for c in np.unique(clusters_z):
n = np.sum(clusters_z == c)
print(f" {'noise' if c == -1 else f'Cluster {c}'}: {n} ({n / len(clusters_z) * 100:.1f}%)")
Computation of distances started Computation of the distances up to 100 NNs started 0.37 seconds for computing distances ID estimation finished: selecting ID of 4.888936744578951 Computation of distances started Computation of the distances up to 150 NNs started 0.60 seconds for computing distances kstar estimation started, Dthr = 23.92812698 0.07s computing kstar PAk density estimation started 2.04 seconds optimizing the likelihood for all the points PAk density estimation finished Clustering started init succeded Raw identification of the putative centers: 0.000 sec Further checking on centers: 0.074 sec Pruning of the centers wrongly identified in part one: 0.000 sec Preliminary assignation finished: 0.013 sec Number of clusters before multimodality test= 90 Identification of the saddle points: 0.034 sec Multimodality test finished: 0.002 sec Final operations: 0.0016279220581054688 sec Clustering finished, 4 clusters found total time is, 0.1274576187133789 Cluster populations: noise: 15879 (79.4%) Cluster 0: 85 (0.4%) Cluster 1: 3 (0.0%) Cluster 2: 2719 (13.6%) Cluster 3: 1314 (6.6%)
xy_z = TSNE(n_components=2, perplexity=30, random_state=0).fit_transform(X_z)
unique_z = np.unique(clusters_z)
cmap_z = plt.cm.get_cmap('tab10', len(unique_z))
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
sc = ax1.scatter(xy_z[:, 0], xy_z[:, 1], c=log_den_z, cmap='viridis', s=8)
ax1.set_title('PAk log-density'); ax1.set_xticks([]); ax1.set_yticks([])
plt.colorbar(sc, ax=ax1, label='log density')
for i, c in enumerate(unique_z):
mask = clusters_z == c
ax2.scatter(xy_z[mask, 0], xy_z[mask, 1],
color='lightgray' if c == -1 else cmap_z(i), s=8,
alpha=0.3 if c == -1 else 0.9,
label='noise' if c == -1 else f'Cluster {c}')
ax2.set_title('ADP Clusters'); ax2.set_xticks([]); ax2.set_yticks([])
ax2.legend(markerscale=2)
plt.suptitle('Zeolite-water - t-SNE projection', y=1.01)
plt.tight_layout(); plt.show()
2.5 Cluster Analysis¶
# Nearest Al-O distance and cluster label over time
d1_z = X_z[:, 0]
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 6), sharex=True)
ax1.plot(d1_z, color='black', lw=0.5, alpha=0.5)
for i, c in enumerate(unique_z):
mask = clusters_z == c
ax1.scatter(np.where(mask)[0], d1_z[mask], s=5,
color='lightgray' if c == -1 else cmap_z(i),
label='noise' if c == -1 else f'Cluster {c}')
ax1.set_ylabel('Nearest Al-O distance (\u00c5)')
ax1.set_title('Al-O distance time-series, coloured by cluster')
ax1.legend(markerscale=2, loc='upper right')
ax2.scatter(np.arange(len(clusters_z)), clusters_z, s=4, c=clusters_z, cmap='tab10')
ax2.set_ylabel('Cluster'); ax2.set_xlabel('Frame')
plt.tight_layout(); plt.show()
# Mean feature vector per cluster
for c in np.unique(clusters_z):
mean_d = X_z[clusters_z == c].mean(axis=0)
label = 'noise' if c == -1 else f'Cluster {c}'
print(f"{label}: mean Al-O distances = {np.round(mean_d, 2)} \u00c5")
noise: mean Al-O distances = [3.5 3.72 4.11 4.7 5.67] Å Cluster 0: mean Al-O distances = [3.48 3.6 3.75 4.49 5.56] Å Cluster 1: mean Al-O distances = [3.53 4.33 5.34 5.47 8.01] Å Cluster 2: mean Al-O distances = [3.52 3.68 3.98 4.27 4.48] Å Cluster 3: mean Al-O distances = [3.63 3.78 4.07 4.59 6.83] Å
# Representative structure: frame closest to each cluster centroid
rep_indices_z = []
for c in np.unique(clusters_z):
if c == -1:
continue
frames = np.where(clusters_z == c)[0]
Xc = X_z[frames]
idx = frames[np.argmin(np.linalg.norm(Xc - Xc.mean(axis=0), axis=1))]
rep_indices_z.append(idx)
rep_structures_z = [traj_z[i] for i in rep_indices_z]
write('../../data/dadapy/zeolite_cluster_representatives.extxyz', rep_structures_z, format='extxyz')
view(rep_structures_z, viewer='ngl')
HBox(children=(NGLWidget(max_frame=3), VBox(children=(Dropdown(description='Show', options=('All', 'O', 'Al', …
2.6 Extended Features: Al-O + O-O Distances¶
Adding water-water O-O distances captures information about the hydrogen-bond network beyond the Al coordination shell.
atoms.get_all_distances(mic=True) handles PBC correctly for both distance blocks.
# O-O distances between water oxygens (PBC-aware)
OO_features_z = np.array([
np.sort(
atoms.get_all_distances(mic=True)
[np.ix_(water_O_z, water_O_z)]
[np.triu_indices(len(water_O_z), k=1)]
)
for atoms in traj_z
])
X_new_z = np.hstack([X_z, OO_features_z])
print(f"Al-O features : {X_z.shape[1]}")
print(f"O-O features : {OO_features_z.shape[1]}")
print(f"Combined shape : {X_new_z.shape}")
Al-O features : 5 O-O features : 10 Combined shape : (20000, 15)
data_new_z = Data(X_new_z, verbose=False)
data_new_z.compute_id_2NN()
data_new_z.compute_distances(maxk=150)
data_new_z.dist_indices = data_new_z.dist_indices.astype(np.int32)
data_new_z.distances = data_new_z.distances.astype(np.float64)
_, _ = data_new_z.compute_density_PAk()
# Scan Z to find stable cluster count
Z_scan, Z_results = np.linspace(0, 20, 20), []
for Z in Z_scan:
c_ = data_new_z.compute_clustering_ADP(Z=Z, halo=True)
Z_results.append(len(np.unique(c_[c_ != -1])))
plt.figure(figsize=(6, 4))
plt.plot(Z_scan, Z_results, 'o-')
plt.xlabel('Z'); plt.ylabel('Clusters')
plt.title('Cluster count vs Z — Al–O + O–O features')
plt.grid(True, alpha=0.3); plt.tight_layout(); plt.show()
clusters_new_z = data_new_z.compute_clustering_ADP(Z=8, halo=True)
print('Populations:', {c: int(np.sum(clusters_new_z == c)) for c in np.unique(clusters_new_z)})
Populations: {0: 19998, 1: 2}
3. AWB Zeolite - Scaled Analysis¶
We apply the same workflow to two larger AWB zeolite supercells:
- System 1: Al:Si = 1:4 - 16 Al, 64 Si, 32 H2O (~500 atoms)
- System 2: Al:Si = 1:1 - 64 Al, 64 Si, 32 H2O (~544 atoms)
Reusable helper functions keep the per-system code concise.
3.0 Helper Functions¶
import umap
def identify_water_oxygens(atoms):
"""Return indices of water oxygens.
Criterion: O with two H neighbours closer than 1.3 Angstrom (mic=True).
"""
sym = np.array(atoms.get_chemical_symbols())
O_idx = np.where(sym == 'O')[0]
H_idx = np.where(sym == 'H')[0]
return np.array([
o for o in O_idx
if np.sort(atoms.get_distances(o, H_idx, mic=True))[1] < 1.3
])
def build_feature_matrix(traj, Al_idx, water_O_idx, Si_idx=None):
"""Build sorted Al-O, [Si-O,] O-O distance feature vectors.
Uses atoms.get_all_distances(mic=True) for PBC-correct distances.
"""
features = []
for atoms in traj:
D = atoms.get_all_distances(mic=True)
AlO = np.sort(D[np.ix_(Al_idx, water_O_idx)].flatten())
D_OO = D[np.ix_(water_O_idx, water_O_idx)]
OO = np.sort(D_OO[np.triu_indices(len(water_O_idx), k=1)])
parts = [AlO]
if Si_idx is not None:
SiO = np.sort(D[np.ix_(Si_idx, water_O_idx)].flatten())
parts.append(SiO)
parts.append(OO)
features.append(np.hstack(parts))
return np.array(features)
def dadapy_pipeline(X, maxk=200, Z=1.6, verbose=False):
"""Run full DADApy pipeline: ID -> distances -> PAk density -> ADP clustering.
Note: dist_indices are cast to int32 to work around a Windows/Cython dtype issue.
"""
data = Data(X, verbose=verbose)
data.compute_id_2NN()
data.compute_distances(maxk=maxk)
data.dist_indices = data.dist_indices.astype(np.int32)
data.distances = data.distances.astype(np.float64)
log_den, _ = data.compute_density_PAk()
clusters = data.compute_clustering_ADP(Z=Z, halo=True)
return data, log_den, clusters
def plot_umap_clusters(X, clusters, log_den=None, title='',
n_neighbors=50, min_dist=0.08):
"""UMAP projection coloured by cluster (and optionally by density)."""
reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors,
min_dist=min_dist, metric='euclidean', random_state=0)
xy = reducer.fit_transform(X)
unique_c = np.unique(clusters)
cmap = plt.cm.get_cmap('tab10', len(unique_c))
ncols = 2 if log_den is not None else 1
fig, axes = plt.subplots(1, ncols, figsize=(6 * ncols, 5))
axes = np.atleast_1d(axes)
if log_den is not None:
sc = axes[0].scatter(xy[:, 0], xy[:, 1], c=log_den, cmap='viridis', s=8)
axes[0].set_title('PAk log-density')
axes[0].set_xticks([]); axes[0].set_yticks([])
plt.colorbar(sc, ax=axes[0], label='log density')
ax = axes[-1]
for i, c in enumerate(unique_c):
mask = clusters == c
ax.scatter(xy[mask, 0], xy[mask, 1],
color='lightgray' if c == -1 else cmap(i),
s=8, alpha=0.3 if c == -1 else 0.9,
label='noise' if c == -1 else f'Cluster {c}')
ax.set_title(f'ADP Clusters — {title}')
ax.set_xticks([]); ax.set_yticks([])
ax.legend(markerscale=2)
plt.tight_layout(); plt.show()
return xy
def get_representative_structures(traj, X, clusters):
"""Return the frame closest to each non-noise cluster centroid."""
reps = []
for c in np.unique(clusters):
if c == -1:
continue
frames = np.where(clusters == c)[0]
Xc = X[frames]
idx = frames[np.argmin(np.linalg.norm(Xc - Xc.mean(axis=0), axis=1))]
reps.append(traj[idx])
return reps
3.1 System 1: Al:Si = 1:4 (Al16, Si64, 32 H\u2082O)¶
# Load every 10th frame to reduce memory and compute time
traj16 = read("../../data/dadapy/nvt_al16_32H2O.traj", index="::10")
print(f"Frames: {len(traj16)}, Atoms: {len(traj16[0])}")
sym16 = np.array(traj16[0].get_chemical_symbols())
Al_idx16 = np.where(sym16 == 'Al')[0]
Si_idx16 = np.where(sym16 == 'Si')[0]
wO16 = identify_water_oxygens(traj16[0])
print(f"Al: {len(Al_idx16)}, Si: {len(Si_idx16)}, Water O: {len(wO16)}")
Frames: 2200, Atoms: 496 Al: 16, Si: 112, Water O: 32
X16 = build_feature_matrix(traj16, Al_idx16, wO16, Si_idx=Si_idx16)
print(f"Feature matrix: {X16.shape}")
Feature matrix: (2200, 4592)
data16, log_den16, clusters16 = dadapy_pipeline(X16, maxk=200, Z=1.6)
print('Cluster populations:')
for c in np.unique(clusters16):
n = np.sum(clusters16 == c)
print(f" {'noise' if c == -1 else f'Cluster {c}'}: {n} ({n / len(clusters16) * 100:.1f}%)")
Cluster populations: noise: 1161 (52.8%) Cluster 0: 479 (21.8%) Cluster 1: 364 (16.5%) Cluster 2: 196 (8.9%)
xy16 = plot_umap_clusters(X16, clusters16, log_den=log_den16, title='Al16 (Al:Si = 1:4)')
# Shortest Al-O distance over time
n_AlO_feats16 = len(Al_idx16) * len(wO16)
AlO_block16 = X16[:, :n_AlO_feats16]
plt.figure(figsize=(10, 4))
plt.plot(AlO_block16[:, 0], lw=0.8, label='min Al-O')
plt.axhline(2.5, linestyle='--', color='gray', alpha=0.7, label='2.5 \u00c5 threshold')
plt.xlabel('Frame'); plt.ylabel('Distance (\u00c5)')
plt.title('Shortest Al-O distance - Al16 system')
plt.legend(); plt.tight_layout(); plt.show()
reps16 = get_representative_structures(traj16, X16, clusters16)
write('../../data/dadapy/AWB_Al16_representatives.extxyz', reps16, format='extxyz')
view(reps16, viewer='ngl')
HBox(children=(NGLWidget(max_frame=2), VBox(children=(Dropdown(description='Show', options=('All', 'O', 'Al', …
3.2 System 2: Al:Si = 1:1 (Al64, Si64, 32 H2O)¶
For the larger system we omit the Si-O block to keep the feature vector manageable.
traj64 = read("../../data/dadapy/nvt_al64_32H2O.traj", index="::10")
print(f"Frames: {len(traj64)}, Atoms: {len(traj64[0])}")
sym64 = np.array(traj64[0].get_chemical_symbols())
Al_idx64 = np.where(sym64 == 'Al')[0]
Si_idx64 = np.where(sym64 == 'Si')[0]
wO64 = identify_water_oxygens(traj64[0])
print(f"Al: {len(Al_idx64)}, Si: {len(Si_idx64)}, Water O: {len(wO64)}")
Frames: 2200, Atoms: 544 Al: 64, Si: 64, Water O: 32
# Al-O + O-O only (Si-O omitted to limit feature dimensionality)
X64 = build_feature_matrix(traj64, Al_idx64, wO64, Si_idx=None)
print(f"Feature matrix: {X64.shape}")
Feature matrix: (2200, 2544)
data64, log_den64, clusters64 = dadapy_pipeline(X64, maxk=200, Z=1.6)
print('Cluster populations:')
for c in np.unique(clusters64):
n = np.sum(clusters64 == c)
print(f" {'noise' if c == -1 else f'Cluster {c}'}: {n} ({n / len(clusters64) * 100:.1f}%)")
Cluster populations: noise: 700 (31.8%) Cluster 0: 1301 (59.1%) Cluster 1: 199 (9.0%)
xy64 = plot_umap_clusters(X64, clusters64, log_den=log_den64, title='Al64 (Al:Si = 1:1)')
reps64 = get_representative_structures(traj64, X64, clusters64)
write('../../data/dadapy/AWB_Al64_representatives.extxyz', reps64, format='extxyz')
view(reps64, viewer='ngl')
HBox(children=(NGLWidget(max_frame=1), VBox(children=(Dropdown(description='Show', options=('All', 'O', 'Al', …
# Cluster label over time for both systems
fig, axes = plt.subplots(2, 1, figsize=(14, 6), sharex=False)
for ax, (c_, title_) in zip(axes, [(clusters16, 'Al16 (Al:Si = 1:4)'),
(clusters64, 'Al64 (Al:Si = 1:1)')]):
ax.scatter(np.arange(len(c_)), c_, s=4, c=c_, cmap='tab10')
ax.set_ylabel('Cluster'); ax.set_title(title_)
axes[-1].set_xlabel('Frame')
plt.tight_layout(); plt.show()