Source code for tests.test_groupers
import os
import shutil
import tempfile
import numpy as np
import pytest
from chemsmart.io.xyz.xyzfile import XYZFile
from chemsmart.jobs.grouper.base import MoleculeGrouper
from chemsmart.jobs.grouper.connectivity import ConnectivityGrouper
from chemsmart.jobs.grouper.energy import EnergyGrouper
from chemsmart.jobs.grouper.formula import FormulaGrouper
from chemsmart.jobs.grouper.isomorphism import RDKitIsomorphismGrouper
from chemsmart.jobs.grouper.rmsd import (
BasicRMSDGrouper,
HungarianRMSDGrouper,
IRMSDGrouper,
PymolRMSDGrouper,
RMSDGrouper,
SpyRMSDGrouper,
)
from chemsmart.jobs.grouper.tanimoto import TanimotoSimilarityGrouper
from chemsmart.jobs.grouper.tfd import TorsionFingerprintGrouper
from chemsmart.utils.grouper import StructureGrouperFactory
from chemsmart.utils.utils import find_irmsd_command, kabsch_align
[docs]
@pytest.fixture
def temp_working_dir():
"""
Pytest fixture to create a temporary directory and change to it for testing.
This prevents group_result folders from being created in the project directory.
"""
original_dir = os.getcwd()
temp_dir = tempfile.mkdtemp()
try:
os.chdir(temp_dir)
yield temp_dir
finally:
os.chdir(original_dir)
# Clean up the temporary directory
shutil.rmtree(temp_dir, ignore_errors=True)
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_BasicRMSD_grouper_and_basic_functionality:
NUM_PROCS = 4
[docs]
def test_rmsd_grouper(
self, methanol_molecules, methanol_and_ethanol, temp_working_dir
):
methanol = methanol_molecules[0]
methanol_rot1 = methanol_molecules[1]
assert np.any(
methanol.positions != methanol_rot1.positions
), "Rotated molecule should have different positions."
grouper = BasicRMSDGrouper(methanol_molecules)
groups, group_indices = grouper.group()
rmsd = grouper._calculate_rmsd((0, 1))
assert np.isclose(
rmsd, 0.0, rtol=1e-3
), "RMSD should be close to zero."
rmsd = grouper._calculate_rmsd((1, 2))
assert np.isclose(
rmsd, 0.0, rtol=1e-3
), "RMSD should be close to zero."
_, _, _, _, rmsd_kabsch = kabsch_align(
methanol.positions, methanol_rot1.positions
)
assert np.isclose(rmsd_kabsch, 0.0, rtol=1e-3)
assert (
len(groups) == 1
), "Molecules should form one group based on geometry."
assert (
len(group_indices) == 1
), "Molecules should form one group based on geometry."
unique_structures = grouper.unique()
assert (
len(unique_structures) == 1
), "Molecules should form one group based on geometry."
grouper2 = BasicRMSDGrouper(methanol_and_ethanol)
groups, group_indices = grouper2.group()
assert (
len(groups) == 2
), "Molecules should form two groups based on geometry."
assert (
len(group_indices) == 2
), "Molecules should form two groups based on geometry."
rmsd = grouper2._calculate_rmsd((0, 1))
assert (
rmsd is np.inf
), "RMSD is set to be infinity for different molecules."
unique_structures = grouper2.unique()
assert (
len(unique_structures) == 2
), "Molecules should form two groups based on geometry."
[docs]
def test_rmsd_grouper_for_crest_conformers(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = BasicRMSDGrouper(
molecules, threshold=0.2, num_procs=self.NUM_PROCS
)
groups, group_indices = grouper.group()
assert len(groups) == 18
assert len(group_indices) == 18
unique_structures = grouper.unique()
assert len(unique_structures) == 18
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 1))
assert np.isclose(rmsd, 0.409, rtol=1e-3)
_, _, _, _, rmsd = kabsch_align(
molecules[0].positions, molecules[1].positions
)
assert np.isclose(rmsd, 0.409, rtol=1e-3)
grouper2 = BasicRMSDGrouper(
molecules, threshold=0.5, num_procs=self.NUM_PROCS
)
groups, group_indices = grouper2.group()
assert len(groups) == 12
assert len(group_indices) == 12
unique_structures = grouper2.unique()
assert len(unique_structures) == 12
grouper3 = BasicRMSDGrouper(
molecules, threshold=1.0, num_procs=self.NUM_PROCS
)
groups, group_indices = grouper3.group()
assert len(groups) == 11
assert len(group_indices) == 11
unique_structures = grouper3.unique()
assert len(unique_structures) == 11
grouper4 = BasicRMSDGrouper(
molecules, threshold=1.5, num_procs=self.NUM_PROCS
)
groups, group_indices = grouper4.group()
assert len(groups) == 9
assert len(group_indices) == 9
unique_structures = grouper4.unique()
assert len(unique_structures) == 9
grouper5 = BasicRMSDGrouper(
molecules, threshold=2.0, num_procs=self.NUM_PROCS
)
groups, group_indices = grouper5.group()
assert len(groups) == 8
assert len(group_indices) == 8
unique_structures = grouper5.unique()
assert len(unique_structures) == 8
grouper6 = BasicRMSDGrouper(
molecules, threshold=2.5, num_procs=self.NUM_PROCS
)
groups, group_indices = grouper6.group()
assert len(groups) == 4
assert len(group_indices) == 4
unique_structures = grouper6.unique()
assert len(unique_structures) == 4
[docs]
def test_num_groups_parameter(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = BasicRMSDGrouper(
molecules, threshold=None, num_groups=17, num_procs=self.NUM_PROCS
)
groups, group_indices = grouper.group()
assert len(groups) == 17
assert len(group_indices) == 17
unique_structures = grouper.unique()
assert len(unique_structures) == 17
[docs]
def test_pick_the_lowestenergy_conformers(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = BasicRMSDGrouper(
molecules, threshold=None, num_groups=3, num_procs=self.NUM_PROCS
)
groups, group_indices = grouper.group()
assert len(groups) == 3
assert len(group_indices) == 3
unique_structures = grouper.unique()
assert len(unique_structures) == 3
energies = [mol.energy for mol in unique_structures]
assert -126.2575508 in energies
assert -126.25153216 in energies
assert -126.24909661 in energies
[docs]
def test_rmsd_grouper_for_crest_conformers_ignore_Hs(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = BasicRMSDGrouper(
molecules,
threshold=0.2,
num_procs=self.NUM_PROCS,
ignore_hydrogens=True,
)
groups, group_indices = grouper.group()
assert len(groups) == 17
assert len(group_indices) == 17
unique_structures = grouper.unique()
assert len(unique_structures) == 17
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 1))
assert np.isclose(rmsd, 0.301, rtol=1e-3) # removed H atoms
_, _, _, _, rmsd = kabsch_align(
molecules[0].positions, molecules[1].positions
)
assert np.isclose(rmsd, 0.409, rtol=1e-3) # did not remove H atoms
grouper2 = BasicRMSDGrouper(
molecules,
threshold=0.5,
num_procs=self.NUM_PROCS,
ignore_hydrogens=True,
)
groups, group_indices = grouper2.group()
assert len(groups) == 12
assert len(group_indices) == 12
unique_structures = grouper2.unique()
assert len(unique_structures) == 12
grouper3 = BasicRMSDGrouper(
molecules,
threshold=1.0,
num_procs=self.NUM_PROCS,
ignore_hydrogens=True,
)
groups, group_indices = grouper3.group()
assert len(groups) == 10
assert len(group_indices) == 10
unique_structures = grouper3.unique()
assert len(unique_structures) == 10
grouper4 = BasicRMSDGrouper(
molecules,
threshold=1.5,
num_procs=self.NUM_PROCS,
ignore_hydrogens=True,
)
groups, group_indices = grouper4.group()
assert len(groups) == 7
assert len(group_indices) == 7
unique_structures = grouper4.unique()
assert len(unique_structures) == 7
grouper5 = BasicRMSDGrouper(
molecules,
threshold=2.0,
num_procs=self.NUM_PROCS,
ignore_hydrogens=True,
)
groups, group_indices = grouper5.group()
assert len(groups) == 5
assert len(group_indices) == 5
unique_structures = grouper5.unique()
assert len(unique_structures) == 5
grouper6 = BasicRMSDGrouper(
molecules,
threshold=2.5,
num_procs=self.NUM_PROCS,
ignore_hydrogens=True,
)
groups, group_indices = grouper6.group()
assert len(groups) == 5
assert len(group_indices) == 5
unique_structures = grouper6.unique()
assert len(unique_structures) == 5
[docs]
def test_rmsd_grouper_for_rotated_molecules(
self, two_rotated_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=two_rotated_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 2
grouper = BasicRMSDGrouper(
molecules,
threshold=0.5,
num_procs=self.NUM_PROCS,
ignore_hydrogens=False,
)
groups, group_indices = grouper.group()
assert len(groups) == 2
assert len(group_indices) == 2
unique_structures = grouper.unique()
assert len(unique_structures) == 2
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 1))
assert np.isclose(rmsd, 0.611, rtol=1e-3)
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_HungarianRMSD_grouper:
NUM_PROCS = 4
[docs]
def test_hrmsd_grouper_for_rotated_molecules(
self, two_rotated_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=two_rotated_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 2
grouper = HungarianRMSDGrouper(
molecules,
threshold=0.5,
num_procs=self.NUM_PROCS,
ignore_hydrogens=False,
)
groups, group_indices = grouper.group()
assert len(groups) == 1
assert len(group_indices) == 1
unique_structures = grouper.unique()
assert len(unique_structures) == 1
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 1))
assert np.isclose(rmsd, 0.2294, rtol=1e-3)
[docs]
def test_hrmsd_grouper_for_crest_molecules(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = HungarianRMSDGrouper(
molecules,
threshold=0.5,
num_procs=self.NUM_PROCS,
ignore_hydrogens=False,
)
groups, group_indices = grouper.group()
assert len(groups) == 12
assert len(group_indices) == 12
unique_structures = grouper.unique()
assert len(unique_structures) == 12
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 1))
assert np.isclose(rmsd, 0.4091, rtol=1e-3)
rmsd = grouper._calculate_rmsd((0, 2))
assert np.isclose(rmsd, 0.5899, rtol=1e-3)
rmsd = grouper._calculate_rmsd((0, 3))
assert np.isclose(rmsd, 1.8891, rtol=1e-3)
[docs]
def test_ignore_hydrogen(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = HungarianRMSDGrouper(
molecules,
threshold=0.5,
num_procs=self.NUM_PROCS,
ignore_hydrogens=True,
)
groups, group_indices = grouper.group()
assert len(groups) == 12
assert len(group_indices) == 12
unique_structures = grouper.unique()
assert len(unique_structures) == 12
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 4))
assert np.isclose(rmsd, 1.1915, rtol=1e-3)
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_SpyRMSD_grouper:
NUM_PROCS = 4
[docs]
def test_spyrmsd_grouper_for_rotated_molecules(
self, two_rotated_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=two_rotated_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 2
grouper = SpyRMSDGrouper(
molecules,
threshold=0.5,
num_procs=self.NUM_PROCS,
ignore_hydrogens=False,
)
groups, group_indices = grouper.group()
assert len(groups) == 1
assert len(group_indices) == 1
unique_structures = grouper.unique()
assert len(unique_structures) == 1
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 1))
assert np.isclose(rmsd, 0.2125, rtol=1e-3)
[docs]
def test_spyrmsd_grouper_for_crest_molecules(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = SpyRMSDGrouper(
molecules,
threshold=0.5,
num_procs=self.NUM_PROCS,
ignore_hydrogens=False,
)
groups, group_indices = grouper.group()
assert len(groups) == 12
assert len(group_indices) == 12
unique_structures = grouper.unique()
assert len(unique_structures) == 12
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 1))
assert np.isclose(rmsd, 0.4091, rtol=1e-3)
rmsd = grouper._calculate_rmsd((0, 2))
assert np.isclose(rmsd, 1.3925, rtol=1e-3)
rmsd = grouper._calculate_rmsd((0, 3))
assert np.isclose(rmsd, 2.1789, rtol=1e-3)
rmsd = grouper._calculate_rmsd((0, 4))
assert np.isclose(rmsd, 1.8202, rtol=1e-3)
rmsd = grouper._calculate_rmsd((0, 5))
assert np.isclose(rmsd, 2.0029, rtol=1e-3)
[docs]
def test_ignore_hydrogen(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = SpyRMSDGrouper(
molecules,
threshold=2.5981,
num_procs=self.NUM_PROCS,
ignore_hydrogens=True,
)
groups, group_indices = grouper.group()
assert len(groups) == 3
assert len(group_indices) == 3
unique_structures = grouper.unique()
assert len(unique_structures) == 3
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 5))
assert np.isclose(rmsd, 1.7034, rtol=1e-3)
rmsd = grouper._calculate_rmsd((0, 6))
assert np.isclose(rmsd, 2.6183, rtol=1e-3)
[docs]
@pytest.mark.skipif(
not find_irmsd_command(), reason="irmsd command not available"
)
@pytest.mark.usefixtures("temp_working_dir")
class Test_IRMSD_grouper:
NUM_PROCS = 4
[docs]
def test_irmsd_grouper_for_rotated_molecules(
self, two_rotated_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=two_rotated_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 2
grouper = IRMSDGrouper(
molecules,
threshold=0.125,
num_procs=self.NUM_PROCS,
ignore_hydrogens=False,
)
groups, group_indices = grouper.group()
assert len(groups) == 2
assert len(group_indices) == 2
unique_structures = grouper.unique()
assert len(unique_structures) == 2
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 1))
assert np.isclose(rmsd, 0.2294, rtol=1e-3)
[docs]
def test_irmsd_grouper_for_crest_molecules(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = IRMSDGrouper(
molecules,
threshold=0.5,
num_procs=self.NUM_PROCS,
ignore_hydrogens=False,
)
groups, group_indices = grouper.group()
assert len(groups) == 12
assert len(group_indices) == 12
unique_structures = grouper.unique()
assert len(unique_structures) == 12
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 1))
assert np.isclose(rmsd, 0.4091, rtol=1e-3)
rmsd = grouper._calculate_rmsd((0, 2))
assert np.isclose(rmsd, 1.3925, rtol=1e-3)
rmsd = grouper._calculate_rmsd((2, 10))
assert np.isclose(rmsd, 2.2390, rtol=1e-3)
rmsd = grouper._calculate_rmsd((8, 16))
assert np.isclose(rmsd, 3.4209, rtol=1e-3)
rmsd = grouper._calculate_rmsd((0, 13))
assert np.isclose(rmsd, 0.8411, rtol=1e-3)
[docs]
def test_ignore_hydrogen(
self, two_rotated_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=two_rotated_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 2
grouper = IRMSDGrouper(
molecules,
threshold=0.125,
num_procs=self.NUM_PROCS,
ignore_hydrogens=True,
)
groups, group_indices = grouper.group()
assert len(groups) == 2
assert len(group_indices) == 2
unique_structures = grouper.unique()
assert len(unique_structures) == 2
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 1))
assert np.isclose(rmsd, 0.2294, rtol=1e-3)
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_PymolRMSD_grouper:
NUM_PROCS = 1
[docs]
@classmethod
def setup_class(cls):
"""Initialize PyMOL once for all tests in this class."""
try:
import pymol
from pymol import cmd
pymol.finish_launching(["pymol", "-qc"])
cmd.reinitialize()
except Exception:
# PyMOL may not be installed; tests will be skipped via @pytest.mark.skipif
pass
[docs]
def teardown_method(self, method):
"""Clean up PyMOL objects after each test method to prevent slowdown."""
try:
from pymol import cmd
# Delete all objects instead of reinitialize to keep session alive
cmd.delete("all")
except Exception:
# Ignore cleanup errors if PyMOL is not available
pass
[docs]
def test_pymolrmsd_grouper_for_rotated_molecules(
self, two_rotated_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=two_rotated_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 2
grouper = PymolRMSDGrouper(
molecules,
threshold=0.5,
num_procs=self.NUM_PROCS,
ignore_hydrogens=False,
)
groups, group_indices = grouper.group()
assert len(groups) == 1
assert len(group_indices) == 1
unique_structures = grouper.unique()
assert len(unique_structures) == 1
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 1))
assert np.isclose(rmsd, 0.0000, rtol=1e-3)
# Explicitly cleanup to prevent __del__ from calling quit()
if hasattr(grouper, "_temp_dir") and grouper._temp_dir:
shutil.rmtree(grouper._temp_dir, ignore_errors=True)
grouper._temp_dir = None
grouper.cmd = None # Prevent __del__ from calling quit()
[docs]
def test_pymolrmsd_grouper_for_crest_molecules(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = PymolRMSDGrouper(
molecules,
threshold=0.5,
num_procs=self.NUM_PROCS,
ignore_hydrogens=False,
)
groups, group_indices = grouper.group()
assert len(groups) == 7
assert len(group_indices) == 7
unique_structures = grouper.unique()
assert len(unique_structures) == 7
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 1))
assert np.isclose(rmsd, 0.074175, rtol=1e-3)
rmsd = grouper._calculate_rmsd((0, 2))
assert np.isclose(rmsd, 0.023745, rtol=1e-3)
rmsd = grouper._calculate_rmsd((3, 4))
assert np.isclose(rmsd, 1.725241, rtol=1e-3)
rmsd = grouper._calculate_rmsd((7, 8))
assert np.isclose(rmsd, 2.309451, rtol=1e-3)
rmsd = grouper._calculate_rmsd((14, 16))
assert np.isclose(rmsd, 1.837319, rtol=1e-3)
# Explicitly cleanup to prevent __del__ from calling quit()
if hasattr(grouper, "_temp_dir") and grouper._temp_dir:
shutil.rmtree(grouper._temp_dir, ignore_errors=True)
grouper._temp_dir = None
grouper.cmd = None # Prevent __del__ from calling quit()
[docs]
def test_ignore_hydrogen(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = PymolRMSDGrouper(
molecules,
threshold=0.8203818,
num_procs=self.NUM_PROCS,
ignore_hydrogens=True,
)
groups, group_indices = grouper.group()
assert len(groups) == 5
assert len(group_indices) == 5
unique_structures = grouper.unique()
assert len(unique_structures) == 5
# rmsd calculation from grouper
rmsd = grouper._calculate_rmsd((0, 2))
assert np.isclose(rmsd, 0.0211, rtol=1e-3)
rmsd = grouper._calculate_rmsd((1, 4))
assert np.isclose(rmsd, 0.7669, rtol=1e-3)
# Explicitly cleanup to prevent __del__ from calling quit()
if hasattr(grouper, "_temp_dir") and grouper._temp_dir:
shutil.rmtree(grouper._temp_dir, ignore_errors=True)
grouper._temp_dir = None
grouper.cmd = None # Prevent __del__ from calling quit()
[docs]
def test_pymol_grouper_rejects_multiproc(
self, methanol_molecules, temp_working_dir
):
"""Test that PyMOL grouper raises error when num_procs > 1."""
with pytest.raises(ValueError) as excinfo:
PymolRMSDGrouper(
methanol_molecules,
threshold=0.5,
num_procs=4, # Should raise error
)
assert (
"single" in str(excinfo.value).lower()
or "num_procs" in str(excinfo.value).lower()
)
[docs]
@classmethod
def teardown_class(cls):
"""Clean up PyMOL after all tests in this class are done."""
try:
from pymol import cmd
cmd.reinitialize()
except Exception:
# Ignore cleanup errors if PyMOL is not available
pass
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_Tanimoto_similarity_grouper:
NUM_PROCS = 4
[docs]
def test_tanimoto_similarity_grouper(
self, methanol_molecules, methanol_and_ethanol, temp_working_dir
):
grouper = TanimotoSimilarityGrouper(methanol_molecules)
groups, group_indices = grouper.group()
assert (
len(groups) == 1
), "Molecules should form one group based on RCM similarity."
assert (
len(group_indices) == 1
), "Molecules should form one group based on RCM similarity."
unique_structures = grouper.unique()
assert (
len(unique_structures) == 1
), "Molecules should form one group based on RCM similarity."
grouper2 = TanimotoSimilarityGrouper(methanol_and_ethanol)
groups, group_indices = grouper2.group()
assert (
len(groups) == 2
), "Molecules should form two groups based on RCM similarity."
assert (
len(group_indices) == 2
), "Molecules should form two groups based on RCM similarity."
unique_structures = grouper2.unique()
assert (
len(unique_structures) == 2
), "Molecules should form two groups based on RCM similarity."
[docs]
def test_tanimoto_grouper_for_crest_conformers(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = TanimotoSimilarityGrouper(
molecules,
threshold=0.98,
fingerprint_type="usrcat",
num_procs=self.NUM_PROCS,
)
groups, group_indices = grouper.group()
assert len(groups) == 4
assert len(group_indices) == 4
unique_structures = grouper.unique()
assert len(unique_structures) == 4
grouper = TanimotoSimilarityGrouper(
molecules,
threshold=0.999,
fingerprint_type="usrcat",
num_procs=self.NUM_PROCS,
)
groups, group_indices = grouper.group()
assert len(groups) == 14
assert len(group_indices) == 14
unique_structures = grouper.unique()
assert len(unique_structures) == 14
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_TorsionFingerprint_grouper:
NUM_PROCS = 4
[docs]
def test_torsionfingerprint_grouper_for_rotated_molecules(
self, two_rotated_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=two_rotated_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 2
grouper = TorsionFingerprintGrouper(
molecules,
threshold=0.1,
num_procs=self.NUM_PROCS,
ignore_hydrogens=False,
)
groups, group_indices = grouper.group()
assert len(groups) == 1
assert len(group_indices) == 1
unique_structures = grouper.unique()
assert len(unique_structures) == 1
[docs]
def test_torsionfingerprint_grouper_for_crest_molecules(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = TorsionFingerprintGrouper(
molecules,
threshold=0.05,
num_procs=self.NUM_PROCS,
ignore_hydrogens=False,
)
groups, group_indices = grouper.group()
assert len(groups) == 10
assert len(group_indices) == 10
unique_structures = grouper.unique()
assert len(unique_structures) == 10
[docs]
def test_use_weights_parameter(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = TorsionFingerprintGrouper(
molecules,
threshold=0.05,
num_procs=self.NUM_PROCS,
use_weights=False,
ignore_hydrogens=False,
)
groups, group_indices = grouper.group()
assert len(groups) == 15
assert len(group_indices) == 15
unique_structures = grouper.unique()
assert len(unique_structures) == 15
tfd = grouper._calculate_tfd((0, 2))
assert np.isclose(tfd, 0.08229, rtol=1e-3)
tfd = grouper._calculate_tfd((0, 7))
assert np.isclose(tfd, 0.07727, rtol=1e-3)
[docs]
def test_count_groups_matches_complete_linkage_iteration_order(
self, methanol_molecules
):
grouper = TorsionFingerprintGrouper(
methanol_molecules[:3],
threshold=0.1,
num_procs=1,
ignore_hydrogens=False,
)
adj_matrix = np.zeros((3, 3), dtype=bool)
adj_matrix[0, 1] = adj_matrix[1, 0] = True
adj_matrix[1, 2] = adj_matrix[2, 1] = True
_, index_groups = grouper._complete_linkage_grouping(adj_matrix, 3)
num_groups = grouper._count_groups(adj_matrix, 3)
assert len(index_groups) == 2
assert num_groups == len(index_groups)
[docs]
def test_use_maxdev_parameter(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = TorsionFingerprintGrouper(
molecules,
threshold=0.26965,
num_procs=self.NUM_PROCS,
use_weights=True,
max_dev="spec",
ignore_hydrogens=False,
)
groups, group_indices = grouper.group()
assert len(groups) == 2
assert len(group_indices) == 2
unique_structures = grouper.unique()
assert len(unique_structures) == 2
tfd = grouper._calculate_tfd((0, 1))
assert np.isclose(tfd, 0.02027, rtol=1e-3)
tfd = grouper._calculate_tfd((3, 4))
assert np.isclose(tfd, 0.24365, rtol=1e-3)
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_other_groupers:
NUM_PROCS = 4
[docs]
def test_base_record_template_method(self, methanol_molecules):
class DummyGrouper(MoleculeGrouper):
def group(self):
return [], []
def _record_results(self, **kwargs):
return kwargs
grouper = DummyGrouper(methanol_molecules)
payload = grouper.record(example_key="example_value")
assert payload["example_key"] == "example_value"
[docs]
def test_record_writes_rmsd_matrix_with_headers(
self, methanol_molecules, temp_working_dir
):
"""Detailed record test: verify matrix file and key header lines are written."""
from openpyxl import load_workbook
molecules = methanol_molecules[:2]
grouper = BasicRMSDGrouper(
molecules,
threshold=0.5,
num_procs=1,
label="record_detail",
energy_type="E",
)
# Populate cache first so Groups sheet can be generated.
groups, group_indices = grouper.group()
assert len(groups) >= 1
assert len(group_indices) >= 1
# Call unified record() explicitly to validate template-method path.
rmsd_matrix = np.zeros((len(molecules), len(molecules)))
grouper.record(rmsd_matrix=rmsd_matrix, grouping_time=0.01)
xlsx_file = os.path.join(
temp_working_dir,
"record_detail_group_result",
"record_detail_BasicRMSDGrouper_T0.5.xlsx",
)
assert os.path.exists(xlsx_file)
wb = load_workbook(xlsx_file, data_only=True)
assert "RMSD_Matrix" in wb.sheetnames
assert "Groups" in wb.sheetnames
ws = wb["RMSD_Matrix"]
header_lines = [str(ws[f"A{i}"].value) for i in range(1, 25)]
assert any("Energy Type: E" in line for line in header_lines if line)
assert any(
"Used Molecules: 2" in line for line in header_lines if line
)
assert any(
"Skipped Molecules: 0" in line for line in header_lines if line
)
assert any("Grouping Time:" in line for line in header_lines if line)
[docs]
def test_record_writes_formula_outputs_with_headers(
self, methanol_and_ethanol, temp_working_dir
):
"""Detailed record test: verify non-matrix output sheet and header lines."""
from openpyxl import load_workbook
grouper = FormulaGrouper(
methanol_and_ethanol,
num_procs=1,
label="formula_record_detail",
energy_type="E",
)
groups, group_indices = grouper.group()
assert len(groups) == 2
assert len(group_indices) == 2
xlsx_file = os.path.join(
temp_working_dir,
"formula_record_detail_group_result",
"formula_record_detail_FormulaGrouper.xlsx",
)
assert os.path.exists(xlsx_file)
wb = load_workbook(xlsx_file, data_only=True)
assert "Formulas" in wb.sheetnames
assert "Groups" in wb.sheetnames
ws = wb["Formulas"]
header_lines = [str(ws[f"A{i}"].value) for i in range(1, 25)]
assert any(
"Total Molecules: 2" in line for line in header_lines if line
)
assert any(
"Unique Formulas: 2" in line for line in header_lines if line
)
assert any("Energy Type: E" in line for line in header_lines if line)
[docs]
def test_formula_grouper(
self,
methanol_molecules,
methanol_and_ethanol,
conformers_from_rdkit,
temp_working_dir,
):
grouper = FormulaGrouper(methanol_molecules)
groups, group_indices = grouper.group()
unique_structures = grouper.unique()
assert (
len(groups) == 1
), "Molecules should form one group based on formula."
assert (
len(unique_structures) == 1
), "Molecules should form one group based on formula."
grouper2 = FormulaGrouper(methanol_and_ethanol)
groups, group_indices = grouper2.group()
unique_structures = grouper2.unique()
assert (
len(groups) == 2
), "Molecules should form two groups based on formula."
assert (
len(unique_structures) == 2
), "Molecules should form two groups based on formula."
grouper3 = FormulaGrouper(conformers_from_rdkit)
# based on Formula, should all be the same even for 300 conformers
groups, group_indices = grouper3.group()
unique_structures = grouper3.unique()
assert len(groups) == 1
assert len(unique_structures) == 1
[docs]
@pytest.mark.slow
def test_connectivity_grouper(
self, methanol_molecules, methanol_and_ethanol, temp_working_dir
):
grouper = ConnectivityGrouper(methanol_molecules)
groups, group_indices = grouper.group()
assert (
len(groups) == 1
), "Molecules should form one group based on connectivity."
assert (
len(group_indices) == 1
), "Molecules should form one group based on connectivity."
unique_structures = grouper.unique()
assert (
len(unique_structures) == 1
), "Molecules should form one group based on connectivity."
grouper2 = ConnectivityGrouper(methanol_and_ethanol)
groups, group_indices = grouper2.group()
assert (
len(groups) == 2
), "Molecules should form two groups based on connectivity."
assert (
len(group_indices) == 2
), "Molecules should form two groups based on connectivity."
unique_structures = grouper2.unique()
assert (
len(unique_structures) == 2
), "Molecules should form two groups based on connectivity."
[docs]
def test_connectivity_grouper_for_crest_conformers(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
grouper = ConnectivityGrouper(
molecules,
num_procs=self.NUM_PROCS,
)
groups, group_indices = grouper.group()
assert len(groups) == 4
assert len(group_indices) == 4
unique_structures = grouper.unique()
assert len(unique_structures) == 4
[docs]
def test_rdkit_isomorphism_grouper(
self, methanol_molecules, methanol_and_ethanol, temp_working_dir
):
grouper = RDKitIsomorphismGrouper(methanol_molecules)
groups, group_indices = grouper.group()
assert (
len(groups) == 1
), "Molecules should form one group based on RCM similarity."
assert (
len(group_indices) == 1
), "Molecules should form one group based on RCM similarity."
unique_structures = grouper.unique()
assert (
len(unique_structures) == 1
), "Molecules should form one group based on RCM similarity."
grouper2 = RDKitIsomorphismGrouper(methanol_and_ethanol)
groups, group_indices = grouper2.group()
assert (
len(groups) == 2
), "Molecules should form two groups based on RCM similarity."
assert (
len(group_indices) == 2
), "Molecules should form two groups based on RCM similarity."
unique_structures = grouper2.unique()
assert (
len(unique_structures) == 2
), "Molecules should form two groups based on RCM similarity."
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_EnergyGrouper:
NUM_PROCS = 4
[docs]
def test_energy_grouper_raises_error_for_missing_energy(
self, methanol_molecules, temp_working_dir
):
# methanol_molecules from pubchem don't have energy information
with pytest.raises(ValueError) as excinfo:
EnergyGrouper(methanol_molecules)
assert "missing energy information" in str(excinfo.value)
[docs]
def test_energy_grouper_for_crest_conformers(
self, multiple_molecules_xyz_file, temp_working_dir
):
"""Test EnergyGrouper with molecules that have energy information."""
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
assert len(molecules) == 18
# All CREST conformers should have energy
for mol in molecules:
assert mol.energy is not None
assert len(molecules) == 18
grouper = EnergyGrouper(
molecules,
threshold=0.5,
num_procs=self.NUM_PROCS,
)
groups, group_indices = grouper.group()
assert len(groups) == 7
assert len(group_indices) == 7
unique_structures = grouper.unique()
assert len(unique_structures) == 7
expected1 = -1.7839 / 627.509474
expected2 = 1.4580 / 627.509474
relative_diff, abs_diff = grouper._calculate_energy_diff((1, 0))
assert np.isclose(relative_diff, expected1, rtol=1e-3)
relative_diff, abs_diff = grouper._calculate_energy_diff((1, 2))
assert np.isclose(abs_diff, expected2, rtol=1e-3)
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
grouper = EnergyGrouper(
molecules, num_groups=5, num_procs=self.NUM_PROCS
)
groups, group_indices = grouper.group()
assert len(groups) == 5
assert len(group_indices) == 5
[docs]
def test_energy_grouper_for_log_conformers(
self, ts_conformers_log_directory, temp_working_dir
):
"""Test EnergyGrouper with molecules loaded from log files using Gibbs energy."""
import glob
import re
from chemsmart.io.gaussian.output import Gaussian16Output
from chemsmart.io.molecules.structure import Molecule
# Load molecules from log files and set Gibbs energy (like CLI does)
log_files = sorted(
glob.glob(os.path.join(ts_conformers_log_directory, "*.log"))
)
molecules = []
conformer_ids = []
for log_file in log_files:
mol = Molecule.from_filepath(
filepath=log_file, index="-1", return_list=False
)
if mol is not None:
# Extract and set Gibbs energy using gibbs_free_energy property
g16_output = Gaussian16Output(filename=log_file)
gibbs_energy = g16_output.gibbs_free_energy
if gibbs_energy is not None:
mol._energy = gibbs_energy
molecules.append(mol)
basename = os.path.basename(log_file)
match = re.search(r"_c(\d+)\.log$", basename)
if match:
conformer_ids.append(f"c{match.group(1)}")
else:
conformer_ids.append(basename)
assert len(molecules) == 5
assert len(conformer_ids) == 5
# All log conformers should have Gibbs energy
for mol in molecules:
assert mol.energy is not None
grouper = EnergyGrouper(
molecules,
threshold=1.0,
num_procs=self.NUM_PROCS,
conformer_ids=conformer_ids,
)
groups, group_indices = grouper.group()
assert len(groups) == 2
assert len(group_indices) == 2
unique_structures = grouper.unique()
assert len(unique_structures) == 2
expected1 = -4.1429 / 627.509474
expected2 = 4.1429 / 627.509474
relative_diff, abs_diff = grouper._calculate_energy_diff((1, 4))
assert np.isclose(relative_diff, expected1, rtol=1e-2)
assert np.isclose(abs_diff, expected2, rtol=1e-2)
[docs]
def test_energy_extraction_from_xyz_file(
self, multiple_molecules_xyz_file, temp_working_dir
):
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
# Test specific energy value for 3rd structure (index 2)
expected_energy_mol3 = -126.25238449
actual_energy_mol3 = molecules[2].energy
assert actual_energy_mol3 is not None, "3rd molecule energy is None"
assert np.isclose(
actual_energy_mol3, expected_energy_mol3, rtol=1e-8
), (
f"Energy mismatch for 3rd molecule: got {actual_energy_mol3}, "
f"expected {expected_energy_mol3}"
)
[docs]
def test_gibbs_energy_extraction_function(
self, ts_conformers_log_directory, temp_working_dir
):
"""Test that Gaussian16Output.gibbs_free_energy extracts correct value."""
from chemsmart.io.gaussian.output import Gaussian16Output
log_file = os.path.join(
ts_conformers_log_directory, "ch_1c_para_c1.log"
)
g16_output = Gaussian16Output(filename=log_file)
# Known values
expected_scf = -1401.99431519
expected_gibbs_correction = 0.311863
expected_gibbs_energy = (
expected_scf + expected_gibbs_correction
) # -1401.68245219
gibbs_energy = g16_output.gibbs_free_energy
assert np.isclose(gibbs_energy, expected_gibbs_energy, rtol=1e-6), (
f"Gibbs energy mismatch: got {gibbs_energy}, "
f"expected {expected_gibbs_energy}"
)
[docs]
def test_energy_extraction_from_ts_log_files(
self, ts_conformers_log_directory, temp_working_dir
):
"""Test that energy is correctly extracted from TS log files as SCF Done energy."""
import glob
from chemsmart.io.gaussian.output import Gaussian16Output
from chemsmart.io.molecules.structure import Molecule
log_files = sorted(
glob.glob(os.path.join(ts_conformers_log_directory, "*.log"))
)
molecules = []
for log_file in log_files:
mol = Molecule.from_filepath(
filepath=log_file, index="-1", return_list=False
)
if mol is not None:
molecules.append(mol)
g16_output = Gaussian16Output(filename=log_file)
scf_energy = (
g16_output.scf_energies[-1]
if g16_output.scf_energies
else None
)
assert np.isclose(mol.energy, scf_energy, rtol=1e-6), (
f"Energy mismatch for {log_file}: "
f"mol.energy={mol.energy}, expected SCF={scf_energy}"
)
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Testfactory:
[docs]
def test_structure_grouper_factory_energy(
self, multiple_molecules_xyz_file, temp_working_dir
):
"""Test factory creation of energy grouper (requires molecules with energy)."""
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
factory = StructureGrouperFactory()
energy_grouper = factory.create(molecules, strategy="energy")
assert isinstance(energy_grouper, EnergyGrouper)
[docs]
def test_structure_grouper_factory(
self, methanol_molecules, temp_working_dir
):
factory = StructureGrouperFactory()
rmsd_grouper = factory.create(methanol_molecules, strategy="rmsd")
assert isinstance(rmsd_grouper, RMSDGrouper)
hrmsd_grouper = factory.create(methanol_molecules, strategy="hrmsd")
assert isinstance(hrmsd_grouper, RMSDGrouper)
spyrmsd_grouper = factory.create(
methanol_molecules, strategy="spyrmsd"
)
assert isinstance(spyrmsd_grouper, RMSDGrouper)
# irmsd requires external command, skip if not available
if find_irmsd_command():
irmsd_grouper = factory.create(
methanol_molecules, strategy="irmsd"
)
assert isinstance(irmsd_grouper, RMSDGrouper)
pymolrmsd_grouper = factory.create(
methanol_molecules, strategy="pymolrmsd"
)
assert isinstance(pymolrmsd_grouper, RMSDGrouper)
# Explicitly cleanup pymolrmsd_grouper to prevent __del__ from calling quit()
if (
hasattr(pymolrmsd_grouper, "_temp_dir")
and pymolrmsd_grouper._temp_dir
):
shutil.rmtree(pymolrmsd_grouper._temp_dir, ignore_errors=True)
pymolrmsd_grouper._temp_dir = None
pymolrmsd_grouper.cmd = None # Prevent __del__ from calling quit()
torsion_grouper = factory.create(
methanol_molecules, strategy="torsion"
)
assert isinstance(torsion_grouper, TorsionFingerprintGrouper)
formula_grouper = factory.create(
methanol_molecules, strategy="formula"
)
assert isinstance(formula_grouper, FormulaGrouper)
connectivity_grouper = factory.create(
methanol_molecules, strategy="connectivity"
)
assert isinstance(connectivity_grouper, ConnectivityGrouper)
tanimoto_grouper = factory.create(
methanol_molecules, strategy="tanimoto"
)
assert isinstance(tanimoto_grouper, TanimotoSimilarityGrouper)
rdkit_isomorphism_grouper = factory.create(
methanol_molecules, strategy="isomorphism"
)
assert isinstance(rdkit_isomorphism_grouper, RDKitIsomorphismGrouper)
[docs]
@classmethod
def teardown_class(cls):
"""Clean up PyMOL after all tests in this class are done."""
try:
from pymol import cmd
cmd.reinitialize()
except Exception:
# Ignore cleanup errors if PyMOL is not available
pass
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_grouper_utility_functions:
"""Test utility functions and helper methods in groupers."""
NUM_PROCS = 1
[docs]
def test_rmsd_matrix_with_num_groups(
self, multiple_molecules_xyz_file, temp_working_dir
):
"""Test that RMSD matrix filename reflects num_groups when used."""
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
grouper = BasicRMSDGrouper(
molecules,
num_groups=3,
num_procs=1,
label="test_num_groups",
)
_, _ = grouper.group()
# Check that Excel file was created in the temp_working_dir
expected_file = os.path.join(
temp_working_dir,
"test_num_groups_group_result",
"test_num_groups_BasicRMSDGrouper_N3.xlsx",
)
assert os.path.exists(
expected_file
), f"Matrix file not found: {expected_file}"
[docs]
def test_grouping_result_caching(
self, multiple_molecules_xyz_file, temp_working_dir
):
"""Test that grouping results are cached and reused."""
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
grouper = BasicRMSDGrouper(
molecules,
threshold=0.5,
num_procs=1,
)
groups1, indices1 = grouper.group()
assert grouper._cached_groups is not None
assert grouper._cached_group_indices is not None
# unique() should use cached results
unique_mols = grouper.unique()
assert len(unique_mols) == len(groups1)
[docs]
def test_unique_returns_lowest_energy_representative(
self, multiple_molecules_xyz_file, temp_working_dir
):
"""Test that unique() returns lowest energy molecule from each group."""
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
# CREST conformers have energy information
grouper = BasicRMSDGrouper(
molecules,
threshold=2.0,
num_procs=1,
)
groups, group_indices = grouper.group()
unique_mols = grouper.unique()
# For each group, verify the representative has lowest energy
for i, (group, indices) in enumerate(zip(groups, group_indices)):
mols_with_energy = [
(mol, idx)
for mol, idx in zip(group, indices)
if mol.energy is not None
]
if mols_with_energy:
min_energy_mol = min(
mols_with_energy, key=lambda x: x[0].energy
)[0]
assert unique_mols[i].energy == min_energy_mol.energy
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_grouper_complete_linkage:
"""Test complete linkage clustering behavior."""
NUM_PROCS = 4
[docs]
def test_complete_linkage_prevents_chaining(
self, multiple_molecules_xyz_file, temp_working_dir
):
"""Test that complete linkage prevents chaining effect in grouping."""
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
# With a moderate threshold, complete linkage should create
# groups where ALL members are within threshold of each other
grouper = BasicRMSDGrouper(
molecules,
threshold=1.0,
num_procs=self.NUM_PROCS,
)
groups, group_indices = grouper.group()
# Verify complete linkage property: within each group,
# all pairs should have RMSD < threshold
for group_idx, indices in enumerate(group_indices):
if len(indices) > 1:
for i, idx_i in enumerate(indices):
for j, idx_j in enumerate(indices):
if i < j:
rmsd = grouper._calculate_rmsd((idx_i, idx_j))
assert rmsd < 1.0, (
f"Group {group_idx}: RMSD between {idx_i} and {idx_j} "
f"is {rmsd}, exceeds threshold 1.0"
)
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_conformer_ids_functionality:
"""Test conformer_ids parameter functionality."""
NUM_PROCS = 1
[docs]
def test_conformer_ids_from_log_directory(
self, ts_conformers_log_directory, temp_working_dir
):
"""Test loading conformer IDs from a directory of log files."""
import glob
import re
from chemsmart.io.molecules.structure import Molecule
# Load molecules from log files in the ts_conformers_log_directory
log_files = sorted(
glob.glob(os.path.join(ts_conformers_log_directory, "*.log"))
)
assert (
len(log_files) == 5
), f"Expected 5 log files, found {len(log_files)}"
# Extract conformer IDs from filenames (e.g., ch_1c_para_c1.log -> c1)
molecules = []
conformer_ids = []
for log_file in log_files:
mol = Molecule.from_filepath(
filepath=log_file, index="-1", return_list=False
)
if mol is not None:
molecules.append(mol)
# Extract conformer ID from filename
basename = os.path.basename(log_file)
match = re.search(r"_c(\d+)\.log$", basename)
if match:
conformer_ids.append(f"c{match.group(1)}")
else:
conformer_ids.append(basename)
assert len(molecules) >= 2, "Need at least 2 molecules for grouping"
grouper = BasicRMSDGrouper(
molecules,
threshold=1.0,
num_procs=1,
label="ts_conformers_test",
conformer_ids=conformer_ids,
)
_, _ = grouper.group()
# Verify conformer_ids are stored
assert grouper.conformer_ids == conformer_ids
# Check Excel output
import pandas as pd
excel_file = os.path.join(
"ts_conformers_test_group_result",
"ts_conformers_test_BasicRMSDGrouper_T1.0.xlsx",
)
assert os.path.exists(
excel_file
), f"Excel file not found: {excel_file}"
# Matrix data starts at row 8 (0-indexed: skiprows=13), first column is index
df = pd.read_excel(
excel_file, sheet_name="RMSD_Matrix", skiprows=12, index_col=0
)
# Check that conformer IDs are used as labels
assert "c1" in str(df.columns[0]) or "c1" in str(df.index[0])
[docs]
def test_traj_conformer_ids_original_indices(self, temp_working_dir):
"""Test that traj job correctly sets original conformer indices."""
from chemsmart.io.molecules.structure import Molecule
# Create test molecules simulating a trajectory
mol = Molecule.from_pubchem(identifier="CO")
molecules = [mol.copy() for _ in range(18)]
# Simulate traj behavior: select last 50% (indices 10-18 in original)
proportion = 0.5
total = len(molecules)
last_num = int(round(total * proportion, 1))
_ = molecules[-last_num:] # selected_molecules - not used in this test
# Calculate expected original indices (1-based)
start_original_index = total - last_num + 1
expected_ids = [str(start_original_index + i) for i in range(last_num)]
# Verify expected IDs
assert expected_ids[0] == "10" # First selected is original index 10
assert expected_ids[-1] == "18" # Last selected is original index 18
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_output_file_generation:
"""Test that grouper generates correct output files."""
NUM_PROCS = 1
[docs]
def test_group_xyz_files_contain_energy_and_index_info(
self, multiple_molecules_xyz_file, temp_working_dir
):
"""Test that group XYZ files contain energy and original index information."""
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
grouper = BasicRMSDGrouper(
molecules[:6],
threshold=1.0,
num_procs=1,
label="info_test",
)
_, _ = grouper.group()
grouper.unique() # Generates xyz files
output_dir = "info_test_group_result"
first_group_file = os.path.join(output_dir, "info_test_group_1.xyz")
with open(first_group_file, "r") as f:
content = f.read()
# Check for expected information in comments
assert "Original_Index:" in content
assert "E" in content
[docs]
def test_group_xyz_files_sorted_by_energy(
self, multiple_molecules_xyz_file, temp_working_dir
):
"""Test that molecules in group XYZ files are sorted by energy (lowest first)."""
import re
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
grouper = BasicRMSDGrouper(
molecules,
threshold=2.0, # Higher threshold to get more molecules per group
num_procs=1,
label="sort_test",
)
groups, group_indices = grouper.group()
grouper.unique() # Generates xyz files
output_dir = "sort_test_group_result"
# Check each group file
for group_num in range(len(groups)):
group_file = os.path.join(
output_dir, f"sort_test_group_{group_num + 1}.xyz"
)
with open(group_file, "r") as f:
content = f.read()
# Extract energies from file
lines = content.strip().split("\n")
i = 0
energies = []
while i < len(lines):
num_atoms = int(lines[i].strip())
comment_line = lines[i + 1]
energy_match = re.search(
r"E\(Hartree\):\s*([-\d.]+)", comment_line
)
if energy_match:
energies.append(float(energy_match.group(1)))
i += num_atoms + 2
# Verify sorted by energy (lowest first)
for j in range(len(energies) - 1):
assert energies[j] <= energies[j + 1], (
f"Group {group_num + 1} not sorted: "
f"energy[{j}]={energies[j]} > energy[{j+1}]={energies[j+1]}"
)
[docs]
def test_group_xyz_files_energy_from_log_files(
self, ts_conformers_log_directory, temp_working_dir
):
"""Test that energy is correctly extracted from log files and written to group XYZ files."""
import glob
import re
from chemsmart.io.molecules.structure import Molecule
# Load molecules from log files
log_files = sorted(
glob.glob(os.path.join(ts_conformers_log_directory, "*.log"))
)
assert (
len(log_files) >= 2
), f"Need at least 2 log files, found {len(log_files)}"
molecules = []
conformer_ids = []
original_energies = {}
for idx, log_file in enumerate(log_files):
mol = Molecule.from_filepath(
filepath=log_file, index="-1", return_list=False
)
if mol is not None:
molecules.append(mol)
# Store original energy for verification
original_energies[idx] = mol.energy
# Extract conformer ID from filename
basename = os.path.basename(log_file)
match = re.search(r"_c(\d+)\.log$", basename)
if match:
conformer_ids.append(f"c{match.group(1)}")
else:
conformer_ids.append(basename)
assert len(molecules) >= 2, "Need at least 2 molecules for grouping"
grouper = BasicRMSDGrouper(
molecules,
threshold=5.0, # High threshold to group all together
num_procs=1,
label="log_energy_test",
conformer_ids=conformer_ids,
)
_, _ = grouper.group()
grouper.unique() # Generates xyz files
# Check group file
output_dir = "log_energy_test_group_result"
first_group_file = os.path.join(
output_dir, "log_energy_test_group_1.xyz"
)
assert os.path.exists(
first_group_file
), f"Group file not found: {first_group_file}"
with open(first_group_file, "r") as f:
content = f.read()
# Verify energy values in the output
assert "E(Hartree):" in content
# Parse and verify each energy value
lines = content.strip().split("\n")
i = 0
energies_found = 0
while i < len(lines):
try:
num_atoms = int(lines[i].strip())
except ValueError:
break
comment_line = lines[i + 1]
# Extract energy from comment
energy_match = re.search(
r"E\(Hartree\):\s*([-\d.]+)", comment_line
)
if energy_match:
extracted_energy = float(energy_match.group(1))
# Verify the energy is a reasonable value (not zero or nan)
assert (
extracted_energy < 0
), f"Energy should be negative: {extracted_energy}"
energies_found += 1
i += num_atoms + 2
assert energies_found > 0, "No energies found in output file"
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_edge_cases:
"""Test edge cases and boundary conditions."""
[docs]
def test_two_molecules_grouping(self, temp_working_dir):
"""Test grouping with minimum number of molecules (2)."""
from chemsmart.io.molecules.structure import Molecule
mol1 = Molecule.from_pubchem(identifier="CO")
mol2 = mol1.copy()
grouper = BasicRMSDGrouper([mol1, mol2], threshold=0.5)
groups, group_indices = grouper.group()
# Two identical molecules should form one group
assert len(groups) == 1
assert len(groups[0]) == 2
[docs]
def test_num_groups_equals_num_molecules(
self, methanol_molecules, temp_working_dir
):
"""Test requesting same number of groups as molecules."""
grouper = BasicRMSDGrouper(
methanol_molecules,
num_groups=len(methanol_molecules),
)
groups, group_indices = grouper.group()
# Should create one group per molecule (or fewer if some are identical)
assert len(groups) <= len(methanol_molecules)
[docs]
def test_num_groups_exceeds_num_molecules(
self, methanol_molecules, temp_working_dir
):
"""Test requesting more groups than molecules."""
n_mols = len(methanol_molecules)
grouper = BasicRMSDGrouper(
methanol_molecules,
num_groups=n_mols + 5,
)
groups, group_indices = grouper.group()
# Should create at most n_mols groups
assert len(groups) <= n_mols
[docs]
def test_very_low_threshold(
self, multiple_molecules_xyz_file, temp_working_dir
):
"""Test with very low threshold (should create many groups)."""
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
grouper = BasicRMSDGrouper(
molecules,
threshold=0.001, # Very low threshold
num_procs=1,
)
groups, group_indices = grouper.group()
# With very low threshold, most molecules should be in separate groups
# (unless they're truly identical)
assert len(groups) >= len(molecules) - 1
[docs]
def test_very_high_threshold(
self, multiple_molecules_xyz_file, temp_working_dir
):
"""Test with very high threshold (should create few groups)."""
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
grouper = BasicRMSDGrouper(
molecules,
threshold=100.0, # Very high threshold
num_procs=1,
)
groups, group_indices = grouper.group()
# With very high threshold, all molecules with same formula should be in one group
assert len(groups) <= 3 # Likely 1-2 groups
[docs]
def test_different_formulas_always_separate(
self, methanol_and_ethanol, temp_working_dir
):
"""Test that molecules with different formulas are always in separate groups."""
grouper = BasicRMSDGrouper(
methanol_and_ethanol,
threshold=100.0, # Even with high threshold
)
groups, group_indices = grouper.group()
# Methanol and ethanol should always be separate
assert len(groups) == 2
[docs]
def test_rmsd_infinity_for_different_molecules(
self, methanol_and_ethanol, temp_working_dir
):
"""Test that RMSD returns infinity for molecules with different atom counts."""
grouper = BasicRMSDGrouper(methanol_and_ethanol, threshold=0.5)
rmsd = grouper._calculate_rmsd((0, 1))
assert rmsd == np.inf or rmsd == float("inf")
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_label_and_append_label:
"""Test -l (label) and -a (append_label) parameter functionality."""
NUM_PROCS = 1
[docs]
def test_get_label_function(self, temp_working_dir):
"""Test _get_label function logic."""
from chemsmart.cli.grouper.grouper import _get_label
# Test with label only
result = _get_label(
label="custom", append_label=None, base_label="base"
)
assert result == "custom"
# Test with append_label only
result = _get_label(
label=None, append_label="suffix", base_label="base"
)
assert result == "base_suffix"
# Test with neither (use base_label)
result = _get_label(label=None, append_label=None, base_label="base")
assert result == "base"
# Test with both should raise error
with pytest.raises(ValueError) as excinfo:
_get_label(
label="custom", append_label="suffix", base_label="base"
)
assert "Only give label or append_label" in str(excinfo.value)
[docs]
def test_label_in_output_directory(
self, multiple_molecules_xyz_file, temp_working_dir
):
"""Test that label parameter affects output directory name."""
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)[:5]
# Test with custom label
grouper = BasicRMSDGrouper(
molecules,
threshold=1.0,
num_procs=self.NUM_PROCS,
label="my_custom_label",
)
_, _ = grouper.group()
# Check output directory uses custom label
output_dir = "my_custom_label_group_result"
assert os.path.exists(
output_dir
), f"Output dir not found: {output_dir}"
# Check Excel file uses custom label
excel_file = os.path.join(
output_dir, "my_custom_label_BasicRMSDGrouper_T1.0.xlsx"
)
assert os.path.exists(
excel_file
), f"Excel file not found: {excel_file}"
[docs]
def test_label_in_group_xyz_files(
self, multiple_molecules_xyz_file, temp_working_dir
):
"""Test that label parameter affects group XYZ file names."""
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)[:5]
grouper = BasicRMSDGrouper(
molecules,
threshold=1.0,
num_procs=self.NUM_PROCS,
label="test_label",
)
groups, group_indices = grouper.group()
grouper.unique() # Generates xyz files
output_dir = "test_label_group_result"
# Check group XYZ files use label
for i in range(len(groups)):
xyz_path = os.path.join(output_dir, f"test_label_group_{i+1}.xyz")
assert os.path.exists(xyz_path), f"Group XYZ not found: {xyz_path}"
[docs]
def test_different_labels_create_different_outputs(
self, multiple_molecules_xyz_file, temp_working_dir
):
"""Test that different labels create separate output directories."""
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)[:5]
# First grouper with label "run1"
grouper1 = BasicRMSDGrouper(
molecules,
threshold=1.0,
num_procs=self.NUM_PROCS,
label="run1",
)
grouper1.group()
# Second grouper with label "run2"
grouper2 = BasicRMSDGrouper(
molecules,
threshold=1.0,
num_procs=self.NUM_PROCS,
label="run2",
)
grouper2.group()
# Both output directories should exist
assert os.path.exists("run1_group_result")
assert os.path.exists("run2_group_result")
# Both should have their own Excel files
assert os.path.exists(
"run1_group_result/run1_BasicRMSDGrouper_T1.0.xlsx"
)
assert os.path.exists(
"run2_group_result/run2_BasicRMSDGrouper_T1.0.xlsx"
)
[docs]
def test_label_with_num_groups(
self, multiple_molecules_xyz_file, temp_working_dir
):
"""Test that label works correctly with num_groups parameter."""
xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
molecules = xyz_file.get_molecules(index=":", return_list=True)
grouper = BasicRMSDGrouper(
molecules,
num_groups=5,
num_procs=self.NUM_PROCS,
label="numgroups_test",
)
_, _ = grouper.group()
output_dir = "numgroups_test_group_result"
assert os.path.exists(output_dir)
# Excel file should reflect num_groups (N5) instead of threshold
excel_file = os.path.join(
output_dir, "numgroups_test_BasicRMSDGrouper_N5.xlsx"
)
assert os.path.exists(
excel_file
), f"Excel file not found: {excel_file}"
[docs]
class TestConformerIdExtraction:
"""Test conformer ID extraction from filenames."""
@staticmethod
def _sort_key(x):
"""Sort key for file_info: cXX files by number first, then others by filename."""
if x[2] is not None:
return (0, x[2], x[3])
else:
return (1, 0, x[3])
[docs]
def test_extract_conformer_id_with_pattern(self):
"""Test extraction with _cXX_ pattern."""
from chemsmart.cli.grouper.grouper import _extract_conformer_id
assert _extract_conformer_id("structure_c1_opt.log") == "c1"
assert _extract_conformer_id("mol_c12.log") == "c12"
assert (
_extract_conformer_id("/path/to/structure_c123_ts.log") == "c123"
)
[docs]
def test_extract_conformer_id_without_pattern(self):
"""Test extraction without _cXX_ pattern returns None."""
from chemsmart.cli.grouper.grouper import _extract_conformer_id
assert _extract_conformer_id("mol_opt.log") is None
assert _extract_conformer_id("structure.log") is None
assert _extract_conformer_id("/path/to/calc.log") is None
[docs]
def test_conformer_id_fallback_to_filename(self):
"""Test that filename is used as conformer ID when no _cXX_ pattern."""
from chemsmart.cli.grouper.grouper import _extract_conformer_id
test_files = [
"/path/to/mol_opt.log",
"/path/to/structure_ts.out",
]
conformer_ids = []
for f in test_files:
conf_id = _extract_conformer_id(f)
if conf_id is None:
basename = os.path.basename(f)
conf_id = os.path.splitext(basename)[0]
conformer_ids.append(conf_id)
assert conformer_ids == ["mol_opt", "structure_ts"]
[docs]
def test_conformer_ids_molecules_correspondence(
self, ts_conformers_log_directory, temp_working_dir
):
"""Test that conformer_ids and molecules are strictly one-to-one corresponding.
This verifies that after sorting, each molecule's energy matches the energy
from its corresponding file (identified by conf_id).
"""
import glob
import re
from chemsmart.cli.grouper.grouper import _extract_conformer_id
from chemsmart.io.gaussian.output import Gaussian16Output
from chemsmart.io.molecules.structure import Molecule
# First, build a mapping from conf_id to expected gibbs_energy
log_files = glob.glob(
os.path.join(ts_conformers_log_directory, "*.log")
)
expected_energies = {}
for f in log_files:
conf_id = _extract_conformer_id(f)
basename = os.path.basename(f)
name_without_ext = os.path.splitext(basename)[0]
if conf_id is None:
conf_id = name_without_ext
g16_output = Gaussian16Output(filename=f)
gibbs_energy = g16_output.gibbs_free_energy
if gibbs_energy is not None:
expected_energies[conf_id] = gibbs_energy
# Now simulate the actual loading logic with sorting
file_info = []
for f in log_files:
conf_id = _extract_conformer_id(f)
basename = os.path.basename(f)
name_without_ext = os.path.splitext(basename)[0]
if conf_id:
num = int(re.search(r"\d+", conf_id).group())
file_info.append((f, conf_id, num, name_without_ext))
else:
file_info.append((f, name_without_ext, None, name_without_ext))
file_info.sort(key=self._sort_key)
# Load molecules
molecules = []
conformer_ids = []
for filepath, conf_id, _, _ in file_info:
try:
mol = Molecule.from_filepath(
filepath=filepath, index="-1", return_list=False
)
if mol is not None:
g16_output = Gaussian16Output(filename=filepath)
gibbs_energy = g16_output.gibbs_free_energy
if gibbs_energy is not None:
mol._energy = gibbs_energy
molecules.append(mol)
conformer_ids.append(conf_id)
except Exception:
pass
# Verify one-to-one correspondence
assert len(molecules) == len(conformer_ids)
# Verify each molecule's energy matches the expected energy for its conf_id
for i, (mol, conf_id) in enumerate(zip(molecules, conformer_ids)):
if conf_id in expected_energies:
expected_energy = expected_energies[conf_id]
assert np.isclose(mol.energy, expected_energy, rtol=1e-10), (
f"Index {i}: energy mismatch for {conf_id}, "
f"got {mol.energy}, expected {expected_energy}"
)
[docs]
def test_mixed_pattern_files_correspondence(self):
"""Test correspondence when mixing files with and without _cXX_ pattern."""
from chemsmart.cli.grouper.grouper import _extract_conformer_id
test_files = [
("mol_c1_opt.log", "c1"),
("mol_c2_opt.log", "c2"),
("other_calc.log", None),
("mol_c3_opt.log", "c3"),
]
file_info = []
for filename, expected_pattern in test_files:
conf_id = _extract_conformer_id(filename)
basename = os.path.basename(filename)
name_without_ext = os.path.splitext(basename)[0]
if conf_id:
assert conf_id == expected_pattern
num = int(conf_id.replace("c", ""))
file_info.append((filename, conf_id, num, name_without_ext))
else:
assert expected_pattern is None
file_info.append(
(filename, name_without_ext, None, name_without_ext)
)
file_info.sort(key=self._sort_key)
sorted_conf_ids = [info[1] for info in file_info]
# Files with pattern first (sorted by number), then others by filename
assert sorted_conf_ids == ["c1", "c2", "c3", "other_calc"]
[docs]
def test_all_files_without_cxx_pattern(self):
"""Test that a folder with no _cXX_ pattern files works correctly."""
from chemsmart.cli.grouper.grouper import _extract_conformer_id
test_files = [
"mol_opt.log",
"structure_ts.log",
"calc.log",
"another_mol.log",
]
file_info = []
for filename in test_files:
conf_id = _extract_conformer_id(filename)
basename = os.path.basename(filename)
name_without_ext = os.path.splitext(basename)[0]
if conf_id:
num = int(conf_id.replace("c", ""))
file_info.append((filename, conf_id, num, name_without_ext))
else:
file_info.append(
(filename, name_without_ext, None, name_without_ext)
)
# All files should have None as num
for info in file_info:
assert info[2] is None
file_info.sort(key=self._sort_key)
sorted_conf_ids = [info[1] for info in file_info]
# All files sorted alphabetically by filename
assert sorted_conf_ids == [
"another_mol",
"calc",
"mol_opt",
"structure_ts",
]
[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_energy_extraction_function:
"""Framework tests for energy extraction by file type."""
[docs]
def test_energy_extraction_gaussian(
self, gaussian_dppeFeCl2_link_opt_outputfile
):
from chemsmart.analysis.thermochemistry import Thermochemistry
from chemsmart.cli.grouper.grouper import (
_extract_energy_based_on_energy_type,
)
thermo = Thermochemistry(
filename=gaussian_dppeFeCl2_link_opt_outputfile
)
extracted = _extract_energy_based_on_energy_type(thermo, "E")
assert np.isclose(extracted, -3869.01351827, rtol=1e-7)
thermo = Thermochemistry(
filename=gaussian_dppeFeCl2_link_opt_outputfile
)
extracted = _extract_energy_based_on_energy_type(thermo, "H")
assert np.isclose(extracted, -3868.552084, rtol=1e-7)
thermo = Thermochemistry(
filename=gaussian_dppeFeCl2_link_opt_outputfile
)
extracted = _extract_energy_based_on_energy_type(thermo, "G")
assert np.isclose(extracted, -3868.648944, rtol=1e-7)
thermo_qhh = Thermochemistry(
filename=gaussian_dppeFeCl2_link_opt_outputfile,
temperature=298.15,
concentration=1.0,
h_freq_cutoff=100.0,
s_freq_cutoff=100.0,
use_weighted_mass=True,
entropy_method="grimme",
)
extracted = _extract_energy_based_on_energy_type(thermo_qhh, "qhH")
assert np.isclose(extracted, -3868.558465, rtol=1e-7)
thermo_qhg = Thermochemistry(
filename=gaussian_dppeFeCl2_link_opt_outputfile,
temperature=298.15,
concentration=1.0,
h_freq_cutoff=100.0,
s_freq_cutoff=100.0,
use_weighted_mass=True,
entropy_method="grimme",
)
extracted = _extract_energy_based_on_energy_type(thermo_qhg, "qhG")
assert np.isclose(extracted, -3868.645329, rtol=1e-7)
[docs]
def test_energy_extraction_orca(self, fe2_singlet_output):
from chemsmart.analysis.thermochemistry import Thermochemistry
from chemsmart.cli.grouper.grouper import (
_extract_energy_based_on_energy_type,
)
thermo = Thermochemistry(filename=fe2_singlet_output)
extracted = _extract_energy_based_on_energy_type(thermo, "E")
assert np.isclose(extracted, -1568.256171848101, rtol=1e-7)
thermo = Thermochemistry(filename=fe2_singlet_output)
extracted = _extract_energy_based_on_energy_type(thermo, "H")
assert np.isclose(extracted, -1568.14237774, rtol=1e-7)
thermo = Thermochemistry(filename=fe2_singlet_output)
extracted = _extract_energy_based_on_energy_type(thermo, "G")
assert np.isclose(extracted, -1568.18873516, rtol=1e-7)
thermo_qhh = Thermochemistry(
filename=fe2_singlet_output,
temperature=298.15,
concentration=1.0,
h_freq_cutoff=100.0,
s_freq_cutoff=100.0,
use_weighted_mass=True,
entropy_method="grimme",
)
extracted = _extract_energy_based_on_energy_type(thermo_qhh, "qhH")
assert np.isclose(extracted, -1568.143280, rtol=1e-7)
thermo_qhg = Thermochemistry(
filename=fe2_singlet_output,
temperature=298.15,
concentration=1.0,
h_freq_cutoff=100.0,
s_freq_cutoff=100.0,
use_weighted_mass=True,
entropy_method="grimme",
)
extracted = _extract_energy_based_on_energy_type(thermo_qhg, "qhG")
assert np.isclose(extracted, -1568.186619, rtol=1e-7)