Source code for tests.test_groupers

import os
import shutil
import tempfile

import numpy as np
import pytest

from chemsmart.io.xyz.xyzfile import XYZFile
from chemsmart.jobs.grouper.base import MoleculeGrouper
from chemsmart.jobs.grouper.connectivity import ConnectivityGrouper
from chemsmart.jobs.grouper.energy import EnergyGrouper
from chemsmart.jobs.grouper.formula import FormulaGrouper
from chemsmart.jobs.grouper.isomorphism import RDKitIsomorphismGrouper
from chemsmart.jobs.grouper.rmsd import (
    BasicRMSDGrouper,
    HungarianRMSDGrouper,
    IRMSDGrouper,
    PymolRMSDGrouper,
    RMSDGrouper,
    SpyRMSDGrouper,
)
from chemsmart.jobs.grouper.tanimoto import TanimotoSimilarityGrouper
from chemsmart.jobs.grouper.tfd import TorsionFingerprintGrouper
from chemsmart.utils.grouper import StructureGrouperFactory
from chemsmart.utils.utils import find_irmsd_command, kabsch_align



[docs]
@pytest.fixture
def temp_working_dir():
    """
    Pytest fixture to create a temporary directory and change to it for testing.
    This prevents group_result folders from being created in the project directory.
    """
    original_dir = os.getcwd()
    temp_dir = tempfile.mkdtemp()

    try:
        os.chdir(temp_dir)
        yield temp_dir
    finally:
        os.chdir(original_dir)
        # Clean up the temporary directory
        shutil.rmtree(temp_dir, ignore_errors=True)




[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_BasicRMSD_grouper_and_basic_functionality:
    NUM_PROCS = 4


[docs]
    def test_rmsd_grouper(
        self, methanol_molecules, methanol_and_ethanol, temp_working_dir
    ):
        methanol = methanol_molecules[0]
        methanol_rot1 = methanol_molecules[1]
        assert np.any(
            methanol.positions != methanol_rot1.positions
        ), "Rotated molecule should have different positions."
        grouper = BasicRMSDGrouper(methanol_molecules)
        groups, group_indices = grouper.group()
        rmsd = grouper._calculate_rmsd((0, 1))
        assert np.isclose(
            rmsd, 0.0, rtol=1e-3
        ), "RMSD should be close to zero."
        rmsd = grouper._calculate_rmsd((1, 2))
        assert np.isclose(
            rmsd, 0.0, rtol=1e-3
        ), "RMSD should be close to zero."
        _, _, _, _, rmsd_kabsch = kabsch_align(
            methanol.positions, methanol_rot1.positions
        )
        assert np.isclose(rmsd_kabsch, 0.0, rtol=1e-3)
        assert (
            len(groups) == 1
        ), "Molecules should form one group based on geometry."
        assert (
            len(group_indices) == 1
        ), "Molecules should form one group based on geometry."
        unique_structures = grouper.unique()
        assert (
            len(unique_structures) == 1
        ), "Molecules should form one group based on geometry."

        grouper2 = BasicRMSDGrouper(methanol_and_ethanol)
        groups, group_indices = grouper2.group()
        assert (
            len(groups) == 2
        ), "Molecules should form two groups based on geometry."
        assert (
            len(group_indices) == 2
        ), "Molecules should form two groups based on geometry."
        rmsd = grouper2._calculate_rmsd((0, 1))
        assert (
            rmsd is np.inf
        ), "RMSD is set to be infinity for different molecules."
        unique_structures = grouper2.unique()
        assert (
            len(unique_structures) == 2
        ), "Molecules should form two groups based on geometry."



[docs]
    def test_rmsd_grouper_for_crest_conformers(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)

        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = BasicRMSDGrouper(
            molecules, threshold=0.2, num_procs=self.NUM_PROCS
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 18
        assert len(group_indices) == 18
        unique_structures = grouper.unique()
        assert len(unique_structures) == 18

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 1))
        assert np.isclose(rmsd, 0.409, rtol=1e-3)

        _, _, _, _, rmsd = kabsch_align(
            molecules[0].positions, molecules[1].positions
        )
        assert np.isclose(rmsd, 0.409, rtol=1e-3)

        grouper2 = BasicRMSDGrouper(
            molecules, threshold=0.5, num_procs=self.NUM_PROCS
        )
        groups, group_indices = grouper2.group()
        assert len(groups) == 12
        assert len(group_indices) == 12
        unique_structures = grouper2.unique()
        assert len(unique_structures) == 12

        grouper3 = BasicRMSDGrouper(
            molecules, threshold=1.0, num_procs=self.NUM_PROCS
        )
        groups, group_indices = grouper3.group()
        assert len(groups) == 11
        assert len(group_indices) == 11
        unique_structures = grouper3.unique()
        assert len(unique_structures) == 11

        grouper4 = BasicRMSDGrouper(
            molecules, threshold=1.5, num_procs=self.NUM_PROCS
        )
        groups, group_indices = grouper4.group()
        assert len(groups) == 9
        assert len(group_indices) == 9
        unique_structures = grouper4.unique()
        assert len(unique_structures) == 9

        grouper5 = BasicRMSDGrouper(
            molecules, threshold=2.0, num_procs=self.NUM_PROCS
        )
        groups, group_indices = grouper5.group()
        assert len(groups) == 8
        assert len(group_indices) == 8
        unique_structures = grouper5.unique()
        assert len(unique_structures) == 8

        grouper6 = BasicRMSDGrouper(
            molecules, threshold=2.5, num_procs=self.NUM_PROCS
        )
        groups, group_indices = grouper6.group()
        assert len(groups) == 4
        assert len(group_indices) == 4
        unique_structures = grouper6.unique()
        assert len(unique_structures) == 4



[docs]
    def test_num_groups_parameter(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)

        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = BasicRMSDGrouper(
            molecules, threshold=None, num_groups=17, num_procs=self.NUM_PROCS
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 17
        assert len(group_indices) == 17
        unique_structures = grouper.unique()
        assert len(unique_structures) == 17



[docs]
    def test_pick_the_lowestenergy_conformers(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)

        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = BasicRMSDGrouper(
            molecules, threshold=None, num_groups=3, num_procs=self.NUM_PROCS
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 3
        assert len(group_indices) == 3
        unique_structures = grouper.unique()
        assert len(unique_structures) == 3

        energies = [mol.energy for mol in unique_structures]
        assert -126.2575508 in energies
        assert -126.25153216 in energies
        assert -126.24909661 in energies



[docs]
    def test_rmsd_grouper_for_crest_conformers_ignore_Hs(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)

        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = BasicRMSDGrouper(
            molecules,
            threshold=0.2,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=True,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 17
        assert len(group_indices) == 17
        unique_structures = grouper.unique()
        assert len(unique_structures) == 17

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 1))
        assert np.isclose(rmsd, 0.301, rtol=1e-3)  # removed H atoms

        _, _, _, _, rmsd = kabsch_align(
            molecules[0].positions, molecules[1].positions
        )
        assert np.isclose(rmsd, 0.409, rtol=1e-3)  # did not remove H atoms

        grouper2 = BasicRMSDGrouper(
            molecules,
            threshold=0.5,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=True,
        )
        groups, group_indices = grouper2.group()
        assert len(groups) == 12
        assert len(group_indices) == 12
        unique_structures = grouper2.unique()
        assert len(unique_structures) == 12

        grouper3 = BasicRMSDGrouper(
            molecules,
            threshold=1.0,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=True,
        )
        groups, group_indices = grouper3.group()
        assert len(groups) == 10
        assert len(group_indices) == 10
        unique_structures = grouper3.unique()
        assert len(unique_structures) == 10

        grouper4 = BasicRMSDGrouper(
            molecules,
            threshold=1.5,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=True,
        )
        groups, group_indices = grouper4.group()
        assert len(groups) == 7
        assert len(group_indices) == 7
        unique_structures = grouper4.unique()
        assert len(unique_structures) == 7

        grouper5 = BasicRMSDGrouper(
            molecules,
            threshold=2.0,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=True,
        )
        groups, group_indices = grouper5.group()
        assert len(groups) == 5
        assert len(group_indices) == 5
        unique_structures = grouper5.unique()
        assert len(unique_structures) == 5

        grouper6 = BasicRMSDGrouper(
            molecules,
            threshold=2.5,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=True,
        )
        groups, group_indices = grouper6.group()
        assert len(groups) == 5
        assert len(group_indices) == 5
        unique_structures = grouper6.unique()
        assert len(unique_structures) == 5



[docs]
    def test_rmsd_grouper_for_rotated_molecules(
        self, two_rotated_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=two_rotated_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 2
        grouper = BasicRMSDGrouper(
            molecules,
            threshold=0.5,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=False,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 2
        assert len(group_indices) == 2
        unique_structures = grouper.unique()
        assert len(unique_structures) == 2

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 1))
        assert np.isclose(rmsd, 0.611, rtol=1e-3)





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_HungarianRMSD_grouper:
    NUM_PROCS = 4


[docs]
    def test_hrmsd_grouper_for_rotated_molecules(
        self, two_rotated_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=two_rotated_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 2
        grouper = HungarianRMSDGrouper(
            molecules,
            threshold=0.5,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=False,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 1
        assert len(group_indices) == 1
        unique_structures = grouper.unique()
        assert len(unique_structures) == 1

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 1))
        assert np.isclose(rmsd, 0.2294, rtol=1e-3)



[docs]
    def test_hrmsd_grouper_for_crest_molecules(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = HungarianRMSDGrouper(
            molecules,
            threshold=0.5,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=False,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 12
        assert len(group_indices) == 12
        unique_structures = grouper.unique()
        assert len(unique_structures) == 12

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 1))
        assert np.isclose(rmsd, 0.4091, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((0, 2))
        assert np.isclose(rmsd, 0.5899, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((0, 3))
        assert np.isclose(rmsd, 1.8891, rtol=1e-3)



[docs]
    def test_ignore_hydrogen(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = HungarianRMSDGrouper(
            molecules,
            threshold=0.5,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=True,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 12
        assert len(group_indices) == 12
        unique_structures = grouper.unique()
        assert len(unique_structures) == 12

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 4))
        assert np.isclose(rmsd, 1.1915, rtol=1e-3)





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_SpyRMSD_grouper:
    NUM_PROCS = 4


[docs]
    def test_spyrmsd_grouper_for_rotated_molecules(
        self, two_rotated_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=two_rotated_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 2
        grouper = SpyRMSDGrouper(
            molecules,
            threshold=0.5,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=False,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 1
        assert len(group_indices) == 1
        unique_structures = grouper.unique()
        assert len(unique_structures) == 1

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 1))
        assert np.isclose(rmsd, 0.2125, rtol=1e-3)



[docs]
    def test_spyrmsd_grouper_for_crest_molecules(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = SpyRMSDGrouper(
            molecules,
            threshold=0.5,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=False,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 12
        assert len(group_indices) == 12
        unique_structures = grouper.unique()
        assert len(unique_structures) == 12

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 1))
        assert np.isclose(rmsd, 0.4091, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((0, 2))
        assert np.isclose(rmsd, 1.3925, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((0, 3))
        assert np.isclose(rmsd, 2.1789, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((0, 4))
        assert np.isclose(rmsd, 1.8202, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((0, 5))
        assert np.isclose(rmsd, 2.0029, rtol=1e-3)



[docs]
    def test_ignore_hydrogen(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = SpyRMSDGrouper(
            molecules,
            threshold=2.5981,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=True,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 3
        assert len(group_indices) == 3
        unique_structures = grouper.unique()
        assert len(unique_structures) == 3

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 5))
        assert np.isclose(rmsd, 1.7034, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((0, 6))
        assert np.isclose(rmsd, 2.6183, rtol=1e-3)





[docs]
@pytest.mark.skipif(
    not find_irmsd_command(), reason="irmsd command not available"
)
@pytest.mark.usefixtures("temp_working_dir")
class Test_IRMSD_grouper:
    NUM_PROCS = 4


[docs]
    def test_irmsd_grouper_for_rotated_molecules(
        self, two_rotated_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=two_rotated_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 2
        grouper = IRMSDGrouper(
            molecules,
            threshold=0.125,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=False,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 2
        assert len(group_indices) == 2
        unique_structures = grouper.unique()
        assert len(unique_structures) == 2

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 1))
        assert np.isclose(rmsd, 0.2294, rtol=1e-3)



[docs]
    def test_irmsd_grouper_for_crest_molecules(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = IRMSDGrouper(
            molecules,
            threshold=0.5,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=False,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 12
        assert len(group_indices) == 12
        unique_structures = grouper.unique()
        assert len(unique_structures) == 12

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 1))
        assert np.isclose(rmsd, 0.4091, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((0, 2))
        assert np.isclose(rmsd, 1.3925, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((2, 10))
        assert np.isclose(rmsd, 2.2390, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((8, 16))
        assert np.isclose(rmsd, 3.4209, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((0, 13))
        assert np.isclose(rmsd, 0.8411, rtol=1e-3)



[docs]
    def test_ignore_hydrogen(
        self, two_rotated_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=two_rotated_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 2
        grouper = IRMSDGrouper(
            molecules,
            threshold=0.125,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=True,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 2
        assert len(group_indices) == 2
        unique_structures = grouper.unique()
        assert len(unique_structures) == 2

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 1))
        assert np.isclose(rmsd, 0.2294, rtol=1e-3)





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_PymolRMSD_grouper:
    NUM_PROCS = 1


[docs]
    @classmethod
    def setup_class(cls):
        """Initialize PyMOL once for all tests in this class."""
        try:
            import pymol
            from pymol import cmd

            pymol.finish_launching(["pymol", "-qc"])
            cmd.reinitialize()
        except Exception:
            # PyMOL may not be installed; tests will be skipped via @pytest.mark.skipif
            pass



[docs]
    def teardown_method(self, method):
        """Clean up PyMOL objects after each test method to prevent slowdown."""
        try:
            from pymol import cmd

            # Delete all objects instead of reinitialize to keep session alive
            cmd.delete("all")
        except Exception:
            # Ignore cleanup errors if PyMOL is not available
            pass



[docs]
    def test_pymolrmsd_grouper_for_rotated_molecules(
        self, two_rotated_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=two_rotated_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 2
        grouper = PymolRMSDGrouper(
            molecules,
            threshold=0.5,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=False,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 1
        assert len(group_indices) == 1
        unique_structures = grouper.unique()
        assert len(unique_structures) == 1

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 1))
        assert np.isclose(rmsd, 0.0000, rtol=1e-3)

        # Explicitly cleanup to prevent __del__ from calling quit()
        if hasattr(grouper, "_temp_dir") and grouper._temp_dir:

            shutil.rmtree(grouper._temp_dir, ignore_errors=True)
            grouper._temp_dir = None
        grouper.cmd = None  # Prevent __del__ from calling quit()



[docs]
    def test_pymolrmsd_grouper_for_crest_molecules(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = PymolRMSDGrouper(
            molecules,
            threshold=0.5,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=False,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 7
        assert len(group_indices) == 7
        unique_structures = grouper.unique()
        assert len(unique_structures) == 7

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 1))
        assert np.isclose(rmsd, 0.074175, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((0, 2))
        assert np.isclose(rmsd, 0.023745, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((3, 4))
        assert np.isclose(rmsd, 1.725241, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((7, 8))
        assert np.isclose(rmsd, 2.309451, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((14, 16))
        assert np.isclose(rmsd, 1.837319, rtol=1e-3)

        # Explicitly cleanup to prevent __del__ from calling quit()
        if hasattr(grouper, "_temp_dir") and grouper._temp_dir:

            shutil.rmtree(grouper._temp_dir, ignore_errors=True)
            grouper._temp_dir = None
        grouper.cmd = None  # Prevent __del__ from calling quit()



[docs]
    def test_ignore_hydrogen(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = PymolRMSDGrouper(
            molecules,
            threshold=0.8203818,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=True,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 5
        assert len(group_indices) == 5
        unique_structures = grouper.unique()
        assert len(unique_structures) == 5

        # rmsd calculation from grouper
        rmsd = grouper._calculate_rmsd((0, 2))
        assert np.isclose(rmsd, 0.0211, rtol=1e-3)
        rmsd = grouper._calculate_rmsd((1, 4))
        assert np.isclose(rmsd, 0.7669, rtol=1e-3)

        # Explicitly cleanup to prevent __del__ from calling quit()
        if hasattr(grouper, "_temp_dir") and grouper._temp_dir:

            shutil.rmtree(grouper._temp_dir, ignore_errors=True)
            grouper._temp_dir = None
        grouper.cmd = None  # Prevent __del__ from calling quit()



[docs]
    def test_pymol_grouper_rejects_multiproc(
        self, methanol_molecules, temp_working_dir
    ):
        """Test that PyMOL grouper raises error when num_procs > 1."""
        with pytest.raises(ValueError) as excinfo:
            PymolRMSDGrouper(
                methanol_molecules,
                threshold=0.5,
                num_procs=4,  # Should raise error
            )
        assert (
            "single" in str(excinfo.value).lower()
            or "num_procs" in str(excinfo.value).lower()
        )



[docs]
    @classmethod
    def teardown_class(cls):
        """Clean up PyMOL after all tests in this class are done."""
        try:
            from pymol import cmd

            cmd.reinitialize()
        except Exception:
            # Ignore cleanup errors if PyMOL is not available
            pass





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_Tanimoto_similarity_grouper:
    NUM_PROCS = 4


[docs]
    def test_tanimoto_similarity_grouper(
        self, methanol_molecules, methanol_and_ethanol, temp_working_dir
    ):
        grouper = TanimotoSimilarityGrouper(methanol_molecules)
        groups, group_indices = grouper.group()
        assert (
            len(groups) == 1
        ), "Molecules should form one group based on RCM similarity."
        assert (
            len(group_indices) == 1
        ), "Molecules should form one group based on RCM similarity."
        unique_structures = grouper.unique()
        assert (
            len(unique_structures) == 1
        ), "Molecules should form one group based on RCM similarity."
        grouper2 = TanimotoSimilarityGrouper(methanol_and_ethanol)
        groups, group_indices = grouper2.group()
        assert (
            len(groups) == 2
        ), "Molecules should form two groups based on RCM similarity."
        assert (
            len(group_indices) == 2
        ), "Molecules should form two groups based on RCM similarity."
        unique_structures = grouper2.unique()
        assert (
            len(unique_structures) == 2
        ), "Molecules should form two groups based on RCM similarity."



[docs]
    def test_tanimoto_grouper_for_crest_conformers(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)

        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = TanimotoSimilarityGrouper(
            molecules,
            threshold=0.98,
            fingerprint_type="usrcat",
            num_procs=self.NUM_PROCS,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 4
        assert len(group_indices) == 4
        unique_structures = grouper.unique()
        assert len(unique_structures) == 4

        grouper = TanimotoSimilarityGrouper(
            molecules,
            threshold=0.999,
            fingerprint_type="usrcat",
            num_procs=self.NUM_PROCS,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 14
        assert len(group_indices) == 14
        unique_structures = grouper.unique()
        assert len(unique_structures) == 14





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_TorsionFingerprint_grouper:
    NUM_PROCS = 4


[docs]
    def test_torsionfingerprint_grouper_for_rotated_molecules(
        self, two_rotated_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=two_rotated_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 2
        grouper = TorsionFingerprintGrouper(
            molecules,
            threshold=0.1,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=False,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 1
        assert len(group_indices) == 1
        unique_structures = grouper.unique()
        assert len(unique_structures) == 1



[docs]
    def test_torsionfingerprint_grouper_for_crest_molecules(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = TorsionFingerprintGrouper(
            molecules,
            threshold=0.05,
            num_procs=self.NUM_PROCS,
            ignore_hydrogens=False,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 10
        assert len(group_indices) == 10
        unique_structures = grouper.unique()
        assert len(unique_structures) == 10



[docs]
    def test_use_weights_parameter(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = TorsionFingerprintGrouper(
            molecules,
            threshold=0.05,
            num_procs=self.NUM_PROCS,
            use_weights=False,
            ignore_hydrogens=False,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 15
        assert len(group_indices) == 15
        unique_structures = grouper.unique()
        assert len(unique_structures) == 15

        tfd = grouper._calculate_tfd((0, 2))
        assert np.isclose(tfd, 0.08229, rtol=1e-3)
        tfd = grouper._calculate_tfd((0, 7))
        assert np.isclose(tfd, 0.07727, rtol=1e-3)



[docs]
    def test_count_groups_matches_complete_linkage_iteration_order(
        self, methanol_molecules
    ):
        grouper = TorsionFingerprintGrouper(
            methanol_molecules[:3],
            threshold=0.1,
            num_procs=1,
            ignore_hydrogens=False,
        )

        adj_matrix = np.zeros((3, 3), dtype=bool)
        adj_matrix[0, 1] = adj_matrix[1, 0] = True
        adj_matrix[1, 2] = adj_matrix[2, 1] = True

        _, index_groups = grouper._complete_linkage_grouping(adj_matrix, 3)
        num_groups = grouper._count_groups(adj_matrix, 3)

        assert len(index_groups) == 2
        assert num_groups == len(index_groups)



[docs]
    def test_use_maxdev_parameter(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = TorsionFingerprintGrouper(
            molecules,
            threshold=0.26965,
            num_procs=self.NUM_PROCS,
            use_weights=True,
            max_dev="spec",
            ignore_hydrogens=False,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 2
        assert len(group_indices) == 2
        unique_structures = grouper.unique()
        assert len(unique_structures) == 2

        tfd = grouper._calculate_tfd((0, 1))
        assert np.isclose(tfd, 0.02027, rtol=1e-3)
        tfd = grouper._calculate_tfd((3, 4))
        assert np.isclose(tfd, 0.24365, rtol=1e-3)





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_other_groupers:
    NUM_PROCS = 4


[docs]
    def test_base_record_template_method(self, methanol_molecules):
        class DummyGrouper(MoleculeGrouper):
            def group(self):
                return [], []

            def _record_results(self, **kwargs):
                return kwargs

        grouper = DummyGrouper(methanol_molecules)
        payload = grouper.record(example_key="example_value")
        assert payload["example_key"] == "example_value"



[docs]
    def test_record_writes_rmsd_matrix_with_headers(
        self, methanol_molecules, temp_working_dir
    ):
        """Detailed record test: verify matrix file and key header lines are written."""
        from openpyxl import load_workbook

        molecules = methanol_molecules[:2]
        grouper = BasicRMSDGrouper(
            molecules,
            threshold=0.5,
            num_procs=1,
            label="record_detail",
            energy_type="E",
        )

        # Populate cache first so Groups sheet can be generated.
        groups, group_indices = grouper.group()
        assert len(groups) >= 1
        assert len(group_indices) >= 1

        # Call unified record() explicitly to validate template-method path.
        rmsd_matrix = np.zeros((len(molecules), len(molecules)))
        grouper.record(rmsd_matrix=rmsd_matrix, grouping_time=0.01)

        xlsx_file = os.path.join(
            temp_working_dir,
            "record_detail_group_result",
            "record_detail_BasicRMSDGrouper_T0.5.xlsx",
        )
        assert os.path.exists(xlsx_file)

        wb = load_workbook(xlsx_file, data_only=True)
        assert "RMSD_Matrix" in wb.sheetnames
        assert "Groups" in wb.sheetnames

        ws = wb["RMSD_Matrix"]
        header_lines = [str(ws[f"A{i}"].value) for i in range(1, 25)]

        assert any("Energy Type: E" in line for line in header_lines if line)
        assert any(
            "Used Molecules: 2" in line for line in header_lines if line
        )
        assert any(
            "Skipped Molecules: 0" in line for line in header_lines if line
        )
        assert any("Grouping Time:" in line for line in header_lines if line)



[docs]
    def test_record_writes_formula_outputs_with_headers(
        self, methanol_and_ethanol, temp_working_dir
    ):
        """Detailed record test: verify non-matrix output sheet and header lines."""
        from openpyxl import load_workbook

        grouper = FormulaGrouper(
            methanol_and_ethanol,
            num_procs=1,
            label="formula_record_detail",
            energy_type="E",
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 2
        assert len(group_indices) == 2

        xlsx_file = os.path.join(
            temp_working_dir,
            "formula_record_detail_group_result",
            "formula_record_detail_FormulaGrouper.xlsx",
        )
        assert os.path.exists(xlsx_file)

        wb = load_workbook(xlsx_file, data_only=True)
        assert "Formulas" in wb.sheetnames
        assert "Groups" in wb.sheetnames

        ws = wb["Formulas"]
        header_lines = [str(ws[f"A{i}"].value) for i in range(1, 25)]
        assert any(
            "Total Molecules: 2" in line for line in header_lines if line
        )
        assert any(
            "Unique Formulas: 2" in line for line in header_lines if line
        )
        assert any("Energy Type: E" in line for line in header_lines if line)



[docs]
    def test_formula_grouper(
        self,
        methanol_molecules,
        methanol_and_ethanol,
        conformers_from_rdkit,
        temp_working_dir,
    ):
        grouper = FormulaGrouper(methanol_molecules)
        groups, group_indices = grouper.group()
        unique_structures = grouper.unique()
        assert (
            len(groups) == 1
        ), "Molecules should form one group based on formula."

        assert (
            len(unique_structures) == 1
        ), "Molecules should form one group based on formula."

        grouper2 = FormulaGrouper(methanol_and_ethanol)
        groups, group_indices = grouper2.group()
        unique_structures = grouper2.unique()
        assert (
            len(groups) == 2
        ), "Molecules should form two groups based on formula."
        assert (
            len(unique_structures) == 2
        ), "Molecules should form two groups based on formula."

        grouper3 = FormulaGrouper(conformers_from_rdkit)
        # based on Formula, should all be the same even for 300 conformers
        groups, group_indices = grouper3.group()
        unique_structures = grouper3.unique()
        assert len(groups) == 1
        assert len(unique_structures) == 1



[docs]
    @pytest.mark.slow
    def test_connectivity_grouper(
        self, methanol_molecules, methanol_and_ethanol, temp_working_dir
    ):
        grouper = ConnectivityGrouper(methanol_molecules)
        groups, group_indices = grouper.group()
        assert (
            len(groups) == 1
        ), "Molecules should form one group based on connectivity."
        assert (
            len(group_indices) == 1
        ), "Molecules should form one group based on connectivity."
        unique_structures = grouper.unique()
        assert (
            len(unique_structures) == 1
        ), "Molecules should form one group based on connectivity."

        grouper2 = ConnectivityGrouper(methanol_and_ethanol)
        groups, group_indices = grouper2.group()
        assert (
            len(groups) == 2
        ), "Molecules should form two groups based on connectivity."
        assert (
            len(group_indices) == 2
        ), "Molecules should form two groups based on connectivity."
        unique_structures = grouper2.unique()
        assert (
            len(unique_structures) == 2
        ), "Molecules should form two groups based on connectivity."



[docs]
    def test_connectivity_grouper_for_crest_conformers(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)

        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18
        grouper = ConnectivityGrouper(
            molecules,
            num_procs=self.NUM_PROCS,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 4
        assert len(group_indices) == 4
        unique_structures = grouper.unique()
        assert len(unique_structures) == 4



[docs]
    def test_rdkit_isomorphism_grouper(
        self, methanol_molecules, methanol_and_ethanol, temp_working_dir
    ):
        grouper = RDKitIsomorphismGrouper(methanol_molecules)
        groups, group_indices = grouper.group()
        assert (
            len(groups) == 1
        ), "Molecules should form one group based on RCM similarity."
        assert (
            len(group_indices) == 1
        ), "Molecules should form one group based on RCM similarity."
        unique_structures = grouper.unique()
        assert (
            len(unique_structures) == 1
        ), "Molecules should form one group based on RCM similarity."
        grouper2 = RDKitIsomorphismGrouper(methanol_and_ethanol)
        groups, group_indices = grouper2.group()
        assert (
            len(groups) == 2
        ), "Molecules should form two groups based on RCM similarity."
        assert (
            len(group_indices) == 2
        ), "Molecules should form two groups based on RCM similarity."
        unique_structures = grouper2.unique()
        assert (
            len(unique_structures) == 2
        ), "Molecules should form two groups based on RCM similarity."





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_EnergyGrouper:
    NUM_PROCS = 4


[docs]
    def test_energy_grouper_raises_error_for_missing_energy(
        self, methanol_molecules, temp_working_dir
    ):
        # methanol_molecules from pubchem don't have energy information
        with pytest.raises(ValueError) as excinfo:
            EnergyGrouper(methanol_molecules)
        assert "missing energy information" in str(excinfo.value)



[docs]
    def test_energy_grouper_for_crest_conformers(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        """Test EnergyGrouper with molecules that have energy information."""
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)
        assert len(molecules) == 18

        # All CREST conformers should have energy
        for mol in molecules:
            assert mol.energy is not None

        assert len(molecules) == 18
        grouper = EnergyGrouper(
            molecules,
            threshold=0.5,
            num_procs=self.NUM_PROCS,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 7
        assert len(group_indices) == 7
        unique_structures = grouper.unique()
        assert len(unique_structures) == 7

        expected1 = -1.7839 / 627.509474
        expected2 = 1.4580 / 627.509474

        relative_diff, abs_diff = grouper._calculate_energy_diff((1, 0))
        assert np.isclose(relative_diff, expected1, rtol=1e-3)
        relative_diff, abs_diff = grouper._calculate_energy_diff((1, 2))
        assert np.isclose(abs_diff, expected2, rtol=1e-3)

        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)

        grouper = EnergyGrouper(
            molecules, num_groups=5, num_procs=self.NUM_PROCS
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 5
        assert len(group_indices) == 5



[docs]
    def test_energy_grouper_for_log_conformers(
        self, ts_conformers_log_directory, temp_working_dir
    ):
        """Test EnergyGrouper with molecules loaded from log files using Gibbs energy."""
        import glob
        import re

        from chemsmart.io.gaussian.output import Gaussian16Output
        from chemsmart.io.molecules.structure import Molecule

        # Load molecules from log files and set Gibbs energy (like CLI does)
        log_files = sorted(
            glob.glob(os.path.join(ts_conformers_log_directory, "*.log"))
        )
        molecules = []
        conformer_ids = []

        for log_file in log_files:
            mol = Molecule.from_filepath(
                filepath=log_file, index="-1", return_list=False
            )
            if mol is not None:
                # Extract and set Gibbs energy using gibbs_free_energy property
                g16_output = Gaussian16Output(filename=log_file)
                gibbs_energy = g16_output.gibbs_free_energy
                if gibbs_energy is not None:
                    mol._energy = gibbs_energy

                molecules.append(mol)
                basename = os.path.basename(log_file)
                match = re.search(r"_c(\d+)\.log$", basename)
                if match:
                    conformer_ids.append(f"c{match.group(1)}")
                else:
                    conformer_ids.append(basename)

        assert len(molecules) == 5
        assert len(conformer_ids) == 5

        # All log conformers should have Gibbs energy
        for mol in molecules:
            assert mol.energy is not None

        grouper = EnergyGrouper(
            molecules,
            threshold=1.0,
            num_procs=self.NUM_PROCS,
            conformer_ids=conformer_ids,
        )
        groups, group_indices = grouper.group()
        assert len(groups) == 2
        assert len(group_indices) == 2
        unique_structures = grouper.unique()
        assert len(unique_structures) == 2

        expected1 = -4.1429 / 627.509474
        expected2 = 4.1429 / 627.509474

        relative_diff, abs_diff = grouper._calculate_energy_diff((1, 4))
        assert np.isclose(relative_diff, expected1, rtol=1e-2)
        assert np.isclose(abs_diff, expected2, rtol=1e-2)



[docs]
    def test_energy_extraction_from_xyz_file(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):

        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)

        # Test specific energy value for 3rd structure (index 2)
        expected_energy_mol3 = -126.25238449
        actual_energy_mol3 = molecules[2].energy

        assert actual_energy_mol3 is not None, "3rd molecule energy is None"
        assert np.isclose(
            actual_energy_mol3, expected_energy_mol3, rtol=1e-8
        ), (
            f"Energy mismatch for 3rd molecule: got {actual_energy_mol3}, "
            f"expected {expected_energy_mol3}"
        )



[docs]
    def test_gibbs_energy_extraction_function(
        self, ts_conformers_log_directory, temp_working_dir
    ):
        """Test that Gaussian16Output.gibbs_free_energy extracts correct value."""
        from chemsmart.io.gaussian.output import Gaussian16Output

        log_file = os.path.join(
            ts_conformers_log_directory, "ch_1c_para_c1.log"
        )
        g16_output = Gaussian16Output(filename=log_file)

        # Known values
        expected_scf = -1401.99431519
        expected_gibbs_correction = 0.311863
        expected_gibbs_energy = (
            expected_scf + expected_gibbs_correction
        )  # -1401.68245219

        gibbs_energy = g16_output.gibbs_free_energy
        assert np.isclose(gibbs_energy, expected_gibbs_energy, rtol=1e-6), (
            f"Gibbs energy mismatch: got {gibbs_energy}, "
            f"expected {expected_gibbs_energy}"
        )



[docs]
    def test_energy_extraction_from_ts_log_files(
        self, ts_conformers_log_directory, temp_working_dir
    ):
        """Test that energy is correctly extracted from TS log files as SCF Done energy."""
        import glob

        from chemsmart.io.gaussian.output import Gaussian16Output
        from chemsmart.io.molecules.structure import Molecule

        log_files = sorted(
            glob.glob(os.path.join(ts_conformers_log_directory, "*.log"))
        )

        molecules = []
        for log_file in log_files:
            mol = Molecule.from_filepath(
                filepath=log_file, index="-1", return_list=False
            )
            if mol is not None:
                molecules.append(mol)
                g16_output = Gaussian16Output(filename=log_file)
                scf_energy = (
                    g16_output.scf_energies[-1]
                    if g16_output.scf_energies
                    else None
                )
                assert np.isclose(mol.energy, scf_energy, rtol=1e-6), (
                    f"Energy mismatch for {log_file}: "
                    f"mol.energy={mol.energy}, expected SCF={scf_energy}"
                )





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Testfactory:


[docs]
    def test_structure_grouper_factory_energy(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        """Test factory creation of energy grouper (requires molecules with energy)."""
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)

        factory = StructureGrouperFactory()
        energy_grouper = factory.create(molecules, strategy="energy")
        assert isinstance(energy_grouper, EnergyGrouper)



[docs]
    def test_structure_grouper_factory(
        self, methanol_molecules, temp_working_dir
    ):
        factory = StructureGrouperFactory()
        rmsd_grouper = factory.create(methanol_molecules, strategy="rmsd")
        assert isinstance(rmsd_grouper, RMSDGrouper)
        hrmsd_grouper = factory.create(methanol_molecules, strategy="hrmsd")
        assert isinstance(hrmsd_grouper, RMSDGrouper)
        spyrmsd_grouper = factory.create(
            methanol_molecules, strategy="spyrmsd"
        )
        assert isinstance(spyrmsd_grouper, RMSDGrouper)

        # irmsd requires external command, skip if not available
        if find_irmsd_command():
            irmsd_grouper = factory.create(
                methanol_molecules, strategy="irmsd"
            )
            assert isinstance(irmsd_grouper, RMSDGrouper)

        pymolrmsd_grouper = factory.create(
            methanol_molecules, strategy="pymolrmsd"
        )
        assert isinstance(pymolrmsd_grouper, RMSDGrouper)

        # Explicitly cleanup pymolrmsd_grouper to prevent __del__ from calling quit()
        if (
            hasattr(pymolrmsd_grouper, "_temp_dir")
            and pymolrmsd_grouper._temp_dir
        ):

            shutil.rmtree(pymolrmsd_grouper._temp_dir, ignore_errors=True)
            pymolrmsd_grouper._temp_dir = None
        pymolrmsd_grouper.cmd = None  # Prevent __del__ from calling quit()

        torsion_grouper = factory.create(
            methanol_molecules, strategy="torsion"
        )
        assert isinstance(torsion_grouper, TorsionFingerprintGrouper)
        formula_grouper = factory.create(
            methanol_molecules, strategy="formula"
        )
        assert isinstance(formula_grouper, FormulaGrouper)
        connectivity_grouper = factory.create(
            methanol_molecules, strategy="connectivity"
        )
        assert isinstance(connectivity_grouper, ConnectivityGrouper)
        tanimoto_grouper = factory.create(
            methanol_molecules, strategy="tanimoto"
        )
        assert isinstance(tanimoto_grouper, TanimotoSimilarityGrouper)
        rdkit_isomorphism_grouper = factory.create(
            methanol_molecules, strategy="isomorphism"
        )
        assert isinstance(rdkit_isomorphism_grouper, RDKitIsomorphismGrouper)



[docs]
    @classmethod
    def teardown_class(cls):
        """Clean up PyMOL after all tests in this class are done."""
        try:
            from pymol import cmd

            cmd.reinitialize()
        except Exception:
            # Ignore cleanup errors if PyMOL is not available
            pass





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_grouper_utility_functions:
    """Test utility functions and helper methods in groupers."""

    NUM_PROCS = 1


[docs]
    def test_rmsd_matrix_with_num_groups(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        """Test that RMSD matrix filename reflects num_groups when used."""
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)

        grouper = BasicRMSDGrouper(
            molecules,
            num_groups=3,
            num_procs=1,
            label="test_num_groups",
        )
        _, _ = grouper.group()

        # Check that Excel file was created in the temp_working_dir
        expected_file = os.path.join(
            temp_working_dir,
            "test_num_groups_group_result",
            "test_num_groups_BasicRMSDGrouper_N3.xlsx",
        )
        assert os.path.exists(
            expected_file
        ), f"Matrix file not found: {expected_file}"



[docs]
    def test_grouping_result_caching(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        """Test that grouping results are cached and reused."""
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)

        grouper = BasicRMSDGrouper(
            molecules,
            threshold=0.5,
            num_procs=1,
        )

        groups1, indices1 = grouper.group()

        assert grouper._cached_groups is not None
        assert grouper._cached_group_indices is not None

        # unique() should use cached results
        unique_mols = grouper.unique()
        assert len(unique_mols) == len(groups1)



[docs]
    def test_unique_returns_lowest_energy_representative(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        """Test that unique() returns lowest energy molecule from each group."""
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)

        # CREST conformers have energy information
        grouper = BasicRMSDGrouper(
            molecules,
            threshold=2.0,
            num_procs=1,
        )
        groups, group_indices = grouper.group()
        unique_mols = grouper.unique()

        # For each group, verify the representative has lowest energy
        for i, (group, indices) in enumerate(zip(groups, group_indices)):
            mols_with_energy = [
                (mol, idx)
                for mol, idx in zip(group, indices)
                if mol.energy is not None
            ]
            if mols_with_energy:
                min_energy_mol = min(
                    mols_with_energy, key=lambda x: x[0].energy
                )[0]
                assert unique_mols[i].energy == min_energy_mol.energy





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_grouper_complete_linkage:
    """Test complete linkage clustering behavior."""

    NUM_PROCS = 4


[docs]
    def test_complete_linkage_prevents_chaining(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        """Test that complete linkage prevents chaining effect in grouping."""
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)

        # With a moderate threshold, complete linkage should create
        # groups where ALL members are within threshold of each other
        grouper = BasicRMSDGrouper(
            molecules,
            threshold=1.0,
            num_procs=self.NUM_PROCS,
        )
        groups, group_indices = grouper.group()

        # Verify complete linkage property: within each group,
        # all pairs should have RMSD < threshold
        for group_idx, indices in enumerate(group_indices):
            if len(indices) > 1:
                for i, idx_i in enumerate(indices):
                    for j, idx_j in enumerate(indices):
                        if i < j:
                            rmsd = grouper._calculate_rmsd((idx_i, idx_j))
                            assert rmsd < 1.0, (
                                f"Group {group_idx}: RMSD between {idx_i} and {idx_j} "
                                f"is {rmsd}, exceeds threshold 1.0"
                            )





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_conformer_ids_functionality:
    """Test conformer_ids parameter functionality."""

    NUM_PROCS = 1


[docs]
    def test_conformer_ids_from_log_directory(
        self, ts_conformers_log_directory, temp_working_dir
    ):
        """Test loading conformer IDs from a directory of log files."""
        import glob
        import re

        from chemsmart.io.molecules.structure import Molecule

        # Load molecules from log files in the ts_conformers_log_directory
        log_files = sorted(
            glob.glob(os.path.join(ts_conformers_log_directory, "*.log"))
        )
        assert (
            len(log_files) == 5
        ), f"Expected 5 log files, found {len(log_files)}"

        # Extract conformer IDs from filenames (e.g., ch_1c_para_c1.log -> c1)
        molecules = []
        conformer_ids = []
        for log_file in log_files:
            mol = Molecule.from_filepath(
                filepath=log_file, index="-1", return_list=False
            )
            if mol is not None:
                molecules.append(mol)
                # Extract conformer ID from filename
                basename = os.path.basename(log_file)
                match = re.search(r"_c(\d+)\.log$", basename)
                if match:
                    conformer_ids.append(f"c{match.group(1)}")
                else:
                    conformer_ids.append(basename)

        assert len(molecules) >= 2, "Need at least 2 molecules for grouping"

        grouper = BasicRMSDGrouper(
            molecules,
            threshold=1.0,
            num_procs=1,
            label="ts_conformers_test",
            conformer_ids=conformer_ids,
        )
        _, _ = grouper.group()

        # Verify conformer_ids are stored
        assert grouper.conformer_ids == conformer_ids

        # Check Excel output
        import pandas as pd

        excel_file = os.path.join(
            "ts_conformers_test_group_result",
            "ts_conformers_test_BasicRMSDGrouper_T1.0.xlsx",
        )
        assert os.path.exists(
            excel_file
        ), f"Excel file not found: {excel_file}"

        # Matrix data starts at row 8 (0-indexed: skiprows=13), first column is index
        df = pd.read_excel(
            excel_file, sheet_name="RMSD_Matrix", skiprows=12, index_col=0
        )
        # Check that conformer IDs are used as labels
        assert "c1" in str(df.columns[0]) or "c1" in str(df.index[0])



[docs]
    def test_traj_conformer_ids_original_indices(self, temp_working_dir):
        """Test that traj job correctly sets original conformer indices."""
        from chemsmart.io.molecules.structure import Molecule

        # Create test molecules simulating a trajectory
        mol = Molecule.from_pubchem(identifier="CO")
        molecules = [mol.copy() for _ in range(18)]

        # Simulate traj behavior: select last 50% (indices 10-18 in original)
        proportion = 0.5
        total = len(molecules)
        last_num = int(round(total * proportion, 1))
        _ = molecules[-last_num:]  # selected_molecules - not used in this test

        # Calculate expected original indices (1-based)
        start_original_index = total - last_num + 1
        expected_ids = [str(start_original_index + i) for i in range(last_num)]

        # Verify expected IDs
        assert expected_ids[0] == "10"  # First selected is original index 10
        assert expected_ids[-1] == "18"  # Last selected is original index 18





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_output_file_generation:
    """Test that grouper generates correct output files."""

    NUM_PROCS = 1


[docs]
    def test_group_xyz_files_contain_energy_and_index_info(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        """Test that group XYZ files contain energy and original index information."""
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)

        grouper = BasicRMSDGrouper(
            molecules[:6],
            threshold=1.0,
            num_procs=1,
            label="info_test",
        )
        _, _ = grouper.group()
        grouper.unique()  # Generates xyz files

        output_dir = "info_test_group_result"
        first_group_file = os.path.join(output_dir, "info_test_group_1.xyz")

        with open(first_group_file, "r") as f:
            content = f.read()

        # Check for expected information in comments
        assert "Original_Index:" in content
        assert "E" in content



[docs]
    def test_group_xyz_files_sorted_by_energy(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        """Test that molecules in group XYZ files are sorted by energy (lowest first)."""
        import re

        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)

        grouper = BasicRMSDGrouper(
            molecules,
            threshold=2.0,  # Higher threshold to get more molecules per group
            num_procs=1,
            label="sort_test",
        )
        groups, group_indices = grouper.group()
        grouper.unique()  # Generates xyz files

        output_dir = "sort_test_group_result"

        # Check each group file
        for group_num in range(len(groups)):
            group_file = os.path.join(
                output_dir, f"sort_test_group_{group_num + 1}.xyz"
            )

            with open(group_file, "r") as f:
                content = f.read()

            # Extract energies from file
            lines = content.strip().split("\n")
            i = 0
            energies = []

            while i < len(lines):
                num_atoms = int(lines[i].strip())
                comment_line = lines[i + 1]
                energy_match = re.search(
                    r"E\(Hartree\):\s*([-\d.]+)", comment_line
                )
                if energy_match:
                    energies.append(float(energy_match.group(1)))
                i += num_atoms + 2

            # Verify sorted by energy (lowest first)
            for j in range(len(energies) - 1):
                assert energies[j] <= energies[j + 1], (
                    f"Group {group_num + 1} not sorted: "
                    f"energy[{j}]={energies[j]} > energy[{j+1}]={energies[j+1]}"
                )



[docs]
    def test_group_xyz_files_energy_from_log_files(
        self, ts_conformers_log_directory, temp_working_dir
    ):
        """Test that energy is correctly extracted from log files and written to group XYZ files."""
        import glob
        import re

        from chemsmart.io.molecules.structure import Molecule

        # Load molecules from log files
        log_files = sorted(
            glob.glob(os.path.join(ts_conformers_log_directory, "*.log"))
        )
        assert (
            len(log_files) >= 2
        ), f"Need at least 2 log files, found {len(log_files)}"

        molecules = []
        conformer_ids = []
        original_energies = {}

        for idx, log_file in enumerate(log_files):
            mol = Molecule.from_filepath(
                filepath=log_file, index="-1", return_list=False
            )
            if mol is not None:
                molecules.append(mol)
                # Store original energy for verification
                original_energies[idx] = mol.energy

                # Extract conformer ID from filename
                basename = os.path.basename(log_file)
                match = re.search(r"_c(\d+)\.log$", basename)
                if match:
                    conformer_ids.append(f"c{match.group(1)}")
                else:
                    conformer_ids.append(basename)

        assert len(molecules) >= 2, "Need at least 2 molecules for grouping"

        grouper = BasicRMSDGrouper(
            molecules,
            threshold=5.0,  # High threshold to group all together
            num_procs=1,
            label="log_energy_test",
            conformer_ids=conformer_ids,
        )
        _, _ = grouper.group()
        grouper.unique()  # Generates xyz files

        # Check group file
        output_dir = "log_energy_test_group_result"
        first_group_file = os.path.join(
            output_dir, "log_energy_test_group_1.xyz"
        )

        assert os.path.exists(
            first_group_file
        ), f"Group file not found: {first_group_file}"

        with open(first_group_file, "r") as f:
            content = f.read()

        # Verify energy values in the output
        assert "E(Hartree):" in content

        # Parse and verify each energy value
        lines = content.strip().split("\n")
        i = 0
        energies_found = 0
        while i < len(lines):
            try:
                num_atoms = int(lines[i].strip())
            except ValueError:
                break
            comment_line = lines[i + 1]

            # Extract energy from comment
            energy_match = re.search(
                r"E\(Hartree\):\s*([-\d.]+)", comment_line
            )
            if energy_match:
                extracted_energy = float(energy_match.group(1))
                # Verify the energy is a reasonable value (not zero or nan)
                assert (
                    extracted_energy < 0
                ), f"Energy should be negative: {extracted_energy}"
                energies_found += 1

            i += num_atoms + 2

        assert energies_found > 0, "No energies found in output file"





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_edge_cases:
    """Test edge cases and boundary conditions."""


[docs]
    def test_two_molecules_grouping(self, temp_working_dir):
        """Test grouping with minimum number of molecules (2)."""
        from chemsmart.io.molecules.structure import Molecule

        mol1 = Molecule.from_pubchem(identifier="CO")
        mol2 = mol1.copy()

        grouper = BasicRMSDGrouper([mol1, mol2], threshold=0.5)
        groups, group_indices = grouper.group()

        # Two identical molecules should form one group
        assert len(groups) == 1
        assert len(groups[0]) == 2



[docs]
    def test_num_groups_equals_num_molecules(
        self, methanol_molecules, temp_working_dir
    ):
        """Test requesting same number of groups as molecules."""
        grouper = BasicRMSDGrouper(
            methanol_molecules,
            num_groups=len(methanol_molecules),
        )
        groups, group_indices = grouper.group()

        # Should create one group per molecule (or fewer if some are identical)
        assert len(groups) <= len(methanol_molecules)



[docs]
    def test_num_groups_exceeds_num_molecules(
        self, methanol_molecules, temp_working_dir
    ):
        """Test requesting more groups than molecules."""
        n_mols = len(methanol_molecules)
        grouper = BasicRMSDGrouper(
            methanol_molecules,
            num_groups=n_mols + 5,
        )
        groups, group_indices = grouper.group()

        # Should create at most n_mols groups
        assert len(groups) <= n_mols



[docs]
    def test_very_low_threshold(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        """Test with very low threshold (should create many groups)."""
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)

        grouper = BasicRMSDGrouper(
            molecules,
            threshold=0.001,  # Very low threshold
            num_procs=1,
        )
        groups, group_indices = grouper.group()

        # With very low threshold, most molecules should be in separate groups
        # (unless they're truly identical)
        assert len(groups) >= len(molecules) - 1



[docs]
    def test_very_high_threshold(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        """Test with very high threshold (should create few groups)."""
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)

        grouper = BasicRMSDGrouper(
            molecules,
            threshold=100.0,  # Very high threshold
            num_procs=1,
        )
        groups, group_indices = grouper.group()

        # With very high threshold, all molecules with same formula should be in one group
        assert len(groups) <= 3  # Likely 1-2 groups



[docs]
    def test_different_formulas_always_separate(
        self, methanol_and_ethanol, temp_working_dir
    ):
        """Test that molecules with different formulas are always in separate groups."""
        grouper = BasicRMSDGrouper(
            methanol_and_ethanol,
            threshold=100.0,  # Even with high threshold
        )
        groups, group_indices = grouper.group()

        # Methanol and ethanol should always be separate
        assert len(groups) == 2



[docs]
    def test_rmsd_infinity_for_different_molecules(
        self, methanol_and_ethanol, temp_working_dir
    ):
        """Test that RMSD returns infinity for molecules with different atom counts."""
        grouper = BasicRMSDGrouper(methanol_and_ethanol, threshold=0.5)
        rmsd = grouper._calculate_rmsd((0, 1))

        assert rmsd == np.inf or rmsd == float("inf")





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_label_and_append_label:
    """Test -l (label) and -a (append_label) parameter functionality."""

    NUM_PROCS = 1


[docs]
    def test_get_label_function(self, temp_working_dir):
        """Test _get_label function logic."""
        from chemsmart.cli.grouper.grouper import _get_label

        # Test with label only
        result = _get_label(
            label="custom", append_label=None, base_label="base"
        )
        assert result == "custom"

        # Test with append_label only
        result = _get_label(
            label=None, append_label="suffix", base_label="base"
        )
        assert result == "base_suffix"

        # Test with neither (use base_label)
        result = _get_label(label=None, append_label=None, base_label="base")
        assert result == "base"

        # Test with both should raise error
        with pytest.raises(ValueError) as excinfo:
            _get_label(
                label="custom", append_label="suffix", base_label="base"
            )
        assert "Only give label or append_label" in str(excinfo.value)



[docs]
    def test_label_in_output_directory(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        """Test that label parameter affects output directory name."""
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)[:5]

        # Test with custom label
        grouper = BasicRMSDGrouper(
            molecules,
            threshold=1.0,
            num_procs=self.NUM_PROCS,
            label="my_custom_label",
        )
        _, _ = grouper.group()

        # Check output directory uses custom label
        output_dir = "my_custom_label_group_result"
        assert os.path.exists(
            output_dir
        ), f"Output dir not found: {output_dir}"

        # Check Excel file uses custom label
        excel_file = os.path.join(
            output_dir, "my_custom_label_BasicRMSDGrouper_T1.0.xlsx"
        )
        assert os.path.exists(
            excel_file
        ), f"Excel file not found: {excel_file}"



[docs]
    def test_label_in_group_xyz_files(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        """Test that label parameter affects group XYZ file names."""
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)[:5]

        grouper = BasicRMSDGrouper(
            molecules,
            threshold=1.0,
            num_procs=self.NUM_PROCS,
            label="test_label",
        )
        groups, group_indices = grouper.group()
        grouper.unique()  # Generates xyz files

        output_dir = "test_label_group_result"

        # Check group XYZ files use label
        for i in range(len(groups)):
            xyz_path = os.path.join(output_dir, f"test_label_group_{i+1}.xyz")
            assert os.path.exists(xyz_path), f"Group XYZ not found: {xyz_path}"



[docs]
    def test_different_labels_create_different_outputs(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        """Test that different labels create separate output directories."""
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)[:5]

        # First grouper with label "run1"
        grouper1 = BasicRMSDGrouper(
            molecules,
            threshold=1.0,
            num_procs=self.NUM_PROCS,
            label="run1",
        )
        grouper1.group()

        # Second grouper with label "run2"
        grouper2 = BasicRMSDGrouper(
            molecules,
            threshold=1.0,
            num_procs=self.NUM_PROCS,
            label="run2",
        )
        grouper2.group()

        # Both output directories should exist
        assert os.path.exists("run1_group_result")
        assert os.path.exists("run2_group_result")

        # Both should have their own Excel files
        assert os.path.exists(
            "run1_group_result/run1_BasicRMSDGrouper_T1.0.xlsx"
        )
        assert os.path.exists(
            "run2_group_result/run2_BasicRMSDGrouper_T1.0.xlsx"
        )



[docs]
    def test_label_with_num_groups(
        self, multiple_molecules_xyz_file, temp_working_dir
    ):
        """Test that label works correctly with num_groups parameter."""
        xyz_file = XYZFile(filename=multiple_molecules_xyz_file)
        molecules = xyz_file.get_molecules(index=":", return_list=True)

        grouper = BasicRMSDGrouper(
            molecules,
            num_groups=5,
            num_procs=self.NUM_PROCS,
            label="numgroups_test",
        )
        _, _ = grouper.group()

        output_dir = "numgroups_test_group_result"
        assert os.path.exists(output_dir)

        # Excel file should reflect num_groups (N5) instead of threshold
        excel_file = os.path.join(
            output_dir, "numgroups_test_BasicRMSDGrouper_N5.xlsx"
        )
        assert os.path.exists(
            excel_file
        ), f"Excel file not found: {excel_file}"





[docs]
class TestConformerIdExtraction:
    """Test conformer ID extraction from filenames."""

    @staticmethod
    def _sort_key(x):
        """Sort key for file_info: cXX files by number first, then others by filename."""
        if x[2] is not None:
            return (0, x[2], x[3])
        else:
            return (1, 0, x[3])


[docs]
    def test_extract_conformer_id_with_pattern(self):
        """Test extraction with _cXX_ pattern."""
        from chemsmart.cli.grouper.grouper import _extract_conformer_id

        assert _extract_conformer_id("structure_c1_opt.log") == "c1"
        assert _extract_conformer_id("mol_c12.log") == "c12"
        assert (
            _extract_conformer_id("/path/to/structure_c123_ts.log") == "c123"
        )



[docs]
    def test_extract_conformer_id_without_pattern(self):
        """Test extraction without _cXX_ pattern returns None."""
        from chemsmart.cli.grouper.grouper import _extract_conformer_id

        assert _extract_conformer_id("mol_opt.log") is None
        assert _extract_conformer_id("structure.log") is None
        assert _extract_conformer_id("/path/to/calc.log") is None



[docs]
    def test_conformer_id_fallback_to_filename(self):
        """Test that filename is used as conformer ID when no _cXX_ pattern."""
        from chemsmart.cli.grouper.grouper import _extract_conformer_id

        test_files = [
            "/path/to/mol_opt.log",
            "/path/to/structure_ts.out",
        ]

        conformer_ids = []
        for f in test_files:
            conf_id = _extract_conformer_id(f)
            if conf_id is None:
                basename = os.path.basename(f)
                conf_id = os.path.splitext(basename)[0]
            conformer_ids.append(conf_id)

        assert conformer_ids == ["mol_opt", "structure_ts"]



[docs]
    def test_conformer_ids_molecules_correspondence(
        self, ts_conformers_log_directory, temp_working_dir
    ):
        """Test that conformer_ids and molecules are strictly one-to-one corresponding.

        This verifies that after sorting, each molecule's energy matches the energy
        from its corresponding file (identified by conf_id).
        """
        import glob
        import re

        from chemsmart.cli.grouper.grouper import _extract_conformer_id
        from chemsmart.io.gaussian.output import Gaussian16Output
        from chemsmart.io.molecules.structure import Molecule

        # First, build a mapping from conf_id to expected gibbs_energy
        log_files = glob.glob(
            os.path.join(ts_conformers_log_directory, "*.log")
        )
        expected_energies = {}

        for f in log_files:
            conf_id = _extract_conformer_id(f)
            basename = os.path.basename(f)
            name_without_ext = os.path.splitext(basename)[0]

            if conf_id is None:
                conf_id = name_without_ext

            g16_output = Gaussian16Output(filename=f)
            gibbs_energy = g16_output.gibbs_free_energy
            if gibbs_energy is not None:
                expected_energies[conf_id] = gibbs_energy

        # Now simulate the actual loading logic with sorting
        file_info = []
        for f in log_files:
            conf_id = _extract_conformer_id(f)
            basename = os.path.basename(f)
            name_without_ext = os.path.splitext(basename)[0]

            if conf_id:
                num = int(re.search(r"\d+", conf_id).group())
                file_info.append((f, conf_id, num, name_without_ext))
            else:
                file_info.append((f, name_without_ext, None, name_without_ext))

        file_info.sort(key=self._sort_key)

        # Load molecules
        molecules = []
        conformer_ids = []

        for filepath, conf_id, _, _ in file_info:
            try:
                mol = Molecule.from_filepath(
                    filepath=filepath, index="-1", return_list=False
                )
                if mol is not None:
                    g16_output = Gaussian16Output(filename=filepath)
                    gibbs_energy = g16_output.gibbs_free_energy
                    if gibbs_energy is not None:
                        mol._energy = gibbs_energy

                    molecules.append(mol)
                    conformer_ids.append(conf_id)
            except Exception:
                pass

        # Verify one-to-one correspondence
        assert len(molecules) == len(conformer_ids)

        # Verify each molecule's energy matches the expected energy for its conf_id
        for i, (mol, conf_id) in enumerate(zip(molecules, conformer_ids)):
            if conf_id in expected_energies:
                expected_energy = expected_energies[conf_id]
                assert np.isclose(mol.energy, expected_energy, rtol=1e-10), (
                    f"Index {i}: energy mismatch for {conf_id}, "
                    f"got {mol.energy}, expected {expected_energy}"
                )



[docs]
    def test_mixed_pattern_files_correspondence(self):
        """Test correspondence when mixing files with and without _cXX_ pattern."""
        from chemsmart.cli.grouper.grouper import _extract_conformer_id

        test_files = [
            ("mol_c1_opt.log", "c1"),
            ("mol_c2_opt.log", "c2"),
            ("other_calc.log", None),
            ("mol_c3_opt.log", "c3"),
        ]

        file_info = []
        for filename, expected_pattern in test_files:
            conf_id = _extract_conformer_id(filename)
            basename = os.path.basename(filename)
            name_without_ext = os.path.splitext(basename)[0]

            if conf_id:
                assert conf_id == expected_pattern
                num = int(conf_id.replace("c", ""))
                file_info.append((filename, conf_id, num, name_without_ext))
            else:
                assert expected_pattern is None
                file_info.append(
                    (filename, name_without_ext, None, name_without_ext)
                )

        file_info.sort(key=self._sort_key)
        sorted_conf_ids = [info[1] for info in file_info]

        # Files with pattern first (sorted by number), then others by filename
        assert sorted_conf_ids == ["c1", "c2", "c3", "other_calc"]



[docs]
    def test_all_files_without_cxx_pattern(self):
        """Test that a folder with no _cXX_ pattern files works correctly."""
        from chemsmart.cli.grouper.grouper import _extract_conformer_id

        test_files = [
            "mol_opt.log",
            "structure_ts.log",
            "calc.log",
            "another_mol.log",
        ]

        file_info = []
        for filename in test_files:
            conf_id = _extract_conformer_id(filename)
            basename = os.path.basename(filename)
            name_without_ext = os.path.splitext(basename)[0]

            if conf_id:
                num = int(conf_id.replace("c", ""))
                file_info.append((filename, conf_id, num, name_without_ext))
            else:
                file_info.append(
                    (filename, name_without_ext, None, name_without_ext)
                )

        # All files should have None as num
        for info in file_info:
            assert info[2] is None

        file_info.sort(key=self._sort_key)
        sorted_conf_ids = [info[1] for info in file_info]

        # All files sorted alphabetically by filename
        assert sorted_conf_ids == [
            "another_mol",
            "calc",
            "mol_opt",
            "structure_ts",
        ]





[docs]
@pytest.mark.usefixtures("temp_working_dir")
class Test_energy_extraction_function:
    """Framework tests for energy extraction by file type."""


[docs]
    def test_energy_extraction_gaussian(
        self, gaussian_dppeFeCl2_link_opt_outputfile
    ):
        from chemsmart.analysis.thermochemistry import Thermochemistry
        from chemsmart.cli.grouper.grouper import (
            _extract_energy_based_on_energy_type,
        )

        thermo = Thermochemistry(
            filename=gaussian_dppeFeCl2_link_opt_outputfile
        )
        extracted = _extract_energy_based_on_energy_type(thermo, "E")
        assert np.isclose(extracted, -3869.01351827, rtol=1e-7)

        thermo = Thermochemistry(
            filename=gaussian_dppeFeCl2_link_opt_outputfile
        )
        extracted = _extract_energy_based_on_energy_type(thermo, "H")
        assert np.isclose(extracted, -3868.552084, rtol=1e-7)

        thermo = Thermochemistry(
            filename=gaussian_dppeFeCl2_link_opt_outputfile
        )
        extracted = _extract_energy_based_on_energy_type(thermo, "G")
        assert np.isclose(extracted, -3868.648944, rtol=1e-7)

        thermo_qhh = Thermochemistry(
            filename=gaussian_dppeFeCl2_link_opt_outputfile,
            temperature=298.15,
            concentration=1.0,
            h_freq_cutoff=100.0,
            s_freq_cutoff=100.0,
            use_weighted_mass=True,
            entropy_method="grimme",
        )
        extracted = _extract_energy_based_on_energy_type(thermo_qhh, "qhH")
        assert np.isclose(extracted, -3868.558465, rtol=1e-7)

        thermo_qhg = Thermochemistry(
            filename=gaussian_dppeFeCl2_link_opt_outputfile,
            temperature=298.15,
            concentration=1.0,
            h_freq_cutoff=100.0,
            s_freq_cutoff=100.0,
            use_weighted_mass=True,
            entropy_method="grimme",
        )
        extracted = _extract_energy_based_on_energy_type(thermo_qhg, "qhG")
        assert np.isclose(extracted, -3868.645329, rtol=1e-7)



[docs]
    def test_energy_extraction_orca(self, fe2_singlet_output):
        from chemsmart.analysis.thermochemistry import Thermochemistry
        from chemsmart.cli.grouper.grouper import (
            _extract_energy_based_on_energy_type,
        )

        thermo = Thermochemistry(filename=fe2_singlet_output)
        extracted = _extract_energy_based_on_energy_type(thermo, "E")
        assert np.isclose(extracted, -1568.256171848101, rtol=1e-7)

        thermo = Thermochemistry(filename=fe2_singlet_output)
        extracted = _extract_energy_based_on_energy_type(thermo, "H")
        assert np.isclose(extracted, -1568.14237774, rtol=1e-7)

        thermo = Thermochemistry(filename=fe2_singlet_output)
        extracted = _extract_energy_based_on_energy_type(thermo, "G")
        assert np.isclose(extracted, -1568.18873516, rtol=1e-7)

        thermo_qhh = Thermochemistry(
            filename=fe2_singlet_output,
            temperature=298.15,
            concentration=1.0,
            h_freq_cutoff=100.0,
            s_freq_cutoff=100.0,
            use_weighted_mass=True,
            entropy_method="grimme",
        )
        extracted = _extract_energy_based_on_energy_type(thermo_qhh, "qhH")
        assert np.isclose(extracted, -1568.143280, rtol=1e-7)

        thermo_qhg = Thermochemistry(
            filename=fe2_singlet_output,
            temperature=298.15,
            concentration=1.0,
            h_freq_cutoff=100.0,
            s_freq_cutoff=100.0,
            use_weighted_mass=True,
            entropy_method="grimme",
        )
        extracted = _extract_energy_based_on_energy_type(thermo_qhg, "qhG")
        assert np.isclose(extracted, -1568.186619, rtol=1e-7)