#!/usr/bin/env python
# coding: utf-8

# ## Benchmarking Hydrocarbon Isomerization Energies

# This benchmark aims to evaluate the accuracy of a range of machine learning potentials included in AMS, using the ISOC7 dataset from the paper: A.Karton and E.Semidalas, “Benchmarking Isomerization Energies for C5–C7 Hydrocarbons: The ISOC7 Database,” J. of Comp. Chem. 47(2) (2026): e70296, https://doi.org/10.1002/jcc.70296.
#
# The ISOC7 dataset contains a diverse range of hydrocarbons from simple alkanes to highly strained systems featuring tetrahedrane units, as well as alkanes, alkenes, alkynes, and cyclic compounds. It has reference data to CCSD(T)/CBS level of accuracy and therefore provides a good benchmark for evaluating the accuracy of ML potentials for small organic systems.

# ### Extract Reference ISOC7 Database

# We first extract the ISOC7 database:

import os
import tarfile
from pathlib import Path

isoc7_tar = Path(os.environ["AMSHOME"]) / "atomicdata" / "Benchmarks" / "ISOC7.tar.gz"
isoc7 = Path("./ISOC7")

if not isoc7.exists():
    with tarfile.open(isoc7_tar, "r:gz") as tar:
        tar.extractall(path=Path.cwd())


# ### Load XYZ Data

# Then we load all the xyz files as PLAMS molecules:

import scm.plams as plams

molecules = plams.read_molecules(isoc7, formats=["xyz"])


# We can see the number of loaded structures:

print(f"Number of loaded structures: {len(molecules)}")


# And view a selection of them, with their corresponding unique id:

imgs = {k: plams.view(v, width=300, height=300, guess_bonds=True) for k, v in list(molecules.items())[:4]}

plams.plot_image_grid(imgs, rows=1, save_path="picture1.png")
# ### Load Reference Data

# Next we load the reference CCSD(T)/CBS energies from the benchmark:

import pandas as pd

reference_data = pd.read_csv(
    isoc7 / "ISOC7_W1-F12_Ref_Values.csv",
    usecols=[
        "reaction_name",
        "species1",
        "species2",
        "W1-F12_isomerization_energy (CCSD(T)/CBS, kcal/mol)",
    ],
)

reference_data


# ### Set up Data Sets with ParAMS

# To run our benchmark, we are going to user ParAMS. First we set up the JobCollection. This contains the information on the input structures.

import scm.params as params

results_importer = params.ResultsImporter(settings={"units": {"energy": "kcal/mol"}})


for k, v in molecules.items():
    jce = params.JCEntry()
    jce.settings.input.AMS.Task = "SinglePoint"
    jce.molecule = v
    jce.metadata["Source"] = "https://doi.org/10.1002/jcc.70296"
    results_importer.job_collection.add_entry(k, jce)


print(results_importer.job_collection.from_jobids(["C5H10_1"]))


# Next we set up the dataset containing the information on each isomerization reaction and the reference energy:

ha_to_kcal_mol = plams.Units.conversion_ratio("Ha", "kcal/mol")

for _, (s1, s2, ref_e) in reference_data[
    ["species1", "species2", "W1-F12_isomerization_energy (CCSD(T)/CBS, kcal/mol)"]
].iterrows():
    results_importer.data_set.add_entry(
        f"energy('{s2}')-energy('{s1}')",
        reference=ref_e,
        unit=("kcal/mol", ha_to_kcal_mol),
        metadata={
            "Source": "https://doi.org/10.1002/jcc.70296",
            "reference_method": "CCSD(T)/CBS",
        },
    )


print(results_importer.data_set.from_jobids(["C5H6_1", "C5H6_20"]))


# We can also visualize one of these isomerization reactions, for example the first one with a high isomerization energy:

imgs = {k: plams.view(molecules[k], width=300, height=300, guess_bonds=True) for k in ["C5H6_20", "C5H6_1"]}

plams.plot_image_grid(imgs, rows=1, save_path="picture2.png")
# Now we store the yaml files needed to run the ParAMS calculations:

results_importer.store("./isoc7_benchmark")


# Note that the dataset can also be viewed using the ParAMS GUI:

# !params -gui ./isoc7_benchmark/job_collection.yaml


# ### Run Benchmarks with ParAMS

# Now we set up a benchmark job for each ML model of interest, and run them. We include all models suitable for small organic molecules, as well as some trained on materials/other domains for comparison.

models = [
    "AIMNet2-B973c",
    "AIMNet2-wB97MD3",
    "ANI-1ccx",
    "ANI-1x",
    "ANI-2x",
    "eSEN-S-Con-OMol",
    "M3GNet-UP-2022",
    "MACE-MPA-0",
    "UMA-S-1.2-ODAC",
    "UMA-S-1.2-OMat",
    "UMA-S-1.2-OMol",
]


jobs = {}

for model in models:
    job = params.ParAMSJob.from_yaml("./isoc7_benchmark", name=f"benchmark_{model}")
    job.settings.runscript.nproc = 1
    job.settings.runscript.preamble_lines = ["export OMP_NUM_THREADS=1"]
    job.settings.input.parallellevels.jobs = 1
    job.settings.input.mlpotential.model = model
    job.settings.input.task = "SinglePoint"
    jobs[model] = job


for job in jobs.values():
    job.run()
    job.ok()


# Each completed job now has results for the errors between the model calculations and the reference. For example, with AIMNet2-wB97MD3:

print("\n".join(str(jobs["AIMNet2-wB97MD3"].results.get_data_set_evaluator()).split("\n")[:20]))


# Results can alternatively be displayed in a correlation plot of the reference vs. predicted energies:

ax = jobs["AIMNet2-wB97MD3"].results.plot_simple_correlation("energy", validation_set=False)
ax
ax.figure.savefig("picture3.png")


# Note that all these can also again be viewed in the ParAMS GUI:

# !params -gui ./plams_workdir/benchmark_AIMNet2-wB97MD3/results


# ## Evaluating Benchmark Results

# To evaluate the results, we extract the Mean Absolute Error (MAE) and Root Mean Squared Error (RMSE) from the job results:

results = pd.DataFrame({"Model": models})


results["MAE"] = [jobs[m].results.get_data_set_evaluator().results.mae for m in models]
results["RMSE"] = [jobs[m].results.get_data_set_evaluator().results.rmse for m in models]


results.sort_values(by="MAE", inplace=True)


results.rename(
    columns={
        "MAE": "MAE<sup>*</sup>",
        "RMSE": "RMSE<sup>*</sup>",
    },
    inplace=True,
)


results.style.hide(axis="index").background_gradient(
    subset=["MAE<sup>*</sup>", "RMSE<sup>*</sup>"], cmap="Blues", vmin=0, vmax=10
).format("{:.2f}", subset=["MAE<sup>*</sup>", "RMSE<sup>*</sup>"]).set_caption(
    "<br><span style='text-align:left;font-size:18px'>Benchmark Results for <a href='https://doi.org/10.1002/jcc.70296' target='_blank'>ISOC7 Dataset</a> Isomerization Energies</span>"
    "<br><span style='font-size:14px'><sup>*</sup> Reference method CCSD(T)/CBS, in kcal/mol</span>"
).set_table_styles(
    [
        {"selector": "th", "props": [("border-bottom", "2px solid black")]},
        {"selector": "th, td", "props": [("min-width", "100px")]},
        {"selector": "td", "props": [("border-bottom", "1px solid lightgray")]},
        {"selector": "table", "props": [("border-collapse", "collapse")]},
    ]
)


# From these results, we see that `AIMNet2-wB97MD3` performs best, achieving the lowest errors (MAE = 1.29 kcal/mol, RMSE = 1.60 kcal/mol), clearly outperforming all other models. In a broader context, this is on a par with the best performing DFT functionals, at a fraction of the computational cost.
#
# Other models targeted at small molecules also perform well, including `eSEN-S-Con-OMol` and `ANI-1ccx` all with MAE < 5 kcal/mol.
#
# We find that, as expected, the worst performers in this case are models trained primarily on materials data such as `M3GNet-UP-2022` and `MACE-MPA-0`, which are better suited to other chemical domains.
#
# One surprising result is for `UMA-S-1.2-OMol`, which would be expected to perform better for these systems than other UMA variants. We can compare and contrast the correlation plots for `UMA-S-1.2-OMol` with the previous plot for the best performing model, `AIMNet2-wB97MD3`:

ax = jobs["UMA-S-1.2-OMol"].results.plot_simple_correlation("energy", validation_set=False)
ax
ax.figure.savefig("picture4.png")


# This shows there are some structures with a very high energy difference with respect to the reference. We can inspect these structures:

print("\n".join(str(jobs["UMA-S-1.2-OMol"].results.get_data_set_evaluator()).split("\n")[:9]))


imgs = {
    k: plams.view(molecules[k], width=300, height=300, guess_bonds=True)
    for k in ["C6H6_2", "C7H8_3", "C7H8_5", "C6H6_21"]
}

plams.plot_image_grid(imgs, rows=1, save_path="picture5.png")


# As noted in the original paper, we find many of the systems with a high deviations compared to the reference values contain a tetrahedrane core, which is also a common issue with many of the tested DFT functionals.