#!/usr/bin/env amspython
# coding: utf-8

# ## Initial imports

from scm.plams import plot_molecule
import scm.plams as plams
from scm.params import ParAMSJob, ResultsImporter
import glob
import matplotlib.pyplot as plt


# ## Initialize PLAMS environment

plams.init()


# ## Run a quick reference MD job for liquid Ar
#
# In this example we generate some simple reference data.
#
# If you
#
# * already have reference data in the ParAMS .yaml format, then you can skip this step
# * already have reference data in the ASE .xyz or .db format, you can convert it to the ParAMS .yaml format. See the section "Convert ASE format to ParAMS" below

molecule = plams.Molecule(numbers=[18], positions=[(0, 0, 0)])
box = plams.packmol(molecule, n_atoms=32, density=1.4, tolerance=3.0)
plot_molecule(box)


reference_engine_settings = plams.Settings()
reference_engine_settings.runscript.nproc = 1
reference_engine_settings.input.ForceField.Type = "UFF"
md_job = plams.AMSNVTJob(
    settings=reference_engine_settings,
    molecule=box,
    name="uff_md",
    nsteps=5000,
    temperature=500,
    writeenginegradients=True,
    samplingfreq=500,
)
md_job.run()


# ## Import reference results with ParAMS ResultsImporter
#
# Here we use the ``add_trajectory_singlepoints`` results importer. For more details about usage of the results importers, see the corresponding tutorials.

ri = ResultsImporter(settings={"units": {"energy": "eV", "forces": "eV/angstrom"}})
ri.add_trajectory_singlepoints(md_job.results.rkfpath(), properties=["energy", "forces"])
# feel free to add other trajectories as well:
# ri.add_trajectory_singlepoints(job2.results.rkfpath(), properties=["energy", "forces"])   # etc...


# ## Optional: split into training/validation sets
#
# Machine learning potentials in ParAMS can only be trained if there is both a training set and a validation set.
#
# If you do not specify a validation set, the training set will automatically be split into a training and validation set when the parametrization starts.
#
# Here, we will manually split the data set ourselves.
#
# Let's first print the information in the current ResultsImporter training set:


def print_data_set_summary(data_set, title):
    number_of_entries = len(data_set)
    jobids = data_set.jobids
    number_of_jobids = len(jobids)
    print(f"{title}:")
    print(f"    number of entries: {number_of_entries}")
    print(f"    number of jobids: {number_of_jobids}")
    print(f"    jobids: {jobids}")


print_data_set_summary(ri.data_sets["training_set"], "Original training set")


# Above, the number of entries is twice the number of jobids because the ``energy`` and ``forces`` extractors are separate entries.
#
# The energy and force extractors for a given structure (e.g. frame006) must belong to the same data set. For this reason, when doing the split, we call ``split_by_jobid``

training_set, validation_set = ri.data_sets["training_set"].split_by_jobids(0.8, 0.2, seed=314)
ri.data_sets["training_set"] = training_set
ri.data_sets["validation_set"] = validation_set

print_data_set_summary(ri.data_sets["training_set"], "New training set")
print_data_set_summary(ri.data_sets["validation_set"], "New validation set")


# ## Store the reference results in ParAMS yaml format
#
# Use ``ResultsImporter.store()`` to store all the data in the results importer in the ParAMS .yaml format:

yaml_dir = "yaml_ref_data"
ri.store(yaml_dir, backup=False)

# print the contents of the directory
for x in glob.glob(f"{yaml_dir}/*"):
    print(x)


# ## Set up and run a ParAMSJob for training ML Potentials
#
# See the ParAMS MachineLearning documentation for all available input options.
#
# Training the model may take a few minutes.

job = ParAMSJob.from_yaml(yaml_dir)
job.name = "params_training_ml_potential"
job.settings.input.Task = "MachineLearning"
job.settings.input.MachineLearning.CommitteeSize = 1  # train only a single model
job.settings.input.MachineLearning.MaxEpochs = 200
job.settings.input.MachineLearning.LossCoeffs.Energy = 10
job.settings.input.MachineLearning.Backend = "M3GNet"
job.settings.input.MachineLearning.M3GNet.Model = "UniversalPotential"
job.settings.input.MachineLearning.Target.Forces.Enabled = "No"
job.settings.input.MachineLearning.RunAMSAtEnd = "Yes"


job.run()


# ## Results of the ML potential training
#
# Use ``job.results.get_running_loss()`` to get the loss value as a function of epoch:

epoch, training_loss = job.results.get_running_loss(data_set="training_set")
plt.plot(epoch, training_loss)

epoch, validation_loss = job.results.get_running_loss(data_set="validation_set")
plt.plot(epoch, validation_loss)
plt.legend(["training loss", "validation loss"])


# If you set ``MachineLearning%RunAMSAtEnd`` (it is on by default), this will run the ML potential through AMS at the end of the fitting procedure, similar to the ParAMS SinglePoint task.
#
# This will give you access to more results, for example the predicted-vs-reference energy and forces for all entries in the training and validation set. Plot them in a scatter plot like this:

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 6))

for i, data_set in enumerate(["training_set", "validation_set"]):
    for j, key in enumerate(["energy", "forces"]):
        dse = job.results.get_data_set_evaluator(data_set=data_set, source="best")
        data = dse.results[key]
        ax = axes[i][j]
        ax.plot(data.reference_values, data.predictions, ".")
        ax.set_xlabel(f"Reference {key} ({data.unit})")
        ax.set_ylabel(f"Predicted {key} ({data.unit})")
        ax.set_title(f"{data_set}\n{key} MAE: {data.mae:.3f} {data.unit}")
        ax.set_xlim(auto=True)
        ax.autoscale(False)
        ax.plot([-10, 10], [-10, 10], linewidth=5, zorder=-1, alpha=0.3, c="red")

plt.subplots_adjust(hspace=0.6, wspace=0.4)


# ## Get the engine settings for production jobs
#
# First, let's find the path to where the trained m3gnet model resides using ``get_deployed_model_paths()``. This function returns a list of paths to the trained models. In this case we only trained one model, so we access the first element of the list with ``[0]``:

print(job.results.get_deployed_model_paths()[0])


# The above is the path we need to give as the ``ParameterDir`` input option in the AMS MLPotential engine. For other backends it might instead be the ``ParameterFile`` option.
#
# To get the complete engine settings as a PLAMS Settings object, use the method ``get_production_engine_settings()``:

production_engine_settings = job.results.get_production_engine_settings()
print(plams.AMSJob(settings=production_engine_settings).get_input())


# ## Run a short MD simulation with the trained potential

production_engine_settings.runscript.nproc = 1  # run AMS Driver in serial
new_md_job = plams.AMSNVTJob(
    settings=production_engine_settings,
    molecule=box,
    nsteps=5000,
    temperature=300,
    samplingfreq=100,
    name="production_md",
    timestep=1.0,
)
new_md_job.run(watch=True)


# ## Open trajectory file in AMSmovie
# With the production trajectory you can run analysis tools in AMSmovie, or access them from Python. See the AMS manual for details.

trajectory_file = new_md_job.results.rkfpath()


# ## Finish PLAMS

plams.finish()