from __future__ import annotations
from pathlib import Path
from typing import Iterable, Literal, Sequence
from scm.pisa.block import DriverBlock,EngineBlock,FixedBlock,FreeBlock,InputBlock
from scm.pisa.key import BoolKey,FloatKey,FloatListKey,IntKey,IntListKey,MultipleChoiceKey,PathStringKey,StringKey,BoolType
[docs]class ParAMSGenerateReference(DriverBlock):
r"""
:ivar EngineCollection: Path to (optional) JobCollection Engines YAML file.
:vartype EngineCollection: str | StringKey
:ivar JobCollection: Path to JobCollection YAML file.
:vartype JobCollection: str | StringKey
:ivar ParameterInterface: Path to parameter interface YAML file.
:vartype ParameterInterface: str | StringKey
:ivar RestartDirectory: Specify a directory to continue interrupted GenerateReference or SinglePoint calculations. The directory depends on the task:
GenerateReference: results/reference_jobs
SinglePoint: results/single_point/jobs
Note: If you use the GUI this directory will be COPIED into the results folder and the name will be prepended with 'dep-'. This can take up a lot of disk space, so you may want to remove the 'dep-' folder after the job has finished.
:vartype RestartDirectory: str | Path | StringKey
:ivar ResultsDirectory: Directory in which output files will be created.
:vartype ResultsDirectory: str | Path | StringKey
:ivar Task: Task to run.
Available options:
•MachineLearning: Optimization for machine learning models.
•Optimization: Global optimization powered by GloMPO
•Generate Reference: Run jobs with reference engine to get reference values
•Single Point: Evaluate the current configuration of jobs, training data, and parameters
•Sensitivity: Measure the sensitivity of the loss function to each of the active parameters
:vartype Task: Literal["Optimization", "GenerateReference", "SinglePoint", "Sensitivity", "MachineLearning"]
:ivar DataSet: Configuration settings for each data set in the optimization.
:vartype DataSet: ParAMSGenerateReference._DataSet
:ivar ParallelLevels: Distribution of threads/processes between the parallelization levels.
:vartype ParallelLevels: ParAMSGenerateReference._ParallelLevels
"""
[docs] class _DataSet(FixedBlock):
r"""
Configuration settings for each data set in the optimization.
:ivar BatchSize: Number of data set entries to be evaluated per epoch. Default 0 means all entries.
:vartype BatchSize: int | IntKey
:ivar EvaluateEvery: This data set is evaluated every n evaluations of the training set.
This will always be set to 1 for the training set. For other data sets it will be adjusted to the closest multiple of LoggingInterval%General, i.e., you cannot evaluate an extra data set more frequently than you log it.
:vartype EvaluateEvery: int | IntKey
:ivar LossFunction: Loss function used to quantify the error between model and reference values. This becomes the minimization task.
Available options:
• mae: Mean absolute error
• rmse: Root mean squared error
• sse: Sum of squared errors
• sae: Sum of absolute errors
:vartype LossFunction: Literal["mae", "rmse", "sse", "sae"]
:ivar MaxJobs: Limit each evaluation to a subset of n jobs. Default 0 meaning all jobs are used.
:vartype MaxJobs: int | IntKey
:ivar MaxJobsShuffle: Use a different job subset every for every evaluation.
:vartype MaxJobsShuffle: BoolType | BoolKey
:ivar Name: Unique data set identifier.
The first occurrence of DataSet will always be called training_set.
The second will always be called validation_set.
These cannot be overwritten.
Later occurrences will default to data_set_xx where xx starts at 03 and increments from there. This field can be used to customize the latter names.
:vartype Name: str | StringKey
:ivar Path: Path to DataSet YAML file.
:vartype Path: str | StringKey
:ivar UsePipe: Use AMS Pipe for suitable jobs to speed-up evaluation.
:vartype UsePipe: BoolType | BoolKey
"""
def __post_init__(self):
self.BatchSize: int | IntKey = IntKey(name='BatchSize', comment='Number of data set entries to be evaluated per epoch. Default 0 means all entries.', default=0)
self.EvaluateEvery: int | IntKey = IntKey(name='EvaluateEvery', comment='This data set is evaluated every n evaluations of the training set.\n\nThis will always be set to 1 for the training set. For other data sets it will be adjusted to the closest multiple of LoggingInterval%General, i.e., you cannot evaluate an extra data set more frequently than you log it.', default=1)
self.LossFunction: Literal["mae", "rmse", "sse", "sae"] = MultipleChoiceKey(name='LossFunction', comment='Loss function used to quantify the error between model and reference values. This becomes the minimization task.\n\nAvailable options:\n• mae: Mean absolute error\n• rmse: Root mean squared error\n• sse: Sum of squared errors\n• sae: Sum of absolute errors', default='sse', choices=['mae', 'rmse', 'sse', 'sae'])
self.MaxJobs: int | IntKey = IntKey(name='MaxJobs', comment='Limit each evaluation to a subset of n jobs. Default 0 meaning all jobs are used.', default=0)
self.MaxJobsShuffle: BoolType | BoolKey = BoolKey(name='MaxJobsShuffle', comment='Use a different job subset every for every evaluation.', default=False)
self.Name: str | StringKey = StringKey(name='Name', comment='Unique data set identifier.\n\nThe first occurrence of DataSet will always be called training_set.\nThe second will always be called validation_set.\nThese cannot be overwritten.\n\nLater occurrences will default to data_set_xx where xx starts at 03 and increments from there. This field can be used to customize the latter names.', default='')
self.Path: str | StringKey = StringKey(name='Path', comment='Path to DataSet YAML file.')
self.UsePipe: BoolType | BoolKey = BoolKey(name='UsePipe', comment='Use AMS Pipe for suitable jobs to speed-up evaluation.', default=True)
[docs] class _ParallelLevels(FixedBlock):
r"""
Distribution of threads/processes between the parallelization levels.
:ivar CommitteeMembers: Maximum number of committee member optimizations to run in parallel. If set to zero will take the minimum of MachineLearning%CommitteeSize and the number of available cores (NSCM)
:vartype CommitteeMembers: int | IntKey
:ivar Cores: Number of cores to use per committee member optimization. By default (0) the available cores (NSCM) divided equally among committee members. When using GPU offloading, consider setting this to 1.
:vartype Cores: int | IntKey
:ivar Jobs: Number of JobCollection jobs to run in parallel for each loss function evaluation.
:vartype Jobs: int | IntKey
:ivar Optimizations: Number of independent optimizers to run in parallel.
:vartype Optimizations: int | IntKey
:ivar ParameterVectors: Number of parameter vectors to try in parallel for each optimizer iteration. This level of parallelism can only be used with optimizers that support parallel optimization!
Default (0) will set this value to the number of cores on the system divided by the number of optimizers run in parallel, i.e., each optimizer will be given an equal share of the resources.
:vartype ParameterVectors: int | IntKey
:ivar Processes: Number of processes (MPI ranks) to spawn for each JobCollection job. This effectively sets the NSCM environment variable for each job.
A value of `-1` will disable explicit setting of related variables. We recommend a value of `1` in almost all cases. A value greater than 1 would only be useful if you parametrize DFTB with a serial optimizer and have very few jobs in the job collection.
:vartype Processes: int | IntKey
:ivar Threads: Number of threads to use for each of the processes. This effectively set the OMP_NUM_THREADS environment variable.
Note that the DFTB engine does not use threads, so the value of this variable would not have any effect. We recommend always leaving it at the default value of 1. Please consult the manual of the engine you are parameterizing.
A value of `-1` will disable explicit setting of related variables.
:vartype Threads: int | IntKey
"""
def __post_init__(self):
self.CommitteeMembers: int | IntKey = IntKey(name='CommitteeMembers', comment='Maximum number of committee member optimizations to run in parallel. If set to zero will take the minimum of MachineLearning%CommitteeSize and the number of available cores (NSCM)', gui_name='Number of parallel committee members:', default=1)
self.Cores: int | IntKey = IntKey(name='Cores', comment='Number of cores to use per committee member optimization. By default (0) the available cores (NSCM) divided equally among committee members. When using GPU offloading, consider setting this to 1.', gui_name='Processes (per Job):', default=0)
self.Jobs: int | IntKey = IntKey(name='Jobs', comment='Number of JobCollection jobs to run in parallel for each loss function evaluation.', gui_name='Jobs (per loss function evaluation):', default=0)
self.Optimizations: int | IntKey = IntKey(name='Optimizations', comment='Number of independent optimizers to run in parallel.', gui_name='Number of parallel optimizers:', default=1)
self.ParameterVectors: int | IntKey = IntKey(name='ParameterVectors', comment='Number of parameter vectors to try in parallel for each optimizer iteration. This level of parallelism can only be used with optimizers that support parallel optimization!\n\nDefault (0) will set this value to the number of cores on the system divided by the number of optimizers run in parallel, i.e., each optimizer will be given an equal share of the resources.', gui_name='Loss function evaluations (per optimizer):', default=0)
self.Processes: int | IntKey = IntKey(name='Processes', comment='Number of processes (MPI ranks) to spawn for each JobCollection job. This effectively sets the NSCM environment variable for each job.\n\nA value of `-1` will disable explicit setting of related variables. We recommend a value of `1` in almost all cases. A value greater than 1 would only be useful if you parametrize DFTB with a serial optimizer and have very few jobs in the job collection.', gui_name='Processes (per Job):', default=1)
self.Threads: int | IntKey = IntKey(name='Threads', comment='Number of threads to use for each of the processes. This effectively set the OMP_NUM_THREADS environment variable.\nNote that the DFTB engine does not use threads, so the value of this variable would not have any effect. We recommend always leaving it at the default value of 1. Please consult the manual of the engine you are parameterizing.\n\nA value of `-1` will disable explicit setting of related variables.', gui_name='Threads (per Process):', default=1)
def __post_init__(self):
self.EngineCollection: str | StringKey = StringKey(name='EngineCollection', comment='Path to (optional) JobCollection Engines YAML file.', default='job_collection_engines.yaml')
self.JobCollection: str | StringKey = StringKey(name='JobCollection', comment='Path to JobCollection YAML file.', default='job_collection.yaml')
self.ParameterInterface: str | StringKey = StringKey(name='ParameterInterface', comment='Path to parameter interface YAML file.', default='parameter_interface.yaml')
self.RestartDirectory: str | Path | StringKey = PathStringKey(name='RestartDirectory', comment="Specify a directory to continue interrupted GenerateReference or SinglePoint calculations. The directory depends on the task: \n\nGenerateReference: results/reference_jobs\nSinglePoint: results/single_point/jobs\n\nNote: If you use the GUI this directory will be COPIED into the results folder and the name will be prepended with 'dep-'. This can take up a lot of disk space, so you may want to remove the 'dep-' folder after the job has finished.", gui_name='Load jobs from: ', default='', ispath=True, gui_type='directory')
self.ResultsDirectory: str | Path | StringKey = PathStringKey(name='ResultsDirectory', comment='Directory in which output files will be created.', gui_name='Working directory: ', default='results', ispath=True)
self.Task: Literal["Optimization", "GenerateReference", "SinglePoint", "Sensitivity", "MachineLearning"] = MultipleChoiceKey(name='Task', comment='Task to run.\n\nAvailable options:\n•MachineLearning: Optimization for machine learning models.\n•Optimization: Global optimization powered by GloMPO\n•Generate Reference: Run jobs with reference engine to get reference values\n•Single Point: Evaluate the current configuration of jobs, training data, and parameters\n•Sensitivity: Measure the sensitivity of the loss function to each of the active parameters', default='Optimization', choices=['Optimization', 'GenerateReference', 'SinglePoint', 'Sensitivity', 'MachineLearning'])
self.DataSet: ParAMSGenerateReference._DataSet = self._DataSet(name='DataSet', comment='Configuration settings for each data set in the optimization.', unique=False, gui_type='Repeat at least once')
self.ParallelLevels: ParAMSGenerateReference._ParallelLevels = self._ParallelLevels(name='ParallelLevels', comment='Distribution of threads/processes between the parallelization levels.', gui_name='Parallelization distribution: ')