Example: pKa prediction (PLAMS)

This example should be executed using PLAMS.

Download pKa.py

from scm.plams.interfaces.molecule.rdkit import from_smiles
import numpy as np
import multiprocessing

# In this example we compute pKa (acid dissociation constant) using MOPAC for a set of
# molecules. The molecules are defined using smiles strings, and are converted to xyz 
# structures using the plams-rdkit interface.

# Important note: the predicted pKa strongly depend on the molecule's conformer.
# Here we use the lowest conformer predicted by rdkit's UFF.
# The difference between the values computed here and the results on the 
# MOPAC website (ref_mopac_pKa) is due to different conformers 

# Data taken from the online MOPAC manual: http://openmopac.net/manual/ (only a sub set)
data_tmp = [
   # Molecule name                 smiles                            exp_pKa   ref_mopac_pKa (from mopac's website)
   ['1-Naphthoic_acid',           'C1=CC=C2C(=C1)C=CC=C2C(=O)O',      3.69,     4.35],
   ['2,2,2-Trichloroethanol',     'C(C(Cl)(Cl)Cl)O',                 12.02,    12.22],
   ['2,2,2-Trifluoroethanol',     'C(C(F)(F)F)O',                    12.40,    12.27], 
   ['2,2-Dimethylpropionic_acid', 'CC(C)(C)C(=O)O',                   5.03,     5.23],
   ['2,3,4,6-Tetrachlorophenol',  'C1=C(C(=C(C(=C1Cl)Cl)Cl)O)Cl',     7.10,     6.08],
   ['Acetic_acid',                'CC(=O)O',                          4.76,     5.00],
   ['Acrylic_acid',               'C=CC(=O)O',                        4.25,     4.65],
   ['Benzoid_acid',               'C1=CC=C(C=C1)C(=O)O',              4.20,     4.30],
   ['Citric_acid',                'C(C(=O)O)C(CC(=O)O)(C(=O)O)O',     3.13,     2.56],
   ['Ethanol',                    'CCO',                             16.00,    16.37],
   ['Formic_acid',                'C(=O)O',                           3.77,     3.77],
   ['Glycine',                    'C(C(=O)O)N',                       2.35,     2.53],
   ['Isoleucine',                 'CCC(C)C(C(=O)O)N',                 2.32,     2.48],
   ['Methanol',                   'CO',                              15.54,    15.23],
   ['o-Nitrophenol',              'C1=CC=C(C(=C1)[N+](=O)[O-])O',     7.17,     7.52],
   ['Pentachlorophenol',          'C1(=C(C(=C(C(=C1Cl)Cl)Cl)Cl)Cl)O', 4.90,     5.55],
   ['Phenol',                     'C1=CC=C(C=C1)O',                  10.00,     9.71],
   ['Pyruvic_acid',               'CC(=O)C(=O)O',                     2.50,     2.85],
   ['T-Butanol',                  'CC(C)(C)O',                       17.00,    16.25],
   ['Terephthalic_acid',          'C1=CC(=CC=C1C(=O)O)C(=O)O',        3.51,     3.59],
   ['Valine',                     'CC(C)C(C(=O)O)N',                  2.29,     2.61],
   ['Water',                      'O',                               15.74,    15.75]]

# Turn data_tmp into a dictionary: 
systems = [{'name':d[0], 'smiles':d[1], 'exp_pKa':d[2], 'ref_mopac_pKa':d[3]} for d in data_tmp] 

# Create the molecules from the smiles using rdkit:
molecules = []
for system in systems:
   # Compute 30 conformers, optimize with UFF and pick the lowest in energy.
   mol = from_smiles(system['smiles'], nconfs=30, forcefield='uff')[0]

   mol.properties.name = system['name']
   mol.properties.exp_pKa = system['exp_pKa']
   mol.properties.ref_mopac_pKa = system['ref_mopac_pKa']
   
   molecules.append(mol)

# MOPAC input:
s = Settings()
s.runscript.nproc = 1 # serial calculation
s.input.ams.Task = 'GeometryOptimization'
s.input.ams.GeometryOptimization.Convergence.Step = 1.0e-3
s.input.ams.GeometryOptimization.Convergence.Gradients = 1.0e-5
s.input.mopac.model = 'PM6'
s.input.mopac.properties.pKa = 'Yes'

# Set up and run jobs:
jobs = MultiJob(children=[AMSJob(name=mol.properties.name, molecule=mol, settings=s) for mol in molecules])
jr = JobRunner(parallel=True, maxjobs=multiprocessing.cpu_count()) # run jobs in parallel
jobs.run(jobrunner=jr)

# Collect results:
for i, mol in enumerate(molecules):
   pKaValues = jobs.children[i].results.readrkf('Properties', 'pKaValues', file='mopac')
   mol.properties.calc_pKa = np.mean(pKaValues) # If there is more than one pKa, take the average value
 
# Print results in a table:
print("Results:\n")
print("| {:28} | {:8} | {:8} | {:8} | {:8} |".format("Molecule", "exp pKa", "calc pKa", "ref", 'calc-exp'))
for mol in molecules:
   print("| {:28} | {:>8.2f} | {:>8.4f} | {:>8.2f} | {:>8.2f} |".format(mol.properties.name, mol.properties.exp_pKa, mol.properties.calc_pKa, mol.properties.ref_mopac_pKa, mol.properties.calc_pKa-mol.properties.exp_pKa))
print("")

errors = [mol.properties.calc_pKa - mol.properties.exp_pKa for mol in molecules]

print("Mean signed error  : {:4.2f}".format(np.mean(errors)))
print("Mean unsigned error: {:4.2f}".format(np.mean([abs(e) for e in errors])))
print("Root mean square error: {:4.2f}".format(np.sqrt(np.mean([e**2 for e in errors]))))
print("Done")