1
|
|
"""
|
2
|
|
Optimization procedure/task
|
3
|
|
"""
|
4
|
|
|
5
|
14
|
from typing import List, Optional
|
6
|
|
|
7
|
14
|
import qcelemental as qcel
|
8
|
14
|
import qcengine as qcng
|
9
|
|
|
10
|
14
|
from .base import BaseTasks
|
11
|
14
|
from ..interface.models import Molecule, OptimizationRecord, QCSpecification, ResultRecord, TaskRecord, KeywordSet
|
12
|
14
|
from ..interface.models.task_models import PriorityEnum
|
13
|
14
|
from .procedures_util import parse_single_tasks, form_qcinputspec_schema
|
14
|
|
|
15
|
|
|
16
|
14
|
class OptimizationTasks(BaseTasks):
|
17
|
|
"""
|
18
|
|
Optimization task manipulation
|
19
|
|
"""
|
20
|
|
|
21
|
14
|
def verify_input(self, data):
|
22
|
14
|
program = data.meta.program.lower()
|
23
|
14
|
if program not in qcng.list_all_procedures():
|
24
|
14
|
return "Procedure '{}' not available in QCEngine.".format(program)
|
25
|
|
|
26
|
14
|
program = data.meta.qc_spec["program"].lower()
|
27
|
14
|
if program not in qcng.list_all_programs():
|
28
|
14
|
return "Program '{}' not available in QCEngine.".format(program)
|
29
|
|
|
30
|
14
|
return True
|
31
|
|
|
32
|
14
|
def parse_input(self, data, duplicate_id="hash_index"):
|
33
|
|
"""Parse input json into internally appropriate format
|
34
|
|
|
35
|
|
json_data = {
|
36
|
|
"meta": {
|
37
|
|
"procedure": "optimization",
|
38
|
|
"option": "default",
|
39
|
|
"program": "geometric",
|
40
|
|
"qc_meta": {
|
41
|
|
"driver": "energy",
|
42
|
|
"method": "HF",
|
43
|
|
"basis": "sto-3g",
|
44
|
|
"keywords": "default",
|
45
|
|
"program": "psi4"
|
46
|
|
},
|
47
|
|
},
|
48
|
|
"data": ["mol_id_1", "mol_id_2", ...],
|
49
|
|
}
|
50
|
|
|
51
|
|
qc_schema_input = {
|
52
|
|
"molecule": {
|
53
|
|
"geometry": [
|
54
|
|
0.0, 0.0, -0.6,
|
55
|
|
0.0, 0.0, 0.6,
|
56
|
|
],
|
57
|
|
"symbols": ["H", "H"],
|
58
|
|
"connectivity": [[0, 1, 1]]
|
59
|
|
},
|
60
|
|
"driver": "gradient",
|
61
|
|
"model": {
|
62
|
|
"method": "HF",
|
63
|
|
"basis": "sto-3g"
|
64
|
|
},
|
65
|
|
"keywords": {},
|
66
|
|
}
|
67
|
|
json_data = {
|
68
|
|
"keywords": {
|
69
|
|
"coordsys": "tric",
|
70
|
|
"maxiter": 100,
|
71
|
|
"program": "psi4"
|
72
|
|
},
|
73
|
|
}
|
74
|
|
|
75
|
|
"""
|
76
|
|
|
77
|
|
# Get the optimization specification from the input meta dictionary
|
78
|
14
|
opt_spec = data.meta
|
79
|
|
|
80
|
|
# We should only have gotten here if procedure is 'optimization'
|
81
|
14
|
assert opt_spec.procedure.lower() == "optimization"
|
82
|
|
|
83
|
|
# Grab the tag and priority if available
|
84
|
14
|
tag = opt_spec.tag
|
85
|
14
|
priority = opt_spec.priority
|
86
|
|
|
87
|
|
# Handle (optimization) keywords, which may be None
|
88
|
|
# TODO: These are not stored in the keywords table (yet)
|
89
|
14
|
opt_keywords = {} if opt_spec.keywords is None else opt_spec.keywords
|
90
|
|
|
91
|
|
# Set the program used for gradient evaluations. This is stored in the input qcspec
|
92
|
|
# but the QCInputSpecification does not have a place for program. So instead
|
93
|
|
# we move it to the optimization keywords
|
94
|
14
|
opt_keywords["program"] = opt_spec.qc_spec["program"]
|
95
|
|
|
96
|
|
# Pull out the QCSpecification from the input
|
97
|
14
|
qc_spec_dict = data.meta.qc_spec
|
98
|
|
|
99
|
|
# Handle qc specification keywords, which may be None
|
100
|
14
|
qc_keywords = qc_spec_dict.get("keywords", None)
|
101
|
14
|
if qc_keywords is not None:
|
102
|
|
# The keywords passed in may contain the entire KeywordSet.
|
103
|
|
# But the QCSpec will only hold the ID
|
104
|
1
|
qc_keywords = self.storage.get_add_keywords_mixed([qc_keywords])["data"][0]
|
105
|
1
|
if qc_keywords is None:
|
106
|
0
|
raise KeyError("Could not find requested KeywordsSet from id key.")
|
107
|
1
|
qc_spec_dict["keywords"] = qc_keywords.id
|
108
|
|
|
109
|
|
# Now that keywords are fixed we can do this
|
110
|
14
|
qc_spec = QCSpecification(**qc_spec_dict)
|
111
|
|
|
112
|
|
# Add all the initial molecules to the database
|
113
|
|
# TODO: WARNING WARNING if get_add_molecules_mixed is modified to handle duplicates
|
114
|
|
# correctly, you must change some pieces later in this function
|
115
|
14
|
molecule_list = self.storage.get_add_molecules_mixed(data.data)["data"]
|
116
|
|
|
117
|
|
# Keep molecule IDs that are not None
|
118
|
|
# Molecule IDs may be None if they are duplicates (ie, the same molecule was listed twice
|
119
|
|
# in data.data) or an id specified in data.data was invalid
|
120
|
14
|
valid_molecule_idx = [idx for idx, mol in enumerate(molecule_list) if mol is not None]
|
121
|
14
|
valid_molecules = [x for x in molecule_list if x is not None]
|
122
|
|
|
123
|
|
# Create all OptimizationRecords
|
124
|
14
|
all_opt_records = []
|
125
|
14
|
for mol in valid_molecules:
|
126
|
|
# TODO fix handling of protocols (perhaps after hardening rest models)
|
127
|
14
|
opt_data = {
|
128
|
|
"initial_molecule": mol.id,
|
129
|
|
"qc_spec": qc_spec,
|
130
|
|
"keywords": opt_keywords,
|
131
|
|
"program": opt_spec.program,
|
132
|
|
}
|
133
|
14
|
if hasattr(opt_spec, "protocols"):
|
134
|
1
|
opt_data["protocols"] = data.meta.protocols
|
135
|
|
|
136
|
14
|
opt_rec = OptimizationRecord(**opt_data)
|
137
|
14
|
all_opt_records.append(opt_rec)
|
138
|
|
|
139
|
|
# Add all the procedures in a single function call
|
140
|
|
# NOTE: Because get_add_molecules_mixed returns None for duplicate
|
141
|
|
# molecules (or when specifying incorrect ids),
|
142
|
|
# all_opt_records should never contain duplicates
|
143
|
14
|
ret = self.storage.add_procedures(all_opt_records)
|
144
|
|
|
145
|
|
# Get all procedure IDs (may be new or existing)
|
146
|
|
# These will be in the order we sent to add_results
|
147
|
14
|
all_opt_ids = ret["data"]
|
148
|
14
|
existing_ids = ret["meta"]["duplicates"]
|
149
|
|
|
150
|
|
# Assing ids to the optimization records
|
151
|
14
|
for idx in range(len(all_opt_records)):
|
152
|
14
|
r = all_opt_records[idx].copy(update={"id": all_opt_ids[idx]})
|
153
|
14
|
all_opt_records[idx] = r
|
154
|
|
|
155
|
|
# Now generate all the tasks, but only for results that don't exist already
|
156
|
14
|
self.create_tasks(
|
157
|
|
all_opt_records, valid_molecules, [qc_keywords] * len(all_opt_records), tag=tag, priority=priority
|
158
|
|
)
|
159
|
|
|
160
|
|
# Keep the returned result id list in the same order as the input molecule list
|
161
|
|
# If a molecule was None, then the corresponding result ID will be None
|
162
|
|
# (since the entry in valid_molecule_idx will be missing). Ditto for molecules specified
|
163
|
|
# more than once in the argument to this function
|
164
|
14
|
opt_ids = [None] * len(molecule_list)
|
165
|
14
|
for idx, result_id in zip(valid_molecule_idx, all_opt_ids):
|
166
|
14
|
opt_ids[idx] = result_id
|
167
|
|
|
168
|
14
|
return opt_ids, existing_ids
|
169
|
|
|
170
|
14
|
def create_tasks(
|
171
|
|
self,
|
172
|
|
records: List[OptimizationRecord],
|
173
|
|
molecules: Optional[List[Molecule]] = None,
|
174
|
|
qc_keywords: Optional[List[KeywordSet]] = None,
|
175
|
|
tag: Optional[str] = None,
|
176
|
|
priority: Optional[PriorityEnum] = None,
|
177
|
|
):
|
178
|
|
|
179
|
|
# Find the molecule keywords specified in the records
|
180
|
14
|
rec_mol_ids = [x.initial_molecule for x in records]
|
181
|
|
|
182
|
|
# If not specified when calling this function, load them from the database
|
183
|
|
# TODO: there can be issues with duplicate molecules. So we have to go one by one
|
184
|
14
|
if molecules is None:
|
185
|
1
|
molecules = [self.storage.get_molecules(x)["data"][0] for x in rec_mol_ids]
|
186
|
|
|
187
|
|
# Check id to make sure the molecules match the ids in the records
|
188
|
14
|
mol_ids = [x.id for x in molecules]
|
189
|
14
|
if rec_mol_ids != mol_ids:
|
190
|
0
|
raise ValueError(f"Given molecule ids {str(mol_ids)} do not match those in records: {str(rec_mol_ids)}")
|
191
|
|
|
192
|
|
# Do the same as above but with with qc specification keywords
|
193
|
14
|
rec_qc_kw_ids = [x.qc_spec.keywords for x in records]
|
194
|
14
|
if qc_keywords is None:
|
195
|
1
|
qc_keywords = [self.storage.get_keywords(x)["data"][0] if x is not None else None for x in rec_qc_kw_ids]
|
196
|
|
|
197
|
14
|
qc_kw_ids = [x.id if x is not None else None for x in qc_keywords]
|
198
|
14
|
if rec_qc_kw_ids != qc_kw_ids:
|
199
|
0
|
raise ValueError(f"Given keyword ids {str(qc_kw_ids)} do not match those in records: {str(rec_qc_kw_ids)}")
|
200
|
|
|
201
|
14
|
new_tasks = []
|
202
|
14
|
for rec, mol, kw in zip(records, molecules, qc_keywords):
|
203
|
14
|
inp = self._build_schema_input(rec, mol, kw)
|
204
|
14
|
inp.input_specification.extras["_qcfractal_tags"] = {
|
205
|
|
"program": rec.qc_spec.program,
|
206
|
|
"keywords": rec.qc_spec.keywords, # Just the id?
|
207
|
|
}
|
208
|
|
|
209
|
|
# Build task object
|
210
|
14
|
task = TaskRecord(
|
211
|
|
**{
|
212
|
|
"spec": {
|
213
|
|
"function": "qcengine.compute_procedure",
|
214
|
|
"args": [inp.dict(), rec.program],
|
215
|
|
"kwargs": {},
|
216
|
|
},
|
217
|
|
"parser": "optimization",
|
218
|
|
# TODO This is pretty whacked. Fix column names at some point
|
219
|
|
"program": rec.qc_spec.program,
|
220
|
|
"procedure": rec.program,
|
221
|
|
"tag": tag,
|
222
|
|
"priority": priority,
|
223
|
|
"base_result": rec.id,
|
224
|
|
}
|
225
|
|
)
|
226
|
|
|
227
|
14
|
new_tasks.append(task)
|
228
|
|
|
229
|
14
|
return self.storage.queue_submit(new_tasks)
|
230
|
|
|
231
|
14
|
def handle_completed_output(self, opt_outputs):
|
232
|
|
"""Save the results of the procedure.
|
233
|
|
It must make sure to save the results in the results table
|
234
|
|
including the task_id in the TaskQueue table
|
235
|
|
"""
|
236
|
|
|
237
|
1
|
completed_tasks = []
|
238
|
1
|
updates = []
|
239
|
1
|
for output in opt_outputs:
|
240
|
1
|
rec = self.storage.get_procedures(id=output["base_result"])["data"][0]
|
241
|
1
|
rec = OptimizationRecord(**rec)
|
242
|
|
|
243
|
1
|
procedure = output["result"]
|
244
|
|
|
245
|
|
# Adds the results to the database and sets the ids inside the dictionary
|
246
|
1
|
self.retrieve_outputs(procedure)
|
247
|
|
|
248
|
|
# Add initial and final molecules
|
249
|
1
|
update_dict = {}
|
250
|
1
|
update_dict["stdout"] = procedure.get("stdout", None)
|
251
|
1
|
update_dict["stderr"] = procedure.get("stderr", None)
|
252
|
1
|
update_dict["error"] = procedure.get("error", None)
|
253
|
|
|
254
|
1
|
initial_mol, final_mol = self.storage.add_molecules(
|
255
|
|
[Molecule(**procedure["initial_molecule"]), Molecule(**procedure["final_molecule"])]
|
256
|
|
)["data"]
|
257
|
1
|
assert initial_mol == rec.initial_molecule
|
258
|
1
|
update_dict["final_molecule"] = final_mol
|
259
|
|
|
260
|
|
# Parse trajectory computations and add task_id
|
261
|
1
|
traj_dict = {k: v for k, v in enumerate(procedure["trajectory"])}
|
262
|
|
|
263
|
|
# Add results for the trajectory to the database
|
264
|
1
|
for k, v in traj_dict.items():
|
265
|
1
|
self.retrieve_outputs(v)
|
266
|
|
|
267
|
1
|
results = parse_single_tasks(self.storage, traj_dict)
|
268
|
1
|
for k, v in results.items():
|
269
|
1
|
results[k] = ResultRecord(**v)
|
270
|
|
|
271
|
1
|
ret = self.storage.add_results(list(results.values()))
|
272
|
1
|
update_dict["trajectory"] = ret["data"]
|
273
|
1
|
update_dict["energies"] = procedure["energies"]
|
274
|
1
|
update_dict["provenance"] = procedure["provenance"]
|
275
|
|
|
276
|
1
|
rec = OptimizationRecord(**{**rec.dict(), **update_dict})
|
277
|
1
|
updates.append(rec)
|
278
|
1
|
completed_tasks.append(output["task_id"])
|
279
|
|
|
280
|
1
|
self.storage.update_procedures(updates)
|
281
|
1
|
self.storage.queue_mark_complete(completed_tasks)
|
282
|
|
|
283
|
1
|
return completed_tasks
|
284
|
|
|
285
|
14
|
@staticmethod
|
286
|
14
|
def _build_schema_input(
|
287
|
|
record: OptimizationRecord, initial_molecule: "Molecule", qc_keywords: Optional["KeywordSet"] = None
|
288
|
|
) -> "OptimizationInput":
|
289
|
|
"""
|
290
|
|
Creates a OptimizationInput schema.
|
291
|
|
"""
|
292
|
|
|
293
|
14
|
assert record.initial_molecule == initial_molecule.id
|
294
|
14
|
if record.qc_spec.keywords:
|
295
|
1
|
assert record.qc_spec.keywords == qc_keywords.id
|
296
|
|
|
297
|
14
|
qcinput_spec = form_qcinputspec_schema(record.qc_spec, keywords=qc_keywords)
|
298
|
|
|
299
|
14
|
model = qcel.models.OptimizationInput(
|
300
|
|
id=record.id,
|
301
|
|
initial_molecule=initial_molecule,
|
302
|
|
keywords=record.keywords,
|
303
|
|
extras=record.extras,
|
304
|
|
hash_index=record.hash_index,
|
305
|
|
input_specification=qcinput_spec,
|
306
|
|
protocols=record.protocols,
|
307
|
|
)
|
308
|
14
|
return model
|