1
|
|
#!/usr/bin/env python
|
2
|
60
|
"""
|
3
|
|
survey_conformers.py
|
4
|
|
|
5
|
|
Purpose: Compare energies and calculation times of the same molecule set
|
6
|
|
across different QM methods.
|
7
|
|
|
8
|
|
Version: Apr 24 2019
|
9
|
|
By: Victoria T. Lim
|
10
|
|
|
11
|
|
"""
|
12
|
|
|
13
|
60
|
import os
|
14
|
60
|
import numpy as np
|
15
|
60
|
import operator as o
|
16
|
60
|
from scipy import stats
|
17
|
60
|
import matplotlib.pyplot as plt
|
18
|
|
|
19
|
60
|
import quanformer.proc_tags as pt
|
20
|
60
|
import quanformer.reader as reader
|
21
|
|
|
22
|
|
|
23
|
|
|
24
|
60
|
def avg_mol_time(titles, infile, method, basis, tag, mol_slice=[]):
|
25
|
|
"""
|
26
|
|
For an SDF file with all confs of all mols, get the average runtime
|
27
|
|
of all conformers for each molecule.
|
28
|
|
The input dictionary may or not be empty. If it is, append the avg/stdev
|
29
|
|
time of the calculation from this infile to existing molecule's value.
|
30
|
|
|
31
|
|
Parameters
|
32
|
|
----------
|
33
|
|
titles : dictionary
|
34
|
|
Keys are molecule names.
|
35
|
|
Values are [[qm1_avg, qm1_std], [qm2_avg, qm2_std], ... ]
|
36
|
|
where the index refers to a particular level of theory.
|
37
|
|
Dictionary may or may not be empty.
|
38
|
|
infile : string
|
39
|
|
name of the SDF file from which to extract time data from SD tag
|
40
|
|
method : string
|
41
|
|
QM method
|
42
|
|
basis : string
|
43
|
|
QM basis set
|
44
|
|
tag : string
|
45
|
|
datum of interest, e.g., "QM opt energy"
|
46
|
|
See keys in the define_tag function of proc_tags module.
|
47
|
|
mol_slice : list
|
48
|
|
list of indices from which to slice mols generator for read_mols
|
49
|
|
[start, stop, step]
|
50
|
|
|
51
|
|
Returns
|
52
|
|
-------
|
53
|
|
titles : dictionary
|
54
|
|
dictionary with extracted data from SDF file; keys are molnames,
|
55
|
|
values are lists of list of [avg_time, stdev_time] for many QM methods
|
56
|
|
|
57
|
|
"""
|
58
|
|
|
59
|
|
# Open molecule file.
|
60
|
60
|
if len(mol_slice) == 3:
|
61
|
0
|
mols = reader.read_mols(infile, mol_slice)
|
62
|
|
else:
|
63
|
60
|
mols = reader.read_mols(infile)
|
64
|
|
|
65
|
|
|
66
|
|
# Prepare text file to write extracted data.
|
67
|
60
|
timeF = open("timeAvgs.txt", 'a')
|
68
|
60
|
timeF.write("\nFile: {}\n".format(infile))
|
69
|
60
|
timeF.write("Average [{}/{}] [{}s] over all confs for each molecule\n".format(method, basis, tag))
|
70
|
|
|
71
|
60
|
for mol_i in mols:
|
72
|
|
|
73
|
|
# get array of all conformer data of this mol
|
74
|
60
|
try:
|
75
|
60
|
time_array = np.fromiter(pt.get_sd_list(mol_i, tag, 'Psi4', method, basis), dtype=np.float64)
|
76
|
60
|
except ValueError:
|
77
|
0
|
time_array = np.asarray([np.nan])*mol_i.NumConfs()
|
78
|
|
|
79
|
|
# exclude conformers for which job did not finish (nan)
|
80
|
60
|
nanIndices = np.argwhere(np.isnan(time_array))
|
81
|
60
|
for i in reversed(nanIndices): # loop in reverse to delete correctly
|
82
|
0
|
time_array = np.delete(time_array, i)
|
83
|
60
|
meantime = np.mean(time_array)
|
84
|
60
|
stdtime = np.std(time_array)
|
85
|
|
|
86
|
|
# write out data to file and store in dictionary
|
87
|
60
|
timeF.write(" %s\t%d confs\t\t%.3f +- %.3f\n" % (mol_i.GetTitle(), time_array.size, meantime, stdtime))
|
88
|
60
|
name = mol_i.GetTitle()
|
89
|
60
|
if name not in titles:
|
90
|
60
|
titles[name] = []
|
91
|
60
|
titles[name].append([meantime, stdtime])
|
92
|
|
|
93
|
60
|
timeF.close()
|
94
|
60
|
return titles
|
95
|
|
|
96
|
|
|
97
|
60
|
def plot_groupedbar(ax, labels_data_std, errbar=False):
|
98
|
|
"""
|
99
|
|
Function modified from Peter Kerpedjiev.
|
100
|
|
http://emptypipes.org/2013/11/09/matplotlib-multicategory-barchart/
|
101
|
|
|
102
|
|
Create a barchart for data across different x_labels with
|
103
|
|
multiple color_labels for each x_label.
|
104
|
|
|
105
|
|
Parameters
|
106
|
|
----------
|
107
|
|
ax: The plotting axes from matplotlib.
|
108
|
|
labels_data_std: The data set as an (n, 3) numpy array
|
109
|
|
|
110
|
|
"""
|
111
|
|
# Aggregate the color_labels and the x_labels according to their mean values
|
112
|
60
|
color_labels = [(c, np.mean(labels_data_std[labels_data_std[:, 0] == c][:, 2].astype(float))) for c in np.unique(labels_data_std[:, 0])]
|
113
|
60
|
x_labels = [(c, np.mean(labels_data_std[labels_data_std[:, 1] == c][:, 2].astype(float))) for c in np.unique(labels_data_std[:, 1])]
|
114
|
|
|
115
|
|
# sort the color_labels, x_labels and data so that the bars in
|
116
|
|
# the plot will be ordered by x_category and color_label
|
117
|
60
|
color_labels = [c[0] for c in sorted(color_labels, key=o.itemgetter(1))]
|
118
|
60
|
x_labels = [c[0] for c in sorted(x_labels, key=o.itemgetter(1))]
|
119
|
60
|
labels_data_std = np.array(sorted(labels_data_std, key=lambda x: x_labels.index(x[1])))
|
120
|
|
|
121
|
|
# the space between each set of bars
|
122
|
60
|
space = 0.3
|
123
|
60
|
n = len(color_labels)
|
124
|
60
|
width = (1 - space) / (len(color_labels))
|
125
|
60
|
indices = range(len(x_labels))
|
126
|
|
|
127
|
|
# set color cycle
|
128
|
60
|
if n < 9:
|
129
|
60
|
ax.set_prop_cycle(plt.cycler('color', plt.cm.Accent(np.linspace(0, 1, n))))
|
130
|
0
|
elif n < 11:
|
131
|
0
|
ax.set_prop_cycle(plt.cycler('color', plt.cm.tab10(np.linspace(0, 1, n))))
|
132
|
0
|
elif n < 13:
|
133
|
0
|
ax.set_prop_cycle(plt.cycler('color', plt.cm.Paired(np.linspace(0, 1, n))))
|
134
|
|
else:
|
135
|
0
|
ax.set_prop_cycle(plt.cycler('color', plt.cm.tab20(np.linspace(0, 1, n))))
|
136
|
|
|
137
|
|
# Create a set of bars at each position
|
138
|
60
|
for i, cond in enumerate(color_labels):
|
139
|
60
|
vals = labels_data_std[labels_data_std[:, 0] == cond][:, 2].astype(np.float)
|
140
|
60
|
pos = [j - (1 - space) / 2. + i * width for j in indices]
|
141
|
60
|
if errbar:
|
142
|
0
|
stds = labels_data_std[labels_data_std[:, 0] == cond][:, 3].astype(np.float)
|
143
|
0
|
ax.bar(pos, vals, yerr=stds, width=width, label=cond)
|
144
|
|
else:
|
145
|
60
|
ax.bar(pos, vals, width=width, label=cond)
|
146
|
|
|
147
|
|
# Set the x-axis tick labels to be equal to the x_labels
|
148
|
60
|
if len(indices) > 7: # adjust int as needed for better placement of many labels
|
149
|
0
|
indices = [x - 1 for x in indices]
|
150
|
60
|
ax.set_xticks(indices)
|
151
|
60
|
ax.set_xticklabels(x_labels)
|
152
|
60
|
plt.setp(plt.xticks()[1], rotation=50)
|
153
|
|
|
154
|
|
# Add a legend
|
155
|
60
|
handles, labels = ax.get_legend_handles_labels()
|
156
|
60
|
ax.legend(handles[::-1], labels[::-1], loc='upper left')
|
157
|
|
|
158
|
|
|
159
|
60
|
def arrange_and_plot(wholedict, ptitle):
|
160
|
|
"""
|
161
|
|
Organize and plot data.
|
162
|
|
|
163
|
|
Parameters
|
164
|
|
----------
|
165
|
|
wholedict : OrderedDict
|
166
|
|
dictionary of information to be plotted, ordered by input file
|
167
|
|
required keys are 'label' 'titleMols' 'rmsds'
|
168
|
|
ptitle : string
|
169
|
|
Title on plot
|
170
|
|
|
171
|
|
"""
|
172
|
|
# fill in dictionary for ref file for list comprehension later
|
173
|
60
|
wholedict[0]['titleMols'] = np.full(len(wholedict[1]['titleMols']), np.nan)
|
174
|
60
|
wholedict[0]['rmsds'] = np.full(len(wholedict[1]['rmsds']), np.nan)
|
175
|
|
|
176
|
|
# extract part of dictionary using list comprehension
|
177
|
60
|
subset = np.array([[(wholedict[fd][key]) for key in\
|
178
|
|
('label','titleMols','rmsds')] for fd in list(wholedict.keys())[1:]], dtype=object).T
|
179
|
|
|
180
|
|
# build plot list
|
181
|
60
|
plotlist = []
|
182
|
60
|
for m in range(len(wholedict[1]['titleMols'])):
|
183
|
60
|
for f in range(len(wholedict) - 1):
|
184
|
60
|
temp = []
|
185
|
60
|
temp.append(subset[0][f])
|
186
|
60
|
temp.append(subset[1][f][m])
|
187
|
60
|
temp.append(subset[2][f][m])
|
188
|
60
|
plotlist.append(temp)
|
189
|
60
|
plotlist = np.array(plotlist)
|
190
|
|
|
191
|
|
# generate plot
|
192
|
60
|
fig = plt.figure()
|
193
|
60
|
ax = fig.add_subplot(111)
|
194
|
60
|
plot_groupedbar(ax, plotlist)
|
195
|
60
|
plt.title(ptitle, fontsize=14)
|
196
|
60
|
plt.savefig('barchart.png', bbox_inches='tight')
|
197
|
60
|
plt.show()
|
198
|
|
|
199
|
|
|
200
|
60
|
def extract_enes(dict1, mol_slice=[]):
|
201
|
|
"""
|
202
|
|
From files in input dictionaries, read in molecules, extract information
|
203
|
|
from SD tags for conformer energies and indices.
|
204
|
|
|
205
|
|
Parameters
|
206
|
|
----------
|
207
|
|
dict1 : dict
|
208
|
|
dictionary of input files and information to extract from SD tags
|
209
|
|
keys are: 'theory' 'fname' 'tagkey' 'label'
|
210
|
|
mol_slice : list
|
211
|
|
list of indices from which to slice mols generator for read_mols
|
212
|
|
[start, stop, step]
|
213
|
|
|
214
|
|
Returns
|
215
|
|
-------
|
216
|
|
titleMols : list of strings
|
217
|
|
names of all molecules in the SDF file
|
218
|
|
confNums : list of ints
|
219
|
|
conformer index numbers
|
220
|
|
enes : list of numpy arrays
|
221
|
|
conformer energies of the compared file (kcal/mol)
|
222
|
|
confNans : list of numpy arrays
|
223
|
|
indices of enes where the values are nans
|
224
|
|
|
225
|
|
"""
|
226
|
|
|
227
|
|
# Open molecule file.
|
228
|
60
|
if len(mol_slice) == 3:
|
229
|
0
|
mols = reader.read_mols(dict1['fname'], mol_slice)
|
230
|
|
else:
|
231
|
60
|
mols = reader.read_mols(dict1['fname'])
|
232
|
|
|
233
|
60
|
short_tag = dict1['tagkey']
|
234
|
60
|
qmethod, qbasis = reader.separated_theory(dict1['theory'])
|
235
|
|
|
236
|
60
|
titleMols = []
|
237
|
60
|
confNums = []
|
238
|
60
|
enes = []
|
239
|
60
|
confNans = []
|
240
|
|
|
241
|
60
|
for imol in mols:
|
242
|
|
|
243
|
|
# Get absolute energies from the SD tags
|
244
|
60
|
iabs = np.array(list(map(float, pt.get_sd_list(imol, short_tag, 'Psi4', qmethod, qbasis))))
|
245
|
|
|
246
|
|
# Get omega conformer number of first, for reference info
|
247
|
|
# whole list can be used for matching purposes
|
248
|
60
|
indices_orig = pt.get_sd_list(imol, "original index")
|
249
|
|
|
250
|
|
# find conformers for which job did not finish (nan)
|
251
|
60
|
nanIndices = np.argwhere(np.isnan(iabs))
|
252
|
|
|
253
|
|
# convert energies from Hartrees to kcal/mol
|
254
|
60
|
iabs = 627.5095 * iabs
|
255
|
|
|
256
|
60
|
titleMols.append(imol.GetTitle())
|
257
|
60
|
confNans.append(nanIndices)
|
258
|
60
|
confNums.append(indices_orig)
|
259
|
60
|
enes.append(iabs)
|
260
|
|
|
261
|
60
|
return titleMols, confNums, enes, confNans
|
262
|
|
|
263
|
|
|
264
|
60
|
def remove_dead_conformers(enelist, idxlist, nanlist):
|
265
|
|
# enelist[i][j] is for file i, mol j
|
266
|
|
|
267
|
|
# reshape list for that i goes to mol and j goes to file
|
268
|
60
|
enelist = np.array(enelist).T
|
269
|
60
|
idxlist = np.array(idxlist).T
|
270
|
60
|
nanlist = np.array(nanlist).T
|
271
|
|
|
272
|
60
|
for i, mols_nan in enumerate(nanlist):
|
273
|
60
|
for file_mols_nan in mols_nan:
|
274
|
0
|
mols_ene = enelist[i]
|
275
|
0
|
mols_idx = idxlist[i]
|
276
|
0
|
for j, (file_mols_ene, file_mols_idx) in enumerate(zip(mols_ene, mols_idx)):
|
277
|
0
|
enelist[i][j] = np.delete(file_mols_ene, file_mols_nan)
|
278
|
0
|
idxlist[i][j] = np.delete(file_mols_idx, file_mols_nan)
|
279
|
|
|
280
|
|
# transpose it back to return
|
281
|
60
|
return idxlist.T, enelist.T
|
282
|
|
|
283
|
|
|
284
|
60
|
def relative_energies(enelist, method, ref_conf_int=0):
|
285
|
|
"""
|
286
|
|
Compute relative energies by either subtracting or dividing
|
287
|
|
by a reference conformer's energy value. This energy value
|
288
|
|
is different for every mol in every file.
|
289
|
|
|
290
|
|
Parameters
|
291
|
|
----------
|
292
|
|
enelist : list of lists
|
293
|
|
enelist[i][j] is the energy for file i, mol j
|
294
|
|
method : string
|
295
|
|
either 'subtract' or 'divide' to take all values of mol
|
296
|
|
and subtract by first conf or divide by first conf
|
297
|
|
ref_conf_int : integer
|
298
|
|
0-based index of the integer for which to use as reference
|
299
|
|
for taking relative energies
|
300
|
|
|
301
|
|
Returns
|
302
|
|
--------
|
303
|
|
rel_enes : list of lists
|
304
|
|
same structure as input enelist but with relative energies
|
305
|
|
|
306
|
|
"""
|
307
|
|
|
308
|
60
|
rel_enes = []
|
309
|
60
|
for i, file_ene in enumerate(enelist):
|
310
|
60
|
rel_enes.append([])
|
311
|
60
|
for mol_ene in file_ene:
|
312
|
60
|
if method == 'subtract':
|
313
|
60
|
rel = mol_ene - mol_ene[ref_conf_int]
|
314
|
60
|
elif method == 'divide':
|
315
|
60
|
rel = mol_ene/mol_ene[ref_conf_int]
|
316
|
60
|
rel_enes[i].append(rel)
|
317
|
60
|
return rel_enes
|
318
|
|
|
319
|
|
|
320
|
60
|
def avg_coeffvar(enelist, ref_conf_int=0):
|
321
|
|
"""
|
322
|
|
For the ABSOLUTE energies of a single conformer, compute
|
323
|
|
coefficient of variation (CV) across methods, then
|
324
|
|
average the CVs across all conformers for each molecule.
|
325
|
|
|
326
|
|
Note: Input energies can be raw energies, or scaled energies (such that
|
327
|
|
energies are still on ratio scale).
|
328
|
|
Note: It is incorrect to compute the coefficient of variation on relative
|
329
|
|
values. See wikipedia for an example on temperature.
|
330
|
|
Note: The units of the input shouldn't matter since the CV is a ratio,
|
331
|
|
meaning that units should cancel.
|
332
|
|
|
333
|
|
Parameters
|
334
|
|
----------
|
335
|
|
enelist : list of lists
|
336
|
|
enelist[i][j] is absolute energy of file i, mol j
|
337
|
|
|
338
|
|
Returns
|
339
|
|
-------
|
340
|
|
spread_by_mol : list
|
341
|
|
len(spread_by_mol) == number of molecules
|
342
|
|
spread_by_mol[i] == avg(CVs(conformer enes from diff methods) of mol i
|
343
|
|
cvlist_all_mols : list of lists
|
344
|
|
len(cvlist_all_mols) == number of molecules
|
345
|
|
len(cvlist_all_mols[i]) == number of conformers for molecule i minus 1
|
346
|
|
|
347
|
|
Notes
|
348
|
|
-----
|
349
|
|
* This takes population stdevs, as opposed to sample stdevs.
|
350
|
|
* Validated for single conformer, and for three conformers.
|
351
|
|
|
352
|
|
"""
|
353
|
|
|
354
|
|
# 1D list for average of CVs of each mol
|
355
|
60
|
spread_by_mol = []
|
356
|
60
|
cvlist_all_mols = []
|
357
|
|
|
358
|
|
# reshape list so that enelist[i][j] is mol i, file j
|
359
|
60
|
enelist = np.array(enelist).T
|
360
|
|
|
361
|
|
# here, mols_ene[x][y] is ith molecule, xth file, and yth conformer
|
362
|
60
|
for i, mols_ene in enumerate(enelist):
|
363
|
60
|
cvlist = []
|
364
|
|
|
365
|
|
# check that at least two conformers exist
|
366
|
60
|
num_confs = mols_ene[0].shape[0]
|
367
|
60
|
if num_confs <= 1:
|
368
|
60
|
print(f"skipping molecule {i} since only 0 or 1 conformer")
|
369
|
60
|
spread_by_mol.append(np.nan)
|
370
|
60
|
cvlist_all_mols.append(num_confs*[np.nan])
|
371
|
60
|
continue
|
372
|
|
|
373
|
|
# number of files is mols_ene.shape[0]; should be same for all mols
|
374
|
60
|
for j in range(num_confs):
|
375
|
|
|
376
|
|
# for single conformer j, get its energies from all k files
|
377
|
60
|
confs_ene = [mols_ene[k][j] for k in range(mols_ene.shape[0])]
|
378
|
|
|
379
|
|
# compute coeff of variation
|
380
|
60
|
with np.errstate(divide='ignore', invalid='ignore'):
|
381
|
60
|
cv = np.std(confs_ene) / np.average(confs_ene)
|
382
|
60
|
cvlist.append(cv)
|
383
|
|
|
384
|
|
# remove ref conf from avg calc bc energies scaled by this conf
|
385
|
60
|
cvlist.pop(ref_conf_int)
|
386
|
|
|
387
|
|
# average the coeffs of variation
|
388
|
60
|
spread_by_mol.append(np.average(cvlist))
|
389
|
60
|
cvlist_all_mols.append(cvlist)
|
390
|
|
|
391
|
60
|
return spread_by_mol, cvlist_all_mols
|
392
|
|
|
393
|
|
|
394
|
60
|
def normalized_deviation(enelist):
|
395
|
|
"""
|
396
|
|
For the relative energies of each conformer of some molecule,
|
397
|
|
subtract the average energy for the different methods, then
|
398
|
|
take the standard deviation of all conformers of all methods.
|
399
|
|
|
400
|
|
TODO -- should the stdev be computed for all the conformers
|
401
|
|
of the mol (currently coded) or across the methods
|
402
|
|
(so add the line under np.average)? Would the stdevs of
|
403
|
|
each conformer then be averaged?
|
404
|
|
AKA should you be taking the stdev of the normalized data (current)
|
405
|
|
or take avg of the stdev?
|
406
|
|
|
407
|
|
Parameters
|
408
|
|
----------
|
409
|
|
TODO
|
410
|
|
|
411
|
|
Returns
|
412
|
|
-------
|
413
|
|
TODO
|
414
|
|
|
415
|
|
"""
|
416
|
|
|
417
|
|
# enelist[i][j] is for file i, mol j
|
418
|
|
# spread_by_mol is 1D list of len(num_mols)
|
419
|
0
|
spread_by_mol = []
|
420
|
|
|
421
|
|
# reshape list for that i goes to mol and j goes to file
|
422
|
0
|
enelist = np.array(enelist).T
|
423
|
0
|
for i, mols_ene in enumerate(enelist):
|
424
|
0
|
mollist = []
|
425
|
|
|
426
|
|
# number of files is mols_ene.shape[0]
|
427
|
0
|
for j in range(1, mols_ene[0].shape[0]):
|
428
|
0
|
confs_ene = [mols_ene[k][j] for k in range(mols_ene.shape[0])]
|
429
|
|
|
430
|
|
# subtract the average of methods for the conformer
|
431
|
|
# note: not subtracting average conformer energy for each method
|
432
|
|
# bc that would give us estimate of spread conformer energies as
|
433
|
|
# opposed to estimating spread of method energies
|
434
|
0
|
confs_ene = confs_ene - np.average(confs_ene)
|
435
|
0
|
mollist.append(confs_ene)
|
436
|
|
|
437
|
0
|
spread_by_mol.append(np.std(mollist))
|
438
|
|
|
439
|
0
|
return spread_by_mol
|
440
|
|
|
441
|
|
|
442
|
60
|
def rmsd_two_files(dict1, dict2, mol_slice=[]):
|
443
|
|
"""
|
444
|
|
From files in input dictionaries, read in molecules, extract information
|
445
|
|
from SD tags, compute relative conformer energies wrt to first conformer
|
446
|
|
of each molecule, and calculate RMSD of energies wrt reference data.
|
447
|
|
|
448
|
|
Parameters
|
449
|
|
----------
|
450
|
|
dict1 : dict
|
451
|
|
REFERENCE dictionary from which to calculate energy RMSDs
|
452
|
|
dictionary of input files and information to extract from SD tags
|
453
|
|
keys are: 'theory' 'fname' 'tagkey' 'label'
|
454
|
|
dict2 : dict
|
455
|
|
dictionary of input files and information to extract from SD tags
|
456
|
|
keys are: 'theory' 'fname' 'tagkey' 'label'
|
457
|
|
mol_slice : list
|
458
|
|
list of indices from which to slice mols generator for read_mols
|
459
|
|
[start, stop, step]
|
460
|
|
|
461
|
|
Returns
|
462
|
|
-------
|
463
|
|
titleMols : list of strings
|
464
|
|
names of all molecules in the SDF file
|
465
|
|
rmsds : list of floats
|
466
|
|
confNums : list of ints
|
467
|
|
conformer index numbers, excluding nans of both dict1 and dict2
|
468
|
|
e.g., if dict1 mol1 has confs of 0 1 2 3 5
|
469
|
|
and dict2 mol1 has confs of 0 2 3 4 5
|
470
|
|
then output would be 0 2 3 5
|
471
|
|
reference_enes : list of numpy arrays
|
472
|
|
relative conformer energies of the reference file (kcal/mol)
|
473
|
|
compared_enes : list of numpy arrays
|
474
|
|
relative conformer energies of the compared file (kcal/mol)
|
475
|
|
|
476
|
|
"""
|
477
|
|
|
478
|
60
|
titleMols = []
|
479
|
60
|
rmsds = []
|
480
|
60
|
confNums = []
|
481
|
60
|
reference_enes = []
|
482
|
60
|
compared_enes = []
|
483
|
|
|
484
|
|
# load molecules and extract data
|
485
|
60
|
mols1_titles, mols1_indices, mols1_enes, mols1_nans = extract_enes(dict1, mol_slice)
|
486
|
60
|
mols2_titles, mols2_indices, mols2_enes, mols2_nans = extract_enes(dict2, mol_slice)
|
487
|
|
|
488
|
|
# check that number of molecules exist in both files
|
489
|
60
|
if len(mols1_titles) != len(mols2_titles):
|
490
|
0
|
raise AssertionError("The number of molecules do not match for files:"
|
491
|
|
"\n{}\n{}".format(dict1['fname'], dict2['fname']))
|
492
|
|
|
493
|
60
|
for i in range(len(mols1_titles)):
|
494
|
60
|
title1 = mols1_titles[i]
|
495
|
60
|
title2 = mols2_titles[i]
|
496
|
60
|
indices1_orig = mols1_indices[i]
|
497
|
60
|
indices2_orig = mols2_indices[i]
|
498
|
60
|
enes1 = mols1_enes[i]
|
499
|
60
|
enes2 = mols2_enes[i]
|
500
|
60
|
nans1 = mols1_nans[i]
|
501
|
60
|
nans2 = mols2_nans[i]
|
502
|
|
|
503
|
|
# check that the names and conformers match for both mols
|
504
|
60
|
if title1 != title2 or (indices1_orig != indices2_orig):
|
505
|
0
|
raise AssertionError("ERROR: Either names or number of conformers "
|
506
|
|
"differ for mol {}/{} in:\n{}\n{}".format(
|
507
|
|
title1, title2, dict1['fname'], dict2['fname']))
|
508
|
|
|
509
|
|
# exclude conformers for which job did not finish (nan)
|
510
|
60
|
indices1_prune1, enes1_prune1 = remove_dead_conformers(enes1, indices1_orig, nans1)
|
511
|
60
|
indices2_prune1, enes2_prune1 = remove_dead_conformers(enes2, indices2_orig, nans1)
|
512
|
60
|
indices1_prune2, enes1_prune2 = remove_dead_conformers(enes1_prune1, indices1_prune1, nans2)
|
513
|
60
|
indices2_prune2, enes2_prune2 = remove_dead_conformers(enes2_prune1, indices2_prune1, nans2)
|
514
|
|
|
515
|
|
# make sure conformer indices match after pruning
|
516
|
60
|
if not (indices1_prune2 == indices2_prune2).all():
|
517
|
0
|
raise AssertionError("List of conformer indices do not match "
|
518
|
|
"after pruning conformers for {}: in files:\n{}\n{}\n".format(
|
519
|
|
title1, dict1['fname'], dict2['fname']))
|
520
|
|
|
521
|
|
# check that there exists at least two conformers
|
522
|
60
|
num_confs = indices1_prune2.shape[0]
|
523
|
60
|
if num_confs <= 1:
|
524
|
60
|
print("skipping molecule {} since only 0 or 1 conformer".format(title1))
|
525
|
60
|
continue
|
526
|
|
|
527
|
|
# take relative energy to first conf
|
528
|
60
|
irel = enes1 - enes1[0]
|
529
|
60
|
jrel = enes2 - enes2[0]
|
530
|
|
|
531
|
|
# take RMSD of conformer energies for this particular mol
|
532
|
60
|
dev = irel - jrel
|
533
|
60
|
sqd = np.square(dev)
|
534
|
60
|
mn = np.sum(sqd) / (np.shape(sqd)[0] - 1)
|
535
|
60
|
rt = np.sqrt(mn)
|
536
|
|
|
537
|
|
# store data
|
538
|
60
|
titleMols.append(title1)
|
539
|
60
|
rmsds.append(rt)
|
540
|
60
|
confNums.append(indices1_prune2)
|
541
|
60
|
reference_enes.append(irel)
|
542
|
60
|
compared_enes.append(jrel)
|
543
|
|
|
544
|
60
|
return titleMols, rmsds, confNums, reference_enes, compared_enes
|
545
|
|
|
546
|
|
|
547
|
|
### ------------------- Script -------------------
|
548
|
|
|
549
|
|
|
550
|
60
|
def survey_energies_ref(wholedict, ref_index, mol_slice=[], outfn='relene-rmsd.dat'):
|
551
|
|
"""
|
552
|
|
Compute RMSD of relative conformer energies with respect to the data
|
553
|
|
in ref_index spot. The relative energies by conformer are computed first,
|
554
|
|
then the RMSD is calculated with respect to reference.
|
555
|
|
|
556
|
|
Parameters
|
557
|
|
----------
|
558
|
|
wholedict : OrderedDict
|
559
|
|
ordered dictionary of input files and information to extract from SD tags
|
560
|
|
keys are: 'theory' 'fname' 'tagkey' 'label'
|
561
|
|
ref_index : int
|
562
|
|
integer to specify reference file of wholedict[ref_index]
|
563
|
|
mol_slice : list
|
564
|
|
list of indices from which to slice mols generator for read_mols
|
565
|
|
[start, stop, step]
|
566
|
|
outfn : str
|
567
|
|
name of the output file with RMSDs of energies
|
568
|
|
|
569
|
|
Returns
|
570
|
|
-------
|
571
|
|
wholedict : OrderedDict
|
572
|
|
same as input wholedict with additional keys:
|
573
|
|
'titleMols' 'rmsds' 'confNums' 'reference_enes' 'compared_enes'
|
574
|
|
|
575
|
|
"""
|
576
|
|
|
577
|
60
|
description = 'relative energies (kcal/mol)'
|
578
|
60
|
infile = wholedict[0]['fname']
|
579
|
60
|
print("Using reference file: %s " % infile)
|
580
|
|
|
581
|
|
# Write description in output file.
|
582
|
60
|
compF = open(outfn, 'w')
|
583
|
60
|
compF.write(f"# RMSD of {description} wrt file 0\n")
|
584
|
|
|
585
|
60
|
for i, d in enumerate(wholedict.values()):
|
586
|
60
|
compF.write("# File %d: %s\n" % (i, d['fname']))
|
587
|
60
|
compF.write("# calc=%s, %s\n" % (d['tagkey'], d['theory']))
|
588
|
|
|
589
|
60
|
for i in range(1, len(wholedict)):
|
590
|
60
|
print("\nStarting comparison on file: %s" % wholedict[i]['fname'])
|
591
|
|
|
592
|
|
# each of the four returned vars is (file) list of (mols) lists
|
593
|
60
|
titleMols, rmsds, confNums, reference_enes, compared_enes =\
|
594
|
|
rmsd_two_files(wholedict[ref_index], wholedict[i], mol_slice)
|
595
|
60
|
wholedict[i]['titleMols'] = titleMols
|
596
|
60
|
wholedict[i]['rmsds'] = rmsds
|
597
|
60
|
wholedict[i]['confNums'] = confNums
|
598
|
60
|
wholedict[i]['reference_enes'] = reference_enes
|
599
|
60
|
wholedict[i]['compared_enes'] = compared_enes
|
600
|
|
|
601
|
|
# loop over each mol and write energies from wholedict by column
|
602
|
60
|
for m in range(len(wholedict[1]['titleMols'])):
|
603
|
60
|
compF.write('\n\n# Mol ' + wholedict[1]['titleMols'][m])
|
604
|
|
|
605
|
|
# for this mol, write the rmsd from each file side by side
|
606
|
60
|
line = ' RMSDs: '
|
607
|
60
|
for i in range(1, len(wholedict)):
|
608
|
60
|
line += (str(wholedict[i]['rmsds'][m]) + '\t')
|
609
|
60
|
compF.write(line)
|
610
|
60
|
compF.write('\n# ==================================================================')
|
611
|
60
|
compF.write(f'\n# conf\t{description} in column order by file (listed at top)')
|
612
|
|
|
613
|
|
# for this mol, write the compared_enes from each file by columns
|
614
|
60
|
for c in range(len(wholedict[1]['confNums'][m])):
|
615
|
60
|
line = '\n' + str(wholedict[1]['confNums'][m][c]) + '\t'
|
616
|
60
|
line += str(wholedict[1]['reference_enes'][m][c]) + '\t'
|
617
|
60
|
for i in range(1, len(wholedict)):
|
618
|
60
|
line += (str(wholedict[i]['compared_enes'][m][c]) + '\t')
|
619
|
60
|
compF.write(line)
|
620
|
|
|
621
|
60
|
compF.close()
|
622
|
|
|
623
|
60
|
return wholedict
|
624
|
|
|
625
|
|
|
626
|
60
|
def survey_energies(wholedict, mol_slice=[], outfn='relene.dat', ref_conf=0):
|
627
|
|
"""
|
628
|
|
Compute the spread of the conformer energies for each molecule in each file.
|
629
|
|
The spread can be computed as the coefficient of variation of all methods,
|
630
|
|
computed for each conformer, which is averaged over all conformers. Another
|
631
|
|
approach to computing spread in this function is simply normalizing the
|
632
|
|
energies by remove the average, then taking the stdev of the normalized data.
|
633
|
|
^ TODO reconsider this, and add variable in def if wanting to include
|
634
|
|
TODO also add tests for these spread functions
|
635
|
|
|
636
|
|
Conformers that are missing for one calculation/file are taken out of
|
637
|
|
the analysis for all other calculations/files.
|
638
|
|
|
639
|
|
Parameters
|
640
|
|
----------
|
641
|
|
wholedict : OrderedDict
|
642
|
|
ordered dictionary of input files and information to extract from SD tags
|
643
|
|
keys are: 'theory' 'fname' 'tagkey' 'label'
|
644
|
|
mol_slice : list
|
645
|
|
list of indices from which to slice mols generator for read_mols
|
646
|
|
[start, stop, step]
|
647
|
|
outfn : str
|
648
|
|
name of the output file with RMSDs of energies
|
649
|
|
|
650
|
|
Returns
|
651
|
|
-------
|
652
|
|
wholedict : OrderedDict
|
653
|
|
same as input wholedict with additional keys:
|
654
|
|
'titleMols' 'confNums' 'compared_enes'
|
655
|
|
|
656
|
|
"""
|
657
|
|
|
658
|
60
|
num_files = len(wholedict)
|
659
|
|
|
660
|
|
# Write description in output file.
|
661
|
60
|
compF = open(outfn, 'w')
|
662
|
60
|
compF.write(f"# Variability in energies from diff QM methods averaged over conformers\n")
|
663
|
60
|
for i, d in enumerate(wholedict.values()):
|
664
|
60
|
compF.write("# File %d: %s\n" % (i+1, d['fname']))
|
665
|
60
|
compF.write("# calc=%s, %s\n" % (d['tagkey'], d['theory']))
|
666
|
|
|
667
|
60
|
enelist = []
|
668
|
60
|
idxlist = []
|
669
|
60
|
nanlist = []
|
670
|
60
|
for i in range(num_files):
|
671
|
60
|
print("Extracting data for file: %s" % wholedict[i]['fname'])
|
672
|
60
|
titleMols, confNums, compared_enes, confNans = extract_enes(wholedict[i], mol_slice)
|
673
|
60
|
enelist.append(compared_enes)
|
674
|
60
|
idxlist.append(confNums)
|
675
|
60
|
nanlist.append(confNans)
|
676
|
60
|
wholedict[i]['titleMols'] = titleMols
|
677
|
|
|
678
|
60
|
print("Removing mols missing from 1+ files...")
|
679
|
|
# find molecules common in all files
|
680
|
60
|
molname_sets = [set(wholedict[x]['titleMols']) for x in range(num_files)]
|
681
|
60
|
molname_common = set.intersection(*molname_sets)
|
682
|
|
|
683
|
60
|
for i in range(num_files):
|
684
|
|
# find the extraneous mols in this file
|
685
|
60
|
mols_in_file = wholedict[i]['titleMols']
|
686
|
60
|
mols_extraneous = list(set(mols_in_file) - molname_common)
|
687
|
|
|
688
|
|
# get indices of mols to be removed for this file
|
689
|
60
|
removal_idx = []
|
690
|
60
|
for molname in mols_extraneous:
|
691
|
0
|
removal_idx.append(mols_in_file.index(molname))
|
692
|
60
|
for index in sorted(removal_idx, reverse=True):
|
693
|
0
|
print(f" Removing {wholedict[i]['titleMols'][index]} from {wholedict[i]['fname']}")
|
694
|
0
|
del wholedict[i]['titleMols'][index]
|
695
|
0
|
del enelist[i][index]
|
696
|
0
|
del idxlist[i][index]
|
697
|
0
|
del nanlist[i][index]
|
698
|
|
|
699
|
60
|
print("Removing un-finished conformers and computing relative energies...")
|
700
|
60
|
idxlist, enelist = remove_dead_conformers(enelist, idxlist, nanlist)
|
701
|
|
|
702
|
|
# scale energies: scale energies by subtracting or dividing conf_j
|
703
|
|
# by conf_i for all j confs in all mols for all files
|
704
|
|
# (1) subtract first conf, or (2) divide first conf
|
705
|
60
|
if isinstance(ref_conf, int):
|
706
|
60
|
ref_conf_int = ref_conf
|
707
|
60
|
rel_enelist = relative_energies(enelist, 'subtract', ref_conf_int)
|
708
|
60
|
rat_enelist = relative_energies(enelist, 'divide', ref_conf_int)
|
709
|
|
|
710
|
60
|
for i in range(num_files):
|
711
|
60
|
wholedict[i]['compared_enes'] = rel_enelist[i]
|
712
|
60
|
wholedict[i]['confNums'] = idxlist[i]
|
713
|
|
|
714
|
|
# check that entire array is not zero, if ALL mols have one conf
|
715
|
60
|
all_zeros = np.all(np.asarray(rel_enelist)==0)
|
716
|
60
|
if all_zeros:
|
717
|
0
|
print("WARNING: All relative energy values are zero. "
|
718
|
|
"Cannot calculate energy spread.")
|
719
|
0
|
spreadlist = [-1]*len(rel_enelist[0])
|
720
|
|
else:
|
721
|
|
# estimate spread of data on energies
|
722
|
60
|
spreadlist, cvlist_all_mols = avg_coeffvar(rat_enelist, ref_conf_int)
|
723
|
|
#spreadlist = normalized_deviation(rel_enelist)
|
724
|
|
|
725
|
|
# loop over each mol and write energies from wholedict by column
|
726
|
60
|
for m in range(len(wholedict[1]['titleMols'])):
|
727
|
60
|
compF.write('\n\n# Mol ' + wholedict[1]['titleMols'][m])
|
728
|
|
|
729
|
|
# for this mol, write the rmsd from each file side by side
|
730
|
60
|
line = ' {:.2f} ppm averaged CV'.format(spreadlist[m]*1000000)
|
731
|
60
|
compF.write(line)
|
732
|
60
|
compF.write('\n# ==================================================================')
|
733
|
60
|
compF.write(f'\n# rel enes (kcal/mol), column i = file i, row j = conformer j')
|
734
|
|
|
735
|
|
# for this mol, write the compared_enes from each file by columns
|
736
|
60
|
this_mol_conf_inds = wholedict[1]['confNums'][m]
|
737
|
60
|
for c in range(len(this_mol_conf_inds)):
|
738
|
60
|
line = '\n' + str(this_mol_conf_inds[c]) + '\t'
|
739
|
60
|
for i in range(num_files):
|
740
|
60
|
line += '{:.4f}\t'.format(wholedict[i]['compared_enes'][m][c])
|
741
|
|
|
742
|
|
# print overall contribution to spread, skip ref conf since scaled
|
743
|
|
# include second condition in case ref_conf_int is negative
|
744
|
60
|
if c == ref_conf_int or c == len(this_mol_conf_inds)+ref_conf_int:
|
745
|
60
|
line += '# nan ppm CV'
|
746
|
|
else:
|
747
|
60
|
line += '# {:.2f} ppm CV'.format(cvlist_all_mols[m][c-1]*1000000)
|
748
|
60
|
compF.write(line)
|
749
|
|
|
750
|
60
|
compF.close()
|
751
|
|
|
752
|
60
|
return wholedict
|
753
|
|
|
754
|
|
|
755
|
60
|
def survey_times(wholedict, mol_slice=[]):
|
756
|
|
"""
|
757
|
|
Average all conformers' calculations times for each molecule in each file.
|
758
|
|
Generate grouped bar plot where x=QM method, y=time(s). Molecules are
|
759
|
|
grouped together by method and color coded by molecule identity.
|
760
|
|
|
761
|
|
Parameters
|
762
|
|
----------
|
763
|
|
wholedict : OrderedDict
|
764
|
|
ordered dictionary of input files and information to extract from SD tags
|
765
|
|
keys are: 'theory' 'fname' 'tagkey' 'label'
|
766
|
|
mol_slice : list
|
767
|
|
list of indices from which to slice mols generator for read_mols
|
768
|
|
[start, stop, step]
|
769
|
|
|
770
|
|
"""
|
771
|
|
|
772
|
|
# create storage space
|
773
|
60
|
titles = {}
|
774
|
60
|
timeplot = [] # timeplot[i][j] == avg of conformer times for mol i, file j
|
775
|
60
|
stdplot = [] # stdplot[i][j] == stdev of conformer times for mol i, file j
|
776
|
|
|
777
|
|
# loop over each files and compute avg conformer times for all mols
|
778
|
60
|
for i in wholedict:
|
779
|
60
|
qmethod, qbasis = reader.separated_theory(wholedict[i]['theory'])
|
780
|
60
|
qtag = wholedict[i]['tagkey']
|
781
|
60
|
titles = avg_mol_time(titles, wholedict[i]['fname'], qmethod, qbasis, qtag, mol_slice)
|
782
|
|
|
783
|
|
# extract data in dictionary to arrays
|
784
|
60
|
for mol in titles.values():
|
785
|
60
|
timeplot.append([k[0] for k in mol])
|
786
|
60
|
stdplot.append([k[1] for k in mol])
|
787
|
|
|
788
|
|
# calculate number of datafiles for each mol
|
789
|
60
|
lens = [len(x) for x in timeplot]
|
790
|
|
|
791
|
|
# obtain mode representing most common number of files
|
792
|
|
# first [0] to get modes not counts, second [0] to get highest mode
|
793
|
60
|
m = stats.mode(lens)[0][0]
|
794
|
|
|
795
|
|
# delete sublists (mols) with missing data; asarray takes uniform length sublists
|
796
|
|
# should only happen if 1+ method fails job for ALL confs of some mol
|
797
|
|
# TODO: this might be addressed in a better way
|
798
|
60
|
tracker = []
|
799
|
60
|
for i, t in enumerate(timeplot):
|
800
|
60
|
if len(t) != m:
|
801
|
0
|
tracker.append(i)
|
802
|
60
|
for index in sorted(tracker, reverse=True):
|
803
|
0
|
mol_name = list(titles.keys())[index]
|
804
|
0
|
print(f"removing {mol_name} from time analysis")
|
805
|
0
|
del timeplot[index]
|
806
|
0
|
del stdplot[index]
|
807
|
0
|
del titles[mol_name]
|
808
|
|
|
809
|
|
# prepare plot data
|
810
|
60
|
timeplot = np.asarray(timeplot)
|
811
|
60
|
stdplot = np.asarray(stdplot)
|
812
|
60
|
timearray = timeplot.flatten('F')
|
813
|
60
|
stdarray = stdplot.flatten('F')
|
814
|
|
|
815
|
|
# generate mol labels (colors) from dict keys, repeated by number of files
|
816
|
|
# e.g., ['a', 'b', 'c', 'a', 'b', 'c']
|
817
|
60
|
mol_names = list(titles.keys()) * len(wholedict)
|
818
|
|
|
819
|
|
# generate method labels (x-axis) from dict value of 'theory', repeated
|
820
|
|
# by number of mols. e.g., ['a', 'a', 'b', 'b', 'c', 'c']
|
821
|
|
# - remove duplicated spaces from method listing in input file
|
822
|
|
# - get the first word in tagkey which should be 'opt' or 'spe'
|
823
|
|
# https://stackoverflow.com/questions/2449077/duplicate-each-member-in-a-list-python
|
824
|
60
|
method_labels_short = [
|
825
|
|
" ".join(wholedict[item]['theory'].split()) + ' ' +
|
826
|
|
wholedict[item]['tagkey'].split(" ")[0] for item in wholedict]
|
827
|
60
|
method_labels = [val for val in method_labels_short for _ in range(len(titles))]
|
828
|
|
|
829
|
|
# combine color labels, x labels, and plot data
|
830
|
60
|
plotlist = np.column_stack((mol_names, method_labels,
|
831
|
|
timearray.astype(np.object), stdarray.astype(np.object)))
|
832
|
60
|
print(plotlist)
|
833
|
|
|
834
|
|
# generate plot
|
835
|
60
|
fig = plt.figure()
|
836
|
60
|
ax = fig.add_subplot(111)
|
837
|
60
|
plot_groupedbar(ax, plotlist)
|
838
|
|
|
839
|
60
|
plt.title("conformer-averaged wall-clock times", fontsize=16)
|
840
|
60
|
plt.ylabel("time (s)", fontsize=14)
|
841
|
60
|
plt.yticks(fontsize=12)
|
842
|
|
|
843
|
60
|
plt.savefig('rename_me.png', bbox_inches='tight')
|
844
|
60
|
plt.show()
|
845
|
|
|
846
|
|
|
847
|
60
|
def survey_confs(infile, analyze_energies=False, analyze_times=False, ref_index=None, plot_enes=True, mol_slice=[]):
|
848
|
|
"""
|
849
|
|
Main interface to survey_times, survey_energies, or survey_energies_ref.
|
850
|
|
|
851
|
|
Parameters
|
852
|
|
----------
|
853
|
|
infile : string
|
854
|
|
Name of text file with information on file(s) and SD tag info on
|
855
|
|
the level of theory to process.
|
856
|
|
analyze_energies : Boolean
|
857
|
|
analyze energies for the provided input files
|
858
|
|
analyze_times : Boolean
|
859
|
|
analyze calculation times for the provided input files
|
860
|
|
ref_index : integer
|
861
|
|
provide the integer of the line number in the input file to treat
|
862
|
|
as reference file (such as to compute RMSD of energies);
|
863
|
|
line numbers start at ZERO and do not count commented lines
|
864
|
|
plot_enes : Boolean
|
865
|
|
generate plot for energies analysis
|
866
|
|
mol_slice : list
|
867
|
|
list of indices from which to slice mols generator for read_mols
|
868
|
|
[start, stop, step]
|
869
|
|
|
870
|
|
"""
|
871
|
|
|
872
|
60
|
if not os.path.exists(infile):
|
873
|
0
|
raise FileNotFoundError(f"Input file {infile} does not exist.")
|
874
|
|
|
875
|
60
|
wholedict = reader.read_text_input(infile)
|
876
|
|
|
877
|
60
|
if (analyze_energies == analyze_times):
|
878
|
0
|
raise ValueError("Please specify ONE of analyze_energies or analyze_times")
|
879
|
|
|
880
|
60
|
if analyze_times:
|
881
|
60
|
survey_times(wholedict, mol_slice)
|
882
|
|
|
883
|
60
|
if analyze_energies:
|
884
|
60
|
if ref_index is None:
|
885
|
60
|
survey_energies(wholedict, mol_slice)
|
886
|
|
else:
|
887
|
60
|
wholedict = survey_energies_ref(wholedict, ref_index, mol_slice)
|
888
|
|
# TODO move this part inside the survey_energies fx
|
889
|
60
|
if plot_enes:
|
890
|
60
|
arrange_and_plot(wholedict, "RMSDs of relative conformer energies")
|