1
#!/usr/bin/env python
2 60
"""
3
survey_conformers.py
4

5
Purpose:    Compare energies and calculation times of the same molecule set
6
            across different QM methods.
7

8
Version:    Apr 24 2019
9
By:         Victoria T. Lim
10

11
"""
12

13 60
import os
14 60
import numpy as np
15 60
import operator as o
16 60
from scipy import stats
17 60
import matplotlib.pyplot as plt
18

19 60
import quanformer.proc_tags as pt
20 60
import quanformer.reader as reader
21

22

23

24 60
def avg_mol_time(titles, infile, method, basis, tag, mol_slice=[]):
25
    """
26
    For an SDF file with all confs of all mols, get the average runtime
27
        of all conformers for each molecule.
28
    The input dictionary may or not be empty. If it is, append the avg/stdev
29
        time of the calculation from this infile to existing molecule's value.
30

31
    Parameters
32
    ----------
33
    titles : dictionary
34
        Keys are molecule names.
35
        Values are [[qm1_avg, qm1_std], [qm2_avg, qm2_std], ... ]
36
        where the index refers to a particular level of theory.
37
        Dictionary may or may not be empty.
38
    infile : string
39
        name of the SDF file from which to extract time data from SD tag
40
    method : string
41
        QM method
42
    basis : string
43
        QM basis set
44
    tag : string
45
        datum of interest, e.g., "QM opt energy"
46
        See keys in the define_tag function of proc_tags module.
47
    mol_slice : list
48
        list of indices from which to slice mols generator for read_mols
49
        [start, stop, step]
50

51
    Returns
52
    -------
53
    titles : dictionary
54
        dictionary with extracted data from SDF file; keys are molnames,
55
        values are lists of list of [avg_time, stdev_time] for many QM methods
56

57
    """
58

59
    # Open molecule file.
60 60
    if len(mol_slice) == 3:
61 0
        mols = reader.read_mols(infile, mol_slice)
62
    else:
63 60
        mols = reader.read_mols(infile)
64

65

66
    # Prepare text file to write extracted data.
67 60
    timeF = open("timeAvgs.txt", 'a')
68 60
    timeF.write("\nFile: {}\n".format(infile))
69 60
    timeF.write("Average [{}/{}] [{}s] over all confs for each molecule\n".format(method, basis, tag))
70

71 60
    for mol_i in mols:
72

73
        # get array of all conformer data of this mol
74 60
        try:
75 60
            time_array = np.fromiter(pt.get_sd_list(mol_i, tag, 'Psi4', method, basis), dtype=np.float64)
76 60
        except ValueError:
77 0
            time_array = np.asarray([np.nan])*mol_i.NumConfs()
78

79
        # exclude conformers for which job did not finish (nan)
80 60
        nanIndices = np.argwhere(np.isnan(time_array))
81 60
        for i in reversed(nanIndices):  # loop in reverse to delete correctly
82 0
            time_array = np.delete(time_array, i)
83 60
        meantime = np.mean(time_array)
84 60
        stdtime = np.std(time_array)
85

86
        # write out data to file and store in dictionary
87 60
        timeF.write("  %s\t%d confs\t\t%.3f +- %.3f\n" % (mol_i.GetTitle(), time_array.size, meantime, stdtime))
88 60
        name = mol_i.GetTitle()
89 60
        if name not in titles:
90 60
            titles[name] = []
91 60
        titles[name].append([meantime, stdtime])
92

93 60
    timeF.close()
94 60
    return titles
95

96

97 60
def plot_groupedbar(ax, labels_data_std, errbar=False):
98
    """
99
    Function modified from Peter Kerpedjiev.
100
    http://emptypipes.org/2013/11/09/matplotlib-multicategory-barchart/
101

102
    Create a barchart for data across different x_labels with
103
    multiple color_labels for each x_label.
104

105
    Parameters
106
    ----------
107
    ax: The plotting axes from matplotlib.
108
    labels_data_std: The data set as an (n, 3) numpy array
109

110
    """
111
    # Aggregate the color_labels and the x_labels according to their mean values
112 60
    color_labels = [(c, np.mean(labels_data_std[labels_data_std[:, 0] == c][:, 2].astype(float))) for c in np.unique(labels_data_std[:, 0])]
113 60
    x_labels = [(c, np.mean(labels_data_std[labels_data_std[:, 1] == c][:, 2].astype(float))) for c in np.unique(labels_data_std[:, 1])]
114

115
    # sort the color_labels, x_labels and data so that the bars in
116
    # the plot will be ordered by x_category and color_label
117 60
    color_labels = [c[0] for c in sorted(color_labels, key=o.itemgetter(1))]
118 60
    x_labels = [c[0] for c in sorted(x_labels, key=o.itemgetter(1))]
119 60
    labels_data_std = np.array(sorted(labels_data_std, key=lambda x: x_labels.index(x[1])))
120

121
    # the space between each set of bars
122 60
    space = 0.3
123 60
    n = len(color_labels)
124 60
    width = (1 - space) / (len(color_labels))
125 60
    indices = range(len(x_labels))
126

127
    # set color cycle
128 60
    if n < 9:
129 60
        ax.set_prop_cycle(plt.cycler('color', plt.cm.Accent(np.linspace(0, 1, n))))
130 0
    elif n < 11:
131 0
        ax.set_prop_cycle(plt.cycler('color', plt.cm.tab10(np.linspace(0, 1, n))))
132 0
    elif n < 13:
133 0
        ax.set_prop_cycle(plt.cycler('color', plt.cm.Paired(np.linspace(0, 1, n))))
134
    else:
135 0
        ax.set_prop_cycle(plt.cycler('color', plt.cm.tab20(np.linspace(0, 1, n))))
136

137
    # Create a set of bars at each position
138 60
    for i, cond in enumerate(color_labels):
139 60
        vals = labels_data_std[labels_data_std[:, 0] == cond][:, 2].astype(np.float)
140 60
        pos = [j - (1 - space) / 2. + i * width for j in indices]
141 60
        if errbar:
142 0
            stds = labels_data_std[labels_data_std[:, 0] == cond][:, 3].astype(np.float)
143 0
            ax.bar(pos, vals, yerr=stds, width=width, label=cond)
144
        else:
145 60
            ax.bar(pos, vals, width=width, label=cond)
146

147
    # Set the x-axis tick labels to be equal to the x_labels
148 60
    if len(indices) > 7: # adjust int as needed for better placement of many labels
149 0
        indices = [x - 1 for x in indices]
150 60
    ax.set_xticks(indices)
151 60
    ax.set_xticklabels(x_labels)
152 60
    plt.setp(plt.xticks()[1], rotation=50)
153

154
    # Add a legend
155 60
    handles, labels = ax.get_legend_handles_labels()
156 60
    ax.legend(handles[::-1], labels[::-1], loc='upper left')
157

158

159 60
def arrange_and_plot(wholedict, ptitle):
160
    """
161
    Organize and plot data.
162

163
    Parameters
164
    ----------
165
    wholedict : OrderedDict
166
        dictionary of information to be plotted, ordered by input file
167
        required keys are 'label' 'titleMols' 'rmsds'
168
    ptitle : string
169
        Title on plot
170

171
    """
172
    # fill in dictionary for ref file for list comprehension later
173 60
    wholedict[0]['titleMols'] = np.full(len(wholedict[1]['titleMols']), np.nan)
174 60
    wholedict[0]['rmsds'] = np.full(len(wholedict[1]['rmsds']), np.nan)
175

176
    # extract part of dictionary using list comprehension
177 60
    subset = np.array([[(wholedict[fd][key]) for key in\
178
('label','titleMols','rmsds')] for fd in list(wholedict.keys())[1:]], dtype=object).T
179

180
    # build plot list
181 60
    plotlist = []
182 60
    for m in range(len(wholedict[1]['titleMols'])):
183 60
        for f in range(len(wholedict) - 1):
184 60
            temp = []
185 60
            temp.append(subset[0][f])
186 60
            temp.append(subset[1][f][m])
187 60
            temp.append(subset[2][f][m])
188 60
            plotlist.append(temp)
189 60
    plotlist = np.array(plotlist)
190

191
    # generate plot
192 60
    fig = plt.figure()
193 60
    ax = fig.add_subplot(111)
194 60
    plot_groupedbar(ax, plotlist)
195 60
    plt.title(ptitle, fontsize=14)
196 60
    plt.savefig('barchart.png', bbox_inches='tight')
197 60
    plt.show()
198

199

200 60
def extract_enes(dict1, mol_slice=[]):
201
    """
202
    From files in input dictionaries, read in molecules, extract information
203
    from SD tags for conformer energies and indices.
204

205
    Parameters
206
    ----------
207
    dict1 : dict
208
        dictionary of input files and information to extract from SD tags
209
        keys are: 'theory' 'fname' 'tagkey' 'label'
210
    mol_slice : list
211
        list of indices from which to slice mols generator for read_mols
212
        [start, stop, step]
213

214
    Returns
215
    -------
216
    titleMols : list of strings
217
        names of all molecules in the SDF file
218
    confNums : list of ints
219
        conformer index numbers
220
    enes : list of numpy arrays
221
        conformer energies of the compared file (kcal/mol)
222
    confNans : list of numpy arrays
223
        indices of enes where the values are nans
224

225
    """
226

227
    # Open molecule file.
228 60
    if len(mol_slice) == 3:
229 0
        mols = reader.read_mols(dict1['fname'], mol_slice)
230
    else:
231 60
        mols = reader.read_mols(dict1['fname'])
232

233 60
    short_tag = dict1['tagkey']
234 60
    qmethod, qbasis = reader.separated_theory(dict1['theory'])
235

236 60
    titleMols = []
237 60
    confNums = []
238 60
    enes = []
239 60
    confNans = []
240

241 60
    for imol in mols:
242

243
        # Get absolute energies from the SD tags
244 60
        iabs = np.array(list(map(float, pt.get_sd_list(imol, short_tag, 'Psi4', qmethod, qbasis))))
245

246
        # Get omega conformer number of first, for reference info
247
        # whole list can be used for matching purposes
248 60
        indices_orig = pt.get_sd_list(imol, "original index")
249

250
        # find conformers for which job did not finish (nan)
251 60
        nanIndices = np.argwhere(np.isnan(iabs))
252

253
        # convert energies from Hartrees to kcal/mol
254 60
        iabs = 627.5095 * iabs
255

256 60
        titleMols.append(imol.GetTitle())
257 60
        confNans.append(nanIndices)
258 60
        confNums.append(indices_orig)
259 60
        enes.append(iabs)
260

261 60
    return titleMols, confNums, enes, confNans
262

263

264 60
def remove_dead_conformers(enelist, idxlist, nanlist):
265
    # enelist[i][j] is for file i, mol j
266

267
    # reshape list for that i goes to mol and j goes to file
268 60
    enelist = np.array(enelist).T
269 60
    idxlist = np.array(idxlist).T
270 60
    nanlist = np.array(nanlist).T
271

272 60
    for i, mols_nan in enumerate(nanlist):
273 60
        for file_mols_nan in mols_nan:
274 0
            mols_ene = enelist[i]
275 0
            mols_idx = idxlist[i]
276 0
            for j, (file_mols_ene, file_mols_idx) in enumerate(zip(mols_ene, mols_idx)):
277 0
                enelist[i][j] = np.delete(file_mols_ene, file_mols_nan)
278 0
                idxlist[i][j] = np.delete(file_mols_idx, file_mols_nan)
279

280
    # transpose it back to return
281 60
    return idxlist.T, enelist.T
282

283

284 60
def relative_energies(enelist, method, ref_conf_int=0):
285
    """
286
    Compute relative energies by either subtracting or dividing
287
        by a reference conformer's energy value. This energy value
288
        is different for every mol in every file.
289

290
    Parameters
291
    ----------
292
    enelist : list of lists
293
        enelist[i][j] is the energy for file i, mol j
294
    method : string
295
        either 'subtract' or 'divide' to take all values of mol
296
        and subtract by first conf or divide by first conf
297
    ref_conf_int : integer
298
        0-based index of the integer for which to use as reference
299
        for taking relative energies
300

301
    Returns
302
    --------
303
    rel_enes : list of lists
304
        same structure as input enelist but with relative energies
305

306
    """
307

308 60
    rel_enes = []
309 60
    for i, file_ene in enumerate(enelist):
310 60
        rel_enes.append([])
311 60
        for mol_ene in file_ene:
312 60
            if method == 'subtract':
313 60
                rel = mol_ene - mol_ene[ref_conf_int]
314 60
            elif method == 'divide':
315 60
                rel = mol_ene/mol_ene[ref_conf_int]
316 60
            rel_enes[i].append(rel)
317 60
    return rel_enes
318

319

320 60
def avg_coeffvar(enelist, ref_conf_int=0):
321
    """
322
    For the ABSOLUTE energies of a single conformer, compute
323
        coefficient of variation (CV) across methods, then
324
        average the CVs across all conformers for each molecule.
325

326
    Note: Input energies can be raw energies, or scaled energies (such that
327
        energies are still on ratio scale).
328
    Note: It is incorrect to compute the coefficient of variation on relative
329
        values. See wikipedia for an example on temperature.
330
    Note: The units of the input shouldn't matter since the CV is a ratio,
331
        meaning that units should cancel.
332

333
    Parameters
334
    ----------
335
    enelist : list of lists
336
        enelist[i][j] is absolute energy of file i, mol j
337

338
    Returns
339
    -------
340
    spread_by_mol : list
341
        len(spread_by_mol) == number of molecules
342
        spread_by_mol[i] == avg(CVs(conformer enes from diff methods) of mol i
343
    cvlist_all_mols : list of lists
344
        len(cvlist_all_mols) == number of molecules
345
        len(cvlist_all_mols[i]) == number of conformers for molecule i minus 1
346

347
    Notes
348
    -----
349
    * This takes population stdevs, as opposed to sample stdevs.
350
    * Validated for single conformer, and for three conformers.
351

352
    """
353

354
    # 1D list for average of CVs of each mol
355 60
    spread_by_mol = []
356 60
    cvlist_all_mols = []
357

358
    # reshape list so that enelist[i][j] is mol i, file j
359 60
    enelist = np.array(enelist).T
360

361
    # here, mols_ene[x][y] is ith molecule, xth file, and yth conformer
362 60
    for i, mols_ene in enumerate(enelist):
363 60
        cvlist = []
364

365
        # check that at least two conformers exist
366 60
        num_confs = mols_ene[0].shape[0]
367 60
        if num_confs <= 1:
368 60
            print(f"skipping molecule {i} since only 0 or 1 conformer")
369 60
            spread_by_mol.append(np.nan)
370 60
            cvlist_all_mols.append(num_confs*[np.nan])
371 60
            continue
372

373
        # number of files is mols_ene.shape[0]; should be same for all mols
374 60
        for j in range(num_confs):
375

376
            # for single conformer j, get its energies from all k files
377 60
            confs_ene = [mols_ene[k][j] for k in range(mols_ene.shape[0])]
378

379
            # compute coeff of variation
380 60
            with np.errstate(divide='ignore', invalid='ignore'):
381 60
                cv = np.std(confs_ene) / np.average(confs_ene)
382 60
            cvlist.append(cv)
383

384
        # remove ref conf from avg calc bc energies scaled by this conf
385 60
        cvlist.pop(ref_conf_int)
386

387
        # average the coeffs of variation
388 60
        spread_by_mol.append(np.average(cvlist))
389 60
        cvlist_all_mols.append(cvlist)
390

391 60
    return spread_by_mol, cvlist_all_mols
392

393

394 60
def normalized_deviation(enelist):
395
    """
396
    For the relative energies of each conformer of some molecule,
397
    subtract the average energy for the different methods, then
398
    take the standard deviation of all conformers of all methods.
399

400
    TODO -- should the stdev be computed for all the conformers
401
        of the mol (currently coded) or across the methods
402
        (so add the line under np.average)? Would the stdevs of
403
        each conformer then be averaged?
404
    AKA should you be taking the stdev of the normalized data (current)
405
        or take avg of the stdev?
406

407
    Parameters
408
    ----------
409
    TODO
410

411
    Returns
412
    -------
413
    TODO
414

415
    """
416

417
    # enelist[i][j] is for file i, mol j
418
    # spread_by_mol is 1D list of len(num_mols)
419 0
    spread_by_mol = []
420

421
    # reshape list for that i goes to mol and j goes to file
422 0
    enelist = np.array(enelist).T
423 0
    for i, mols_ene in enumerate(enelist):
424 0
        mollist = []
425

426
        # number of files is mols_ene.shape[0]
427 0
        for j in range(1, mols_ene[0].shape[0]):
428 0
            confs_ene = [mols_ene[k][j] for k in range(mols_ene.shape[0])]
429

430
            # subtract the average of methods for the conformer
431
            # note: not subtracting average conformer energy for each method
432
            # bc that would give us estimate of spread conformer energies as
433
            # opposed to estimating spread of method energies
434 0
            confs_ene = confs_ene - np.average(confs_ene)
435 0
            mollist.append(confs_ene)
436

437 0
        spread_by_mol.append(np.std(mollist))
438

439 0
    return spread_by_mol
440

441

442 60
def rmsd_two_files(dict1, dict2, mol_slice=[]):
443
    """
444
    From files in input dictionaries, read in molecules, extract information
445
    from SD tags, compute relative conformer energies wrt to first conformer
446
    of each molecule, and calculate RMSD of energies wrt reference data.
447

448
    Parameters
449
    ----------
450
    dict1 : dict
451
        REFERENCE dictionary from which to calculate energy RMSDs
452
        dictionary of input files and information to extract from SD tags
453
        keys are: 'theory' 'fname' 'tagkey' 'label'
454
    dict2 : dict
455
        dictionary of input files and information to extract from SD tags
456
        keys are: 'theory' 'fname' 'tagkey' 'label'
457
    mol_slice : list
458
        list of indices from which to slice mols generator for read_mols
459
        [start, stop, step]
460

461
    Returns
462
    -------
463
    titleMols : list of strings
464
        names of all molecules in the SDF file
465
    rmsds : list of floats
466
    confNums : list of ints
467
        conformer index numbers, excluding nans of both dict1 and dict2
468
        e.g.,  if dict1 mol1 has confs of 0 1 2 3 5
469
              and dict2 mol1 has confs of 0 2 3 4 5
470
                     then output would be 0 2 3 5
471
    reference_enes : list of numpy arrays
472
        relative conformer energies of the reference file (kcal/mol)
473
    compared_enes : list of numpy arrays
474
        relative conformer energies of the compared file (kcal/mol)
475

476
    """
477

478 60
    titleMols = []
479 60
    rmsds = []
480 60
    confNums = []
481 60
    reference_enes = []
482 60
    compared_enes = []
483

484
    # load molecules and extract data
485 60
    mols1_titles, mols1_indices, mols1_enes, mols1_nans = extract_enes(dict1, mol_slice)
486 60
    mols2_titles, mols2_indices, mols2_enes, mols2_nans = extract_enes(dict2, mol_slice)
487

488
    # check that number of molecules exist in both files
489 60
    if len(mols1_titles) != len(mols2_titles):
490 0
        raise AssertionError("The number of molecules do not match for files:"
491
                             "\n{}\n{}".format(dict1['fname'], dict2['fname']))
492

493 60
    for i in range(len(mols1_titles)):
494 60
        title1 = mols1_titles[i]
495 60
        title2 = mols2_titles[i]
496 60
        indices1_orig = mols1_indices[i]
497 60
        indices2_orig = mols2_indices[i]
498 60
        enes1 = mols1_enes[i]
499 60
        enes2 = mols2_enes[i]
500 60
        nans1 = mols1_nans[i]
501 60
        nans2 = mols2_nans[i]
502

503
        # check that the names and conformers match for both mols
504 60
        if title1 != title2 or (indices1_orig != indices2_orig):
505 0
            raise AssertionError("ERROR: Either names or number of conformers "
506
                                 "differ for mol {}/{} in:\n{}\n{}".format(
507
                                 title1, title2, dict1['fname'], dict2['fname']))
508

509
        # exclude conformers for which job did not finish (nan)
510 60
        indices1_prune1, enes1_prune1 = remove_dead_conformers(enes1, indices1_orig, nans1)
511 60
        indices2_prune1, enes2_prune1 = remove_dead_conformers(enes2, indices2_orig, nans1)
512 60
        indices1_prune2, enes1_prune2 = remove_dead_conformers(enes1_prune1, indices1_prune1, nans2)
513 60
        indices2_prune2, enes2_prune2 = remove_dead_conformers(enes2_prune1, indices2_prune1, nans2)
514

515
        # make sure conformer indices match after pruning
516 60
        if not (indices1_prune2 == indices2_prune2).all():
517 0
            raise AssertionError("List of conformer indices do not match "
518
                                 "after pruning conformers for {}: in files:\n{}\n{}\n".format(
519
                                     title1, dict1['fname'], dict2['fname']))
520

521
        # check that there exists at least two conformers
522 60
        num_confs = indices1_prune2.shape[0]
523 60
        if num_confs <= 1:
524 60
            print("skipping molecule {} since only 0 or 1 conformer".format(title1))
525 60
            continue
526

527
        # take relative energy to first conf
528 60
        irel = enes1 - enes1[0]
529 60
        jrel = enes2 - enes2[0]
530

531
        # take RMSD of conformer energies for this particular mol
532 60
        dev = irel - jrel
533 60
        sqd = np.square(dev)
534 60
        mn = np.sum(sqd) / (np.shape(sqd)[0] - 1)
535 60
        rt = np.sqrt(mn)
536

537
        # store data
538 60
        titleMols.append(title1)
539 60
        rmsds.append(rt)
540 60
        confNums.append(indices1_prune2)
541 60
        reference_enes.append(irel)
542 60
        compared_enes.append(jrel)
543

544 60
    return titleMols, rmsds, confNums, reference_enes, compared_enes
545

546

547
### ------------------- Script -------------------
548

549

550 60
def survey_energies_ref(wholedict, ref_index, mol_slice=[], outfn='relene-rmsd.dat'):
551
    """
552
    Compute RMSD of relative conformer energies with respect to the data
553
    in ref_index spot. The relative energies by conformer are computed first,
554
    then the RMSD is calculated with respect to reference.
555

556
    Parameters
557
    ----------
558
    wholedict : OrderedDict
559
        ordered dictionary of input files and information to extract from SD tags
560
        keys are: 'theory' 'fname' 'tagkey' 'label'
561
    ref_index : int
562
        integer to specify reference file of wholedict[ref_index]
563
    mol_slice : list
564
        list of indices from which to slice mols generator for read_mols
565
        [start, stop, step]
566
    outfn : str
567
        name of the output file with RMSDs of energies
568

569
    Returns
570
    -------
571
    wholedict : OrderedDict
572
        same as input wholedict with additional keys:
573
        'titleMols' 'rmsds' 'confNums' 'reference_enes' 'compared_enes'
574

575
    """
576

577 60
    description = 'relative energies (kcal/mol)'
578 60
    infile = wholedict[0]['fname']
579 60
    print("Using reference file: %s " % infile)
580

581
    # Write description in output file.
582 60
    compF = open(outfn, 'w')
583 60
    compF.write(f"# RMSD of {description} wrt file 0\n")
584

585 60
    for i, d in enumerate(wholedict.values()):
586 60
        compF.write("# File %d: %s\n" % (i, d['fname']))
587 60
        compF.write("#   calc=%s, %s\n" % (d['tagkey'], d['theory']))
588

589 60
    for i in range(1, len(wholedict)):
590 60
        print("\nStarting comparison on file: %s" % wholedict[i]['fname'])
591

592
        # each of the four returned vars is (file) list of (mols) lists
593 60
        titleMols, rmsds, confNums, reference_enes, compared_enes =\
594
            rmsd_two_files(wholedict[ref_index], wholedict[i], mol_slice)
595 60
        wholedict[i]['titleMols'] = titleMols
596 60
        wholedict[i]['rmsds'] = rmsds
597 60
        wholedict[i]['confNums'] = confNums
598 60
        wholedict[i]['reference_enes'] = reference_enes
599 60
        wholedict[i]['compared_enes'] = compared_enes
600

601
    # loop over each mol and write energies from wholedict by column
602 60
    for m in range(len(wholedict[1]['titleMols'])):
603 60
        compF.write('\n\n# Mol ' + wholedict[1]['titleMols'][m])
604

605
        # for this mol, write the rmsd from each file side by side
606 60
        line = ' RMSDs: '
607 60
        for i in range(1, len(wholedict)):
608 60
            line += (str(wholedict[i]['rmsds'][m]) + '\t')
609 60
        compF.write(line)
610 60
        compF.write('\n# ==================================================================')
611 60
        compF.write(f'\n# conf\t{description} in column order by file (listed at top)')
612

613
        # for this mol, write the compared_enes from each file by columns
614 60
        for c in range(len(wholedict[1]['confNums'][m])):
615 60
            line = '\n' + str(wholedict[1]['confNums'][m][c]) + '\t'
616 60
            line += str(wholedict[1]['reference_enes'][m][c]) + '\t'
617 60
            for i in range(1, len(wholedict)):
618 60
                line += (str(wholedict[i]['compared_enes'][m][c]) + '\t')
619 60
            compF.write(line)
620

621 60
    compF.close()
622

623 60
    return wholedict
624

625

626 60
def survey_energies(wholedict, mol_slice=[], outfn='relene.dat', ref_conf=0):
627
    """
628
    Compute the spread of the conformer energies for each molecule in each file.
629
    The spread can be computed as the coefficient of variation of all methods,
630
    computed for each conformer, which is averaged over all conformers. Another
631
    approach to computing spread in this function is simply normalizing the
632
    energies by remove the average, then taking the stdev of the normalized data.
633
     ^ TODO reconsider this, and add variable in def if wanting to include
634
       TODO also add tests for these spread functions
635

636
    Conformers that are missing for one calculation/file are taken out of
637
    the analysis for all other calculations/files.
638

639
    Parameters
640
    ----------
641
    wholedict : OrderedDict
642
        ordered dictionary of input files and information to extract from SD tags
643
        keys are: 'theory' 'fname' 'tagkey' 'label'
644
    mol_slice : list
645
        list of indices from which to slice mols generator for read_mols
646
        [start, stop, step]
647
    outfn : str
648
        name of the output file with RMSDs of energies
649

650
    Returns
651
    -------
652
    wholedict : OrderedDict
653
        same as input wholedict with additional keys:
654
        'titleMols' 'confNums' 'compared_enes'
655

656
    """
657

658 60
    num_files = len(wholedict)
659

660
    # Write description in output file.
661 60
    compF = open(outfn, 'w')
662 60
    compF.write(f"# Variability in energies from diff QM methods averaged over conformers\n")
663 60
    for i, d in enumerate(wholedict.values()):
664 60
        compF.write("# File %d: %s\n" % (i+1, d['fname']))
665 60
        compF.write("#   calc=%s, %s\n" % (d['tagkey'], d['theory']))
666

667 60
    enelist = []
668 60
    idxlist = []
669 60
    nanlist = []
670 60
    for i in range(num_files):
671 60
        print("Extracting data for file: %s" % wholedict[i]['fname'])
672 60
        titleMols, confNums, compared_enes, confNans = extract_enes(wholedict[i], mol_slice)
673 60
        enelist.append(compared_enes)
674 60
        idxlist.append(confNums)
675 60
        nanlist.append(confNans)
676 60
        wholedict[i]['titleMols'] = titleMols
677

678 60
    print("Removing mols missing from 1+ files...")
679
    # find molecules common in all files
680 60
    molname_sets = [set(wholedict[x]['titleMols']) for x in range(num_files)]
681 60
    molname_common = set.intersection(*molname_sets)
682

683 60
    for i in range(num_files):
684
        # find the extraneous mols in this file
685 60
        mols_in_file = wholedict[i]['titleMols']
686 60
        mols_extraneous = list(set(mols_in_file) - molname_common)
687

688
        # get indices of mols to be removed for this file
689 60
        removal_idx = []
690 60
        for molname in mols_extraneous:
691 0
            removal_idx.append(mols_in_file.index(molname))
692 60
        for index in sorted(removal_idx, reverse=True):
693 0
            print(f"  Removing {wholedict[i]['titleMols'][index]} from {wholedict[i]['fname']}")
694 0
            del wholedict[i]['titleMols'][index]
695 0
            del enelist[i][index]
696 0
            del idxlist[i][index]
697 0
            del nanlist[i][index]
698

699 60
    print("Removing un-finished conformers and computing relative energies...")
700 60
    idxlist, enelist = remove_dead_conformers(enelist, idxlist, nanlist)
701

702
    # scale energies: scale energies by subtracting or dividing conf_j
703
    # by conf_i for all j confs in all mols for all files
704
    # (1) subtract first conf, or (2) divide first conf
705 60
    if isinstance(ref_conf, int):
706 60
        ref_conf_int = ref_conf
707 60
    rel_enelist = relative_energies(enelist, 'subtract', ref_conf_int)
708 60
    rat_enelist = relative_energies(enelist, 'divide', ref_conf_int)
709

710 60
    for i in range(num_files):
711 60
        wholedict[i]['compared_enes'] = rel_enelist[i]
712 60
        wholedict[i]['confNums'] = idxlist[i]
713

714
    # check that entire array is not zero, if ALL mols have one conf
715 60
    all_zeros = np.all(np.asarray(rel_enelist)==0)
716 60
    if all_zeros:
717 0
        print("WARNING: All relative energy values are zero. "
718
            "Cannot calculate energy spread.")
719 0
        spreadlist = [-1]*len(rel_enelist[0])
720
    else:
721
        # estimate spread of data on energies
722 60
        spreadlist, cvlist_all_mols = avg_coeffvar(rat_enelist, ref_conf_int)
723
        #spreadlist = normalized_deviation(rel_enelist)
724

725
    # loop over each mol and write energies from wholedict by column
726 60
    for m in range(len(wholedict[1]['titleMols'])):
727 60
        compF.write('\n\n# Mol ' + wholedict[1]['titleMols'][m])
728

729
        # for this mol, write the rmsd from each file side by side
730 60
        line = ' {:.2f} ppm averaged CV'.format(spreadlist[m]*1000000)
731 60
        compF.write(line)
732 60
        compF.write('\n# ==================================================================')
733 60
        compF.write(f'\n# rel enes (kcal/mol), column i = file i, row j = conformer j')
734

735
        # for this mol, write the compared_enes from each file by columns
736 60
        this_mol_conf_inds = wholedict[1]['confNums'][m]
737 60
        for c in range(len(this_mol_conf_inds)):
738 60
            line = '\n' + str(this_mol_conf_inds[c]) + '\t'
739 60
            for i in range(num_files):
740 60
                line += '{:.4f}\t'.format(wholedict[i]['compared_enes'][m][c])
741

742
            # print overall contribution to spread, skip ref conf since scaled
743
            # include second condition in case ref_conf_int is negative
744 60
            if c == ref_conf_int or c == len(this_mol_conf_inds)+ref_conf_int:
745 60
                line += '#  nan ppm CV'
746
            else:
747 60
                line += '# {:.2f} ppm CV'.format(cvlist_all_mols[m][c-1]*1000000)
748 60
            compF.write(line)
749

750 60
    compF.close()
751

752 60
    return wholedict
753

754

755 60
def survey_times(wholedict, mol_slice=[]):
756
    """
757
    Average all conformers' calculations times for each molecule in each file.
758
    Generate grouped bar plot where x=QM method, y=time(s). Molecules are
759
    grouped together by method and color coded by molecule identity.
760

761
    Parameters
762
    ----------
763
    wholedict : OrderedDict
764
        ordered dictionary of input files and information to extract from SD tags
765
        keys are: 'theory' 'fname' 'tagkey' 'label'
766
    mol_slice : list
767
        list of indices from which to slice mols generator for read_mols
768
        [start, stop, step]
769

770
    """
771

772
    # create storage space
773 60
    titles = {}
774 60
    timeplot = []  # timeplot[i][j] == avg of conformer times for mol i, file j
775 60
    stdplot = []  # stdplot[i][j] == stdev of conformer times for mol i, file j
776

777
    # loop over each files and compute avg conformer times for all mols
778 60
    for i in wholedict:
779 60
        qmethod, qbasis = reader.separated_theory(wholedict[i]['theory'])
780 60
        qtag = wholedict[i]['tagkey']
781 60
        titles = avg_mol_time(titles, wholedict[i]['fname'], qmethod, qbasis, qtag, mol_slice)
782

783
    # extract data in dictionary to arrays
784 60
    for mol in titles.values():
785 60
        timeplot.append([k[0] for k in mol])
786 60
        stdplot.append([k[1] for k in mol])
787

788
    # calculate number of datafiles for each mol
789 60
    lens = [len(x) for x in timeplot]
790

791
    # obtain mode representing most common number of files
792
    # first [0] to get modes not counts, second [0] to get highest mode
793 60
    m = stats.mode(lens)[0][0]
794

795
    # delete sublists (mols) with missing data; asarray takes uniform length sublists
796
    # should only happen if 1+ method fails job for ALL confs of some mol
797
    # TODO: this might be addressed in a better way
798 60
    tracker = []
799 60
    for i, t in enumerate(timeplot):
800 60
        if len(t) != m:
801 0
            tracker.append(i)
802 60
    for index in sorted(tracker, reverse=True):
803 0
        mol_name = list(titles.keys())[index]
804 0
        print(f"removing {mol_name} from time analysis")
805 0
        del timeplot[index]
806 0
        del stdplot[index]
807 0
        del titles[mol_name]
808

809
    # prepare plot data
810 60
    timeplot = np.asarray(timeplot)
811 60
    stdplot = np.asarray(stdplot)
812 60
    timearray = timeplot.flatten('F')
813 60
    stdarray = stdplot.flatten('F')
814

815
    # generate mol labels (colors) from dict keys, repeated by number of files
816
    # e.g., ['a', 'b', 'c', 'a', 'b', 'c']
817 60
    mol_names = list(titles.keys()) * len(wholedict)
818

819
    # generate method labels (x-axis) from dict value of 'theory', repeated
820
    # by number of mols. e.g., ['a', 'a', 'b', 'b', 'c', 'c']
821
    # - remove duplicated spaces from method listing in input file
822
    # - get the first word in tagkey which should be 'opt' or 'spe'
823
    # https://stackoverflow.com/questions/2449077/duplicate-each-member-in-a-list-python
824 60
    method_labels_short = [
825
        " ".join(wholedict[item]['theory'].split()) + ' ' +
826
        wholedict[item]['tagkey'].split(" ")[0] for item in wholedict]
827 60
    method_labels = [val for val in method_labels_short for _ in range(len(titles))]
828

829
    # combine color labels, x labels, and plot data
830 60
    plotlist = np.column_stack((mol_names, method_labels,
831
        timearray.astype(np.object), stdarray.astype(np.object)))
832 60
    print(plotlist)
833

834
    # generate plot
835 60
    fig = plt.figure()
836 60
    ax = fig.add_subplot(111)
837 60
    plot_groupedbar(ax, plotlist)
838

839 60
    plt.title("conformer-averaged wall-clock times", fontsize=16)
840 60
    plt.ylabel("time (s)", fontsize=14)
841 60
    plt.yticks(fontsize=12)
842

843 60
    plt.savefig('rename_me.png', bbox_inches='tight')
844 60
    plt.show()
845

846

847 60
def survey_confs(infile, analyze_energies=False, analyze_times=False, ref_index=None, plot_enes=True, mol_slice=[]):
848
    """
849
    Main interface to survey_times, survey_energies, or survey_energies_ref.
850

851
    Parameters
852
    ----------
853
    infile : string
854
        Name of text file with information on file(s) and SD tag info on
855
        the level of theory to process.
856
    analyze_energies : Boolean
857
        analyze energies for the provided input files
858
    analyze_times : Boolean
859
        analyze calculation times for the provided input files
860
    ref_index : integer
861
        provide the integer of the line number in the input file to treat
862
        as reference file (such as to compute RMSD of energies);
863
        line numbers start at ZERO and do not count commented lines
864
    plot_enes : Boolean
865
        generate plot for energies analysis
866
    mol_slice : list
867
        list of indices from which to slice mols generator for read_mols
868
        [start, stop, step]
869

870
    """
871

872 60
    if not os.path.exists(infile):
873 0
        raise FileNotFoundError(f"Input file {infile} does not exist.")
874

875 60
    wholedict = reader.read_text_input(infile)
876

877 60
    if (analyze_energies == analyze_times):
878 0
        raise ValueError("Please specify ONE of analyze_energies or analyze_times")
879

880 60
    if analyze_times:
881 60
        survey_times(wholedict, mol_slice)
882

883 60
    if analyze_energies:
884 60
        if ref_index is None:
885 60
            survey_energies(wholedict, mol_slice)
886
        else:
887 60
            wholedict = survey_energies_ref(wholedict, ref_index, mol_slice)
888
            # TODO move this part inside the survey_energies fx
889 60
            if plot_enes:
890 60
                arrange_and_plot(wholedict, "RMSDs of relative conformer energies")

Read our documentation on viewing source code .

Loading