Showing 1 of 1 files from the diff.

@@ -31,7 +31,7 @@
Loading
31 31
    return qmethod, qbasis
32 32
33 33
34 -
def avg_mol_time(titles, infile, method, basis, tag, mol_slice):
34 +
def avg_mol_time(titles, infile, method, basis, tag, mol_slice=[]):
35 35
    """
36 36
    For an SDF file with all confs of all mols, get the average runtime
37 37
        of all conformers for each molecule.
@@ -155,7 +155,8 @@
Loading
155 155
            ax.bar(pos, vals, width=width, label=cond)
156 156
157 157
    # Set the x-axis tick labels to be equal to the x_labels
158 -
    #indices = [x - 1 for x in indices]  # OPTIONAL, better placement if many labels
158 +
    if len(indices) > 7: # adjust int as needed for better placement of many labels
159 +
        indices = [x - 1 for x in indices]
159 160
    ax.set_xticks(indices)
160 161
    ax.set_xticklabels(x_labels)
161 162
    plt.setp(plt.xticks()[1], rotation=50)
@@ -206,7 +207,7 @@
Loading
206 207
    plt.show()
207 208
208 209
209 -
def extract_enes(dict1):
210 +
def extract_enes(dict1, mol_slice=[]):
210 211
    """
211 212
    From files in input dictionaries, read in molecules, extract information
212 213
    from SD tags for conformer energies and indices.
@@ -216,6 +217,9 @@
Loading
216 217
    dict1 : dict
217 218
        dictionary of input files and information to extract from SD tags
218 219
        keys are: 'theory' 'fname' 'tagkey' 'label'
220 +
    mol_slice : list
221 +
        list of indices from which to slice mols generator for read_mols
222 +
        [start, stop, step]
219 223
220 224
    Returns
221 225
    -------
@@ -230,7 +234,12 @@
Loading
230 234
231 235
    """
232 236
233 -
    mols = reader.read_mols(dict1['fname'])
237 +
    # Open molecule file.
238 +
    if len(mol_slice) == 3:
239 +
        mols = reader.read_mols(dict1['fname'], mol_slice)
240 +
    else:
241 +
        mols = reader.read_mols(dict1['fname'])
242 +
234 243
    short_tag = dict1['tagkey']
235 244
    qmethod, qbasis = separated_theory(dict1['theory'])
236 245
@@ -296,20 +305,28 @@
Loading
296 305
297 306
def avg_coeffvar(enelist):
298 307
    """
299 -
    For the relative energies of a single conformer, compute coefficient of
308 +
    For the ABSOLUTE energies of a single conformer, compute coefficient of
300 309
    variation (CV) across methods. Then average the CVs across all conformers
301 310
    for each molecule.
302 311
312 +
    Note: It is incorrect to compute the coefficient of variation on relative
313 +
        values. See wikipedia for an example on temperature.
314 +
    Note: The units of the input shouldn't matter since the CV is a ratio,
315 +
        meaning that units should cancel.
316 +
303 317
    Parameters
304 318
    ----------
305 319
    enelist : list of lists
306 -
        enelist[i][j] is relative energy of file i, mol j
320 +
        enelist[i][j] is absolute energy of file i, mol j
307 321
308 322
    Returns
309 323
    -------
310 324
    spread_by_mol : list
311 325
        len(spread_by_mol) == number of molecules
312 326
        spread_by_mol[i] == avg(CVs(conformer enes from diff methods) of mol i
327 +
    cvlist_all_mols : list of lists
328 +
        len(cvlist_all_mols) == number of molecules
329 +
        len(cvlist_all_mols[i]) == number of conformers for molecule i minus 1
313 330
314 331
    Notes
315 332
    -----
@@ -320,6 +337,7 @@
Loading
320 337
321 338
    # 1D list for average of CVs of each mol
322 339
    spread_by_mol = []
340 +
    cvlist_all_mols = []
323 341
324 342
    # reshape list so that enelist[i][j] is mol i, file j
325 343
    enelist = np.array(enelist).T
@@ -332,7 +350,8 @@
Loading
332 350
        num_confs = mols_ene[0].shape[0]
333 351
        if num_confs <= 1:
334 352
            print(f"skipping molecule {i} since only 0 or 1 conformer")
335 -
            spread_by_mol.append('n/a')
353 +
            spread_by_mol.append(np.nan)
354 +
            cvlist_all_mols.append(num_confs*[np.nan])
336 355
            continue
337 356
338 357
        # number of files is mols_ene.shape[0]; should be same for all mols
@@ -346,18 +365,11 @@
Loading
346 365
                cv = np.std(confs_ene) / np.average(confs_ene)
347 366
            cvlist.append(cv)
348 367
349 -
        # check that first element is 0. bc energies relative to first conf
350 -
        if not np.isnan(cvlist[0]):
351 -
            print("\nWARNING: Check input of avg_coeffvar function."
352 -
                  " First conformers of each mol should have zero relative "
353 -
                  "energies and thus zero standard deviation across files.")
354 -
            return
355 -
        cvlist = np.delete(cvlist, 0)
356 -
357 368
        # average the coeffs of variation
358 369
        spread_by_mol.append(np.average(cvlist))
370 +
        cvlist_all_mols.append(cvlist)
359 371
360 -
    return spread_by_mol
372 +
    return spread_by_mol, cvlist_all_mols
361 373
362 374
363 375
def normalized_deviation(enelist):
@@ -408,7 +420,7 @@
Loading
408 420
    return spread_by_mol
409 421
410 422
411 -
def rmsd_two_files(dict1, dict2):
423 +
def rmsd_two_files(dict1, dict2, mol_slice=[]):
412 424
    """
413 425
    From files in input dictionaries, read in molecules, extract information
414 426
    from SD tags, compute relative conformer energies wrt to first conformer
@@ -423,6 +435,9 @@
Loading
423 435
    dict2 : dict
424 436
        dictionary of input files and information to extract from SD tags
425 437
        keys are: 'theory' 'fname' 'tagkey' 'label'
438 +
    mol_slice : list
439 +
        list of indices from which to slice mols generator for read_mols
440 +
        [start, stop, step]
426 441
427 442
    Returns
428 443
    -------
@@ -448,8 +463,8 @@
Loading
448 463
    compared_enes = []
449 464
450 465
    # load molecules and extract data
451 -
    mols1_titles, mols1_indices, mols1_enes, mols1_nans = extract_enes(dict1)
452 -
    mols2_titles, mols2_indices, mols2_enes, mols2_nans = extract_enes(dict2)
466 +
    mols1_titles, mols1_indices, mols1_enes, mols1_nans = extract_enes(dict1, mol_slice)
467 +
    mols2_titles, mols2_indices, mols2_enes, mols2_nans = extract_enes(dict2, mol_slice)
453 468
454 469
    # check that number of molecules exist in both files
455 470
    if len(mols1_titles) != len(mols2_titles):
@@ -513,7 +528,7 @@
Loading
513 528
### ------------------- Script -------------------
514 529
515 530
516 -
def survey_energies_ref(wholedict, ref_index, outfn='relene-rmsd.dat'):
531 +
def survey_energies_ref(wholedict, ref_index, mol_slice=[], outfn='relene-rmsd.dat'):
517 532
    """
518 533
    Compute RMSD of relative conformer energies with respect to the data
519 534
    in ref_index spot. The relative energies by conformer are computed first,
@@ -526,6 +541,9 @@
Loading
526 541
        keys are: 'theory' 'fname' 'tagkey' 'label'
527 542
    ref_index : int
528 543
        integer to specify reference file of wholedict[ref_index]
544 +
    mol_slice : list
545 +
        list of indices from which to slice mols generator for read_mols
546 +
        [start, stop, step]
529 547
    outfn : str
530 548
        name of the output file with RMSDs of energies
531 549
@@ -554,7 +572,7 @@
Loading
554 572
555 573
        # each of the four returned vars is (file) list of (mols) lists
556 574
        titleMols, rmsds, confNums, reference_enes, compared_enes =\
557 -
            rmsd_two_files(wholedict[ref_index], wholedict[i])
575 +
            rmsd_two_files(wholedict[ref_index], wholedict[i], mol_slice)
558 576
        wholedict[i]['titleMols'] = titleMols
559 577
        wholedict[i]['rmsds'] = rmsds
560 578
        wholedict[i]['confNums'] = confNums
@@ -586,7 +604,7 @@
Loading
586 604
    return wholedict
587 605
588 606
589 -
def survey_energies(wholedict, outfn='relene.dat'):
607 +
def survey_energies(wholedict, mol_slice=[], outfn='relene.dat'):
590 608
    """
591 609
    Compute the spread of the conformer energies for each molecule in each file.
592 610
    The spread can be computed as the coefficient of variation of all methods,
@@ -604,6 +622,9 @@
Loading
604 622
    wholedict : OrderedDict
605 623
        ordered dictionary of input files and information to extract from SD tags
606 624
        keys are: 'theory' 'fname' 'tagkey' 'label'
625 +
    mol_slice : list
626 +
        list of indices from which to slice mols generator for read_mols
627 +
        [start, stop, step]
607 628
    outfn : str
608 629
        name of the output file with RMSDs of energies
609 630
@@ -615,14 +636,13 @@
Loading
615 636
616 637
    """
617 638
618 -
    description = 'relative energies (kcal/mol)'
619 639
    num_files = len(wholedict)
620 640
621 641
    # Write description in output file.
622 642
    compF = open(outfn, 'w')
623 -
    compF.write(f"# Spread of {description} over diff QM methods averaged over conformers\n")
643 +
    compF.write(f"# Variability in energies from diff QM methods averaged over conformers\n")
624 644
    for i, d in enumerate(wholedict.values()):
625 -
        compF.write("# File %d: %s\n" % (i, d['fname']))
645 +
        compF.write("# File %d: %s\n" % (i+1, d['fname']))
626 646
        compF.write("#   calc=%s, %s\n" % (d['tagkey'], d['theory']))
627 647
628 648
    enelist = []
@@ -630,7 +650,7 @@
Loading
630 650
    nanlist = []
631 651
    for i in range(num_files):
632 652
        print("Extracting data for file: %s" % wholedict[i]['fname'])
633 -
        titleMols, confNums, compared_enes, confNans = extract_enes(wholedict[i])
653 +
        titleMols, confNums, compared_enes, confNans = extract_enes(wholedict[i], mol_slice)
634 654
        enelist.append(compared_enes)
635 655
        idxlist.append(confNums)
636 656
        nanlist.append(confNans)
@@ -671,8 +691,8 @@
Loading
671 691
            "Cannot calculate energy spread.")
672 692
        spreadlist = [-1]*len(relenelist[0])
673 693
    else:
674 -
        # estimate spread of data
675 -
        spreadlist = avg_coeffvar(relenelist)
694 +
        # estimate spread of data on ABSOLUTE energies
695 +
        spreadlist, cvlist_all_mols = avg_coeffvar(enelist)
676 696
        #spreadlist = normalized_deviation(relenelist)
677 697
678 698
@@ -681,16 +701,20 @@
Loading
681 701
        compF.write('\n\n# Mol ' + wholedict[1]['titleMols'][m])
682 702
683 703
        # for this mol, write the rmsd from each file side by side
684 -
        line = ' Spread: {}'.format(spreadlist[m])
704 +
        line = ' {:.2f}% averaged CV'.format(spreadlist[m]*100)
685 705
        compF.write(line)
686 706
        compF.write('\n# ==================================================================')
687 -
        compF.write(f'\n# conf\t{description} in column order by file (listed at top)')
707 +
        compF.write(f'\n# rel enes (kcal/mol), column i = file i, row j = conformer j')
688 708
689 709
        # for this mol, write the compared_enes from each file by columns
690 710
        for c in range(len(wholedict[1]['confNums'][m])):
691 711
            line = '\n' + str(wholedict[1]['confNums'][m][c]) + '\t'
692 712
            for i in range(num_files):
693 -
                line += (str(wholedict[i]['compared_enes'][m][c]) + '\t')
713 +
                line += '{:.4f}\t'.format(wholedict[i]['compared_enes'][m][c])
714 +
715 +
            # print overall contribution to spread
716 +
            line += '# {:.2f}% CV'.format(cvlist_all_mols[m][c]*100)
717 +
694 718
            compF.write(line)
695 719
696 720
    compF.close()
@@ -698,7 +722,7 @@
Loading
698 722
    return wholedict
699 723
700 724
701 -
def survey_times(wholedict, mol_slice):
725 +
def survey_times(wholedict, mol_slice=[]):
702 726
    """
703 727
    Average all conformers' calculations times for each molecule in each file.
704 728
    Generate grouped bar plot where x=QM method, y=time(s). Molecules are
@@ -828,10 +852,9 @@
Loading
828 852
829 853
    if analyze_energies:
830 854
        if ref_index is None:
831 -
            wholedict = survey_energies(wholedict)
855 +
            wholedict = survey_energies(wholedict, mol_slice)
832 856
        else:
833 -
            wholedict = survey_energies_ref(wholedict, ref_index)
857 +
            wholedict = survey_energies_ref(wholedict, ref_index, mol_slice)
834 858
            # TODO move this part inside the survey_energies fx
835 -
            # TODO add mol_slice parameter to survey_energies* functions
836 859
            if plot_enes:
837 860
                arrange_and_plot(wholedict, "RMSDs of relative conformer energies")
Files Coverage
quanformer 87.91%
Project Totals (11 files) 87.91%
163.1
TRAVIS_OS_NAME=osx
163.2
TRAVIS_PYTHON_VERSION=3.6
TRAVIS_OS_NAME=linux
164.1
TRAVIS_OS_NAME=osx
164.2
TRAVIS_PYTHON_VERSION=3.6
TRAVIS_OS_NAME=linux
165.1
TRAVIS_OS_NAME=osx
165.2
TRAVIS_PYTHON_VERSION=3.6
TRAVIS_OS_NAME=linux
166.1
TRAVIS_OS_NAME=osx
166.2
TRAVIS_PYTHON_VERSION=3.6
TRAVIS_OS_NAME=linux
167.2
TRAVIS_PYTHON_VERSION=3.6
TRAVIS_OS_NAME=linux
167.1
TRAVIS_OS_NAME=osx
1
# Codecov configuration to make it a bit less noisy
2
coverage:
3
  status:
4
    patch: false
5
    project:
6
      default:
7
        threshold: 50%
8
comment:
9
  layout: "header"
10
  require_changes: false
11
  branches: null
12
  behavior: default
13
  flags: null
14
  paths: null
15
ignore:
16
  - "quanformer/match_plot.py"
17

18
  - "quanformer/quan2modsem.py"
19
  - "quanformer/match_minima.py"
20
  - "quanformer/opt_vs_spe.py"
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading