quanformer/survey_confs.py
changed.
Showing 1 of 1 files from the diff.
@@ -31,7 +31,7 @@
Loading
31 | 31 | return qmethod, qbasis |
|
32 | 32 | ||
33 | 33 | ||
34 | - | def avg_mol_time(titles, infile, method, basis, tag, mol_slice): |
|
34 | + | def avg_mol_time(titles, infile, method, basis, tag, mol_slice=[]): |
|
35 | 35 | """ |
|
36 | 36 | For an SDF file with all confs of all mols, get the average runtime |
|
37 | 37 | of all conformers for each molecule. |
@@ -155,7 +155,8 @@
Loading
155 | 155 | ax.bar(pos, vals, width=width, label=cond) |
|
156 | 156 | ||
157 | 157 | # Set the x-axis tick labels to be equal to the x_labels |
|
158 | - | #indices = [x - 1 for x in indices] # OPTIONAL, better placement if many labels |
|
158 | + | if len(indices) > 7: # adjust int as needed for better placement of many labels |
|
159 | + | indices = [x - 1 for x in indices] |
|
159 | 160 | ax.set_xticks(indices) |
|
160 | 161 | ax.set_xticklabels(x_labels) |
|
161 | 162 | plt.setp(plt.xticks()[1], rotation=50) |
@@ -206,7 +207,7 @@
Loading
206 | 207 | plt.show() |
|
207 | 208 | ||
208 | 209 | ||
209 | - | def extract_enes(dict1): |
|
210 | + | def extract_enes(dict1, mol_slice=[]): |
|
210 | 211 | """ |
|
211 | 212 | From files in input dictionaries, read in molecules, extract information |
|
212 | 213 | from SD tags for conformer energies and indices. |
@@ -216,6 +217,9 @@
Loading
216 | 217 | dict1 : dict |
|
217 | 218 | dictionary of input files and information to extract from SD tags |
|
218 | 219 | keys are: 'theory' 'fname' 'tagkey' 'label' |
|
220 | + | mol_slice : list |
|
221 | + | list of indices from which to slice mols generator for read_mols |
|
222 | + | [start, stop, step] |
|
219 | 223 | ||
220 | 224 | Returns |
|
221 | 225 | ------- |
@@ -230,7 +234,12 @@
Loading
230 | 234 | ||
231 | 235 | """ |
|
232 | 236 | ||
233 | - | mols = reader.read_mols(dict1['fname']) |
|
237 | + | # Open molecule file. |
|
238 | + | if len(mol_slice) == 3: |
|
239 | + | mols = reader.read_mols(dict1['fname'], mol_slice) |
|
240 | + | else: |
|
241 | + | mols = reader.read_mols(dict1['fname']) |
|
242 | + | ||
234 | 243 | short_tag = dict1['tagkey'] |
|
235 | 244 | qmethod, qbasis = separated_theory(dict1['theory']) |
|
236 | 245 |
@@ -296,20 +305,28 @@
Loading
296 | 305 | ||
297 | 306 | def avg_coeffvar(enelist): |
|
298 | 307 | """ |
|
299 | - | For the relative energies of a single conformer, compute coefficient of |
|
308 | + | For the ABSOLUTE energies of a single conformer, compute coefficient of |
|
300 | 309 | variation (CV) across methods. Then average the CVs across all conformers |
|
301 | 310 | for each molecule. |
|
302 | 311 | ||
312 | + | Note: It is incorrect to compute the coefficient of variation on relative |
|
313 | + | values. See wikipedia for an example on temperature. |
|
314 | + | Note: The units of the input shouldn't matter since the CV is a ratio, |
|
315 | + | meaning that units should cancel. |
|
316 | + | ||
303 | 317 | Parameters |
|
304 | 318 | ---------- |
|
305 | 319 | enelist : list of lists |
|
306 | - | enelist[i][j] is relative energy of file i, mol j |
|
320 | + | enelist[i][j] is absolute energy of file i, mol j |
|
307 | 321 | ||
308 | 322 | Returns |
|
309 | 323 | ------- |
|
310 | 324 | spread_by_mol : list |
|
311 | 325 | len(spread_by_mol) == number of molecules |
|
312 | 326 | spread_by_mol[i] == avg(CVs(conformer enes from diff methods) of mol i |
|
327 | + | cvlist_all_mols : list of lists |
|
328 | + | len(cvlist_all_mols) == number of molecules |
|
329 | + | len(cvlist_all_mols[i]) == number of conformers for molecule i minus 1 |
|
313 | 330 | ||
314 | 331 | Notes |
|
315 | 332 | ----- |
@@ -320,6 +337,7 @@
Loading
320 | 337 | ||
321 | 338 | # 1D list for average of CVs of each mol |
|
322 | 339 | spread_by_mol = [] |
|
340 | + | cvlist_all_mols = [] |
|
323 | 341 | ||
324 | 342 | # reshape list so that enelist[i][j] is mol i, file j |
|
325 | 343 | enelist = np.array(enelist).T |
@@ -332,7 +350,8 @@
Loading
332 | 350 | num_confs = mols_ene[0].shape[0] |
|
333 | 351 | if num_confs <= 1: |
|
334 | 352 | print(f"skipping molecule {i} since only 0 or 1 conformer") |
|
335 | - | spread_by_mol.append('n/a') |
|
353 | + | spread_by_mol.append(np.nan) |
|
354 | + | cvlist_all_mols.append(num_confs*[np.nan]) |
|
336 | 355 | continue |
|
337 | 356 | ||
338 | 357 | # number of files is mols_ene.shape[0]; should be same for all mols |
@@ -346,18 +365,11 @@
Loading
346 | 365 | cv = np.std(confs_ene) / np.average(confs_ene) |
|
347 | 366 | cvlist.append(cv) |
|
348 | 367 | ||
349 | - | # check that first element is 0. bc energies relative to first conf |
|
350 | - | if not np.isnan(cvlist[0]): |
|
351 | - | print("\nWARNING: Check input of avg_coeffvar function." |
|
352 | - | " First conformers of each mol should have zero relative " |
|
353 | - | "energies and thus zero standard deviation across files.") |
|
354 | - | return |
|
355 | - | cvlist = np.delete(cvlist, 0) |
|
356 | - | ||
357 | 368 | # average the coeffs of variation |
|
358 | 369 | spread_by_mol.append(np.average(cvlist)) |
|
370 | + | cvlist_all_mols.append(cvlist) |
|
359 | 371 | ||
360 | - | return spread_by_mol |
|
372 | + | return spread_by_mol, cvlist_all_mols |
|
361 | 373 | ||
362 | 374 | ||
363 | 375 | def normalized_deviation(enelist): |
@@ -408,7 +420,7 @@
Loading
408 | 420 | return spread_by_mol |
|
409 | 421 | ||
410 | 422 | ||
411 | - | def rmsd_two_files(dict1, dict2): |
|
423 | + | def rmsd_two_files(dict1, dict2, mol_slice=[]): |
|
412 | 424 | """ |
|
413 | 425 | From files in input dictionaries, read in molecules, extract information |
|
414 | 426 | from SD tags, compute relative conformer energies wrt to first conformer |
@@ -423,6 +435,9 @@
Loading
423 | 435 | dict2 : dict |
|
424 | 436 | dictionary of input files and information to extract from SD tags |
|
425 | 437 | keys are: 'theory' 'fname' 'tagkey' 'label' |
|
438 | + | mol_slice : list |
|
439 | + | list of indices from which to slice mols generator for read_mols |
|
440 | + | [start, stop, step] |
|
426 | 441 | ||
427 | 442 | Returns |
|
428 | 443 | ------- |
@@ -448,8 +463,8 @@
Loading
448 | 463 | compared_enes = [] |
|
449 | 464 | ||
450 | 465 | # load molecules and extract data |
|
451 | - | mols1_titles, mols1_indices, mols1_enes, mols1_nans = extract_enes(dict1) |
|
452 | - | mols2_titles, mols2_indices, mols2_enes, mols2_nans = extract_enes(dict2) |
|
466 | + | mols1_titles, mols1_indices, mols1_enes, mols1_nans = extract_enes(dict1, mol_slice) |
|
467 | + | mols2_titles, mols2_indices, mols2_enes, mols2_nans = extract_enes(dict2, mol_slice) |
|
453 | 468 | ||
454 | 469 | # check that number of molecules exist in both files |
|
455 | 470 | if len(mols1_titles) != len(mols2_titles): |
@@ -513,7 +528,7 @@
Loading
513 | 528 | ### ------------------- Script ------------------- |
|
514 | 529 | ||
515 | 530 | ||
516 | - | def survey_energies_ref(wholedict, ref_index, outfn='relene-rmsd.dat'): |
|
531 | + | def survey_energies_ref(wholedict, ref_index, mol_slice=[], outfn='relene-rmsd.dat'): |
|
517 | 532 | """ |
|
518 | 533 | Compute RMSD of relative conformer energies with respect to the data |
|
519 | 534 | in ref_index spot. The relative energies by conformer are computed first, |
@@ -526,6 +541,9 @@
Loading
526 | 541 | keys are: 'theory' 'fname' 'tagkey' 'label' |
|
527 | 542 | ref_index : int |
|
528 | 543 | integer to specify reference file of wholedict[ref_index] |
|
544 | + | mol_slice : list |
|
545 | + | list of indices from which to slice mols generator for read_mols |
|
546 | + | [start, stop, step] |
|
529 | 547 | outfn : str |
|
530 | 548 | name of the output file with RMSDs of energies |
|
531 | 549 |
@@ -554,7 +572,7 @@
Loading
554 | 572 | ||
555 | 573 | # each of the four returned vars is (file) list of (mols) lists |
|
556 | 574 | titleMols, rmsds, confNums, reference_enes, compared_enes =\ |
|
557 | - | rmsd_two_files(wholedict[ref_index], wholedict[i]) |
|
575 | + | rmsd_two_files(wholedict[ref_index], wholedict[i], mol_slice) |
|
558 | 576 | wholedict[i]['titleMols'] = titleMols |
|
559 | 577 | wholedict[i]['rmsds'] = rmsds |
|
560 | 578 | wholedict[i]['confNums'] = confNums |
@@ -586,7 +604,7 @@
Loading
586 | 604 | return wholedict |
|
587 | 605 | ||
588 | 606 | ||
589 | - | def survey_energies(wholedict, outfn='relene.dat'): |
|
607 | + | def survey_energies(wholedict, mol_slice=[], outfn='relene.dat'): |
|
590 | 608 | """ |
|
591 | 609 | Compute the spread of the conformer energies for each molecule in each file. |
|
592 | 610 | The spread can be computed as the coefficient of variation of all methods, |
@@ -604,6 +622,9 @@
Loading
604 | 622 | wholedict : OrderedDict |
|
605 | 623 | ordered dictionary of input files and information to extract from SD tags |
|
606 | 624 | keys are: 'theory' 'fname' 'tagkey' 'label' |
|
625 | + | mol_slice : list |
|
626 | + | list of indices from which to slice mols generator for read_mols |
|
627 | + | [start, stop, step] |
|
607 | 628 | outfn : str |
|
608 | 629 | name of the output file with RMSDs of energies |
|
609 | 630 |
@@ -615,14 +636,13 @@
Loading
615 | 636 | ||
616 | 637 | """ |
|
617 | 638 | ||
618 | - | description = 'relative energies (kcal/mol)' |
|
619 | 639 | num_files = len(wholedict) |
|
620 | 640 | ||
621 | 641 | # Write description in output file. |
|
622 | 642 | compF = open(outfn, 'w') |
|
623 | - | compF.write(f"# Spread of {description} over diff QM methods averaged over conformers\n") |
|
643 | + | compF.write(f"# Variability in energies from diff QM methods averaged over conformers\n") |
|
624 | 644 | for i, d in enumerate(wholedict.values()): |
|
625 | - | compF.write("# File %d: %s\n" % (i, d['fname'])) |
|
645 | + | compF.write("# File %d: %s\n" % (i+1, d['fname'])) |
|
626 | 646 | compF.write("# calc=%s, %s\n" % (d['tagkey'], d['theory'])) |
|
627 | 647 | ||
628 | 648 | enelist = [] |
@@ -630,7 +650,7 @@
Loading
630 | 650 | nanlist = [] |
|
631 | 651 | for i in range(num_files): |
|
632 | 652 | print("Extracting data for file: %s" % wholedict[i]['fname']) |
|
633 | - | titleMols, confNums, compared_enes, confNans = extract_enes(wholedict[i]) |
|
653 | + | titleMols, confNums, compared_enes, confNans = extract_enes(wholedict[i], mol_slice) |
|
634 | 654 | enelist.append(compared_enes) |
|
635 | 655 | idxlist.append(confNums) |
|
636 | 656 | nanlist.append(confNans) |
@@ -671,8 +691,8 @@
Loading
671 | 691 | "Cannot calculate energy spread.") |
|
672 | 692 | spreadlist = [-1]*len(relenelist[0]) |
|
673 | 693 | else: |
|
674 | - | # estimate spread of data |
|
675 | - | spreadlist = avg_coeffvar(relenelist) |
|
694 | + | # estimate spread of data on ABSOLUTE energies |
|
695 | + | spreadlist, cvlist_all_mols = avg_coeffvar(enelist) |
|
676 | 696 | #spreadlist = normalized_deviation(relenelist) |
|
677 | 697 | ||
678 | 698 |
@@ -681,16 +701,20 @@
Loading
681 | 701 | compF.write('\n\n# Mol ' + wholedict[1]['titleMols'][m]) |
|
682 | 702 | ||
683 | 703 | # for this mol, write the rmsd from each file side by side |
|
684 | - | line = ' Spread: {}'.format(spreadlist[m]) |
|
704 | + | line = ' {:.2f}% averaged CV'.format(spreadlist[m]*100) |
|
685 | 705 | compF.write(line) |
|
686 | 706 | compF.write('\n# ==================================================================') |
|
687 | - | compF.write(f'\n# conf\t{description} in column order by file (listed at top)') |
|
707 | + | compF.write(f'\n# rel enes (kcal/mol), column i = file i, row j = conformer j') |
|
688 | 708 | ||
689 | 709 | # for this mol, write the compared_enes from each file by columns |
|
690 | 710 | for c in range(len(wholedict[1]['confNums'][m])): |
|
691 | 711 | line = '\n' + str(wholedict[1]['confNums'][m][c]) + '\t' |
|
692 | 712 | for i in range(num_files): |
|
693 | - | line += (str(wholedict[i]['compared_enes'][m][c]) + '\t') |
|
713 | + | line += '{:.4f}\t'.format(wholedict[i]['compared_enes'][m][c]) |
|
714 | + | ||
715 | + | # print overall contribution to spread |
|
716 | + | line += '# {:.2f}% CV'.format(cvlist_all_mols[m][c]*100) |
|
717 | + | ||
694 | 718 | compF.write(line) |
|
695 | 719 | ||
696 | 720 | compF.close() |
@@ -698,7 +722,7 @@
Loading
698 | 722 | return wholedict |
|
699 | 723 | ||
700 | 724 | ||
701 | - | def survey_times(wholedict, mol_slice): |
|
725 | + | def survey_times(wholedict, mol_slice=[]): |
|
702 | 726 | """ |
|
703 | 727 | Average all conformers' calculations times for each molecule in each file. |
|
704 | 728 | Generate grouped bar plot where x=QM method, y=time(s). Molecules are |
@@ -828,10 +852,9 @@
Loading
828 | 852 | ||
829 | 853 | if analyze_energies: |
|
830 | 854 | if ref_index is None: |
|
831 | - | wholedict = survey_energies(wholedict) |
|
855 | + | wholedict = survey_energies(wholedict, mol_slice) |
|
832 | 856 | else: |
|
833 | - | wholedict = survey_energies_ref(wholedict, ref_index) |
|
857 | + | wholedict = survey_energies_ref(wholedict, ref_index, mol_slice) |
|
834 | 858 | # TODO move this part inside the survey_energies fx |
|
835 | - | # TODO add mol_slice parameter to survey_energies* functions |
|
836 | 859 | if plot_enes: |
|
837 | 860 | arrange_and_plot(wholedict, "RMSDs of relative conformer energies") |
Files | Coverage |
---|---|
quanformer | 87.91% |
Project Totals (11 files) | 87.91% |
163.1
TRAVIS_OS_NAME=osx
163.2
TRAVIS_PYTHON_VERSION=3.6 TRAVIS_OS_NAME=linux
164.1
TRAVIS_OS_NAME=osx
164.2
TRAVIS_PYTHON_VERSION=3.6 TRAVIS_OS_NAME=linux
165.1
TRAVIS_OS_NAME=osx
165.2
TRAVIS_PYTHON_VERSION=3.6 TRAVIS_OS_NAME=linux
166.1
TRAVIS_OS_NAME=osx
166.2
TRAVIS_PYTHON_VERSION=3.6 TRAVIS_OS_NAME=linux
167.2
TRAVIS_PYTHON_VERSION=3.6 TRAVIS_OS_NAME=linux
167.1
TRAVIS_OS_NAME=osx
1 |
# Codecov configuration to make it a bit less noisy
|
2 |
coverage: |
3 |
status: |
4 |
patch: false |
5 |
project: |
6 |
default: |
7 |
threshold: 50% |
8 |
comment: |
9 |
layout: "header" |
10 |
require_changes: false |
11 |
branches: null |
12 |
behavior: default |
13 |
flags: null |
14 |
paths: null |
15 |
ignore: |
16 |
- "quanformer/match_plot.py" |
17 |
|
18 |
- "quanformer/quan2modsem.py" |
19 |
- "quanformer/match_minima.py" |
20 |
- "quanformer/opt_vs_spe.py" |
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file.
The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files.
The size and color of each slice is representing the number of statements and the coverage, respectively.