Compare d2e0030 ... +4 ... 21d98e8

Showing 2 of 2 files from the diff.

@@ -54,7 +54,7 @@
Loading
54 54
        ref_index=None,
55 55
        plot_enes=False
56 56
    )
57 -
    os.remove('relene.dat')
57 +
    #os.remove('relene.dat')
58 58
59 59
def test_survey_energies_reference():
60 60
    survey_confs(
@@ -82,5 +82,10 @@
Loading
82 82
# test manually without pytest
83 83
if 0:
84 84
    sys.path.insert(0, '/home/limvt/Documents/quanformer/quanformer')
85 +
    sys.path.insert(0, '/home/limvt/Documents/quanformer/quan_mobley/quanformer')
85 86
    from survey_confs import *
86 87
    test_survey_times()
88 +
    test_survey_times_wrongtag()
89 +
    test_survey_energies()
90 +
    test_survey_energies_reference()
91 +
    test_survey_energies_plot()

@@ -317,126 +317,90 @@
Loading
317 317
    return rel_enes
318 318
319 319
320 -
def avg_coeffvar(enelist, ref_conf_int=0):
321 -
    """
322 -
    For the ABSOLUTE energies of a single conformer, compute
323 -
        coefficient of variation (CV) across methods, then
324 -
        average the CVs across all conformers for each molecule.
320 +
def kde(x):
321 +
    x = np.asarray(x)
322 +
    kde = stats.gaussian_kde(x)
323 +
    return kde
325 324
326 -
    Note: Input energies can be raw energies, or scaled energies (such that
327 -
        energies are still on ratio scale).
328 -
    Note: It is incorrect to compute the coefficient of variation on relative
329 -
        values. See wikipedia for an example on temperature.
330 -
    Note: The units of the input shouldn't matter since the CV is a ratio,
331 -
        meaning that units should cancel.
332 325
333 -
    Parameters
334 -
    ----------
335 -
    enelist : list of lists
336 -
        enelist[i][j] is absolute energy of file i, mol j
326 +
def normalize(x):
327 +
    x = np.asarray(x)
328 +
    return (x - x.min()) / np.ptp(x)
337 329
338 -
    Returns
339 -
    -------
340 -
    spread_by_mol : list
341 -
        len(spread_by_mol) == number of molecules
342 -
        spread_by_mol[i] == avg(CVs(conformer enes from diff methods) of mol i
343 -
    cvlist_all_mols : list of lists
344 -
        len(cvlist_all_mols) == number of molecules
345 -
        len(cvlist_all_mols[i]) == number of conformers for molecule i minus 1
346 -
347 -
    Notes
348 -
    -----
349 -
    * This takes population stdevs, as opposed to sample stdevs.
350 -
    * Validated for single conformer, and for three conformers.
351 330
352 -
    """
331 +
def something(enelist, moltitles):
353 332
354 -
    # 1D list for average of CVs of each mol
355 -
    spread_by_mol = []
356 -
    cvlist_all_mols = []
357 333
358 -
    # reshape list so that enelist[i][j] is mol i, file j
359 -
    enelist = np.array(enelist).T
334 +
    print(enelist)
335 +
    print(enelist.shape)
336 +
337 +
    # enelist[x][y][z] is xth file, yth molecule, zth conformer
338 +
    # number of files/mols should be consistent
339 +
    num_files = enelist.shape[0]
340 +
    num_mols = enelist.shape[1]
360 341
361 -
    # here, mols_ene[x][y] is ith molecule, xth file, and yth conformer
362 -
    for i, mols_ene in enumerate(enelist):
363 -
        cvlist = []
342 +
    for i in range(num_mols):
343 +
        molname = moltitles[i]
344 +
345 +
        enes_mol_i = [enelist[j][i] for j in range(num_files)]
364 346
365 347
        # check that at least two conformers exist
366 -
        num_confs = mols_ene[0].shape[0]
367 -
        if num_confs <= 1:
348 +
        all_num_confs = [enes_mol_i[k].shape[0] for k in range(num_files)]
349 +
        min_num_confs = min(all_num_confs)
350 +
        max_num_confs = max(all_num_confs)
351 +
        print(f"{molname} number of conformers: {all_num_confs}")
352 +
        if min_num_confs <= 1:
368 353
            print(f"skipping molecule {i} since only 0 or 1 conformer")
369 -
            spread_by_mol.append(np.nan)
370 -
            cvlist_all_mols.append(num_confs*[np.nan])
354 +
#            spread_by_mol.append(np.nan)
355 +
#            cvlist_all_mols.append(num_confs*[np.nan])
371 356
            continue
372 357
373 -
        # number of files is mols_ene.shape[0]; should be same for all mols
374 -
        for j in range(num_confs):
375 -
376 -
            # for single conformer j, get its energies from all k files
377 -
            confs_ene = [mols_ene[k][j] for k in range(mols_ene.shape[0])]
378 -
379 -
            # compute coeff of variation
380 -
            with np.errstate(divide='ignore', invalid='ignore'):
381 -
                cv = np.std(confs_ene) / np.average(confs_ene)
382 -
            cvlist.append(cv)
383 -
384 -
        # remove ref conf from avg calc bc energies scaled by this conf
385 -
        cvlist.pop(ref_conf_int)
386 -
387 -
        # average the coeffs of variation
388 -
        spread_by_mol.append(np.average(cvlist))
389 -
        cvlist_all_mols.append(cvlist)
390 -
391 -
    return spread_by_mol, cvlist_all_mols
392 -
393 -
394 -
def normalized_deviation(enelist):
395 -
    """
396 -
    For the relative energies of each conformer of some molecule,
397 -
    subtract the average energy for the different methods, then
398 -
    take the standard deviation of all conformers of all methods.
399 -
400 -
    TODO -- should the stdev be computed for all the conformers
401 -
        of the mol (currently coded) or across the methods
402 -
        (so add the line under np.average)? Would the stdevs of
403 -
        each conformer then be averaged?
404 -
    AKA should you be taking the stdev of the normalized data (current)
405 -
        or take avg of the stdev?
406 -
407 -
    Parameters
408 -
    ----------
409 -
    TODO
410 -
411 -
    Returns
412 -
    -------
413 -
    TODO
414 -
415 -
    """
416 -
417 -
    # enelist[i][j] is for file i, mol j
418 -
    # spread_by_mol is 1D list of len(num_mols)
419 -
    spread_by_mol = []
420 -
421 -
    # reshape list for that i goes to mol and j goes to file
422 -
    enelist = np.array(enelist).T
423 -
    for i, mols_ene in enumerate(enelist):
424 -
        mollist = []
425 -
426 -
        # number of files is mols_ene.shape[0]
427 -
        for j in range(1, mols_ene[0].shape[0]):
428 -
            confs_ene = [mols_ene[k][j] for k in range(mols_ene.shape[0])]
429 -
430 -
            # subtract the average of methods for the conformer
431 -
            # note: not subtracting average conformer energy for each method
432 -
            # bc that would give us estimate of spread conformer energies as
433 -
            # opposed to estimating spread of method energies
434 -
            confs_ene = confs_ene - np.average(confs_ene)
435 -
            mollist.append(confs_ene)
436 -
437 -
        spread_by_mol.append(np.std(mollist))
438 -
439 -
    return spread_by_mol
358 +
        # for reference plot histograms of each file for this mol
359 +
        fig = plt.figure()
360 +
        for k, v in enumerate(range(num_files)):
361 +
            v = v+1
362 +
            ax1 = fig.add_subplot(num_files, 1, v)
363 +
            ax1.hist(enes_mol_i[k])
364 +
        plt.show()
365 +
366 +
        # estimate density of conformer energies of mol_i for each file/method
367 +
        kdes_mol_i = [kde(normalize(enes_mol_i[j])) for j in range(num_files)]
368 +
        # define x-values for which to get value at kde's
369 +
        x = np.linspace(-2, 3, 1000)
370 +
371 +
        # compute area under the curve as sanity check
372 +
        areas = [kdes_mol_i[j].integrate_box_1d(-2, 3) for j in range(num_files)]
373 +
        print(f"{molname} area under kdes: {areas}")
374 +
375 +
        # plot normalized densities of conformer energies for diff methods
376 +
        plt.figure()
377 +
        plt.subplot(2, 1, 1)
378 +
        for j in range(num_files):
379 +
            plt.plot(x, kdes_mol_i[j](x), label=f'file {j}')
380 +
        plt.title(f'{molname} kernel density estimates')
381 +
        plt.legend(loc=2)
382 +
383 +
        plt.subplot(2, 1, 2)
384 +
        x2 = np.linspace(0, 1, 1000)
385 +
        for j in range(num_files):
386 +
            plt.plot(x2, kdes_mol_i[j](x2), label=f'file {j}')
387 +
        plt.show()
388 +
389 +
#        # compute all-by-all relative entropy calculation
390 +
#        # higher value means the two densities are more different
391 +
#        kl_matrix = []
392 +
#        for a in range(len(kdes_mol_i)):
393 +
#          kl_list = []
394 +
#          for b in range(len(kdes_mol_i)):
395 +
#            kl = stats.entropy(kdes_mol_i[a](x), kdes_mol_i[b](x))
396 +
#            kl_list.append(kl)
397 +
#          kl_matrix.append(kl_list)
398 +
#
399 +
#        # plot all-by-all relative entropies
400 +
#        plt.figure()
401 +
#        plt.imshow(kl_matrix, cmap='hot_r', interpolation='nearest', origin='lower')
402 +
#        plt.colorbar()
403 +
#        plt.savefig(f'kullback_mol{i}.png')
440 404
441 405
442 406
def rmsd_two_files(dict1, dict2, mol_slice=[]):
@@ -657,13 +621,14 @@
Loading
657 621
658 622
    num_files = len(wholedict)
659 623
660 -
    # Write description in output file.
624 +
    # write description in output file
661 625
    compF = open(outfn, 'w')
662 626
    compF.write(f"# Variability in energies from diff QM methods averaged over conformers\n")
663 627
    for i, d in enumerate(wholedict.values()):
664 628
        compF.write("# File %d: %s\n" % (i+1, d['fname']))
665 629
        compF.write("#   calc=%s, %s\n" % (d['tagkey'], d['theory']))
666 630
631 +
    # extract data from SDF files
667 632
    enelist = []
668 633
    idxlist = []
669 634
    nanlist = []
@@ -675,12 +640,13 @@
Loading
675 640
        nanlist.append(confNans)
676 641
        wholedict[i]['titleMols'] = titleMols
677 642
643 +
    # only retain molecules common in all files
678 644
    print("Removing mols missing from 1+ files...")
679 -
    # find molecules common in all files
680 645
    molname_sets = [set(wholedict[x]['titleMols']) for x in range(num_files)]
681 646
    molname_common = set.intersection(*molname_sets)
682 647
683 648
    for i in range(num_files):
649 +
684 650
        # find the extraneous mols in this file
685 651
        mols_in_file = wholedict[i]['titleMols']
686 652
        mols_extraneous = list(set(mols_in_file) - molname_common)
@@ -689,65 +655,47 @@
Loading
689 655
        removal_idx = []
690 656
        for molname in mols_extraneous:
691 657
            removal_idx.append(mols_in_file.index(molname))
692 -
        for index in sorted(removal_idx, reverse=True):
693 -
            print(f"  Removing {wholedict[i]['titleMols'][index]} from {wholedict[i]['fname']}")
694 -
            del wholedict[i]['titleMols'][index]
695 -
            del enelist[i][index]
696 -
            del idxlist[i][index]
697 -
            del nanlist[i][index]
698 -
699 -
    print("Removing un-finished conformers and computing relative energies...")
658 +
        for mol_idx in sorted(removal_idx, reverse=True):
659 +
            print(f"  Removing {wholedict[i]['titleMols'][mol_idx]} from {wholedict[i]['fname']}")
660 +
            del wholedict[i]['titleMols'][mol_idx]
661 +
            del enelist[i][mol_idx]
662 +
            del idxlist[i][mol_idx]
663 +
            del nanlist[i][mol_idx]
664 +
665 +
    print("Removing un-finished conformers...")
700 666
    idxlist, enelist = remove_dead_conformers(enelist, idxlist, nanlist)
701 -
702 -
    # scale energies: scale energies by subtracting or dividing conf_j
703 -
    # by conf_i for all j confs in all mols for all files
704 -
    # (1) subtract first conf, or (2) divide first conf
705 -
    if isinstance(ref_conf, int):
706 -
        ref_conf_int = ref_conf
707 -
    rel_enelist = relative_energies(enelist, 'subtract', ref_conf_int)
708 -
    rat_enelist = relative_energies(enelist, 'divide', ref_conf_int)
709 -
710 667
    for i in range(num_files):
711 -
        wholedict[i]['compared_enes'] = rel_enelist[i]
668 +
        wholedict[i]['compared_enes'] = enelist[i]
712 669
        wholedict[i]['confNums'] = idxlist[i]
713 670
714 -
    # check that entire array is not zero, if ALL mols have one conf
715 -
    all_zeros = np.all(np.asarray(rel_enelist)==0)
716 -
    if all_zeros:
717 -
        print("WARNING: All relative energy values are zero. "
718 -
            "Cannot calculate energy spread.")
719 -
        spreadlist = [-1]*len(rel_enelist[0])
720 -
    else:
721 -
        # estimate spread of data on energies
722 -
        spreadlist, cvlist_all_mols = avg_coeffvar(rat_enelist, ref_conf_int)
723 -
        #spreadlist = normalized_deviation(rel_enelist)
724 -
725 -
    # loop over each mol and write energies from wholedict by column
726 -
    for m in range(len(wholedict[1]['titleMols'])):
727 -
        compF.write('\n\n# Mol ' + wholedict[1]['titleMols'][m])
728 -
729 -
        # for this mol, write the rmsd from each file side by side
730 -
        line = ' {:.2f} ppm averaged CV'.format(spreadlist[m]*1000000)
731 -
        compF.write(line)
732 -
        compF.write('\n# ==================================================================')
733 -
        compF.write(f'\n# rel enes (kcal/mol), column i = file i, row j = conformer j')
734 -
735 -
        # for this mol, write the compared_enes from each file by columns
736 -
        this_mol_conf_inds = wholedict[1]['confNums'][m]
737 -
        for c in range(len(this_mol_conf_inds)):
738 -
            line = '\n' + str(this_mol_conf_inds[c]) + '\t'
739 -
            for i in range(num_files):
740 -
                line += '{:.4f}\t'.format(wholedict[i]['compared_enes'][m][c])
741 -
742 -
            # print overall contribution to spread, skip ref conf since scaled
743 -
            # include second condition in case ref_conf_int is negative
744 -
            if c == ref_conf_int or c == len(this_mol_conf_inds)+ref_conf_int:
745 -
                line += '#  nan ppm CV'
746 -
            else:
747 -
                line += '# {:.2f} ppm CV'.format(cvlist_all_mols[m][c-1]*1000000)
748 -
            compF.write(line)
749 -
750 -
    compF.close()
671 +
    _ = something(enelist, wholedict[0]['titleMols'])
672 +
673 +
#    # loop over each mol and write energies from wholedict by column
674 +
#    for m in range(len(wholedict[1]['titleMols'])):
675 +
#        compF.write('\n\n# Mol ' + wholedict[1]['titleMols'][m])
676 +
#
677 +
#        # for this mol, write the rmsd from each file side by side
678 +
#        line = ' {:.2f} ppm averaged CV'.format(spreadlist[m]*1000000)
679 +
#        compF.write(line)
680 +
#        compF.write('\n# ==================================================================')
681 +
#        compF.write(f'\n# rel enes (kcal/mol), column i = file i, row j = conformer j')
682 +
#
683 +
#        # for this mol, write the compared_enes from each file by columns
684 +
#        this_mol_conf_inds = wholedict[1]['confNums'][m]
685 +
#        for c in range(len(this_mol_conf_inds)):
686 +
#            line = '\n' + str(this_mol_conf_inds[c]) + '\t'
687 +
#            for i in range(num_files):
688 +
#                line += '{:.4f}\t'.format(wholedict[i]['compared_enes'][m][c])
689 +
#
690 +
#            # print overall contribution to spread, skip ref conf since scaled
691 +
#            # include second condition in case ref_conf_int is negative
692 +
#            if c == ref_conf_int or c == len(this_mol_conf_inds)+ref_conf_int:
693 +
#                line += '#  nan ppm CV'
694 +
#            else:
695 +
#                line += '# {:.2f} ppm CV'.format(cvlist_all_mols[m][c-1]*1000000)
696 +
#            compF.write(line)
697 +
#
698 +
#    compF.close()
751 699
752 700
    return wholedict
753 701

Learn more Showing 14 files with coverage changes found.

Changes in quanformer/survey_confs.py
-43
+2
Loading file...
New file quanformer/tests/test_basic_plot.py
New
Loading file...
New file quanformer/tests/test_confs_to_psi.py
New
Loading file...
New file quanformer/tests/test_reader.py
New
Loading file...
New file quanformer/tests/test_proc_tags.py
New
Loading file...
New file quanformer/tests/test_pipeline.py
New
Loading file...
New file quanformer/tests/test_filter_confs.py
New
Loading file...
New file quanformer/tests/test_qcarchive.py
New
Loading file...
New file quanformer/tests/test_survey_confs.py
New
Loading file...
New file quanformer/tests/test_get_psi_results.py
New
Loading file...
New file quanformer/tests/test_quanformer.py
New
Loading file...
New file quanformer/tests/test_initialize_confs.py
New
Loading file...
New file quanformer/tests/test_utils.py
New
Loading file...
New file quanformer/_version.py
New
Loading file...
Files Coverage
quanformer -34.73% 51.61%
Project Totals (24 files) 51.61%
Loading