1
#!/usr/bin/env python
2 60
"""
3
filter_confs.py
4

5
This script takes an SDF file and, for each molecule:
6
 - conformers are compared by energy
7
 - conformers are compared by RMSD
8
 - conformers are removed if considered duplicate by energy/RMSD.
9

10
Import and call filter_confs.filter_confs(infile, tag, outfile)
11

12
By: Victoria T. Lim
13

14
"""
15

16

17 60
import os
18

19 60
import openeye.oechem as oechem
20 60
import quanformer.reader as reader
21

22
### ------------------- Functions -------------------
23

24

25 60
def identify_minima(mol, tag, ThresholdE, ThresholdRMSD):
26
    """
27
    For a molecule's set of conformers computed with some level of theory,
28
        whittle down unique conformers based on energy and RMSD.
29

30
    Parameters
31
    ----------
32
    mol           OEChem molecule with all of its conformers
33
    tag           string name of the SD tag in this molecule
34
    ThresholdE    float value for abs(E1-E2), below which 2 confs are "same"
35
        Units are hartrees (default output units of Psi4)
36
    ThresholdR    float value for RMSD, below which 2 confs are "same"
37
        Units are in Angstrom (Psi4 default)
38

39
    Returns
40
    -------
41
    boolean True if successful filter + delete. False if there's only
42
        one conf and it didn't optimize, or something else funky.
43

44
    """
45
    # suppress OERMSD warning
46 60
    level = oechem.OEThrow.GetLevel()
47 60
    oechem.OEThrow.SetLevel(oechem.OEErrorLevel_Error)
48

49
    # Parameters for OpenEye RMSD calculation
50 60
    automorph = True
51 60
    heavyOnly = False
52 60
    overlay = True
53

54
    # declare variables for conformers to delete
55 60
    confsToDel = set()
56 60
    delCount = 0
57

58
    # check if SD tag exists for the case of single conformer
59 60
    if mol.NumConfs() == 1:
60 60
        mol.GetConfs().next()
61 60
        for x in oechem.OEGetSDDataPairs(mol):
62 0
            if tag.lower() in x.GetTag().lower():
63 0
                return True
64
            else:
65 0
                return False
66

67
    # Loop over conformers twice (NxN diagonal comparison of RMSDs)
68 60
    for confRef in mol.GetConfs():
69 60
        print(" ~ Matching conformers to reference: "
70
              f"{mol.GetTitle()} conf {confRef.GetIdx() + 1}")
71

72
        # get real tag (correct for capitalization)
73 60
        for x in oechem.OEGetSDDataPairs(confRef):
74 60
            if tag.lower() in x.GetTag().lower():
75 60
                taglabel = x.GetTag()
76

77
        # check that there was a successfully found tag
78 60
        try:
79 60
            taglabel
80 60
        except NameError:
81 60
            print("Unable to filter bc missing SD data based on tag: {}".
82
                  format(tag))
83 60
            return False
84

85
        # delete cases that don't have energy (opt not converged; or other)
86 60
        if not oechem.OEHasSDData(confRef, taglabel):
87 60
            confsToDel.add(confRef.GetIdx())
88 60
            delCount += 1
89 60
            continue
90 60
        refE = float(oechem.OEGetSDData(confRef, taglabel))
91

92 60
        for confTest in mol.GetConfs():
93
            # upper right triangle comparison
94 60
            if confTest.GetIdx() <= confRef.GetIdx():
95 60
                continue
96
            # skip cases already set for removal
97 60
            if confTest.GetIdx() in confsToDel:
98 60
                continue
99
            # delete cases that don't have energy
100 60
            if not oechem.OEHasSDData(confTest, taglabel):
101 60
                confsToDel.add(confTest.GetIdx())
102 60
                continue
103

104 60
            testE = float(oechem.OEGetSDData(confTest, taglabel))
105
            # if MM (not Psi4) energies, convert absERel to Hartrees
106 60
            if 'mm' in taglabel.lower():
107 60
                absERel = abs(refE - testE) / 627.5095
108
            else:
109 60
                absERel = abs(refE - testE)
110
            # if energies are diff enough --> confs are diff --> keep & skip ahead
111 60
            if absERel > ThresholdE:
112 60
                continue
113
            # if energies are similar, see if they are diff by RMSD
114 60
            rmsd = oechem.OERMSD(confRef, confTest, automorph, heavyOnly,
115
                                 overlay)
116
            # if measured_RMSD < threshold_RMSD --> confs are same --> delete
117 60
            if rmsd < ThresholdRMSD:
118 60
                confsToDel.add(confTest.GetIdx())
119

120
    # reset oechem warning level
121 60
    oechem.OEThrow.SetLevel(level)
122

123
    # for the same molecule, delete tagged conformers
124 60
    print("%s original number of conformers: %d" % (mol.GetTitle(),
125
                                                    mol.NumConfs()))
126 60
    if delCount == mol.NumConfs():
127
        # all conformers in this mol has been tagged for deletion
128 0
        return False
129 60
    for conf in mol.GetConfs():
130 60
        if conf.GetIdx() in confsToDel:
131 60
            print('Removing %s conformer index %d' % (mol.GetTitle(),
132
                                                      conf.GetIdx()))
133 60
            if not mol.DeleteConf(conf):
134 0
                oechem.OEThrow.Fatal("Unable to delete %s GetIdx() %d" \
135
                                  % (mol.GetTitle(), conf.GetIdx()))
136 60
    return True
137

138

139
### ------------------- Script -------------------
140

141

142 60
def filter_confs(infile, tag, outfile):
143
    """
144
    Read in OEMols (and each of their conformers) in 'infile'.
145
    For each molecule:
146
        rough filter conformers based on energy differences specified by 'tag',
147
        fine filter conformers based on RMSD values.
148

149
    Parameters
150
    ----------
151
    infile : str
152
        Name of SDF file with conformers to be filtered
153
    tag : str
154
        SD tag name with the energy value to roughly screen conformers before RMSD
155
        Screening works by removing conformers of very similar energies, where
156
        "similar" is defined by thresE parameter. Examples:
157
        - "QM Psi4 Final Opt. Energy (Har) mp2/def-sv(p)"
158
        - "QM Psi4 Final Single Pt. Energy (Har) mp2/def-sv(p)"
159
    outfile : str
160
        Name of the output file with filtered conformers
161

162
    """
163
    # Parameters for distinguishing cutoff of conformer similarity
164 60
    thresE = 5.E-4  # declare confs diff & skip RMSD comparison above this threshold
165 60
    thresRMSD = 0.2  # above this threshold (Angstrom), confs are "diff" minima
166

167 60
    wdir, fname = os.path.split(infile)
168 60
    numConfsF = open(os.path.join(os.getcwd(), "numConfs.txt"), 'a')
169 60
    numConfsF.write("\n{}\n".format(tag))
170

171
    # open molecule file
172 60
    rmsd_molecules = reader.read_mols(infile)
173

174
    # Open outstream file.
175 60
    rmsd_ofs = oechem.oemolostream()
176 60
    if os.path.exists(outfile):
177 60
        raise FileExistsError("Output file {} already exists in {}".format(
178
            outfile, os.getcwd()))
179 60
    if not rmsd_ofs.open(outfile):
180 0
        oechem.OEThrow.Fatal("Unable to open %s for writing" % outfile)
181

182
    # Identify minima and write output file.
183 60
    for mol in rmsd_molecules:
184 60
        if identify_minima(mol, tag, thresE, thresRMSD):
185 60
            numConfsF.write("%s\t%s\n" % (mol.GetTitle(), mol.NumConfs()))
186 60
            oechem.OEWriteConstMolecule(rmsd_ofs, mol)
187
        else:
188 0
            numConfsF.write("%s\t0\n" % (mol.GetTitle()))
189 60
    numConfsF.close()
190 60
    rmsd_ofs.close()
191

192 60
    print("Done filtering %s to %s.\n" % (fname, outfile))

Read our documentation on viewing source code .

Loading