1
#!/usr/bin/env python
2 60
"""
3
get_psi_results.py
4

5
Purpose:    Process results from a Psi4 QM calculation.
6
By:         Victoria T. Lim
7
Version:    Oct 12 2018
8

9
NOTE:       Psi4 results are obtained by parsing output text file.
10
            Not sure if this is most efficient.
11
            JSON wrapper has some limitations though (as of Dec 2018).
12
            Also, not sure best way of parsing: via iterator (current
13
            approach) or by using some sort of find or regex command?
14

15
"""
16

17 60
import os, sys
18 60
import pickle
19 60
import warnings
20 60
import openeye.oechem as oechem
21 60
import quanformer.proc_tags as pt
22 60
import quanformer.reader as reader
23

24
### ------------------- Functions -------------------
25

26

27 60
def initiate_dict():
28
    """
29
    Initiate conformer's data dict and set initial details
30

31
    Returns
32
    -------
33
    dictionary with keys for 'package' and 'missing'
34
        'package' is the name of the QM software package
35
        'missing' is whether the output file can be found
36

37
    """
38

39 60
    props = {}
40 60
    props['package'] = "Psi4"
41 60
    props['missing'] = False
42

43 60
    return props
44

45

46 60
def get_conf_data(props, calctype, timeout, psiout):
47
    """
48
    Call the relevant functions to process the output
49
    files for one calculation.
50

51
    Parameters
52
    ----------
53
    """
54

55
    # get wall clock time of the job
56 60
    props['time'] = get_psi_time(timeout)
57

58
    # get calculation details
59 60
    props = process_psi_out(psiout, props, calctype)
60

61 60
    return props
62

63

64 60
def set_conf_data(mol, props, calctype):
65

66
    # Set last coordinates from optimization. skip if missing.
67 60
    if 'coords' in props and len(props['coords']) != 0:
68 60
        mol.SetCoords(oechem.OEFloatArray(props['coords']))
69

70
    # Set SD tags for this molecule
71 60
    pt.set_sd_tags(mol, props, calctype)
72

73 60
    return mol
74

75

76 60
def check_title(mol, infile):
77
    """
78
    Check if mol has title which is required for setting up new subdirectories.
79
    If not title listed, rename molecule based on filename. Process filename
80
    to remove dashes and dots due to file name limitations of pipeline.
81

82
    Parameters
83
    ----------
84
    mol : OpenEye OEMol
85
    infile : string, name of the file from which the mol came
86

87
    Returns
88
    -------
89
    OpenEye OEMol
90

91
    """
92 60
    extract_fname = os.path.splitext(os.path.basename(infile))[0]
93 60
    extract_fname = extract_fname.replace("-", "")
94 60
    extract_fname = extract_fname.replace(".", "")
95 60
    if mol.GetTitle() == "":
96 60
        mol.SetTitle(extract_fname)
97

98 60
    return mol
99

100

101 60
def get_psi_time(filename):
102
    """
103
    Get wall-clock time from Psi4 file. If multiple times are present,
104
        the average will be taken. Used in CompareTimes(...) function.
105

106
    Parameters
107
    ----------
108
    filename: string name of the timefile. E.g. "timer.dat"
109

110
    Returns
111
    -------
112
    time: float of the average wall-clock time of a single timefile
113
        OR string of error message if file not found
114

115
    """
116

117
    # check whether file exists
118 60
    if not os.path.isfile(filename):
119 0
        print("*** ERROR: timer file not found: {} ***".format(filename))
120 0
        return ("Timer output file not found")
121

122
    # read file and extract time
123 60
    with open(filename) as fname:
124 60
        times = []
125 60
        for line in fname:
126 60
            if "Wall Time:" in line:
127 60
                times.append(float(line.split()[2]))
128 60
    if len(times) == 0:
129
        # happens if timer.dat is empty
130 0
        raise IOError(f"Error reading time from {filename}")
131 60
    time = sum(times) / float(len(times))
132

133 60
    return time
134

135

136 60
def get_scs_mp2(lines):
137
    """
138
    Get final SCS MP2 energy from output file. This is handled a little
139
    differently, outside of iterator because (1) have to do for both SPE
140
    and OPT, (2) SCS-MP2 may be listed multiple times, not just at end of file,
141
    so must be able to get the last one directly ideally without storing them
142
    all in a list first, (3) checking the qm method for each line seems silly,
143
    since SCS-MP2 energies would only be sought when method is mp2.
144

145
    This function called by the process_psi_out function.
146

147
    Note: MP2 Hessian output also has this but not sure if should be extracted
148
    since am not extracting optimized energies.
149

150
    Parameters
151
    ----------
152
    lines : list
153
        list of lines of Psi4 output file from Python readlines function
154

155
    Returns
156
    -------
157
    scs_ene : float
158
        value of the final SCS-MP2 total energy in original units of Psi4 (Har)
159

160
    """
161 60
    matching = [s for s in lines if "SCS Total Energy" in s]
162 60
    ene = float(matching[-1].split()[4])
163 60
    return ene
164

165

166 60
def process_psi_out(filename, properties, calctype='opt'):
167
    """
168
    Go through output file and get level of theory (method and basis set),
169
        number of optimization steps, initial and final energies, and
170
        optimized coordinates. Returns all this information in a dictionary
171
        that was passed to this function.
172

173
    Parameters
174
    ----------
175
    filename: string name of the output file. E.g. "output.dat"
176
    properties: dictionary where all the data will go. Can be empty or not.
177
    calctype: string; one of 'opt','spe','hess' for geometry optimization,
178
        single point energy calculation, or Hessian calculation
179

180
    Returns
181
    -------
182
    properties: dictionary with summarized data from output file.
183
        spe keys:  basis, method, finalEnergy
184
        opt keys:  basis, method, numSteps, initEnergy, finalEnergy, coords
185
        hess keys: basis, method, hessian
186

187
    """
188

189
    # check whether output file exists
190 60
    if not os.path.isfile(filename):
191 0
        print("*** ERROR: Output file not found: {} ***".format(filename))
192 0
        properties['missing'] = True
193 0
        return properties
194

195
    # open and read file
196 60
    f = open(filename, "r")
197 60
    lines = f.readlines()
198 60
    it = iter(lines)
199

200
    # process results for single point energy calculation
201 60
    if calctype == 'spe':
202 60
        for line in it:
203 60
            if "set basis" in line:
204 60
                properties['basis'] = line.split()[2]
205 60
            if "energy(" in line:
206 60
                properties['method'] = line.split('\'')[1]
207
            # this line should only show up once for spe
208 60
            if "Total Energy =" in line:
209 60
                properties['finalEnergy'] = float(line.split()[3])
210

211
        # check for scs-mp2 energy if applicable
212 60
        if properties['method'].lower() == 'mp2':
213 60
            scs_ene = get_scs_mp2(lines)
214 60
            properties['finalSCSEnergy'] = scs_ene
215

216 60
        return properties
217

218
    # process results for Hessian calculation
219 60
    elif calctype == 'hess':
220 60
        import math
221 60
        import numpy as np
222 60
        hess = []
223 60
        for line in it:
224 60
            if "set basis" in line:
225 60
                properties['basis'] = line.split()[2]
226 60
            if "=hessian(" in ''.join(line.split()):  # rm spaces around eql
227 60
                properties['method'] = line.split('\'')[1]
228 60
            if "## Hessian" in line:
229 60
                for i in range(4):
230 60
                    next(it)
231
                # (1) "Irrep:"
232
                # (2) blank line
233
                # (3) column index labels
234
                # (4) blank line
235

236
                # convert the rest of file/iterator to list
237 60
                rough = list(it)
238
                # remove last 5 lines: four blank, one exit message
239 60
                rough = rough[:-5]
240
                # convert strings to floats
241 60
                rough = [[float(i) for i in line.split()] for line in rough]
242
                # find blank sublists
243 60
                empty_indices = [i for i, x in enumerate(rough) if not x]
244
                # blank sublists are organized in groups of three lines:
245
                # (1) blank (2) column labels (3) blank. empty_ind has (1) and (3)
246
                # delete in reverse to maintain index consistency
247 60
                num_chunks = 1  # how many chunks psi4 printed the Hessian into
248 60
                for i in reversed(range(0, len(empty_indices), 2)):
249 60
                    del rough[empty_indices[i + 1]]  # (3)
250 60
                    del rough[empty_indices[i] + 1]  # (2)
251 60
                    del rough[empty_indices[i]]  # (1)
252 60
                    num_chunks += 1
253
                # now remove first element of every list of row label
254 60
                _ = [l.pop(0) for l in rough]
255
                # get dimension of Hessian (3N for 3Nx3N matrix)
256 60
                three_n = int(len(rough) / num_chunks)
257
                # concatenate the chunks
258 60
                hess = np.array([]).reshape(three_n, 0)
259 60
                for i in range(num_chunks):
260 60
                    beg_ind = i * three_n
261 60
                    end_ind = (i + 1) * three_n
262 60
                    chunk_i = np.array(rough[beg_ind:end_ind])
263 60
                    hess = np.concatenate((hess, chunk_i), axis=1)
264
                # check that final matrix is symmetric
265 60
                if not np.allclose(hess, hess.T, atol=0):
266 0
                    print("ERROR: Quanformer did not read symmetric Hessian "
267
                          "from Psi4 output file")
268 0
                    properties['hessian'] = "Hessian not found in output file"
269 0
                    return properties
270 60
                properties['hessian'] = hess
271
            # the Hessian pattern was never found
272
            else:
273 60
                properties['hessian'] = "Hessian not found in output file"
274

275 60
        return properties
276

277
    # process results for geometry optimization
278
    # loop through file to get method, basis set, numSteps, energies, coords
279 60
    rough = []
280 60
    coords = []
281 60
    for line in it:
282 60
        if "set basis" in line:
283 60
            properties['basis'] = line.split()[2]
284 60
        if "optimize(" in line:
285 60
            properties['method'] = line.split('\'')[1]
286 60
        if "Optimization is complete" in line:
287 60
            properties['numSteps'] = line.strip().split(' ')[5]
288 60
            for _ in range(8):
289 60
                line = next(it)
290 60
            properties['initEnergy'] = float(line.split()[1])
291 60
        if "Final energy" in line:
292 60
            properties['finalEnergy'] = float(line.split()[3])
293 60
            for i in range(3):
294 60
                line = next(it)
295
                # (1) "Final (previous) structure:"
296
                # (2) "Cartesian Geometry (in Angstrom)"
297
                # (3) Start of optimized geometry
298 60
            while "Saving final" not in line:
299 60
                rough.append(line.split()[1:4])
300 60
                line = next(it)
301

302
    # check for scs-mp2 energy if applicable
303 60
    if properties['method'].lower() == 'mp2':
304 60
        scs_ene = get_scs_mp2(lines)
305 60
        properties['finalSCSEnergy'] = scs_ene
306

307
    # convert the 2D (3xN) coordinates to 1D list of length 3N (N atoms)
308 60
    for atomi in rough:
309 60
        coords += [float(i) for i in atomi]
310 60
    properties['coords'] = coords
311 60
    f.close()
312 60
    return properties
313

314

315
### ------------------- Script -------------------
316

317

318 60
def get_psi_results(origsdf,
319
                    finsdf,
320
                    calctype='opt',
321
                    psiout="output.dat",
322
                    timeout="timer.dat"):
323
    """
324
    Read in OEMols (and each of their conformers) in origsdf file,
325
        get results from Psi4 calculations in the same directory as origsdf,
326
        and write out results into finsdf file.
327
    Directory layout is .../maindir/molName/confNumber/outputfiles .
328
    Both origsdf and finsdf are located in maindir.
329

330
    Parameters
331
    ----------
332
    origsdf:  string - original SDF file of input structures of QM calculation
333
    finsdf:   string - full name of final SDF file with optimized results.
334
    calctype: string; one of 'opt','spe','hess' for geometry optimization,
335
        single point energy calculation, or Hessian calculation
336
    psiout:   string - name of the Psi4 output files. Default is "output.dat"
337
    timeout: string - name of the Psi4 timer files. Default is "timer.dat"
338

339
    Returns
340
    -------
341
    method: string - QM method from Psi4 calculations
342
    basisset: string - QM basis set from Psi4 calculations
343

344
    None is returned if the function returns early (e.g., if output file
345
       already exists)
346

347
    """
348

349 60
    hdir, fname = os.path.split(origsdf)
350 60
    wdir = os.getcwd()
351

352
    # check that specified calctype is valid
353 60
    if calctype not in {'opt', 'spe', 'hess'}:
354 60
        raise ValueError("Specify a valid calculation type.")
355

356
    # read in molecules
357 60
    molecules = reader.read_mols(origsdf)
358

359
    # open outstream file
360 60
    writeout = os.path.join(wdir, finsdf)
361 60
    write_ofs = oechem.oemolostream()
362 60
    if os.path.exists(writeout):
363 60
        raise FileExistsError(f"File already exists: {finsdf}\n")
364 60
    if not write_ofs.open(writeout):
365 0
        oechem.OEThrow.Fatal("Unable to open %s for writing" % writeout)
366

367
    # Hessian dictionary, where hdict['molTitle']['confIndex'] has np array
368 60
    if calctype == 'hess':
369 60
        hdict = {}
370

371
    # for each conformer, process output file and write new data to SDF file
372 60
    for mol in molecules:
373 60
        print("===== %s =====" % (mol.GetTitle()))
374 60
        if calctype == 'hess':
375 60
            hdict[mol.GetTitle()] = {}
376

377 60
        for j, conf in enumerate(mol.GetConfs()):
378

379 60
            props = initiate_dict()
380

381
            # set file locations
382 60
            timef = os.path.join(hdir,
383
                                 "%s/%s/%s" % (mol.GetTitle(), j + 1, timeout))
384 60
            outf = os.path.join(hdir,
385
                                "%s/%s/%s" % (mol.GetTitle(), j + 1, psiout))
386

387
            # process output and get dictionary results
388 60
            props = get_conf_data(props, calctype, timef, outf)
389

390
            # if output was missing or are missing calculation details
391
            # move on to next conformer
392 60
            if props['missing'] or (calctype == 'opt' and not all(
393
                    key in props
394
                    for key in ['numSteps', 'finalEnergy', 'coords'])):
395 60
                print(f"ERROR reading {outf}\nEither Psi4 job was incomplete "
396
                        "or wrong calctype specified\n")
397 60
                method = None
398 60
                basisset = None
399 60
                continue
400

401
            # add data to oemol
402 60
            conf = set_conf_data(conf, props, calctype)
403 60
            method = props['method']
404 60
            basisset = props['basis']
405

406
            # if hessian, append to dict bc does not go to SD tag
407 60
            if calctype == 'hess':
408 60
                hdict[mol.GetTitle()][j + 1] = props['hessian']
409

410
            # check mol title
411 60
            conf = check_title(conf, origsdf)
412

413
            # write output file
414 60
            oechem.OEWriteConstMolecule(write_ofs, conf)
415

416
    # if hessian, write hdict out to separate file
417 60
    if calctype == 'hess':
418 60
        hfile = os.path.join(wdir,
419
                             os.path.splitext(finsdf)[0] + '.hess.pickle')
420 60
        pickle.dump(hdict, open(hfile, 'wb'))
421

422
    # close file streams
423 60
    write_ofs.close()
424 60
    return method, basisset
425

426

427 60
def getPsiOne(infile,
428
              outfile,
429
              calctype='opt',
430
              psiout="output.dat",
431
              timeout="timer.dat"):
432
    """
433
    Write out Psi4 optimized geometry details into a new OEMol.
434

435
    Parameters
436
    ----------
437
    infile : string
438
        Name of input geometry file THAT WAS USED TO WRITE THE PSI INPUT.
439
        To ensure that the atom orderings remain constant.
440
    outfile : string
441
        Name of output geometry file with optimized results.
442
    calctype: string; one of 'opt','spe','hess' for geometry optimization,
443
        single point energy calculation, or Hessian calculation
444
    psiout : string
445
        Name of the Psi4 output files. Default is "output.dat"
446
    timeout : string
447
        Name of the Psi4 timer files. Default is "timer.dat"
448

449
    Returns
450
    -------
451
    mol: OpenEye OEMol with data in SD tags
452
    props: dictionary with summarized data from output file.
453
        spe keys:  basis, method, finalEnergy
454
        opt keys:  basis, method, numSteps, initEnergy, finalEnergy, coords
455
        hess keys: basis, method, hessian
456

457
    """
458
    # check that specified calctype is valid
459 60
    if calctype not in {'opt', 'spe', 'hess'}:
460 0
        raise ValueError("Specify a valid calculation type.")
461

462
    # Read in SINGLE MOLECULE .sdf file
463 60
    ifs = oechem.oemolistream()
464 60
    mol = oechem.OEGraphMol()
465 60
    if not ifs.open(infile):
466 0
        raise FileNotFoundError("Unable to open %s for reading" % origsdf)
467 60
    oechem.OEReadMolecule(ifs, mol)
468

469
    # Open outstream file.
470 60
    write_ofs = oechem.oemolostream()
471 60
    if os.path.exists(outfile):
472 0
        print("File already exists: %s. Skip getting results.\n" % (outfile))
473 0
        return (None, None)
474 60
    if not write_ofs.open(outfile):
475 0
        oechem.OEThrow.Fatal("Unable to open %s for writing" % outfile)
476

477 60
    props = initiate_dict()
478

479
    # process output and get dictionary results
480 60
    props = get_conf_data(props, calctype, timeout, psiout)
481

482
    # if output was missing or are missing calculation details
483
    # move on to next conformer
484 60
    if props['missing'] or (calctype == 'opt' and not all(
485
            key in props for key in ['numSteps', 'finalEnergy', 'coords'])):
486 0
        warnings.warn(
487
            "ERROR reading {}\nEither Psi4 job was incomplete OR wrong calctype specified\n"
488
            .format(psiout))
489

490
    # add data to oemol
491 60
    mol = set_conf_data(mol, props, calctype)
492

493
    # if hessian, write hdict out to separate file
494 60
    if calctype == 'hess':
495 0
        hfile = os.path.join(os.path.splitext(outfile)[0] + '.hess.pickle')
496 0
        pickle.dump(props['hessian'], open(hfile, 'wb'))
497

498
    # check mol title
499 60
    mol = check_title(mol, infile)
500

501
    # write output file
502 60
    oechem.OEWriteConstMolecule(write_ofs, mol)
503 60
    write_ofs.close()
504

505 60
    return mol, props
506

507

508 60
if __name__ == "__main__":
509 0
    get_psi_results(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4],
510
                    sys.argv[5])

Read our documentation on viewing source code .

Loading