MobleyLab / chemper
1
"""
2
utils.py
3

4
This file provides simple functions that might be of use to people using the ChemPer package
5

6
"""
7

8 2
import os
9 2
import collections
10

11 2
def get_data_path(relative_path, package='chemper'):
12
    """
13
    Returns the absolute path to a specified relative path inside data directory of a given package
14
    For example,
15
        get_data_path('molecules', package='chemper')
16
    will find this path:
17
        [absolute path to Python packages]/chemper/data/molecules/
18

19
    If the final path does not exist it will raise an IOError
20

21
    Parameters
22
    ----------
23
    relative_path : str
24
                    filename or relative path to a file stored in the data/ directory
25
                    of a given package
26
    package : str
27
              name of the Python package that should contain the specified file
28
              default="chemper"
29

30
    Returns
31
    -------
32
    path : str
33
           absolute path to filename in specified package
34
    """
35 2
    from pkg_resources import resource_filename
36 2
    fn = resource_filename(package, os.path.join('data', relative_path))
37

38 2
    if not os.path.exists(fn):
39 2
        raise IOError("Sorry! The absolute path %s was not found, that is %s is not in the data directory for %s" \
40
                          % (fn, relative_path, package))
41

42 2
    return fn
43

44 2
def get_full_path(relative_path, package="chemper"):
45
    """
46
    This function checks to see if this relative path is available locally
47
    if it is it returns the absolute path to that file.
48
    Otherwise it checks if that relative path is located at [package location]/data/relative path
49
    it will raise an IOError if neither location exists.
50

51
    Parameters
52
    ----------
53
    relative_path : str
54
                    relative path to a file available locally or in package/data
55
    package : str
56
              package name with a data directory the file might be located in
57

58
    Returns
59
    -------
60
    abs_path : str
61
               The absolute path to a local file if available, otherwise the abolsute path
62
               to [package]/data/[relative_path]
63
    """
64 2
    if os.path.exists(relative_path):
65 0
        return os.path.abspath(relative_path)
66 2
    return get_data_path(relative_path, package)
67

68
# =======================================
69
# Check SMIRKS validity
70
# =======================================
71

72 2
def is_valid_smirks(smirks):
73
    """
74
    This function checks if a given smirks is valid.
75

76
    Parameters
77
    ----------
78
    smirks : SMIRKS (or SMARTS) string
79

80
    Returns
81
    -------
82
    is_valid : boolean
83
    """
84 2
    from chemper.mol_toolkits.mol_toolkit import Mol
85 2
    mol = Mol.from_smiles('C')
86 2
    try:
87 2
        mol.smirks_search(smirks)
88 2
        return True
89 2
    except ValueError:
90 2
        return False
91

92
# ===================================================================
93
# custom classes and functions for matching sets of SMIRKS to molecules
94
# ===================================================================
95
"""
96
custom dictionaries
97

98
These custom dictionaries are taken from
99
openforcefield.typing.engines.smirnoff.forcefield.
100
They allow tracking of symmetrically equivalent parameters.
101
For example the bonds (1,2) and (2,1), and  
102
the impropers (1,2,3,4) and (1,2,4,3) are equivalent 
103
"""
104

105

106 2
class TransformedDict(collections.MutableMapping):
107
    """A dictionary that applies an arbitrary key-altering
108
       function before accessing the keys"""
109

110 2
    def __init__(self, *args, **kwargs):
111 2
        self.store = dict()
112 2
        self.update(dict(*args, **kwargs))  # use the free update to set keys
113

114 2
    def __getitem__(self, key):
115 2
        return self.store[self.__keytransform__(key)]
116

117 2
    def __setitem__(self, key, value):
118 2
        self.store[self.__keytransform__(key)] = value
119

120 2
    def __delitem__(self, key):
121 0
        del self.store[self.__keytransform__(key)]
122

123 2
    def __iter__(self):
124 2
        return iter(self.store)
125

126 2
    def __len__(self):
127 0
        return len(self.store)
128

129 2
    def __keytransform__(self, key):
130 0
        return key
131

132 2
    def __str__(self): return str(self.store)
133

134

135 2
class ValenceDict(TransformedDict):
136
    """Enforce uniqueness in atom indices"""
137 2
    def __keytransform__(self, key):
138
        """Reverse tuple if first element is larger than last element."""
139
        # Ensure key is a tuple.
140 2
        key = tuple(key)
141
        # Reverse the key if the first element is bigger than the last.
142 2
        if key[0] > key[-1]:
143 2
            key = tuple(reversed(key))
144 2
        return key
145

146

147 2
class ImproperDict(TransformedDict):
148
    """Symmetrize improper torsions"""
149 2
    def __keytransform__(self,key):
150
        """Reorder tuple in numerical order except for element[1] which is the central atom; it retains its position."""
151
        # Ensure key is a tuple
152 0
        key = tuple(key)
153
        # Retrieve connected atoms
154 0
        connectedatoms = [key[0], key[2], key[3]]
155
        # Sort connected atoms
156 0
        connectedatoms.sort()
157
        # Re-store connected atoms
158 0
        key = tuple( [connectedatoms[0], key[1], connectedatoms[1], connectedatoms[2]])
159 0
        return key
160

161

162 2
def get_typed_molecules(smirks_list, input_molecules):
163
    """
164
    Creates a dictionary assigning a typename
165
    for each set of atom indices in each molecule
166
    based on the provided smirks patterns.
167

168
    Parameters
169
    ----------
170
    smirks_list : list[smirks tuples]
171
                  SMIRKS tuples come in the form (label, smirks)
172
    input_molecules : list[Mol]
173
                      Molecules from any supported toolkit (ChemPer, OpenEye, RDKit)
174

175
    Returns
176
    -------
177
    typeDict : embedded dictionary
178
               { index of input_molecule:
179
                    {(tuple of indices): assigned parameter type } }
180
    """
181 2
    from chemper.mol_toolkits.mol_toolkit import Mol
182 2
    molecules = [Mol(m) for m in input_molecules]
183 2
    type_dict = dict()
184 2
    for mol_idx, mol in enumerate(molecules):
185 2
        type_dict[mol_idx] = {}
186 2
        for [label, smirks] in smirks_list:
187 2
            matches = get_smirks_matches(mol, smirks)
188 2
            for match in matches:
189 2
                type_dict[mol_idx][match] = label
190

191 2
    return type_dict
192

193 2
def create_tuples_for_clusters(smirks_list, molecules):
194
    """
195
    This function is used to get clusters of molecular fragments based
196
    on input SMIRKS.
197

198
    For example, let's assume you wanted to find all of the
199
    atoms that match this SMIRKS list
200
    'any', '[*:1]~[*:2]'
201
    'single', '[*:1]-[*:2]'
202

203
    In this case, the "any" bond would match all bonds, but then
204
    the "single" would match all single bonds.
205
    If you were typing Ethene (C=C) then you expect the double bond
206
    between atoms 0 and 1 to match any bond and all C-H bonds to match single.
207

208
    The output in this case would be:
209
    [ ('any', [[ (0, 1) ]] ),
210
      ('single', [[ (0, 2), (0, 3), (1,4), (1,5) ]] )
211
    ]
212

213
    Parameters
214
    ----------
215
    smirks_list : list[smirks tuples]
216
                  This is a list of tuples with the form (label, SMIRKS)
217
    molecules : list[Mols]
218
                This is a list of molecules you want to type with this list of SMIRKS
219
                They can be from any supported toolkit (ChemPer, OpenEye, RDKit)
220

221
    Returns
222
    -------
223
    type_list : list[typing tuples]
224
                These are a list of typing tuples that could be used as input for
225
                the SMIRKSifier with the form shown above:
226
                (label, [
227
                        [ (atom indices), ...], # atoms in mol0 with label
228
                        [ (atom indices), ... ], # atoms in mol1 with label
229
                        ... ] )
230

231
    """
232 2
    ordered_labels = [l for l, s in smirks_list]
233 2
    type_dict = get_typed_molecules(smirks_list, molecules)
234
    # start by getting things in the form
235
    # {label: {mol_idx: list of dictionary of indices} }
236 2
    label_dict = dict()
237 2
    for mol_idx, mol_dict in type_dict.items():
238 2
        for match, label in mol_dict.items():
239 2
            if label not in label_dict:
240 2
                label_dict[label] = [list() for i in range(len(molecules))]
241

242 2
            label_dict[label][mol_idx].append(match)
243

244
    # now we need to resort the mol_lists
245 2
    final_list = [ (l, label_dict[l]) for l in ordered_labels if l in label_dict]
246 2
    return final_list
247

248

249 2
def get_smirks_matches(mol, smirks):
250
    """
251
    Gets atom indices for a smirks string in a given molecule
252

253
    Parameters
254
    ----------
255
    mol : ChemPer Mol
256
    smirks : str
257
             SMIRKS pattern being matched to the molecule
258

259
    Returns
260
    --------
261
    matches : list[atom tuples]
262
              Atom indices where this smirks matches in this atom
263
    """
264 2
    from chemper.graphs.environment import ChemicalEnvironment
265

266 2
    env = ChemicalEnvironment(smirks)
267 2
    if env.get_type().lower() == 'impropertorsion':
268 0
        matches = ImproperDict()
269
    else:
270 2
        matches = ValenceDict()
271

272 2
    for match in mol.smirks_search(smirks):
273 2
        smirks_indices = sorted(list(match.keys()))
274 2
        atom_indices = tuple([match[s].get_index() for s in smirks_indices])
275 2
        matches[atom_indices] = ''
276

277 2
    return matches.keys()
278

279

280
# ===================================================================
281
# classes for evaluating lists of SMIRKS patterns
282
# ===================================================================
283

284 2
def score_match_reference(current_assignments, ref_assignments):
285
    """
286
    This function is intended to score two sets of SMIRKS patterns or other
287
    typing rules to see how similar they are.
288

289
    Typing assignments are provided in the form of embedded dictionaries:
290
        { molecule index:
291
            { (atom indices tuple): assigned label }
292
            }
293

294
    The assigned types do not (and should not) have the same name, rather
295
    this checks that the assignments types atoms into the same "groups."
296

297
    Parameters
298
    ----------
299
    current_assignments : dict
300
                          typing rules being assessed
301
    ref_assignments : dict
302
                      typing that is considered the reference or ideal
303

304
    Returns
305
    -------
306
    type_matches : list[match tuples]
307
                   match tuples have the form
308
                   (current_label, ref_label, count matched, count in reference label)
309
    score : float
310
            Score as a fraction of how well the current typing matches the reference.
311
            A score of 1.0 is a perfect match
312

313
    We would like to acknowledge Josh Fass <josh.fass@choderalab.org>
314
    who created this scoring algorithm using the bipartite graph for
315
    smarty (github.com/openforcefield/smarty and doi: 10.1021/acs.jctc.8b00821)
316
    """
317 0
    import networkx as nx
318 0
    total_counts = dict()
319 0
    total = 0
320 0
    for mol_idx, ref_dict in ref_assignments.items():
321 0
        for indices, label in ref_dict.items():
322 0
            if label not in total_counts:
323 0
                total_counts[label] = 0
324 0
            total += 1
325 0
            total_counts[label] +=1
326

327 0
    cur_labels = set()
328 0
    ref_labels = set()
329
    # check for missing tuples in dictionaries
330 0
    for mol_idx, current_dict in current_assignments.items():
331 0
        cur_keys = set(current_dict.keys())
332 0
        ref_keys = set(ref_assignments[mol_idx].keys())
333
        # check if there are atom index sets in references not in current
334 0
        if ref_keys - cur_keys:
335
            # TODO:
336 0
            return None, -1
337

338
        # store a set of labels
339 0
        c_labs = [e for k,e in current_dict.items()]
340 0
        r_labs = [e for k,e in ref_assignments[mol_idx].items()]
341 0
        cur_labels = cur_labels.union(set(c_labs))
342 0
        ref_labels = ref_labels.union(set(r_labs))
343

344
    # Create bipartite graph (U,V,E) matching current types U with
345
    # reference types V via edges E with weights equal to number of types in common.
346
    # create a dictionary for each possible pair of current and reference labels
347 0
    types_in_common = dict()
348 0
    for c_lab in cur_labels:
349 0
        for r_lab in ref_labels:
350 0
            types_in_common[(c_lab, r_lab)] = 0
351

352
    # up the count by +1 for each time a current and reference type matches the same set of atoms
353 0
    for mol_idx, index_dict in ref_assignments.items():
354 0
        for indices, r_lab in index_dict.items():
355 0
            c_lab = current_assignments[mol_idx][indices]
356 0
            types_in_common[(c_lab, r_lab)] += 1
357

358
    # Actually make the graph
359 0
    graph = nx.Graph()
360
    # Add current types
361 0
    for c_lab in cur_labels:
362 0
        graph.add_node(c_lab, bipartite=0)
363
    # add reference types
364 0
    for r_lab in ref_labels:
365 0
        graph.add_node(r_lab, bipartite=1)
366
    # add edges
367 0
    for c_lab in cur_labels:
368 0
        for r_lab in ref_labels:
369 0
            weight = types_in_common[(c_lab, r_lab)]
370 0
            graph.add_edge(c_lab, r_lab, weight=weight)
371

372 0
    mate = nx.algorithms.max_weight_matching(graph, maxcardinality=False)
373

374
    # Compute match dictionary and total number of matches.
375 0
    type_matches = list()
376 0
    total_type_matches = 0
377 0
    for lab1, lab2 in mate:
378 0
        counts = graph[lab1][lab2]['weight']
379 0
        total_type_matches += counts
380
        # labels come back in an arbitrary order, so determine if which is current/reference
381 0
        if lab1 in cur_labels:
382 0
            type_matches.append(( lab1, lab2, counts, total_counts[lab2]))
383
        else:
384 0
            type_matches.append( (lab2, lab1, counts, total_counts[lab1]))
385

386
    # compute fractional score:
387 0
    score = total_type_matches / total
388

389 0
    return type_matches, score
390

391 2
def match_reference(current_assignments, ref_assignments):
392
    """
393
    Determine if two sets of labels agree.
394
    This could be used to test if two sets of SMIRKS type a set of molecules
395
    in exactly the same way.
396

397
    This starts with two sets of "assignments" which are dictionaries
398
    like those created with get_typed_molecules.
399

400
    Let's imagine you wanted to type the molecule CF and CCl
401
    where most bonds are assigned X, but one bond in each molecule is
402
    assigned 'Y'. Then the reference_assignments would be:
403
    { 0: (0,1): 'X', (0,2): 'X', (0,3): 'X', (0,4): 'Y'},
404
      1: (0,1): 'X', (0,2): 'X', (0,3): 'X', (0,4): 'Y'},
405
    }
406

407
    Then you discover a set of SMIRKS patterns which match
408
    C-H bonds with one type and C-halogen have another.
409
    Then your current_assignments would be:
410
    { 0: (0,1): 'C-H', (0,2): 'C-H', (0,3): 'C-H', (0,4): 'C-hal.'},
411
      1: (0,1): 'C-H', (0,2): 'C-H', (0,3): 'C-H', (0,4): 'C-hal.'},
412
    }
413
    This would return a set tuples for the corresponding labels
414
    [('C-H', 'X'), ('C-hal.', 'Y')] and True for the two assigments matching.
415

416
    However, if your current set assigned different types to the two
417
    different carbon halogen bonds (C-F and C-Cl) then these two
418
    sets of assignments would not match and the result would be an empty set and False
419

420
    Parameters
421
    ----------
422
    current_assignments : dictionary of indices with current types
423
    ref_assignments : dictionary of indices with reference types
424

425
    Returns
426
    -------
427
    type_matches : set of tuples (current_label, reference_label)
428
                   pair of current and reference labels
429
                   None if the sets don't match
430
    is_match : boolean
431
               Does the current assignment exactly match the reference?
432
    """
433 2
    import networkx as nx
434

435 2
    graph = nx.Graph()
436 2
    matches = set()
437 2
    r_labs = set()
438 2
    c_labs = set()
439

440 2
    for mol_idx, index_dict in ref_assignments.items():
441
        # check that mol_idx is in current_assignments
442 2
        if mol_idx not in current_assignments:
443 0
            return set(), False
444

445 2
        for indices, r_label in index_dict.items():
446
            # check this set of indices is in the current set
447 2
            if indices not in current_assignments[mol_idx]:
448 2
                return set(), False
449

450 2
            c_label = current_assignments[mol_idx][indices]
451
            # check if nodes exist for the two labels (and make them)
452 2
            if c_label not in graph:
453 2
                graph.add_node(c_label)
454 2
                c_labs.add('current_'+c_label)
455

456 2
            if r_label not in graph:
457 2
                graph.add_node(r_label)
458 2
                r_labs.add(r_label)
459

460
            # create an edge connecting the reference and current type
461 2
            graph.add_edge('current_'+c_label, r_label)
462 2
            matches.add((c_label, r_label))
463

464 2
    for c_label in c_labs:
465 2
        if len(graph.edges(c_label)) != 1:
466 2
            return set(), False
467

468 2
    for r_label in r_labs:
469 2
        if len(graph.edges(r_label)) != 1:
470 0
            return set(), False
471

472 2
    return matches, True
473

474

475 2
def check_smirks_to_reference(current_types, reference_assignments, molecules):
476
    """
477
    Checks that the current SMIRKS assign types to this set of molecules
478
    to match the reference assignments.
479

480
    Parameters
481
    ----------
482
    current_types : list[smirks tuples]
483
                    SMIRKS types in the form (label, smirks)
484
    reference_assignments : embedded dictionary
485
                            This could be the output from get_typed_molecules
486
                            the dictionary has the form:
487
                            {mol_idx: {(atom indices tuple): label}, ... }
488
    molecules : list[Mol]
489
                These can be ChemPer Mols or
490
                Mols form a supported toolkit (currently OpenEye or RDKit)
491

492
    Returns
493
    -------
494
    agree : boolean
495
            Returns True if the SMIRKS type the molecules in the same way
496
            as the reference assignments
497
    """
498 0
    r_labs = [l for m,d in reference_assignments.items() for a,l in d.items()]
499 0
    if len(current_types) != len(set(r_labs)):
500 0
        return False
501

502 0
    current_assignments = get_typed_molecules(current_types, molecules)
503 0
    type_matches, matched = match_reference(current_assignments, reference_assignments)
504 0
    return matched
505

506

507 2
def check_smirks_agree(current_types, reference_types, molecules):
508
    """
509
    Checks if two lists of SMIRKS patterns type a list of molecules in the same way.
510

511
    Parameters
512
    ----------
513
    current_types : list[smirks tuples]
514
                    SMIRKS types in the form (label, smirks)
515
    reference_types : list[smirks tuples]
516
                      SMIRKS types in the form (label, smirks)
517
    molecules : list[Mol]
518
                ChemPer Mols or Mols form a supported toolkit (currently OpenEye or RDKit)
519

520
    Returns
521
    -------
522
    match : boolean
523
            True if the two sets of SMIRKS type
524
            the set of molecules in exactly the same way
525
    """
526 2
    if len(current_types) != len(reference_types):
527 2
        return False
528

529 2
    current_assignments = get_typed_molecules(current_types, molecules)
530 2
    reference_assignments = get_typed_molecules(reference_types, molecules)
531

532
    # check if they agree
533 2
    type_matches, matched = match_reference(current_assignments, reference_assignments)
534 2
    return matched
535

Read our documentation on viewing source code .

Loading