1
"""
2
utils.py
3

4
This file provides simple functions that might be of use to people using the ChemPer package
5

6
"""
7

8 151
import os
9 151
import collections
10

11 151
def get_data_path(relative_path, package='chemper'):
12
    """
13
    Returns the absolute path to a specified relative path inside data directory of a given package
14
    For example,
15
        get_data_path('molecules', package='chemper')
16
    will find this path:
17
        [absolute path to Python packages]/chemper/data/molecules/
18

19
    If the final path does not exist it will raise an IOError
20

21
    Parameters
22
    ----------
23
    relative_path : str
24
                    filename or relative path to a file stored in the data/ directory
25
                    of a given package
26
    package : str
27
              name of the Python package that should contain the specified file
28
              default="chemper"
29

30
    Returns
31
    -------
32
    path : str
33
           absolute path to filename in specified package
34
    """
35 151
    from pkg_resources import resource_filename
36 151
    fn = resource_filename(package, os.path.join('data', relative_path))
37

38 151
    if not os.path.exists(fn):
39 151
        raise IOError("Sorry! The absolute path %s was not found, that is %s is not in the data directory for %s" \
40
                          % (fn, relative_path, package))
41

42 151
    return fn
43

44 151
def get_full_path(relative_path, package="chemper"):
45
    """
46
    This function checks to see if this relative path is available locally
47
    if it is it returns the absolute path to that file.
48
    Otherwise it checks if that relative path is located at [package location]/data/relative path
49
    it will raise an IOError if neither location exists.
50

51
    Parameters
52
    ----------
53
    relative_path : str
54
                    relative path to a file available locally or in package/data
55
    package : str
56
              package name with a data directory the file might be located in
57

58
    Returns
59
    -------
60
    abs_path : str
61
               The absolute path to a local file if available, otherwise the abolsute path
62
               to [package]/data/[relative_path]
63
    """
64 151
    if os.path.exists(relative_path):
65 0
        return os.path.abspath(relative_path)
66 151
    return get_data_path(relative_path, package)
67

68

69
# =======================================
70
# Check SMIRKS validity
71
# =======================================
72

73 151
def is_valid_smirks(smirks):
74
    """
75
    This function checks if a given smirks is valid.
76

77
    Parameters
78
    ----------
79
    smirks : SMIRKS (or SMARTS) string
80

81
    Returns
82
    -------
83
    is_valid : boolean
84
    """
85
    # ChemPer only works with SMIRKS that match the
86
    # SMIRNOFF parameter format that is it has to be a
87
    # valid substructure search pattern and meet the
88
    # following three restrictions:
89
    # 1) only for one molecule (no '.' character)
90 151
    if '.' in smirks:
91 151
        return False
92
    # 2) not for a reaction (no '>' character)
93 151
    if '>' in smirks:
94 151
        return False
95
    # 3) it must have at least one indexed (:n) atom
96 151
    import re
97 151
    if not re.findall(r'([:]\d+)', smirks):
98 151
        return False
99

100
    # now check this is a valid substructure search pattern
101 151
    from chemper.mol_toolkits.mol_toolkit import Mol
102 151
    mol = Mol.from_smiles('C')
103 151
    try:
104 151
        mol.smirks_search(smirks)
105 151
        return True
106 151
    except ValueError:
107 151
        return False
108

109
# ===================================================================
110
# custom classes and functions for matching sets of SMIRKS to molecules
111
# ===================================================================
112
"""
113
custom dictionaries
114

115
These custom dictionaries are taken from
116
openforcefield.typing.engines.smirnoff.forcefield.
117
They allow tracking of symmetrically equivalent parameters.
118
For example the bonds (1,2) and (2,1), and  
119
the impropers (1,2,3,4) and (1,2,4,3) are equivalent 
120
"""
121

122

123 151
class TransformedDict(collections.MutableMapping):
124
    """A dictionary that applies an arbitrary key-altering
125
       function before accessing the keys"""
126

127 151
    def __init__(self, *args, **kwargs):
128 151
        self.store = dict()
129 151
        self.update(dict(*args, **kwargs))  # use the free update to set keys
130

131 151
    def __getitem__(self, key):
132 151
        return self.store[self.__keytransform__(key)]
133

134 151
    def __setitem__(self, key, value):
135 151
        self.store[self.__keytransform__(key)] = value
136

137 151
    def __delitem__(self, key):
138 0
        del self.store[self.__keytransform__(key)]
139

140 151
    def __iter__(self):
141 151
        return iter(self.store)
142

143 151
    def __len__(self):
144 0
        return len(self.store)
145

146 151
    def __keytransform__(self, key):
147 0
        return key
148

149 151
    def __str__(self): return str(self.store)
150

151

152 151
class ValenceDict(TransformedDict):
153
    """Enforce uniqueness in atom indices"""
154 151
    def __keytransform__(self, key):
155
        """Reverse tuple if first element is larger than last element."""
156
        # Ensure key is a tuple.
157 151
        key = tuple(key)
158
        # Reverse the key if the first element is bigger than the last.
159 151
        if key[0] > key[-1]:
160 151
            key = tuple(reversed(key))
161 151
        return key
162

163

164 151
class ImproperDict(TransformedDict):
165
    """Symmetrize improper torsions"""
166 151
    def __keytransform__(self,key):
167
        """Reorder tuple in numerical order except for element[1] which is the central atom; it retains its position."""
168
        # Ensure key is a tuple
169 0
        key = tuple(key)
170
        # Retrieve connected atoms
171 0
        connectedatoms = [key[0], key[2], key[3]]
172
        # Sort connected atoms
173 0
        connectedatoms.sort()
174
        # Re-store connected atoms
175 0
        key = tuple( [connectedatoms[0], key[1], connectedatoms[1], connectedatoms[2]])
176 0
        return key
177

178

179 151
def get_typed_molecules(smirks_list, input_molecules):
180
    """
181
    Creates a dictionary assigning a typename
182
    for each set of atom indices in each molecule
183
    based on the provided smirks patterns.
184

185
    Parameters
186
    ----------
187
    smirks_list : list[smirks tuples]
188
                  SMIRKS tuples come in the form (label, smirks)
189
    input_molecules : list[Mol]
190
                      Molecules from any supported toolkit (ChemPer, OpenEye, RDKit)
191

192
    Returns
193
    -------
194
    typeDict : embedded dictionary
195
               { index of input_molecule:
196
                    {(tuple of indices): assigned parameter type } }
197
    """
198 151
    from chemper.mol_toolkits.mol_toolkit import Mol
199 151
    molecules = [Mol(m) for m in input_molecules]
200 151
    type_dict = dict()
201 151
    for mol_idx, mol in enumerate(molecules):
202 151
        type_dict[mol_idx] = {}
203 151
        for [label, smirks] in smirks_list:
204 151
            matches = get_smirks_matches(mol, smirks)
205 151
            for match in matches:
206 151
                type_dict[mol_idx][match] = label
207

208 151
    return type_dict
209

210 151
def create_tuples_for_clusters(smirks_list, molecules):
211
    """
212
    This function is used to get clusters of molecular fragments based
213
    on input SMIRKS.
214

215
    For example, let's assume you wanted to find all of the
216
    atoms that match this SMIRKS list
217
    'any', '[*:1]~[*:2]'
218
    'single', '[*:1]-[*:2]'
219

220
    In this case, the "any" bond would match all bonds, but then
221
    the "single" would match all single bonds.
222
    If you were typing Ethene (C=C) then you expect the double bond
223
    between atoms 0 and 1 to match any bond and all C-H bonds to match single.
224

225
    The output in this case would be:
226
    [ ('any', [[ (0, 1) ]] ),
227
      ('single', [[ (0, 2), (0, 3), (1,4), (1,5) ]] )
228
    ]
229

230
    Parameters
231
    ----------
232
    smirks_list : list[smirks tuples]
233
                  This is a list of tuples with the form (label, SMIRKS)
234
    molecules : list[Mols]
235
                This is a list of molecules you want to type with this list of SMIRKS
236
                They can be from any supported toolkit (ChemPer, OpenEye, RDKit)
237

238
    Returns
239
    -------
240
    type_list : list[typing tuples]
241
                These are a list of typing tuples that could be used as input for
242
                the SMIRKSifier with the form shown above:
243
                (label, [
244
                        [ (atom indices), ...], # atoms in mol0 with label
245
                        [ (atom indices), ... ], # atoms in mol1 with label
246
                        ... ] )
247

248
    """
249 151
    ordered_labels = [l for l, s in smirks_list]
250 151
    type_dict = get_typed_molecules(smirks_list, molecules)
251
    # start by getting things in the form
252
    # {label: {mol_idx: list of dictionary of indices} }
253 151
    label_dict = dict()
254 151
    for mol_idx, mol_dict in type_dict.items():
255 151
        for match, label in mol_dict.items():
256 151
            if label not in label_dict:
257 151
                label_dict[label] = [list() for i in range(len(molecules))]
258

259 151
            label_dict[label][mol_idx].append(match)
260

261
    # now we need to resort the mol_lists
262 151
    final_list = [ (l, label_dict[l]) for l in ordered_labels if l in label_dict]
263 151
    return final_list
264

265

266 151
def get_smirks_matches(mol, smirks):
267
    """
268
    Gets atom indices for a smirks string in a given molecule
269

270
    Parameters
271
    ----------
272
    mol : ChemPer Mol
273
    smirks : str
274
             SMIRKS pattern being matched to the molecule
275

276
    Returns
277
    --------
278
    matches : list[atom tuples]
279
              Atom indices where this smirks matches in this atom
280
    """
281 151
    from chemper.graphs.environment import ChemicalEnvironment
282

283 151
    env = ChemicalEnvironment(smirks)
284 151
    if env.get_type().lower() == 'impropertorsion':
285 0
        matches = ImproperDict()
286
    else:
287 151
        matches = ValenceDict()
288

289 151
    for match in mol.smirks_search(smirks):
290 151
        smirks_indices = sorted(list(match.keys()))
291 151
        atom_indices = tuple([match[s].get_index() for s in smirks_indices])
292 151
        matches[atom_indices] = ''
293

294 151
    return matches.keys()
295

296

297
# ===================================================================
298
# classes for evaluating lists of SMIRKS patterns
299
# ===================================================================
300

301 151
def score_match_reference(current_assignments, ref_assignments):
302
    """
303
    This function is intended to score two sets of SMIRKS patterns or other
304
    typing rules to see how similar they are.
305

306
    Typing assignments are provided in the form of embedded dictionaries:
307
        { molecule index:
308
            { (atom indices tuple): assigned label }
309
            }
310

311
    The assigned types do not (and should not) have the same name, rather
312
    this checks that the assignments types atoms into the same "groups."
313

314
    Parameters
315
    ----------
316
    current_assignments : dict
317
                          typing rules being assessed
318
    ref_assignments : dict
319
                      typing that is considered the reference or ideal
320

321
    Returns
322
    -------
323
    type_matches : list[match tuples]
324
                   match tuples have the form
325
                   (current_label, ref_label, count matched, count in reference label)
326
    score : float
327
            Score as a fraction of how well the current typing matches the reference.
328
            A score of 1.0 is a perfect match
329

330
    We would like to acknowledge Josh Fass <josh.fass@choderalab.org>
331
    who created this scoring algorithm using the bipartite graph for
332
    smarty (github.com/openforcefield/smarty and doi: 10.1021/acs.jctc.8b00821)
333
    """
334 0
    import networkx as nx
335 0
    total_counts = dict()
336 0
    total = 0
337 0
    for mol_idx, ref_dict in ref_assignments.items():
338 0
        for indices, label in ref_dict.items():
339 0
            if label not in total_counts:
340 0
                total_counts[label] = 0
341 0
            total += 1
342 0
            total_counts[label] +=1
343

344 0
    cur_labels = set()
345 0
    ref_labels = set()
346
    # check for missing tuples in dictionaries
347 0
    for mol_idx, current_dict in current_assignments.items():
348 0
        cur_keys = set(current_dict.keys())
349 0
        ref_keys = set(ref_assignments[mol_idx].keys())
350
        # check if there are atom index sets in references not in current
351 0
        if ref_keys - cur_keys:
352
            # TODO:
353 0
            return None, -1
354

355
        # store a set of labels
356 0
        c_labs = [e for k,e in current_dict.items()]
357 0
        r_labs = [e for k,e in ref_assignments[mol_idx].items()]
358 0
        cur_labels = cur_labels.union(set(c_labs))
359 0
        ref_labels = ref_labels.union(set(r_labs))
360

361
    # Create bipartite graph (U,V,E) matching current types U with
362
    # reference types V via edges E with weights equal to number of types in common.
363
    # create a dictionary for each possible pair of current and reference labels
364 0
    types_in_common = dict()
365 0
    for c_lab in cur_labels:
366 0
        for r_lab in ref_labels:
367 0
            types_in_common[(c_lab, r_lab)] = 0
368

369
    # up the count by +1 for each time a current and reference type matches the same set of atoms
370 0
    for mol_idx, index_dict in ref_assignments.items():
371 0
        for indices, r_lab in index_dict.items():
372 0
            c_lab = current_assignments[mol_idx][indices]
373 0
            types_in_common[(c_lab, r_lab)] += 1
374

375
    # Actually make the graph
376 0
    graph = nx.Graph()
377
    # Add current types
378 0
    for c_lab in cur_labels:
379 0
        graph.add_node(c_lab, bipartite=0)
380
    # add reference types
381 0
    for r_lab in ref_labels:
382 0
        graph.add_node(r_lab, bipartite=1)
383
    # add edges
384 0
    for c_lab in cur_labels:
385 0
        for r_lab in ref_labels:
386 0
            weight = types_in_common[(c_lab, r_lab)]
387 0
            graph.add_edge(c_lab, r_lab, weight=weight)
388

389 0
    mate = nx.algorithms.max_weight_matching(graph, maxcardinality=False)
390

391
    # Compute match dictionary and total number of matches.
392 0
    type_matches = list()
393 0
    total_type_matches = 0
394 0
    for lab1, lab2 in mate:
395 0
        counts = graph[lab1][lab2]['weight']
396 0
        total_type_matches += counts
397
        # labels come back in an arbitrary order, so determine if which is current/reference
398 0
        if lab1 in cur_labels:
399 0
            type_matches.append(( lab1, lab2, counts, total_counts[lab2]))
400
        else:
401 0
            type_matches.append( (lab2, lab1, counts, total_counts[lab1]))
402

403
    # compute fractional score:
404 0
    score = total_type_matches / total
405

406 0
    return type_matches, score
407

408 151
def match_reference(current_assignments, ref_assignments):
409
    """
410
    Determine if two sets of labels agree.
411
    This could be used to test if two sets of SMIRKS type a set of molecules
412
    in exactly the same way.
413

414
    This starts with two sets of "assignments" which are dictionaries
415
    like those created with get_typed_molecules.
416

417
    Let's imagine you wanted to type the molecule CF and CCl
418
    where most bonds are assigned X, but one bond in each molecule is
419
    assigned 'Y'. Then the reference_assignments would be:
420
    { 0: (0,1): 'X', (0,2): 'X', (0,3): 'X', (0,4): 'Y'},
421
      1: (0,1): 'X', (0,2): 'X', (0,3): 'X', (0,4): 'Y'},
422
    }
423

424
    Then you discover a set of SMIRKS patterns which match
425
    C-H bonds with one type and C-halogen have another.
426
    Then your current_assignments would be:
427
    { 0: (0,1): 'C-H', (0,2): 'C-H', (0,3): 'C-H', (0,4): 'C-hal.'},
428
      1: (0,1): 'C-H', (0,2): 'C-H', (0,3): 'C-H', (0,4): 'C-hal.'},
429
    }
430
    This would return a set tuples for the corresponding labels
431
    [('C-H', 'X'), ('C-hal.', 'Y')] and True for the two assigments matching.
432

433
    However, if your current set assigned different types to the two
434
    different carbon halogen bonds (C-F and C-Cl) then these two
435
    sets of assignments would not match and the result would be an empty set and False
436

437
    Parameters
438
    ----------
439
    current_assignments : dictionary of indices with current types
440
    ref_assignments : dictionary of indices with reference types
441

442
    Returns
443
    -------
444
    type_matches : set of tuples (current_label, reference_label)
445
                   pair of current and reference labels
446
                   None if the sets don't match
447
    is_match : boolean
448
               Does the current assignment exactly match the reference?
449
    """
450 151
    import networkx as nx
451

452 151
    graph = nx.Graph()
453 151
    matches = set()
454 151
    r_labs = set()
455 151
    c_labs = set()
456

457 151
    for mol_idx, index_dict in ref_assignments.items():
458
        # check that mol_idx is in current_assignments
459 151
        if mol_idx not in current_assignments:
460 0
            return set(), False
461

462 151
        for indices, r_label in index_dict.items():
463
            # check this set of indices is in the current set
464 151
            if indices not in current_assignments[mol_idx]:
465 151
                return set(), False
466

467 151
            c_label = current_assignments[mol_idx][indices]
468
            # check if nodes exist for the two labels (and make them)
469 151
            if c_label not in graph:
470 151
                graph.add_node(c_label)
471 151
                c_labs.add('current_'+c_label)
472

473 151
            if r_label not in graph:
474 151
                graph.add_node(r_label)
475 151
                r_labs.add(r_label)
476

477
            # create an edge connecting the reference and current type
478 151
            graph.add_edge('current_'+c_label, r_label)
479 151
            matches.add((c_label, r_label))
480

481 151
    for c_label in c_labs:
482 151
        if len(graph.edges(c_label)) != 1:
483 151
            return set(), False
484

485 151
    for r_label in r_labs:
486 151
        if len(graph.edges(r_label)) != 1:
487 0
            return set(), False
488

489 151
    return matches, True
490

491

492 151
def check_smirks_to_reference(current_types, reference_assignments, molecules):
493
    """
494
    Checks that the current SMIRKS assign types to this set of molecules
495
    to match the reference assignments.
496

497
    Parameters
498
    ----------
499
    current_types : list[smirks tuples]
500
                    SMIRKS types in the form (label, smirks)
501
    reference_assignments : embedded dictionary
502
                            This could be the output from get_typed_molecules
503
                            the dictionary has the form:
504
                            {mol_idx: {(atom indices tuple): label}, ... }
505
    molecules : list[Mol]
506
                These can be ChemPer Mols or
507
                Mols form a supported toolkit (currently OpenEye or RDKit)
508

509
    Returns
510
    -------
511
    agree : boolean
512
            Returns True if the SMIRKS type the molecules in the same way
513
            as the reference assignments
514
    """
515 0
    r_labs = [l for m,d in reference_assignments.items() for a,l in d.items()]
516 0
    if len(current_types) != len(set(r_labs)):
517 0
        return False
518

519 0
    current_assignments = get_typed_molecules(current_types, molecules)
520 0
    type_matches, matched = match_reference(current_assignments, reference_assignments)
521 0
    return matched
522

523

524 151
def check_smirks_agree(current_types, reference_types, molecules):
525
    """
526
    Checks if two lists of SMIRKS patterns type a list of molecules in the same way.
527

528
    Parameters
529
    ----------
530
    current_types : list[smirks tuples]
531
                    SMIRKS types in the form (label, smirks)
532
    reference_types : list[smirks tuples]
533
                      SMIRKS types in the form (label, smirks)
534
    molecules : list[Mol]
535
                ChemPer Mols or Mols form a supported toolkit (currently OpenEye or RDKit)
536

537
    Returns
538
    -------
539
    match : boolean
540
            True if the two sets of SMIRKS type
541
            the set of molecules in exactly the same way
542
    """
543 151
    if len(current_types) != len(reference_types):
544 151
        return False
545

546 151
    current_assignments = get_typed_molecules(current_types, molecules)
547 151
    reference_assignments = get_typed_molecules(reference_types, molecules)
548

549
    # check if they agree
550 151
    type_matches, matched = match_reference(current_assignments, reference_assignments)
551 151
    return matched
552

Read our documentation on viewing source code .

Loading