1
#!/usr/bin/env python
2

3
#=============================================================================================
4
# MODULE DOCSTRING
5
#=============================================================================================
6

7 2
"""
8
smirksify.py
9

10
In this script, we start with a set of clustered molecular fragments with specified
11
indexed atoms as those you would use to build a ClusterGraph.
12
We then build cluster Graphs to create the initial SMIRKS patterns and check
13
that the generated SMIRKS patterns retain the typing from the input cluster.
14
Next we run a series of iterations removing SMIRKS decorators.
15
If this "move" doesn't change the way the molecules are typed then the change is accepted.
16

17
This class takes inspiration from the tool SMIRKY previously published by the
18
Open Force Field Initiative:
19
github.com/openforcefield/smarty
20

21
In theory, it is possible this process of removing decorators could be more
22
systematic/deterministic, however this is a first approach to see
23
if extracted SMIRKS patterns can do better than SMIRKY.
24
Also, this approach will be more general since the input clusters do not
25
rely on a reference force field.
26

27
AUTHORS
28

29
Caitlin Bannan <bannanc@uci.edu>, UC Irvine, Mobley Group
30
"""
31
#=============================================================================================
32
# GLOBAL IMPORTS
33
#=============================================================================================
34

35 2
import copy
36

37 2
from chemper.graphs.environment import ChemicalEnvironment as CE
38 2
from chemper.mol_toolkits import mol_toolkit
39 2
from chemper.chemper_utils import ImproperDict, ValenceDict, \
40
    get_typed_molecules, match_reference
41 2
from chemper.graphs.cluster_graph import ClusterGraph
42

43 2
import numpy as np
44

45

46
# =============================================================================================
47
# private subroutines
48
# =============================================================================================
49

50 2
def print_smirks(smirks_list):
51
    """
52
    Prints out the current or provided smirks list
53
    in a table like format
54
    """
55 2
    str_form = " {0:<20} | {1:} "
56

57 2
    print()
58 2
    print(str_form.format("Label", "SMIRKS"))
59 2
    print('='*80)
60 2
    for label, smirks in smirks_list:
61 2
        print(str_form.format(label, smirks))
62 2
        print('-'*80)
63 2
    print()
64

65

66 2
class ClusteringError(Exception):
67
    """
68
    Exception for cases where smirks are inappropriate
69
    for the environment type they are being parsed into
70
    """
71 2
    def __init__(self, msg):
72 2
        Exception.__init__(self, msg)
73 2
        self.msg = msg
74

75
# =============================================================================================
76
# SMIRKSifier
77
# =============================================================================================
78

79 2
class SMIRKSifier(object):
80
    """
81
    Generates complex SMIRKS for a given cluster of substructures
82
    and then reduces the decorators in those smirks
83
    """
84 2
    def __init__(self, molecules, cluster_list,
85
                 max_layers=5, verbose=True, strict_smirks=True):
86
        """
87
        Parameters
88
        ----------
89
        molecules: list of Mols
90
            These can be chemper Mols or molecules from any supported toolkit
91
            (currently OpenEye or RDKit)
92

93
        cluster_list: list of labels and smirks_atom_lists
94
            For each label the user should provide a list tuples for atom indices
95
            in each molecule you want included in that cluster.
96

97
            For example, if you wanted all atoms with indices (0,1) and (1,2) to be in cluster 'c1'
98
            and atoms (2,3) in cluster 'c2' for each of two molecules then cluster_list would be
99

100
            [ ('c1', [ (0,1), (1,2) ], [ (0,1), (1,2) ]),
101
              ('c2', [ (2,3)        ], [ (2,3)        ]) ]
102

103
            To see an example of this in action checkout
104
            https://github.com/MobleyLab/chemper/tree/master/examples
105

106
        max_layers: int (optional)
107
            how many atoms away from the indexed atoms should we consider at the maximum
108
            default = 5
109

110
        verbose: boolean (optional)
111
            If true information is printed to the command line during reducing
112
            default = True
113

114
        strict_smirks: boolean (optional)
115
            If False it will not raise an error when incapable of making SMIRKS letting
116
            a master user trouble shoot
117
        """
118 2
        self.molecules = [mol_toolkit.Mol(m) for m in molecules]
119 2
        self.intermediate_smirks = dict()
120 2
        self.cluster_list = cluster_list
121 2
        self.verbose = verbose
122 2
        self.strict_smirks = strict_smirks
123

124
        # determine the type of SMIRKS for symmetry in indices purposes
125
        # This is done by making a test SMIRKS
126 2
        graph = ClusterGraph(self.molecules, cluster_list[0][1], 0)
127 2
        test_smirks = graph.as_smirks(compress=True)
128 2
        env = CE(test_smirks)
129 2
        if env.get_type() is None:
130
            # corresponds to an unknown chemical pattern
131 0
            self.dict_type = dict
132 2
        elif env.get_type().lower() == 'impropertorsion':
133 0
            self.dict_type = ImproperDict
134
        else:
135 2
            self.dict_type = ValenceDict
136

137
        # Convert input "smirks_atom_list" into a dictionary with the form:
138
        # {mol_idx: {(atom indices): label, ...}, ... }
139 2
        self.cluster_dict = dict()
140 2
        self.ref_labels = set()
141 2
        self.total = 0
142
        # form of cluster_list is [(label, [for each mol [ (tuples of atom indices)] ) ]
143 2
        for label, mol_list in self.cluster_list:
144 2
            self.ref_labels.add(label)
145
            # [for each mol [ (tuples of atom indices)]
146 2
            for mol_idx, atom_indice_tuples in enumerate(mol_list):
147 2
                if mol_idx not in self.cluster_dict:
148 2
                    self.cluster_dict[mol_idx] = self.dict_type()
149 2
                for atom_tuple in atom_indice_tuples:
150 2
                    self.total += 1
151 2
                    self.cluster_dict[mol_idx][atom_tuple] = label
152

153
        # make SMIRKS patterns for input clusters
154 2
        self.current_smirks, self.layers = self.make_smirks(max_layers)
155 2
        if self.verbose: print_smirks(self.current_smirks)
156
        # check SMIRKS and save the matches to input clusters
157 2
        self.type_matches, self.checks = self.types_match_reference()
158

159 2
        if not self.checks:
160 2
            msg = """
161
                      SMIRKSifier was not able to create SMIRKS for the provided
162
                      clusters with %i layers. Try increasing the number of layers
163
                      or changing your clusters
164
                      """ % max_layers
165 2
            if self.strict_smirks:
166 2
                raise ClusteringError(msg)
167
            else:
168 0
                print("WARNING!", msg)
169

170 2
    def make_smirks(self, max_layers):
171
        """
172
        Create a list of SMIRKS patterns for the input clusters.
173
        This includes a determining how far away from the indexed atom should
174
        be included in the SMIRKS (or the number of layers
175

176
        Parameters
177
        ----------
178
        max_layers: int
179
            maximum number of bonds away from the indexed atoms should be included
180

181
        Returns
182
        -------
183
        smirks_list: list of tuples
184
            list of tuples in the form (label, smirks)
185
        layers: int
186
            number of layers actually used to specify the set clusters
187
        """
188 2
        layers = 0
189
        # try generating smirks with no layers
190 2
        smirks_list = self._make_cluster_graphs(layers)
191
        # store intermediate smirks and check them
192 2
        self.intermediate_smirks[layers] = smirks_list
193 2
        _, checks = self.types_match_reference(current_types=smirks_list)
194

195 2
        while not checks and (layers < max_layers):
196 2
            layers += 1
197 2
            smirks_list = self._make_cluster_graphs(layers)
198
            # store intermediate smirks
199 2
            self.intermediate_smirks[layers] = smirks_list
200
            # check current smirks patterns
201 2
            _, checks = self.types_match_reference(current_types=smirks_list)
202

203 2
        return smirks_list, layers
204

205 2
    def _make_cluster_graphs(self, layers):
206
        """
207
        Creates a list of SMIRKS with the form
208
        [ (label: SMIRKS), ]
209
        using the stored molecules and cluster_list
210
        """
211 2
        smirks_list = list()
212

213
        # loop through the list of fragment clusters
214 2
        for label, smirks_atom_list in self.cluster_list:
215
            # make a ClusterGraph for that label
216 2
            graph = ClusterGraph(self.molecules, smirks_atom_list, layers)
217

218
            # extract and save the SMIRKS for the cluster
219 2
            smirks = graph.as_smirks(compress=True)
220 2
            smirks_list.append(('zz_'+str(label), smirks))
221

222 2
        return smirks_list
223

224 2
    def types_match_reference(self, current_types=None):
225
        """
226
        Determine best match for each parameter with reference types
227

228
        Parameters
229
        ----------
230
        current_types : list of tuples with form [ (label, smirks), ]
231

232
        Returns
233
        -------
234
        type_matches : list of tuples (current_label, reference_label, counts)
235
            pair of current and reference labels with the number of fragments that match it
236
        """
237 2
        if current_types is None:
238 2
            current_types = self.current_smirks
239

240 2
        if len(current_types) != len(self.ref_labels):
241 0
            return set(), False
242

243 2
        current_assignments = get_typed_molecules(current_types, self.molecules)
244

245 2
        type_matches, checks_pass = match_reference(current_assignments, self.cluster_dict)
246 2
        return type_matches, checks_pass
247

248 2
    def reduce(self, max_its=1000, verbose=None):
249
        """
250
        Reduce the SMIRKS decorators for a number of iterations
251

252
        Parameters
253
        ----------
254
        max_its : int
255
            The specified number of iterations
256
        verbose: boolean
257
            will set the verboseness while running
258
            (if None, the current verbose variable will be used)
259

260
        Returns
261
        ----------
262
        final_smirks: list of tuples
263
            list of final smirks patterns after reducing in the form
264
            [(label, smirks)]
265
        """
266 2
        if not self.checks:
267 0
            print("Cannot reduce since unable to create reliable SMIRKS")
268 0
            return self.current_smirks
269

270 2
        red = Reducer(self.current_smirks, self.molecules, verbose)
271 2
        self.current_smirks = red.run(max_its)
272 2
        return self.current_smirks
273

274

275
# ==================================================================================
276
# Reducer class
277
# ==================================================================================
278

279 2
class Reducer():
280
    """
281
    Reducer starts with any list of SMIRKS and removes unnecessary decorators
282
    while maintaining typing on input molecules.
283

284
    Attributes
285
    ----------
286
    current_smirks : list of tuples
287
                    current SMIRKS patterns in the form (label, smirks)
288
    mols : list of chemper molecules
289
          molecules being used to reduce the input SMIRKS
290
    cluster_dict : dictionary
291
                  Dictionary specifying typing using current SMIRKS in the form:
292
                  {mol_idx:
293
                        { (tuple of atom indices): label } }
294
    """
295 2
    def __init__(self, smirks_list, mols, verbose=False):
296
        """
297
        Parameters
298
        ----------
299
        smirks_list: list of tuples
300
            set of SMIRKS patterns in the form (label, smirks)
301
        mols: list of molecules
302
            Any chemper compatible molecules accepted (ChemPer, OpenEye, or RDKit)
303
        """
304 2
        self.verbose = verbose
305 2
        self.current_smirks = copy.deepcopy(smirks_list)
306

307
        # make reference clusters
308 2
        ref_smirks = [('ref_%s' % l, smirks) for l, smirks in smirks_list]
309 2
        self.molecules = [mol_toolkit.Mol(m) for m in mols]
310 2
        self.cluster_dict = get_typed_molecules(ref_smirks, self.molecules)
311

312 2
    def remove_one_sub_dec(self, input_ors, ref_idx):
313
        """
314
        Remove one ORdecorator from the specified index
315
        # i.e. [(#6, [X4, +0]), (#7, [X3]) ] --> [(#6, [+0]), (#7, [X3]) ]
316
        """
317 2
        ors = copy.deepcopy(input_ors)
318
        # remove one or decorator from a single type
319 2
        ref_or = ors[ref_idx]
320 2
        decs = ref_or[1]
321 2
        decs.remove(np.random.choice(decs))
322 2
        ors[ref_idx] = (ref_or[0], decs)
323 2
        return ors
324

325 2
    def remove_ref_sub_decs(self, input_ors, ref_idx):
326
        """
327
        Remove all of the ORdecorators at the specified index
328
        i.e. [(#6, [X4, +0]), (#7, [X3]) ] --> [(#6, []), (#7, [X3]) ]
329
        """
330 2
        ors = copy.deepcopy(input_ors)
331
        # remove all decorators on one OR type
332 2
        ref_or = ors[ref_idx]
333 2
        ors[ref_idx] = (ref_or[0], list())
334 2
        return ors
335

336 2
    def remove_ref(self, input_decs, ref_idx):
337
        """
338
        Remove the decorator at the referenced index
339
        i.e. [(#6, [X4, +0]), (#7, [X3]) ] --> [(#7, [X3])]
340
        """
341 2
        decs = copy.deepcopy(input_decs)
342
        # remove the single OR type at or_idx
343 2
        ref = decs[ref_idx]
344 2
        decs.remove(ref)
345 2
        return decs
346

347 2
    def remove_all_bases(self, input_ors):
348
        """
349
        convert all bases to [*]
350
        i.e. [(#6, [X4, +0]), (#7, [X3]) ] --> [(*, [X4, +0]), (*, [X3]) ]
351
        """
352 2
        new_all_ors = [('*', d) for b, d in input_ors]
353 2
        return new_all_ors
354

355 2
    def remove_all_dec_type(self, input_ors):
356
        """
357
        remove all decorators of the same type, like all 'X' decorators
358
        i.e. [(#6, [X4, +0]), (#7, [X3]) ] --> [(#6, [+0]), (#7, []) ]
359
        """
360 2
        all_decs = set([d for b,decs in input_ors for d in decs])
361 2
        remove_dec = np.random.choice(list(all_decs))
362

363
        # we start by defining a criteria for when the decorator
364
        # should not be removed.
365 2
        def criteria(x): return remove_dec[0] not in x
366

367 2
        if '+' in remove_dec or '-' in remove_dec:
368 0
            def criteria(x): return not ('+' in x or '-' in x)
369 2
        elif remove_dec in ['a', 'A']:
370 0
            def criteria(x): return x.lower() != 'a'
371 2
        elif 'r' in remove_dec:
372 1
            def criteria(x): return 'r' not in x
373

374 2
        new_ors = list()
375 2
        for b, decs in input_ors:
376 2
            new_decs = [d for d in decs if criteria(d)]
377 2
            new_ors.append((b, new_decs))
378 2
        return new_ors
379

380 2
    def remove_or_atom(self, input_all_ors, or_idx):
381
        """
382
        makes specific types of changes based on atom OR decorators
383

384
        Parameters
385
        ----------
386
        input_all_ors: list of OR decorators
387
            [ (base, [decs]), ...]
388
        or_idx: index that should be used to guide changes
389
        """
390 2
        ref_or = input_all_ors[or_idx]
391
        # one choice is to remove all decorators (0)
392
        # it is always an option
393 2
        choices = ['all']
394
        # are there non-generic bases? If so removing all bases is an option
395 2
        if len(set([b for b,ds in input_all_ors])) > 1:
396 2
            choices.append('gen_base')
397
        # there are two options if the ref type has OR decorators
398 2
        if len(ref_or[1]) > 0:
399
            # remove all OR decorators (#6, [X4, +0]) --> (#6, [])
400 2
            choices.append('all_ref_decs')
401 2
            if len(ref_or[1]) > 1:
402
                # remove one OR decorator (#6, [X4, +0]) --> (#6, [+0])
403 2
                choices.append('one_dec')
404
        # if there are more than one or type
405 2
        if len(input_all_ors) > 1:
406
            # you can remove just one ORtype (the reference)
407 2
            choices.append('remove_ref')
408
            # check that there are decorators
409 2
            all_decs = set([d for b, decs in input_all_ors for d in decs])
410 2
            if len(all_decs) > 0:
411
                # you can remove 1 type of decorator, i.e. all 'Xn' decorators
412 2
                choices.append('all_one_dec')
413

414 2
        change = np.random.choice(choices)
415

416 2
        if change == 'all':
417
            # remove ALL OR decorators
418
            # i.e.[(  # 6, [X4, +0]), (#7, [X3]) ] --> []
419 2
            return list()
420

421 2
        if change == 'remove_ref':
422 2
            return self.remove_ref(input_all_ors, or_idx)
423

424 2
        if change == 'all_ref_decs':
425 2
            return self.remove_ref_sub_decs(input_all_ors, or_idx)
426

427 2
        if change == 'one_dec':
428 2
            return self.remove_one_sub_dec(input_all_ors, or_idx)
429

430 2
        if change == 'gen_base':
431 2
            return self.remove_all_bases(input_all_ors)
432

433
        # change = 'all_one_dec'
434 2
        return self.remove_all_dec_type(input_all_ors)
435

436 2
    def remove_or(self, input_all_ors, is_bond=False):
437
        """
438
        Changes the OR decorators by removing some of them
439

440
        Parameters
441
        -----------
442
        input_all_ors: list of tuples
443
            these are the ORdecorators from a ChemicalEnvironment
444
        is_bond: boolean
445
            are these decorators from from a bond (False for atom)
446
        """
447 2
        if len(input_all_ors) == 0:
448 2
            return input_all_ors, False
449

450
        # chose a set of or decorators to change
451
        # these come in the form [base, (decorators)]
452 2
        all_ors = copy.deepcopy(input_all_ors)
453 2
        or_idx = np.random.randint(len(all_ors))
454

455
        # atoms have more ORdecorators and therefore more options for
456
        # how they can be removed
457 2
        if not is_bond:
458 2
            return self.remove_or_atom(all_ors, or_idx), True
459

460
        # For a bond, either one ORtype is removed
461
        # or they are all removed.
462 2
        if np.random.rand() > 0.5:
463
            # remove just one or type
464 2
            return self.remove_ref(all_ors, or_idx), True
465
        # remove all ors
466 2
        return list(), True
467

468 2
    def remove_and(self, input_all_ands):
469
        """
470
        removes a decorated that is AND'd in the original SMIRKS
471
        """
472
        # if there are no ands return with no changes
473 2
        if len(input_all_ands) == 0:
474 2
            return input_all_ands, False
475

476 2
        if np.random.rand() > 0.5:
477
            # otherwise randomly remove an AND decorator
478 2
            all_ands = copy.deepcopy(input_all_ands)
479 2
            all_ands.remove(np.random.choice(all_ands))
480
        else:
481 2
            all_ands = list()
482 2
        return all_ands, True
483

484 2
    def _get_item_and_remove_options(self, env):
485
        """
486
        Parameters
487
        ----------
488
        env: ChemicalEnvironment
489

490
        Returns
491
        -------
492
        item: an Atom or Bond from the Chemical Environment
493
        dec_opts: list
494
            0 = has OR decorators to remove
495
            1 = has AND decorators to remove
496
            2 = is an atom that could be removed
497
        """
498 2
        items = list()
499 2
        odds = list()
500 2
        for a_b in env.get_atoms() + env.get_bonds():
501 2
            count = len(a_b.and_types)
502 2
            for o in a_b.or_types:
503 2
                if o[0] != '*':
504 2
                    count += 1
505 2
                count += len(o[1])
506

507
            # a wild card, atom with no index should be considered ([*])
508
            # should also be on this list so it can be removed
509 2
            if isinstance(a_b, CE.Atom) and not isinstance(a_b, CE.Bond) and env.is_unindexed(a_b):
510 2
                count += 1
511

512 2
            items.append(a_b)
513 2
            odds.append(count)
514

515
        # normalize odds to weights
516 2
        odds = np.array(odds)
517

518
        # if the odds for changing all atoms and bonds is zero return None
519 2
        if odds.sum() == 0:
520 2
            return None, list()
521

522 2
        weights = odds / odds.sum()
523

524
        # choose an atom or bond with the probabilities:
525 2
        item = np.random.choice(items, p=weights)
526 2
        dec_opts = list()
527 2
        if len(item.or_types) > 0:
528 2
            dec_opts.append('remove_ors')
529 2
        if len(item.and_types) > 0:
530 2
            dec_opts.append('remove_ands')
531

532 2
        if not isinstance(item, CE.Bond): # then it is an atom
533 2
            if env.get_valence(item) == 1 and env.is_unindexed(item):
534 2
                dec_opts.append('remove_atom')
535

536 2
        return item, dec_opts
537

538 2
    def remove_decorator(self, smirks):
539
        """
540
        Chose an atom or bond in the input smirks pattern
541
        and then remove one decorator from it.
542
        """
543 2
        env = CE(smirks)
544 2
        sub, dec_opts = self._get_item_and_remove_options(env)
545

546
        # note this should be impossible
547 2
        if sub is None or len(dec_opts) == 0:
548 2
            return smirks, False
549

550 2
        change = np.random.choice(dec_opts)
551 2
        if change == 'remove_ors':
552 2
            new_or_types, changed = self.remove_or(sub.or_types, isinstance(sub, CE.Bond))
553 2
            if not changed:
554 0
                return smirks, False
555 2
            sub.or_types = new_or_types
556 2
        elif change == 'remove_ands':
557 2
            new_and_types, changed = self.remove_and(sub.and_types)
558 2
            if not changed:
559 0
                return smirks, False
560 2
            sub.and_types = new_and_types
561
        else: # change == 'remove_atom'
562 2
            remove = env.remove_atom(sub)
563 2
            if not remove:
564 0
                return smirks, False
565

566 2
        return env.as_smirks(), True
567

568 2
    def run(self, max_its=1000, verbose=None):
569
        """
570
        Reduce the SMIRKS decorators for a number of iterations
571

572
        Parameters
573
        ----------
574
        max_its : int
575
            The specified number of iterations
576
        verbose: boolean
577
            will set the verboseness while running
578
            (if None, the current verbose variable will be used)
579

580
        Returns
581
        ----------
582
        final_smirks: list of tuples
583
            list of final smirks patterns after reducing in the form
584
            [(label, smirks)]
585
        """
586 2
        temp_verbose = self.verbose
587 2
        if verbose is not None:
588 0
            self.verbose = verbose
589

590 2
        for it in range(max_its):
591 2
            if self.verbose: print("Iteration: ", it)
592

593 2
            proposed_smirks = copy.deepcopy(self.current_smirks)
594
            # chose a SMIRKS to change
595 2
            change_idx = np.random.randint(len(proposed_smirks))
596 2
            change_entry = proposed_smirks[change_idx]
597

598
            # generate new smirks
599 2
            new_smirks, changed = self.remove_decorator(change_entry[1])
600 2
            if self.verbose: print("Attempting to change SMIRKS #%i\n%s  -->  %s"\
601
                                   % (change_idx, change_entry[1], new_smirks))
602 2
            if not changed:
603 2
                if self.verbose:
604 0
                    print("Rejected!\nNo change made to SMIRKS\n%s" % change_entry[1])
605 0
                    print('-'*90)
606 0
                continue
607

608
            # update proposed list and check if it still matches the reference
609 2
            proposed_smirks[change_idx] = (change_entry[0], new_smirks)
610

611 2
            current_assignments = get_typed_molecules(proposed_smirks, self.molecules)
612 2
            _, proposed_checks = match_reference(current_assignments, self.cluster_dict)
613

614 2
            if proposed_checks:
615 2
                if self.verbose: print("Accepted! ")
616 2
                self.current_smirks = copy.deepcopy(proposed_smirks)
617
            else:
618 2
                if self.verbose: print("Rejected!\n proposed SMIRKS changed the way fragments are clustered")
619 2
            if self.verbose: print('-'*90)
620

621 2
        if self.verbose: print_smirks(self.current_smirks)
622 2
        self.verbose = temp_verbose
623

624 2
        return self.current_smirks
625

626

627

628

Read our documentation on viewing source code .

Loading