MobleyLab / chemper
1
#!/usr/bin/env python
2

3
#==============================================================================
4
# MODULE DOCSTRING
5
#==============================================================================
6

7 2
"""
8
environment.py
9
this is adapted from the openforcefield ChemicalEnvironments class.
10

11
There have been some on going debates over where chemper should live,
12
here or in the base openforcefield toolkit. Due to the want to update
13
this module for use in chemper amidst the openforcefield 0.2.0 overall,
14
environment.py has been updated independently in this repository.
15

16
The only function of real use in openforcefield.py is the `get_type` function,
17
which has also been updated here.
18

19
AUTHORS
20

21
Caitlin Bannan <bannanc@uci.edu>, Mobley Lab, University of California Irvine,
22
with contributions from John Chodera, Memorial Sloan Kettering Cancer Center
23
and David Mobley, UC Irvine.
24

25
"""
26

27
#==============================================================================
28
# GLOBAL IMPORTS
29
#==============================================================================
30

31 2
import networkx as nx
32 2
import re
33 2
import copy
34

35 2
from numpy import random
36

37

38
#==============================================================================
39
# Functions
40
#==============================================================================
41

42 2
def _find_embedded_brackets(string, in_char, out_char):
43
    """
44
    Finds the substring surrounded by the in_char and out_char
45
    intended use is to identify embedded bracketed sequences
46

47
    string - a string you want separated
48
    in_char - regular expression for the character you're looking for '\(' for '('
49
    out_char - regular expression for the closing character such as '\)' for ')'
50

51
    string = "[#1$(*-C(-[#7,#8,F,#16,Cl,Br])-[#7,#8,F,#16,Cl,Br]):1]"
52
    sub_string, in_idx, out_idx = _find_embedded_brackets(string, '\(','\)')
53
    # sub_string = (*-C(-[#7,#8,F,#16,Cl,Br])-[#7,#8,F,#16,Cl,Br])  in_idx = 4, out_idx = 50
54
    """
55 2
    in_list = [m.start() for m in re.finditer(in_char, string)]
56 2
    out_list = [m.start() for m in re.finditer(out_char, string)]
57
    # If no occurance of the in_char return an empty string
58 2
    if len(in_list) == 0:
59 2
        return "", -1, -1
60
    # If no out_char returns the first in_char to the end
61 2
    if len(out_list) == 0:
62 0
        return string[in_list[0]:], in_list[0], -1
63

64
    # Otherwise find closure from the first in_char
65 2
    list_idx = 0
66 2
    while list_idx < len(in_list) - 1:
67 2
        if in_list[list_idx+1] > out_list[list_idx]:
68 2
            break
69 2
        list_idx+=1
70 2
    in_idx = in_list[0]
71 2
    out_idx = out_list[list_idx]
72 2
    return string[in_idx:out_idx+1], in_idx, out_idx
73

74 2
def _convert_embedded_smirks(smirks):
75
    """
76
    Converts a SMIRKS string with the $(...) in an atom to the
77
    form expected by the environment parser
78

79
    smirks = any smirks string, if no $(...) then the original smirks is returned
80

81
    initial_smirks = "[#1$(*~[#6]):1]"
82
    new_smirks = _convert_embedded_smirks(initial_smirks)
83
    # new_smirks = [#1:1]~[#6]
84
    """
85 2
    a_out = 0
86 2
    while smirks.find('$(') != -1:
87
        # Find first atom
88 2
        atom, a_in, a_out = _find_embedded_brackets(smirks, r'\[', r'\]')
89 2
        d = atom.find('$(')
90
        # Find atom with the $ string embedded
91 2
        while d == -1:
92 2
            atom, temp_in, temp_out = _find_embedded_brackets(smirks[a_out+1:], r'\[', r'\]')
93 2
            a_in = a_out + temp_in + 1
94 2
            a_out += temp_out + 1
95 2
            d = atom.find('$(')
96

97
        # Store the smirks pattern before and after the relevant atom
98 2
        pre_smirks = smirks[:a_in]
99 2
        post_smirks = smirks[a_out+1:]
100

101
        # Check for ring index, i.e. the 1s in "[#6:1]1-CCCCC1"
102 2
        match = re.match(r'(\d+)',post_smirks)
103 2
        if match is not None: # leftover starts with int
104 2
            ring_out = re.findall(r'(\d+)',post_smirks)[0]
105
            # update post_smirks
106 2
            post_smirks = post_smirks[match.end():]
107
        else:
108 2
            ring_out = ''
109

110 2
        embedded, p_in, p_out = _find_embedded_brackets(atom, r'\(', r'\)')
111
        # two forms of embedded strings $(*~stuff) or $([..]~stuff)
112
        # in the latter case the first atom refers the current atom
113 2
        if embedded[1] == '[':
114 2
            first, f_in, f_out = _find_embedded_brackets(embedded, r'\[',r'\]')
115 2
            first = _convert_embedded_smirks(first)
116 2
            new_atom = atom[:d]+first[1:-1]+atom[p_out+1:]
117 2
            embedded = embedded[f_out+1:]
118
            # if embedded is empty between brackets, remove it
119 2
            if embedded.replace('(','').replace(')','') == '':
120 0
                embedded = ''
121

122 2
        elif embedded[1] == '*': # embedded[1] = *
123 2
            new_atom = atom[:d]+atom[p_out+1:]
124 2
            embedded = embedded[2:]
125

126
        else: # embedded starts with a "no bracket" atom such as 'C'
127 2
            embedded = embedded[1:] # remove leading '('
128
            # atoms by symbol don't need brackets, this covers atomic symbols and aromatic atoms
129 2
            no_bracket = r'(!?[A-Z][a-z]?|!?[cnops])'
130 2
            match = re.match(no_bracket, embedded)
131 2
            if match is not None:
132 2
                new_atom = atom[:d]+embedded[:match.end()]+atom[p_out+1:]
133 2
                embedded = embedded[match.end():]
134
            else:
135 0
                new_atom = atom[:d]+atom[p_out+1]
136

137
        # Look for ring inside embedded SMIRKS "[#6$(*1CCC1)]"
138 2
        match = re.match(r'(\d+)', embedded)
139 2
        if match is not None: # embedded starts with an int
140 2
            ring_in = re.findall(r'(\d+)', embedded)[0]
141 2
            embedded = '(' + embedded[match.end():]
142
        else:
143 2
            ring_in = ''
144 2
            if embedded != '':
145 2
                embedded = '(' + embedded
146

147
        # Make new smirks
148 2
        smirks = pre_smirks+new_atom+ring_out+ring_in+embedded+post_smirks
149

150 2
    return smirks
151

152 2
def _remove_blanks_repeats(init_list, remove_list = ['']):
153
    """
154
    Returns the input list 'init_list'
155
    without any repeating entries or blank strings ''
156
    """
157 2
    final_list = [item for item in init_list if item not in remove_list]
158 2
    return list( set(final_list) )
159

160

161 2
class SMIRKSMismatchError(Exception):
162
    """
163
    Exception for cases where smirks are inappropriate
164
    for the environment type they are being parsed into
165
    """
166 2
    def __init__(self, msg):
167 0
        super(SMIRKSMismatchError, self).__init__(self,msg)
168 0
        self.msg = msg
169

170 2
class SMIRKSParsingError(Exception):
171
    """
172
    Exception for when SMIRKS are not parseable for any environment
173
    """
174 2
    def __init__(self, msg):
175 2
        super(SMIRKSParsingError, self).__init__(self, msg)
176 2
        self.msg = msg
177

178 2
class ChemicalEnvironment(object):
179
    """Chemical environment abstract base class that matches an atom, bond, angle, etc.
180
    """
181 2
    class Atom(object):
182
        """Atom representation, which may have some or_types and ANDtypes properties.
183

184
        Attributes
185
        ----------
186
        or_types : list of tuples in the form (base, [list of decorators])
187
            where bases and decorators are both strings
188
            The descriptor types that will be combined with logical OR
189
        and_types : list of string
190
            The descriptor types  that will be combined with logical AND
191
        """
192 2
        def __init__(self, or_types = None, and_types = None, index = 0, ring = None):
193
            """Initialize an Atom object with optional descriptors.
194

195
            Parameters
196
            -----------
197
            or_types: list of tuples for ORbases and ORdecorators,
198
                in the form (base, [list of decorators])
199
                optional, default = []
200
            and_types: list of str
201
                strings that will be AND'd together in a SMARTS
202
                optional, default = None
203
            index : int
204
                If greater than zero,
205
                the specified index will be attached as a SMIRKS index (e.g. '[#6:1]')
206
                otherwise, it is only used for accessing atom information
207
            ring : int, optional, default = None
208
                If not None, the specified ring index will be attached at the end of the atom i.e. '[#6:1]1'
209
            """
210
            # List of 2 tuples in the form [ (ORbase, ORdecorator), ...]
211 2
            if or_types is not None:
212 2
                self._or_types = copy.deepcopy(or_types)
213
            else:
214 2
                self._or_types = list()
215

216
            # Set of strings that will be AND'd to the the end
217 2
            if and_types is not None:
218 2
                self._and_types = list(copy.deepcopy(and_types))
219
            else:
220 2
                self._and_types = list()
221

222 2
            self.index = index
223 2
            self.ring = ring
224 2
            self.is_atom = True
225

226 2
        def is_generic(self):
227
            """
228
            returns True if there are no decorators on this atom
229
            (IMPORTANT: this is newly added and in chemper only as of 8/9/18)
230
            """
231 0
            if not self._or_types:
232 0
                if not self._and_types:
233 0
                    return True
234 0
            return False
235

236 2
        def as_smarts(self):
237
            """Return the atom representation as SMARTS.
238

239
            Returns
240
            --------
241
            smarts : str
242
            The SMARTS string for this atom
243
            """
244

245 2
            smarts = '['
246

247
            # Add the OR'd features
248 2
            if self._or_types:
249 2
                or_list = list()
250 2
                for (base, or_decorators) in self._or_types:
251 2
                    if len(base) > 0 and base[0] == '$':
252
                        # after a $base an explicit '&' is necessary
253 0
                        if or_decorators:
254 0
                            or_bit = base + '&' + ''.join(or_decorators)
255
                        else:
256 0
                            or_bit = base
257
                    else: # base doesn't start with $
258 2
                        or_bit = base + ''.join(or_decorators)
259 2
                    or_list.append(or_bit)
260 2
                smarts += ','.join(or_list)
261
            else:
262 2
                smarts += '*'
263

264 2
            if len(self._and_types) > 0:
265 2
                smarts += ';' + ';'.join(self._and_types)
266

267 2
            if self.ring is not None:
268 2
                return smarts + ']' + str(self.ring)
269
            else:
270 2
                return smarts + ']'
271

272 2
        def as_smirks(self):
273
            """Return the atom representation as SMIRKS.
274

275
            Returns
276
            --------
277
            smirks : str
278
            The SMIRKS string for this atom
279
            """
280 2
            smirks = self.as_smarts()
281

282
            # No index specified so SMIRKS = SMARTS
283 2
            if self.index <= 0:
284 2
                return smirks
285

286
            # Add label to the end of SMARTS
287 2
            sub_string, start, end = _find_embedded_brackets(smirks, r'\[', r'\]')
288 2
            end_string = smirks[end:]
289 2
            return sub_string[:-1] + ':' + str(self.index) + end_string
290

291 2
        def add_or_type(self, or_base, or_decorators):
292
            """
293
            Adds ORtype to the set for this atom.
294

295
            Parameters
296
            --------
297
            or_base: string, such as '#6'
298
            or_decorators: list of strings, such as ['X4','+0']
299
            """
300 2
            or_decorators = _remove_blanks_repeats(or_decorators, ['', or_base])
301 2
            self._or_types.append((or_base, or_decorators))
302

303 2
        def add_and_type(self, and_type):
304
            """
305
            Adds ANDtype to the set for this atom.
306

307
            Parameters
308
            --------
309
            and_type: string
310
                added to the list of and_types for this atom
311
            """
312 2
            self._and_types.append(and_type)
313 2
            self._and_types = _remove_blanks_repeats(self._and_types)
314

315 2
        @property
316
        def or_types(self):
317
            """Provides the or_types in this atom"""
318 2
            return self._or_types
319

320 2
        @or_types.setter
321
        def or_types(self, new_or_types):
322
            """
323
            sets new or_types for this atom
324

325
            Parameters
326
            ----------
327
            new_or_types: list of tuples in the form (base, [ORdecorators])
328
                for example: ('#6', ['X4','H0','+0']) --> '#6X4H0+0'
329
            """
330 2
            self._or_types = list()
331 2
            if new_or_types is not None:
332 2
                for (base, decs) in new_or_types:
333 2
                    adjusted_decs = _remove_blanks_repeats(decs, ['', base])
334 2
                    self._or_types.append((base, adjusted_decs))
335

336 2
        @property
337
        def and_types(self):
338
            """
339
            returns a copy of the list of and_types for this atom
340
            """
341 2
            return list(copy.deepcopy(self._and_types))
342

343 2
        @and_types.setter
344
        def and_types(self, new_and_types):
345
            """
346
            sets new and_types for this atom
347

348
            Parameters
349
            ----------
350
            new_and_types: list of strings
351
                strings that will be AND'd together in a SMARTS
352
            """
353 2
            if new_and_types is None:
354 0
                self._and_types = list()
355
            else:
356 2
                self._and_types = _remove_blanks_repeats(new_and_types)
357

358 2
    class Bond(Atom):
359
        """Bond representation, which may have ORtype and ANDtype descriptors.
360

361
        Attributes
362
        ----------
363
        or_types : list of tuples of ORbases and ORdecorators
364
            in form (base: [list of decorators])
365
            The ORtype types that will be combined with logical OR
366
        and_types : list of string
367
            The and_types that will be combined with logical AND
368

369
        """
370
        # Implementation identical to atoms apart from what is put in the asSMARTS/asSMIRKS strings
371

372 2
        def __init__(self, or_types = None, and_types = None, index = 0):
373
            """
374
            Parameters
375
            -----------
376
            or_types: list of tuples, optional, default = None
377
                tuples have form (base, [ORdecorators])
378
                bond descriptors that will be OR'd together in a SMARTS
379
            and_types: list of str, optional, default = None
380
                strings that will be AND'd together in a SMARTS
381
            index: integer, default = 0
382
                This is for book keeping inside environments and will not be shown in SMARTS or SMIRKS
383
                example: bond1 in a Bond is the bond between atom1 and atom2
384
            """
385 2
            super(ChemicalEnvironment.Bond,self).__init__(or_types, and_types, index)
386 2
            self.is_atom = False
387 2
            return
388

389 2
        def as_smarts(self):
390
            """Return the atom representation as SMARTS.
391

392
            Returns
393
            --------
394
            smarts : str
395
                The SMARTS string for just this atom
396
            """
397 2
            if self._or_types:
398 2
                or_combos = list()
399 2
                for (ORbase, ORdecorators) in self._or_types:
400 2
                    or_combos.append(ORbase + ''.join(ORdecorators))
401 2
                smarts = ','.join(or_combos)
402
            else:
403 2
                smarts = '~'
404

405 2
            if len(self._and_types) > 0:
406 2
                smarts += ';' + ';'.join(self._and_types)
407

408 2
            return smarts
409

410 2
        def as_smirks(self):
411
            """
412
            Returns
413
            --------
414
            smarts : str
415
                The SMIRKS string for just this bond
416
            """
417
            #the same as asSMARTS()
418
            #    for consistency asSMARTS() or asSMIRKS() can be called
419
            #    for all environment objects
420 2
            return self.as_smarts()
421

422 2
        def get_order(self):
423
            """
424
            Returns a float for the order of this bond
425
            for multiple or_types or ~ it returns the minimum possible order
426
            the intended application is for checking valence around a given atom
427
            """
428
            # Minimum order for empty or_types is 1:
429 2
            if not self._or_types:
430 0
                return 1
431

432 2
            order_dict = {'~':1.,
433
                    '-':1., ':': 1.5, '=':2., '#':3.,
434
                    '!-':1.5, '!:':1., '!=':1., '!#':1.}
435 2
            order_list = [order_dict.get(base,1) for (base, decor) in self._or_types]
436 2
            return min(order_list)
437

438 2
    def __init__(self, smirks = None, label = None, replacements = None):
439
        """Initialize a chemical environment abstract base class.
440

441
        smirks = string, optional
442
            if smirks is not None, a chemical environment is built
443
            from the provided SMIRKS string
444
        label = anything, optional
445
            intended to be used to label this chemical environment
446
            could be a string, int, or float, or anything
447
        replacements = list of lists, optional,
448
            [substitution, smarts] form for parsing SMIRKS
449
        """
450
        # Define the regular expressions used for all SMIRKS decorators
451
        # There are a limited number of descriptors for smirks string they are:
452
        # That is a # followed by one or more ints w/or w/o at ! in front '!#16'
453 2
        element_num = "!?[#]\d+"
454
        # covers element symbols, i.e. N,C,O,Br not followed by a number
455 2
        element_sym = "!?[A-Z][a-z]?"
456
        # covers element symbols that are aromatic:
457 2
        aro_sym = "!?[cnops]"
458
        # replacement strings
459 2
        replace_str = "\$\w+"
460
        # a or A w/ or w/o a ! in front 'A'
461 2
        aro_ali = "!?[aA]"
462
        # the decorators (D,H,j,r,V,X,^) followed by one or more integers
463 2
        needs_int = "!?[DjVX^]\d+"
464
        # R(x), +, - do not need to be followed by a integer w/ or w/o a ! 'R2'
465 2
        optional_int = "!?[RHhrx+-]\d*"
466
        # chirality options, "@", "@@", "@int" w/ or w/o a ! in front
467 2
        chirality = "!?[@]\d+|!?[@]@?"
468

469
        # Generate RegEx string for decorators:
470 2
        self.no_bracket_atom_reg = r'('+'|'.join([element_sym, aro_sym, replace_str])+')'
471 2
        self.atom_reg = '|'.join([element_num, aro_ali, needs_int,
472
                                  optional_int, chirality, replace_str,
473
                                  element_sym, aro_sym])
474 2
        self.atom_reg = r'('+self.atom_reg+')'
475

476
        # Define bond regular expression options below in order:
477
        # single, double, triple, aromatic, directional up bond, directional down bond
478
        # Each can have ! in from and directional can have ? after
479
        # up and down bonds have lots of \ to fit the python requirements
480 2
        self.bond_regs = ['!?[-]', '!?[=]', '!?[#]', '!?[:]', '!?[@]', '!?[\\\\]\\??', '!?[\\/]\\??']
481 2
        self.bond_regs = r'('+'|'.join(self.bond_regs)+')'
482
        # Note, not looking for ~ because that is used for empty bonds
483

484
        # Create an empty graph which will store Atom objects.
485 2
        self._graph = nx.Graph()
486 2
        self.label = label
487 2
        self.replacements = replacements
488

489 2
        if smirks is not None:
490
            # Check that it is a valid SMIRKS
491 2
            if not self.is_valid(smirks):
492 2
                raise SMIRKSParsingError("Error Provided SMIRKS ('%s') was \
493
not parseable with current toolkit" % smirks)
494

495
            # Check for SMIRKS not supported by Chemical Environments
496 2
            if smirks.find('.') != -1:
497 0
                raise SMIRKSParsingError("Error: Provided SMIRKS ('%s') \
498
contains a '.' indicating multiple molecules in the same pattern. This type \
499
of pattern is not parseable into ChemicalEnvironments" % smirks)
500 2
            if smirks.find('>') != -1:
501 0
                raise SMIRKSParsingError("Error: Provided SMIRKS ('%s') \
502
contains a '>' indicating a reaction. This type of pattern is not parseable \
503
into ChemicalEnvironments." % smirks)
504

505
            # try parsing into environment object
506 2
            try:
507 2
                self._parse_smirks(smirks)
508 0
            except:
509 0
                raise SMIRKSParsingError("Error SMIRKS (%s) was not parseable\
510
                        into a ChemicalEnvironment" % smirks)
511

512
        # Check that the created Environment is valid
513 2
        if not self.is_valid():
514 0
            raise SMIRKSParsingError("Input SMIRKS (%s), converted to %s \
515
                    is now invalid" % (smirks, self.as_smirks()))
516

517 2
        return
518

519 2
    def _graph_remove_node(self, node):
520 2
        self._graph.remove_node(node)
521 2
        return True
522

523 2
    def _graph_nodes(self, data=False):
524
        """
525
        When data is False returns a list of nodes in graph
526
        otherwise returns a dictionary in the form {node: data}
527
        """
528 2
        if data:
529 0
            return dict(self._graph.nodes(data=True))
530 2
        return list(self._graph.nodes())
531

532 2
    def _graph_edges(self, data=False, node=None):
533
        """
534
        returns a list of tuples,
535
        If data is False it has the form [(node1, node2)]
536
        Otherwise it includes the data [(node1, node2, data_dictionary)]
537
        If node is not None then the list includes only edges connected to that node
538
        """
539 2
        if node is None:
540 2
            return list(self._graph.edges(data=data))
541 2
        return list(self._graph.edges(node, data=data))
542

543 2
    def _graph_neighbors(self, node):
544
        """
545
        returns a list of neighbors for the given node
546
        """
547 2
        return list(self._graph.neighbors(node))
548

549 2
    def _graph_get_edge_data(self, node1, node2):
550
        """
551
        returns a dictionary for the data at the edged connecting
552
        node1 and node2 in graph
553
        """
554 2
        return self._graph.get_edge_data(node1, node2)
555

556 2
    def is_valid(self, smirks = None):
557
        """
558
        Returns if the environment is valid, that is if it
559
        creates a parseable SMIRKS string.
560
        """
561 2
        if smirks is None:
562 2
            smirks = self._as_smirks()
563 2
        from chemper.chemper_utils import is_valid_smirks
564 2
        return is_valid_smirks(smirks)
565

566 2
    def _parse_smirks(self,input_smirks):
567
        """
568
        This function converts a smirks string to a Chemical Environment
569
        """
570 2
        smirks = _convert_embedded_smirks(input_smirks)
571 2
        atoms = dict() # store created atom
572 2
        idx = 1 # current atom being created
573 2
        store = list() # to store indices while branching
574 2
        bonding_to = idx # which atom are we going to bond to
575

576 2
        atom_string, start, end = _find_embedded_brackets(smirks, r'\[', r'\]')
577

578 2
        if start != 0: # first atom is not in square brackets
579 2
            if start != -1:
580 2
                start_string = smirks[:start]
581
            else:
582 2
                start_string = smirks
583

584
            # Check for atoms not between square brackets
585 2
            split = re.split(self.no_bracket_atom_reg, start_string)
586 2
            atom_string = split[1]
587

588
            # update leftover for this condition
589 2
            if start != -1: # there is at least 1 more bracketed atom
590 2
                leftover = ''.join(split[2:])+smirks[start:]
591
            else:
592 2
                leftover = ''.join(split[2:])
593

594
        else: # First atom is in square brackets
595 2
            leftover = smirks[end+1:]
596
            # remove square brackets for parsing
597 2
            atom_string = atom_string[1:-1]
598

599
        # Check for ring index, i.e. the 1s in "[#6:1]1-CCCCC1"
600 2
        match = re.match(r'(\d+)',leftover)
601 2
        if match is not None:  # leftover starts with int
602 2
            ring = re.findall(r'(\d+)',leftover)[0]
603 2
            leftover = leftover[match.end():]
604
        else:
605 2
            ring = None
606

607
        # Get atom information and create first atom
608 2
        ors, ands, index = self._get_atom_info(atom_string)
609 2
        new_atom = self.add_atom(None, new_or_types= ors, new_and_types= ands,
610
                                 new_atom_index= index, new_atom_ring= ring, beyond_beta= True)
611 2
        atoms[idx] = new_atom
612

613 2
        while len(leftover) > 0:
614 2
            idx += 1
615
            # Check for branching
616 2
            if leftover[0] == ')':
617 2
                bonding_to = store.pop()
618 2
                leftover = leftover[1:]
619 2
                continue
620

621 2
            if leftover[0] == '(':
622 2
                store.append(bonding_to)
623 2
                leftover = leftover[1:]
624 2
                continue
625

626
            # find beginning and end of next [atom]
627 2
            atom_string, start, end = _find_embedded_brackets(leftover, r'\[', r'\]')
628

629 2
            if start != -1: # no more square brackets
630 2
                bond_string = leftover[:start]
631
            else:
632 2
                bond_string = leftover
633

634
            # Check for atoms not between square brackets
635 2
            bond_split = re.split(self.no_bracket_atom_reg, bond_string)
636
            # Next atom is not in brackets for example C in "[#7:1]-C"
637 2
            if len(bond_split) > 1:
638 2
                bond_string = bond_split[0]
639 2
                atom_string = '['+bond_split[1]+']'
640
                # update leftover for this condition
641 2
                if start != -1: # ther is at least 1 more bracketed atom
642 2
                    leftover = ''.join(bond_split[2:])+leftover[start:]
643
                else:
644 2
                    leftover = ''.join(bond_split[2:])
645

646
            else: # next atom is in the brackets [atom]
647
                # bond and atom string stay the same, update leftover
648 2
                leftover = leftover[end+1:]
649

650
            # Get bond and atom info
651 2
            b_or, b_and = self._get_bond_info(bond_string)
652 2
            a_or, a_and, index = self._get_atom_info(atom_string[1:-1])
653

654
            # Check for ring index, i.e. the 1s in "[#6:1]1-CCCCC1"
655 2
            match = re.match(r'(\d+)',leftover)
656 2
            if match is not None: # leftover starts with int
657 2
                ring = re.findall(r'(\d+)',leftover)[0]
658 2
                leftover = leftover[match.end():]
659
            else:
660 2
                ring = None
661

662
            # create new atom
663 2
            new_atom = self.add_atom(atoms[bonding_to], bond_or_types=b_or,
664
                                     bond_and_types=b_and, new_or_types=a_or, new_and_types=a_and,
665
                                     new_atom_index=index, new_atom_ring=ring, beyond_beta=True)
666

667
            # update state
668 2
            atoms[idx] = new_atom
669 2
            bonding_to = idx
670 2
        return
671

672 2
    def _get_atom_info(self, atom):
673
        """
674
        given atom string, returns or_types, and_types, and index
675
        """
676
        # Find atom index
677 2
        colon = atom.find(':')
678 2
        if colon == -1:
679 2
            index = None
680
        else:
681 2
            index = int(atom[colon+1:])
682 2
            atom = atom[:colon]
683

684 2
        split = atom.split(';')
685

686
        # Get and_types (and split them if they don't use ;)
687 2
        and_types = list()
688 2
        for a in split[1:]:
689 2
            and_types += re.findall(self.atom_reg, a)
690

691
        # Get or_types
692 2
        or_list = split[0].split(',')
693 2
        or_types = list()
694
        # Separate or_types into bases and decorators
695 2
        for OR in or_list:
696 2
            or_base, or_decors = self._separate_or_types(OR)
697 2
            if or_base is not None:
698 2
                or_types.append((or_base, or_decors))
699

700 2
        return or_types, and_types, index
701

702 2
    def _separate_or_types(self, or_type):
703
        """
704
        Separates ORtype (i.e. "#6X4R+0") into
705
        a base and decorators (i.e. '#6', ['X4','R','+0'] )
706
        """
707
        # special case 1: wild card
708 2
        if or_type == '*':
709 2
            return None, []
710

711
        # if ORbase is a wildcard
712 2
        if or_type[0] == '*':
713 2
            return '*', re.findall(self.atom_reg, or_type[1:])
714

715
        # Split up decorators by RegEx strings for atoms
716 2
        split = re.findall(self.atom_reg, or_type)
717 2
        if len(split) == 0:
718 2
            return None, []
719

720 2
        base = split[0]
721 2
        decs = _remove_blanks_repeats(split[1:], ['',base])
722 2
        return base, decs
723

724 2
    def _get_bond_info(self, bond):
725
        """
726
        given bond strings returns or_types and and_types
727
        """
728
        # blank bond string is single or aromatic
729
        # empty or_types in Chemical Environments are treated as ~ bonds
730 2
        if bond == "":
731 2
            and_types = list()
732 2
            or_types = [('-', []), (':', [])]
733 2
            return or_types, and_types
734

735
        # AND types indicated by ; at the end
736 2
        split = bond.split(';')
737 2
        and_types = list()
738 2
        for a in split[1:]:
739 2
            and_types += re.findall(self.bond_regs, a)
740

741
        # or_types are divided by ,
742 2
        or_list = split[0].split(',')
743 2
        or_types = list()
744 2
        for OR in or_list:
745 2
            if OR == '~':
746 2
                continue
747 2
            or_divide = re.findall(self.bond_regs, OR)
748 2
            if len(or_divide) > 0:
749 2
                or_types.append((or_divide[0], or_divide[1:]))
750

751 2
        return or_types, and_types
752

753 2
    def as_smirks(self, smarts = False):
754
        """
755
        Returns a SMIRKS representation of the chemical environment
756

757
        Parameters
758
        -----------
759
        smarts: optional, boolean
760
            if True, returns a SMARTS instead of SMIRKS without index labels
761
        """
762 2
        init_atom = self.select_atom(1)
763 2
        return self._as_smirks(init_atom, None, smarts)
764

765 2
    def _as_smirks(self, initial_atom = None, neighbors = None, smarts = False):
766
        """Return a SMIRKS representation of the chemical environment.
767

768
        Parameters
769
        -----------
770
        initalAtom = optional, atom object
771
            This is randomly selected if not chosen.
772
        neighbors = optional, list of atom objects
773
            This is all of the initalAtom neighbors if not specified
774
            generally this is used only for the recursive calls
775
            so initial atoms are not reprinted
776
        smarts = optional, boolean
777
            if True, returns a SMARTS string instead of SMIRKS
778
        """
779
        # If empty chemical environment
780 2
        if len(self._graph_nodes()) == 0:
781 0
            return ""
782

783 2
        if initial_atom is None:
784 2
            initial_atom = self.get_atoms()[0]
785

786 2
        if neighbors is None:
787 2
            neighbors = self._graph_neighbors(initial_atom)
788

789
        # sort neighbors to guarantee order is constant
790 2
        neighbors = sorted(neighbors, key=lambda atom: atom.as_smirks())
791

792
        # initialize smirks for starting atom
793 2
        if smarts:
794 2
            smirks = initial_atom.as_smarts()
795
        else:
796 2
            smirks = initial_atom.as_smirks()
797

798
        # loop through neighbors
799 2
        for idx, neighbor in enumerate(neighbors):
800
            # get the SMIRKS for the bond between these atoms
801
            # bonds are the same if smarts or smirks
802 2
            bond_edge = self._graph_get_edge_data(initial_atom, neighbor)
803 2
            bond_smirks = bond_edge['bond'].as_smirks()
804

805
            # Get the neighbors for this neighbor
806 2
            new_neighbors = self._graph_neighbors(neighbor)
807
            # Remove initialAtom so it doesn't get reprinted
808 2
            new_neighbors.remove(initial_atom)
809

810
            # Call asSMIRKS again to get the details for that atom
811 2
            atom_smirks = self._as_smirks(neighbor, new_neighbors, smarts)
812

813
            # Use ( ) for branch atoms (all but last)
814 2
            if idx < len(neighbors) - 1:
815 2
                smirks += '(' + bond_smirks + atom_smirks + ')'
816
            # This is for the atoms that are a part of the main chain
817
            else:
818 2
                smirks += bond_smirks + atom_smirks
819

820 2
        return smirks
821

822 2
    def select_atom(self, descriptor = None):
823
        """Select a random atom fitting the descriptor.
824

825
        Parameters
826
        ----------
827
        descriptor: optional, None
828
            None - returns any atom with equal probability
829
            int - will return an atom with that index
830
            'Indexed' - returns a random indexed atom
831
            'Unindexed' - returns a random unindexed atom
832
            'Alpha' - returns a random alpha atom
833
            'Beta' - returns a random beta atom
834

835
        Returns
836
        --------
837
        a single Atom object fitting the description
838
        or None if no such atom exists
839
        """
840 2
        if descriptor is None or isinstance(descriptor,str):
841 2
            atoms = self.get_component_list('atom', descriptor)
842 2
            if len(atoms) == 0:
843 2
                return None
844 2
            return random.choice(atoms)
845

846 2
        if not isinstance(descriptor, int):
847 0
            return None
848

849 2
        for atom in self.get_atoms():
850 2
            if atom.index == descriptor:
851 2
                return atom
852 2
        return None
853

854 2
    def get_component_list(self, component_type, descriptor = None):
855
        """
856
        Returns a list of atoms or bonds matching the descriptor
857

858
        Parameters
859
        -----------
860
        component_type: string: 'atom' or 'bond'
861
        descriptor: string, optional
862
            'all', 'Indexed', 'Unindexed', 'Alpha', 'Beta'
863

864
        Returns
865
        -------
866
        """
867 2
        des_list = ['indexed', 'unindexed', 'alpha', 'beta', 'all']
868 2
        if descriptor is not None:
869 2
            d = descriptor.lower()
870 2
            if d not in des_list:
871 0
                raise LookupError("Error: descriptor must be in the list [%s]" %
872
                                ', '.join(des_list))
873
        else:
874 2
            d = None
875

876 2
        if not component_type.lower() in ['atom', 'bond']:
877 0
            raise LookupError("Error: component_type must be 'atom' or 'bond'")
878

879 2
        if component_type.lower() == 'atom':
880 2
            if d == 'indexed':
881 2
                return self.get_indexed_atoms()
882 2
            elif d == 'unindexed':
883 2
                return self.get_unindexed_atoms()
884 2
            elif d == 'alpha':
885 2
                return self.get_alpha_atoms()
886 2
            elif d == 'beta':
887 2
                return self.get_beta_atoms()
888
            else:
889 2
                return self.get_atoms()
890

891 2
        elif component_type.lower() == 'bond':
892 2
            if d == 'indexed':
893 2
                return self.get_indexed_bonds()
894 2
            elif d == 'unindexed':
895 2
                return self.get_unindexed_bonds()
896 2
            elif d == 'alpha':
897 2
                return self.get_alpha_bonds()
898 2
            elif d == 'beta':
899 2
                return self.get_beta_bonds()
900

901 2
            return self.get_bonds()
902

903 0
        return None
904

905 2
    def select_bond(self, descriptor = None):
906
        """Select a random bond fitting the descriptor.
907

908
        Parameters
909
        ----------
910
        descriptor: optional, None
911
            None - returns any bond with equal probability
912
            int - will return an bond with that index
913
            'Indexed' - returns a random indexed bond
914
            'Unindexed' - returns a random unindexed bond
915
            'Alpha' - returns a random alpha bond
916
            'Beta' - returns a random beta bond
917

918
        Returns
919
        --------
920
        a single Bond object fitting the description
921
        or None if no such atom exists
922
        """
923 2
        if descriptor is None or isinstance(descriptor,str):
924 2
            bonds = self.get_component_list('bond', descriptor)
925 2
            if len(bonds) == 0:
926 2
                return None
927 2
            return random.choice(bonds)
928

929 2
        if not isinstance(descriptor, int):
930 0
            return None
931 2
        for bond in self.get_bonds():
932 2
            if bond.index == descriptor:
933 2
                return bond
934

935 2
        return None
936

937 2
    def add_atom(self, bond_to_atom, bond_or_types = None, bond_and_types = None,
938
                 new_or_types = None, new_and_types = None, new_atom_index = None,
939
                 new_atom_ring = None, beyond_beta = False):
940
        """Add an atom to the specified target atom.
941

942
        Parameters
943
        -----------
944
        bond_to_atom: atom object, required
945
            atom the new atom will be bound to
946
        bond_or_types: list of tuples, optional
947
            strings that will be used for the or_types for the new bond
948
        bond_and_types: list of strings, optional
949
            strings that will be used for the and_types for the new bond
950
        new_or_types: list of strings, optional
951
            strings that will be used for the or_types for the new atom
952
        new_and_types: list of strings, optional
953
            strings that will be used for the and_types for the new atom
954
        new_atom_index: int, optional
955
            integer label that could be used to index the atom in a SMIRKS string
956
        new_atom_ring: int, optional
957
            integer used to track the openning and closing of rings in SMARTS/SMIRKS patterns
958
        beyond_beta: boolean, optional
959
            if True, allows bonding beyond beta position
960

961
        Returns
962
        --------
963
        new_atom: atom object for the newly created atom
964
        """
965 2
        if bond_to_atom is None:
966 2
            if len(self._graph_nodes()) > 0:
967 0
                return None
968

969 2
            if new_atom_index is None:
970 2
                new_atom_index = 0
971

972 2
            new_atom = self.Atom(new_or_types, new_and_types, new_atom_index, new_atom_ring)
973 2
            self._graph.add_node(new_atom)
974 2
            return new_atom
975

976
        # Check if we can get past beta position
977 2
        bond_to_index = bond_to_atom.index
978 2
        if bond_to_index < 0 and not beyond_beta:
979 0
            return None
980

981
        # determine the type integer for the new atom and bond
982 2
        if new_atom_index is None:
983 2
            if bond_to_index > 0:
984 2
                new_atom_index = 0
985
            else:
986 2
                new_atom_index = bond_to_index - 1
987

988 2
        if new_atom_index > 0 and bond_to_index > 0:
989 2
            bond_index = max(new_atom_index, bond_to_index) - 1
990
        else:
991 2
            bond_index = new_atom_index
992

993
        # create new bond
994 2
        new_bond = self.Bond(bond_or_types, bond_and_types, bond_index)
995

996
        # create new atom
997 2
        new_atom = self.Atom(new_or_types, new_and_types, new_atom_index, new_atom_ring)
998

999
        # Add node for new_atom
1000 2
        self._graph.add_node(new_atom)
1001

1002
        # Connect original atom and new atom
1003 2
        self._graph.add_edge(bond_to_atom, new_atom, bond = new_bond)
1004

1005 2
        return new_atom
1006

1007 2
    def remove_atom(self, atom, only_empty=False):
1008
        """Remove the specified atom from the chemical environment.
1009
        if the atom is not indexed for the SMIRKS string or
1010
        used to connect two other atoms.
1011

1012
        Parameters
1013
        ----------
1014
        atom: atom object, required
1015
            atom to be removed if it meets the conditions.
1016
        only_empty: boolean, optional
1017
            True only an atom with no and_types and 1 ORtype can be removed
1018

1019
        Returns
1020
        --------
1021
        Boolean True: atom was removed, False: atom was not removed
1022
        """
1023
        # labeled atoms can't be removed
1024 2
        if atom.index > 0:
1025 2
            return False
1026

1027
        # Atom connected to more than one other atom cannot be removed
1028 2
        if len(self._graph_neighbors(atom)) > 1:
1029 2
            return False
1030

1031
        # if you can remove "decorated atoms" remove it
1032 2
        if not only_empty:
1033 2
            self._graph_remove_node(atom)
1034 2
            return True
1035

1036 0
        if len(atom.ANDtypes) > 0:
1037 0
            return False
1038 0
        elif len(atom.ORtypes) > 1:
1039 0
            return False
1040

1041 0
        self._graph_remove_node(atom)
1042 0
        return True
1043

1044 2
    def get_atoms(self):
1045
        """
1046
        Returns
1047
        -------
1048
        list of atoms in the environment
1049
        """
1050 2
        return self._graph_nodes()
1051

1052 2
    def get_bonds(self, atom=None):
1053
        """
1054
        Parameters
1055
        ----------
1056
        atom: Atom object, optional, returns bonds connected to atom
1057
        returns all bonds in fragment if atom is None
1058

1059
        Returns
1060
        --------
1061
        a complete list of bonds in the fragment
1062
        """
1063 2
        if atom is None:
1064 2
            edge_list = self._graph_edges(data=True)
1065 2
            bonds = [data['bond'] for a1, a2, data in edge_list]
1066
        else:
1067 0
            bonds = []
1068 0
            for (a1, a2, info) in self._graph_edges(data=True, node=atom):
1069 0
                bonds.append(info['bond'])
1070

1071 2
        return bonds
1072

1073 2
    def get_bond(self, atom1, atom2):
1074
        """
1075
        Get bond betwen two atoms
1076

1077
        Parameters
1078
        -----------
1079
        atom1 and atom2: atom objects
1080

1081
        Returns
1082
        --------
1083
        bond object between the atoms or None if no bond there
1084
        """
1085 2
        if atom2 in self._graph_neighbors(atom1):
1086 2
            return self._graph_get_edge_data(atom1, atom2)['bond']
1087
        else:
1088 2
            return None
1089

1090 2
    def get_indexed_atoms(self):
1091
        """
1092
        returns the list of Atom objects with an index
1093
        """
1094 2
        index_atoms = []
1095 2
        for atom in self.get_atoms():
1096 2
            if atom.index > 0:
1097 2
                index_atoms.append(atom)
1098 2
        return index_atoms
1099

1100 2
    def get_unindexed_atoms(self):
1101
        """
1102
        returns a list of Atom objects that are not indexed
1103
        """
1104 2
        unindexed_atoms = []
1105 2
        for atom in self.get_atoms():
1106 2
            if atom.index < 1:
1107 2
                unindexed_atoms.append(atom)
1108 2
        return unindexed_atoms
1109

1110 2
    def get_alpha_atoms(self):
1111
        """
1112
        Returns a list of atoms alpha to any indexed atom
1113
            that are not also indexed
1114
        """
1115 2
        alpha_atoms = []
1116 2
        for atom in self.get_atoms():
1117 2
            if atom.index == 0:
1118 2
                alpha_atoms.append(atom)
1119

1120 2
        return alpha_atoms
1121

1122 2
    def get_beta_atoms(self):
1123
        """
1124
        Returns a list of atoms beta to any indexed atom
1125
            that are not alpha or indexed atoms
1126
        """
1127 2
        beta_atoms = []
1128 2
        for atom in self.get_atoms():
1129 2
            if atom.index == -1:
1130 0
                beta_atoms.append(atom)
1131 2
        return beta_atoms
1132

1133 2
    def get_indexed_bonds(self):
1134
        """
1135
        Returns a list of Bond objects that connect two indexed atoms
1136
        """
1137 2
        indexed_bonds = []
1138 2
        for bond in self.get_bonds():
1139 2
            if bond.index > 0:
1140 2
                indexed_bonds.append(bond)
1141 2
        return indexed_bonds
1142

1143 2
    def get_unindexed_bonds(self):
1144
        """
1145
        Returns a list of Bond objects that connect
1146
            an indexed atom to an unindexed atom
1147
            two unindexed atoms
1148
        """
1149 2
        unindexed_bonds = []
1150 2
        for bond in self.get_bonds():
1151 2
            if bond.index < 1:
1152 2
                unindexed_bonds.append(bond)
1153 2
        return unindexed_bonds
1154

1155 2
    def get_alpha_bonds(self):
1156
        """
1157
        Returns a list of Bond objects that connect
1158
            an indexed atom to alpha atoms
1159
        """
1160 2
        alpha_bonds = []
1161 2
        for bond in self.get_bonds():
1162 2
            if bond.index == 0:
1163 2
                alpha_bonds.append(bond)
1164 2
        return alpha_bonds
1165

1166 2
    def get_beta_bonds(self):
1167
        """
1168
        Returns a list of Bond objects that connect
1169
            alpha atoms to beta atoms
1170
        """
1171 2
        betaBonds = []
1172 2
        for bond in self.get_bonds():
1173 2
            if bond.index == -1:
1174 0
                betaBonds.append(bond)
1175 2
        return betaBonds
1176

1177 2
    def is_alpha(self, component):
1178
        """
1179
        Takes an atom or bond are returns True if it is alpha to an indexed atom
1180
        """
1181 2
        return component.index == 0
1182

1183 2
    def is_unindexed(self, component):
1184
        """
1185
        returns True if the atom or bond is not indexed
1186
        """
1187 2
        return component.index < 1
1188

1189 2
    def is_indexed(self, component):
1190
        """
1191
        returns True if the atom or bond is indexed
1192
        """
1193 2
        return component.index > 0
1194

1195 2
    def is_beta(self, component):
1196
        """
1197
        Takes an atom or bond are returns True if it is beta to an indexed atom
1198
        """
1199 2
        return component.index == -1
1200

1201 2
    def get_type(self):
1202
        """
1203
        Uses number of indexed atoms and bond connectivity
1204
        to determine the type of chemical environment
1205

1206
        Returns
1207
        -------
1208
        chemical environemnt type:
1209
            'Atom', 'Bond', 'Angle', 'ProperTorsion', 'ImproperTorsion'
1210
            None if number of indexed atoms is 0 or > 4
1211
        """
1212 2
        index_atoms = self.get_indexed_atoms()
1213 2
        natoms = len(index_atoms)
1214

1215 2
        if natoms == 0 or natoms > 4:
1216 2
            return None
1217

1218
        # one indexed atom can only be atom
1219 2
        if natoms == 1:
1220 2
            return "Atom"
1221

1222
        # if 2 indexed atoms check they are connected
1223 2
        bond12 = self.get_bond(self.select_atom(1), self.select_atom(2))
1224 2
        if bond12 is None:
1225 0
            return None
1226 2
        if natoms == 2:
1227 2
            return "Bond"
1228

1229
        # if 3 indexed atoms check connected in an angle
1230 2
        bond23 = self.get_bond(self.select_atom(2), self.select_atom(3))
1231 2
        if bond23 is None:
1232 0
            return None
1233 2
        if natoms == 3:
1234 2
            return "Angle"
1235

1236
        # if 4 indexed atoms check for proper or improper torsion connections
1237 2
        bond34 = self.get_bond(self.select_atom(3), self.select_atom(4))
1238 2
        bond24 = self.get_bond(self.select_atom(2), self.select_atom(4))
1239 2
        if bond24 is not None:
1240 2
            return "ImproperTorsion"
1241 2
        if bond34 is not None:
1242 2
            return "ProperTorsion"
1243

1244
        # must have 4 indexed atoms with incorrect connectivity
1245 0
        return None
1246

1247 2
    def get_neighbors(self, atom):
1248
        """
1249
        Returns atoms that are bound to the given atom
1250
        in the form of a list of Atom objects
1251
        """
1252 0
        return self._graph_neighbors(atom)
1253

1254 2
    def get_valence(self, atom):
1255
        """
1256
        Returns the valence (number of neighboring atoms)
1257
        around the given atom
1258
        """
1259 2
        return len(self._graph_neighbors(atom))
1260

1261 2
    def get_bond_order(self, atom):
1262
        """
1263
        Returns minimum bond order around a given atom
1264
        0 if atom has no neighbors
1265
        aromatic bonds count as 1.5
1266
        any bond counts as 1.0
1267
        """
1268 2
        order = 0.
1269 2
        for a1, a2, info in self._graph_edges(data=True, node=atom):
1270 2
            order += info['bond'].get_order()
1271 2
        return order
1272

Read our documentation on viewing source code .

Loading