MobleyLab / chemper
1
#!/usr/bin/env python
2

3
#==============================================================================
4
# MODULE DOCSTRING
5
#==============================================================================
6

7 151
"""
8
environment.py
9
this is adapted from the openforcefield ChemicalEnvironments class.
10

11
There have been some on going debates over where environment.py should live,
12
here or in the base openforcefield toolkit. Due to the want to update
13
this module for use in chemper amidst the openforcefield API overall,
14
this environment.py has been updated independently in this repository.
15
These updates have been fairly substantial, specifically the getter
16
and setter functions for decorators were removed and replaced with more
17
pythonic @property and @[property].setter functions instead.
18
All methods have also been renamed to use snake case.
19

20
The only function of real use in openforcefield.py is the `get_type` function,
21
which has also been updated here.
22
"""
23

24
#==============================================================================
25
# GLOBAL IMPORTS
26
#==============================================================================
27

28 151
import networkx as nx
29 151
import re
30 151
import copy
31

32 151
from numpy import random
33

34

35
#==============================================================================
36
# Functions
37
#==============================================================================
38

39 151
def _find_embedded_brackets(string, in_char, out_char):
40
    """
41
    Finds the substring surrounded by the in_char and out_char
42
    intended use is to identify embedded bracketed sequences
43

44
    For example, if you have the input
45
    string = "[#1$(*-C(-[#7,#8,F,#16,Cl,Br])-[#7,#8,F,#16,Cl,Br]):1]"
46
    sub_string, in_idx, out_idx = _find_embedded_brackets(string, '\(','\)')
47
    # sub_string = (*-C(-[#7,#8,F,#16,Cl,Br])-[#7,#8,F,#16,Cl,Br])  in_idx = 4, out_idx = 50
48

49
    Parameters
50
    -----------
51
    string : str
52
        a string you want separated
53
    in_char : str
54
        regular expression for the character you're looking for '\(' for '('
55
    out_char : str
56
        regular expression for the closing character such as '\)' for ')'
57

58
    Returns
59
    --------
60
    substring : str
61
        string between the first occurances of the in_char and out_char
62
    in_idx : int
63
        index from initial string with the first in_char
64
    out_idx : int
65
        index from initial string with the first out_char
66
    """
67 151
    in_list = [m.start() for m in re.finditer(in_char, string)]
68 151
    out_list = [m.start() for m in re.finditer(out_char, string)]
69
    # If no occurance of the in_char return an empty string
70 151
    if len(in_list) == 0:
71 0
        return "", -1, -1
72
    # If no out_char returns the first in_char to the end
73 151
    if len(out_list) == 0:
74 0
        return string[in_list[0]:], in_list[0], -1
75

76
    # Otherwise find closure from the first in_char
77 151
    list_idx = 0
78 151
    while list_idx < len(in_list) - 1:
79 151
        if in_list[list_idx+1] > out_list[list_idx]:
80 151
            break
81 151
        list_idx+=1
82 151
    in_idx = in_list[0]
83 151
    out_idx = out_list[list_idx]
84 151
    return string[in_idx:out_idx+1], in_idx, out_idx
85

86

87 151
def _convert_embedded_smirks(smirks):
88
    """
89
    Converts a SMIRKS string with the $(...) in an atom to the
90
    form expected by the environment parser
91

92
    For example, if you provide initial_smirks = "[#1$(*~[#6]):1]"
93
    then new_smirks = _convert_embedded_smirks(initial_smirks)
94
    will return new_smirks = [#1:1]~[#6]
95

96
    Parameters
97
    -----------
98
    smirks : str
99
        any smirks string, if no $(...) then the original smirks is returned
100

101
    Returns
102
    --------
103
    updated_smirks: str
104
        smirks string with no recursive smirks
105
    """
106 151
    a_out = 0
107 151
    while smirks.find('$(') != -1:
108
        # Find first atom
109 151
        atom, a_in, a_out = _find_embedded_brackets(smirks, r'\[', r'\]')
110 151
        d = atom.find('$(')
111
        # Find atom with the $ string embedded
112 151
        while d == -1:
113 151
            atom, temp_in, temp_out = _find_embedded_brackets(smirks[a_out+1:], r'\[', r'\]')
114 151
            a_in = a_out + temp_in + 1
115 151
            a_out += temp_out + 1
116 151
            d = atom.find('$(')
117

118
        # Store the smirks pattern before and after the relevant atom
119 151
        pre_smirks = smirks[:a_in]
120 151
        post_smirks = smirks[a_out+1:]
121

122
        # Check for ring index, i.e. the 1s in "[#6:1]1-CCCCC1"
123 151
        match = re.match(r'(\d+)',post_smirks)
124 151
        if match is not None: # leftover starts with int
125 151
            ring_out = re.findall(r'(\d+)',post_smirks)[0]
126
            # update post_smirks
127 151
            post_smirks = post_smirks[match.end():]
128
        else:
129 151
            ring_out = ''
130

131 151
        embedded, p_in, p_out = _find_embedded_brackets(atom, r'\(', r'\)')
132
        # two forms of embedded strings $(*~stuff) or $([..]~stuff)
133
        # in the latter case the first atom refers the current atom
134 151
        if embedded[1] == '[':
135 151
            first, f_in, f_out = _find_embedded_brackets(embedded, r'\[',r'\]')
136 151
            first = _convert_embedded_smirks(first)
137 151
            new_atom = atom[:d]+first[1:-1]+atom[p_out+1:]
138 151
            embedded = embedded[f_out+1:]
139
            # if embedded is empty between brackets, remove it
140 151
            if embedded.replace('(','').replace(')','') == '':
141 0
                embedded = ''
142

143 151
        elif embedded[1] == '*': # embedded[1] = *
144 151
            new_atom = atom[:d]+atom[p_out+1:]
145 151
            embedded = embedded[2:]
146

147
        else: # embedded starts with a "no bracket" atom such as 'C'
148 0
            embedded = embedded[1:] # remove leading '('
149
            # atoms by symbol don't need brackets, this covers atomic symbols and aromatic atoms
150 0
            no_bracket = r'(!?[A-Z][a-z]?|!?[cnops])'
151 0
            match = re.match(no_bracket, embedded)
152 0
            if match is not None:
153 0
                new_atom = atom[:d]+embedded[:match.end()]+atom[p_out+1:]
154 0
                embedded = embedded[match.end():]
155
            else:
156 0
                new_atom = atom[:d]+atom[p_out+1]
157

158
        # Look for ring inside embedded SMIRKS "[#6$(*1CCC1)]"
159 151
        match = re.match(r'(\d+)', embedded)
160 151
        if match is not None: # embedded starts with an int
161 151
            ring_in = re.findall(r'(\d+)', embedded)[0]
162 151
            embedded = '(' + embedded[match.end():]
163
        else:
164 151
            ring_in = ''
165 151
            if embedded != '':
166 151
                embedded = '(' + embedded
167

168
        # Make new smirks
169 151
        smirks = pre_smirks+new_atom+ring_out+ring_in+embedded+post_smirks
170

171 151
    return smirks
172

173

174 151
def _remove_blanks_repeats(init_list, remove_list = ['']):
175
    """
176
    Returns the input list 'init_list'
177
    without any repeating entries or blank strings ''
178

179
    Parameters
180
    -----------
181
    init_list : list
182
        This is a list of anything, but intended for decorator strings
183
    remove_list : list
184
        List of things you want removed from the init_list
185
        For decorators we don't need empty strings
186

187
    Returns
188
    --------
189
    final_list : list
190
        The init_list with no duplicates and nothing from the remove_list
191

192
    TODO: this changes the order of inputs potentially so this function
193
          will need to be updated if order of init_list is important.
194
    """
195 151
    final_list = [item for item in init_list if item not in remove_list]
196 151
    return list( set(final_list) )
197

198

199 151
class SMIRKSMismatchError(Exception):
200
    """
201
    Exception for cases where smirks are inappropriate
202
    for the environment type they are being parsed into
203
    """
204 151
    def __init__(self, msg):
205 0
        super(SMIRKSMismatchError, self).__init__(self,msg)
206 0
        self.msg = msg
207

208

209 151
class SMIRKSParsingError(Exception):
210
    """
211
    Exception for when SMIRKS are not parseable for any environment
212
    """
213 151
    def __init__(self, msg):
214 151
        super(SMIRKSParsingError, self).__init__(self, msg)
215 151
        self.msg = msg
216

217

218 151
class ChemicalEnvironment(object):
219
    """Chemical environment abstract base class that matches an atom, bond, angle, etc.
220
    """
221 151
    class Atom(object):
222
        """Atom representation, which may have some or_types and ANDtypes properties.
223

224
        Attributes
225
        ----------
226
        or_types : list of tuples in the form (base, [list of decorators])
227
            where bases and decorators are both strings
228
            The descriptor types that will be combined with logical OR
229
        and_types : list of string
230
            The descriptor types  that will be combined with logical AND
231
        """
232 151
        def __init__(self, or_types = None, and_types = None, index = 0, ring = None):
233
            """Initialize an Atom object with optional descriptors.
234

235
            Parameters
236
            -----------
237
            or_types : list of tuples for ORbases and ORdecorators,
238
                in the form (base, [list of decorators])
239
                optional, default = []
240
            and_types : list of str
241
                strings that will be AND'd together in a SMARTS
242
                optional, default = None
243
            index : int
244
                If greater than zero,
245
                the specified index will be attached as a SMIRKS index (e.g. '[#6:1]')
246
                otherwise, it is only used for accessing atom information
247
            ring : int, optional, default = None
248
                If not None, the specified ring index will be attached at the end of the atom i.e. '[#6:1]1'
249
            """
250
            # List of 2 tuples in the form [ (ORbase, ORdecorator), ...]
251 151
            if or_types is not None:
252 151
                self._or_types = copy.deepcopy(or_types)
253
            else:
254 151
                self._or_types = list()
255

256
            # Set of strings that will be AND'd to the the end
257 151
            if and_types is not None:
258 151
                self._and_types = list(copy.deepcopy(and_types))
259
            else:
260 151
                self._and_types = list()
261

262 151
            self.index = index
263 151
            self.ring = ring
264 151
            self.is_atom = True
265

266 151
        def is_generic(self):
267
            """
268
            returns True if there are no decorators on this atom
269
            (IMPORTANT: this is newly added and in chemper only as of 8/9/18)
270
            """
271 0
            if not self._or_types:
272 0
                if not self._and_types:
273 0
                    return True
274 0
            return False
275

276 151
        def as_smarts(self):
277
            """Return the atom representation as SMARTS.
278

279
            Returns
280
            --------
281
            smarts : str
282
            The SMARTS string for this atom, meaning it has no :n index
283
            """
284

285 151
            smarts = '['
286

287
            # Add the OR'd features
288 151
            if self._or_types:
289 151
                or_list = list()
290 151
                for (base, or_decorators) in self._or_types:
291 151
                    if len(base) > 0 and base[0] == '$':
292
                        # after a $base an explicit '&' is necessary
293 0
                        if or_decorators:
294 0
                            or_bit = base + '&' + ''.join(or_decorators)
295
                        else:
296 0
                            or_bit = base
297
                    else: # base doesn't start with $
298 151
                        or_bit = base + ''.join(or_decorators)
299 151
                    or_list.append(or_bit)
300 151
                smarts += ','.join(or_list)
301
            else:
302 151
                smarts += '*'
303

304 151
            if len(self._and_types) > 0:
305 151
                smarts += ';' + ';'.join(self._and_types)
306

307 151
            if self.ring is not None:
308 151
                return smarts + ']' + str(self.ring)
309
            else:
310 151
                return smarts + ']'
311

312 151
        def as_smirks(self):
313
            """Return the atom representation as SMIRKS.
314

315
            Returns
316
            --------
317
            smirks : str
318
            The SMIRKS string for this atom, same as SMARTS, but with :n index
319
            """
320 151
            smirks = self.as_smarts()
321

322
            # No index specified so SMIRKS = SMARTS
323 151
            if self.index <= 0:
324 151
                return smirks
325

326
            # Add label to the end of SMARTS
327 151
            sub_string, start, end = _find_embedded_brackets(smirks, r'\[', r'\]')
328 151
            end_string = smirks[end:]
329 151
            return sub_string[:-1] + ':' + str(self.index) + end_string
330

331 151
        def add_or_type(self, or_base, or_decorators):
332
            """
333
            Adds ORtype to the set for this atom.
334

335
            Parameters
336
            -----------
337
            or_base : string, such as '#6'
338
            or_decorators : list of strings, such as ['X4','+0']
339
            """
340 151
            or_decorators = _remove_blanks_repeats(or_decorators, ['', or_base])
341 151
            self._or_types.append((or_base, or_decorators))
342

343 151
        def add_and_type(self, and_type):
344
            """
345
            Adds ANDtype to the set for this atom.
346

347
            Parameters
348
            --------
349
            and_type : string
350
                added to the list of and_types for this atom
351
            """
352 151
            self._and_types.append(and_type)
353 151
            self._and_types = _remove_blanks_repeats(self._and_types)
354

355 151
        @property
356
        def or_types(self):
357
            """Provides the or_types in this atom"""
358 151
            return self._or_types
359

360 151
        @or_types.setter
361
        def or_types(self, new_or_types):
362
            """
363
            sets new or_types for this atom
364

365
            Parameters
366
            ----------
367
            new_or_types : list of tuples in the form (base, [ORdecorators])
368
                for example : ('#6', ['X4','H0','+0']) --> '#6X4H0+0'
369
            """
370 151
            self._or_types = list()
371 151
            if new_or_types is not None:
372 151
                for (base, decs) in new_or_types:
373 151
                    adjusted_decs = _remove_blanks_repeats(decs, ['', base])
374 151
                    self._or_types.append((base, adjusted_decs))
375

376 151
        @property
377
        def and_types(self):
378
            """
379
            returns a copy of the list of and_types for this atom
380
            """
381 151
            return list(copy.deepcopy(self._and_types))
382

383 151
        @and_types.setter
384
        def and_types(self, new_and_types):
385
            """
386
            sets new and_types for this atom
387

388
            Parameters
389
            ----------
390
            new_and_types : list of strings
391
                strings that will be AND'd together in a SMARTS
392
            """
393 151
            if new_and_types is None:
394 0
                self._and_types = list()
395
            else:
396 151
                self._and_types = _remove_blanks_repeats(new_and_types)
397

398 151
    class Bond(Atom):
399
        """Bond representation, which may have ORtype and ANDtype descriptors.
400

401
        Attributes
402
        ----------
403
        or_types : list of tuples of ORbases and ORdecorators
404
            in form (base: [list of decorators])
405
            The ORtype types that will be combined with logical OR
406
        and_types : list of string
407
            The and_types that will be combined with logical AND
408

409
        """
410
        # Implementation identical to atoms apart from what is put in the asSMARTS/asSMIRKS strings
411

412 151
        def __init__(self, or_types = None, and_types = None, index = 0):
413
            """
414
            Parameters
415
            -----------
416
            or_types : list of tuples, optional, default = None
417
                tuples have form (base, [ORdecorators])
418
                bond descriptors that will be OR'd together in a SMARTS
419
            and_types : list of str, optional, default = None
420
                strings that will be AND'd together in a SMARTS
421
            index : integer, default = 0
422
                This is for book keeping inside environments and will not be shown in SMARTS or SMIRKS
423
                example: bond1 in a Bond is the bond between atom1 and atom2
424
            """
425 151
            super(ChemicalEnvironment.Bond,self).__init__(or_types, and_types, index)
426 151
            self.is_atom = False
427 151
            return
428

429 151
        def as_smarts(self):
430
            """Return the atom representation as SMARTS.
431

432
            Returns
433
            --------
434
            smarts : str
435
                The SMARTS string for just this atom
436
            """
437 151
            if self._or_types:
438 151
                or_combos = list()
439 151
                for (OR_base, OR_decorators) in self._or_types:
440 151
                    or_combos.append(OR_base + ''.join(OR_decorators))
441 151
                smarts = ','.join(or_combos)
442
            else:
443 151
                smarts = '~'
444

445 151
            if len(self._and_types) > 0:
446 151
                smarts += ';' + ';'.join(self._and_types)
447

448 151
            return smarts
449

450 151
        def as_smirks(self):
451
            """
452
            Returns
453
            --------
454
            smarts : str
455
                The SMIRKS string for just this bond
456
            """
457
            # the same as as_smarts()
458
            #    for consistency as_smarts() or as_smirks() can be called
459
            #    for all environment objects
460 151
            return self.as_smarts()
461

462 151
        def get_order(self):
463
            """
464
            Returns a float for the order of this bond
465
            for multiple or_types or ~ it returns the minimum possible order
466
            the intended application is for checking valence around a given atom
467

468
            Returns
469
            --------
470
            min_order : float
471
                minimum order for this bond (i.e. 1 for a '-' decorator)
472
            """
473
            # Minimum order for empty or_types is 1:
474 151
            if not self._or_types:
475 0
                return 1
476

477 151
            order_dict = {'~':1.,
478
                    '-':1., ':': 1.5, '=':2., '#':3.,
479
                    '!-':1.5, '!:':1., '!=':1., '!#':1.}
480 151
            order_list = [order_dict.get(base,1) for (base, decor) in self._or_types]
481 151
            return min(order_list)
482

483 151
    def __init__(self, smirks = None, label = None, replacements = None):
484
        """Initialize a chemical environment abstract base class.
485

486
        Parameters
487
        -----------
488
        smirks : string, optional
489
            if smirks is not None, a chemical environment is built
490
            from the provided SMIRKS string
491
        label : anything, optional
492
            intended to be used to label this chemical environment
493
            could be a string, int, or float, or anything
494
        replacements : list of lists, optional,
495
            [substitution, smarts] form for parsing SMIRKS
496
        """
497
        # Define the regular expressions used for all SMIRKS decorators
498
        # There are a limited number of descriptors for smirks string they are:
499
        # That is a # followed by one or more ints w/or w/o at ! in front '!#16'
500 151
        element_num = r"!?[#]\d+"
501
        # covers element symbols, i.e. N,C,O,Br not followed by a number
502 151
        element_sym = "!?[A-Z][a-z]?"
503
        # covers element symbols that are aromatic:
504 151
        aro_sym = "!?[cnops]"
505
        # replacement strings
506 151
        replace_str = r"\$\w+"
507
        # a or A w/ or w/o a ! in front 'A'
508 151
        aro_ali = "!?[aA]"
509
        # the decorators (D,H,j,r,V,X,^) followed by one or more integers
510 151
        needs_int = r"!?[DjVX^]\d+"
511
        # R(x), +, - do not need to be followed by a integer w/ or w/o a ! 'R2'
512 151
        optional_int = r"!?[RHhrx+-]\d*"
513
        # chirality options, "@", "@@", "@int" w/ or w/o a ! in front
514 151
        chirality = r"!?[@]\d+|!?[@]@?"
515

516
        # Generate RegEx string for decorators:
517 151
        self.no_bracket_atom_reg = r'('+'|'.join([element_sym, aro_sym, replace_str])+')'
518 151
        self.atom_reg = '|'.join([element_num, aro_ali, needs_int,
519
                                  optional_int, chirality, replace_str,
520
                                  element_sym, aro_sym])
521 151
        self.atom_reg = r'('+self.atom_reg+')'
522

523
        # Define bond regular expression options below in order:
524
        # single, double, triple, aromatic, directional up bond, directional down bond
525
        # Each can have ! in from and directional can have ? after
526
        # up and down bonds have lots of \ to fit the python requirements
527 151
        self.bond_regs = ['!?[-]', '!?[=]', '!?[#]', '!?[:]', '!?[@]', '!?[\\\\]\\??', '!?[\\/]\\??']
528 151
        self.bond_regs = r'('+'|'.join(self.bond_regs)+')'
529
        # Note, not looking for ~ because that is used for empty bonds
530

531
        # Create an empty graph which will store Atom objects.
532 151
        self._graph = nx.Graph()
533 151
        self.label = label
534 151
        self.replacements = replacements
535

536 151
        if smirks is not None:
537
            # Check that it is a valid SMIRKS
538 151
            if not self.is_valid(smirks):
539 151
                raise SMIRKSParsingError("Error Provided SMIRKS ('%s') was \
540
not parseable with current toolkit" % smirks)
541

542
            # Check for SMIRKS not supported by Chemical Environments
543 151
            if smirks.find('.') != -1:
544 0
                raise SMIRKSParsingError("Error: Provided SMIRKS ('%s') \
545
contains a '.' indicating multiple molecules in the same pattern. This type \
546
of pattern is not parseable into ChemicalEnvironments" % smirks)
547 151
            if smirks.find('>') != -1:
548 0
                raise SMIRKSParsingError("Error: Provided SMIRKS ('%s') \
549
contains a '>' indicating a reaction. This type of pattern is not parseable \
550
into ChemicalEnvironments." % smirks)
551

552
            # try parsing into environment object
553 151
            try:
554 151
                self._parse_smirks(smirks)
555 0
            except:
556 0
                raise SMIRKSParsingError("Error SMIRKS (%s) was not parseable\
557
                        into a ChemicalEnvironment" % smirks)
558

559
        # Check that the created Environment is valid
560 151
        if not self.is_valid():
561 0
            raise SMIRKSParsingError("Input SMIRKS (%s), converted to %s \
562
                    is now invalid" % (smirks, self.as_smirks()))
563

564 151
        return
565

566 151
    def _graph_remove_node(self, node):
567
        """
568
        removes a node from the graph, kept separate from other
569
        functions so if (when) networkx has an API change we only
570
        have to change one place.
571

572
        Parameters
573
        -----------
574
        node : node in self._graph
575

576
        Returns
577
        --------
578
        node_removed : bool
579
        """
580 151
        if node not in self._graph:
581 0
            return False
582 151
        self._graph.remove_node(node)
583 151
        return True
584

585 151
    def _graph_nodes(self, data=False):
586
        """
587
        When data is False returns a list of nodes in graph
588
        otherwise returns a dictionary in the form {node: data}
589

590
        Parameters
591
        -----------
592
        data : bool
593
            include data for each node
594

595
        Returns
596
        --------
597
        nodes : list or dict
598
            if data is False, returns a list in the form:
599
                [node1, node2, ...]
600
            if data is True, returns a dictionary in the form:
601
                {node: {data_key: data, ...}, ... }
602
        """
603 151
        if data:
604 0
            return dict(self._graph.nodes(data=True))
605 151
        return list(self._graph.nodes())
606

607 151
    def _graph_edges(self, data=False, node=None):
608
        """
609
        Returns all edges (node=None) or edges associated
610
        with a specific node. We use a custom internal function
611
        so that if (when) networkx changes their API we only
612
        have to change one place in the script.
613

614
        Parameters
615
        -----------
616
        data : bool
617
            include data on edges (bonds)?
618
        node : graph node
619
            get only edges connected to that edge
620

621
        Returns
622
        --------
623
        edges : list of edges
624
            Returns all edges (node=None)
625
            or edges connected to the specified node.
626
            If data is False then the list has the form:
627
                [ (node1, node2), ... ]
628
            otherwise, if data is True is has the form:
629
                [ (node1, node2, {dictionary of data}), ...]
630
        """
631 151
        if node is None:
632 151
            return list(self._graph.edges(data=data))
633 151
        return list(self._graph.edges(node, data=data))
634

635 151
    def _graph_neighbors(self, node):
636
        """
637
        Returns a list of neighbors for the given node.
638
        This is done in a custom function so we have only
639
        one place to change if (when) networkx changes the API.
640

641
        Parameters
642
        -----------
643
        node : graph node
644

645
        Returns
646
        --------
647
        neighbors : list of neighboring nodes
648
        """
649 151
        return list(self._graph.neighbors(node))
650

651 151
    def _graph_get_edge_data(self, node1, node2):
652
        """
653
        Returns a dictionary for the data at the edged connecting
654
        node1 and node2 in graph. We set this in a custom function
655
        so we only have to change one place if (when) networkx
656
        changes their API.
657

658
        Parameters
659
        -----------
660
        node1 : graph node
661
        node2 : a different graph node
662

663
        Returns
664
        --------
665
        data_dict : dict
666
            dictionary of the data stored on the edge between node1 and node2
667
        """
668 151
        return self._graph.get_edge_data(node1, node2)
669

670 151
    def is_valid(self, smirks=None):
671
        """
672
        Checks if the provided SMIRKS or the one created
673
        by the environment is valid according to ChemPer rules.
674

675
        Parameters
676
        -----------
677
        smirks : str or None
678
            if None then we call self.as_smirks()
679

680
        Returns
681
        --------
682
        is_valid : bool
683
            True if this is a valid ChemPer SMIRKS
684
        """
685 151
        if smirks is None:
686 151
            smirks = self._as_smirks()
687 151
        from chemper.chemper_utils import is_valid_smirks
688 151
        return is_valid_smirks(smirks)
689

690 151
    def _parse_smirks(self,input_smirks):
691
        """
692
        This function converts a smirks string to a Chemical Environment
693
        """
694 151
        smirks = _convert_embedded_smirks(input_smirks)
695 151
        atoms = dict() # store created atom
696 151
        idx = 1 # current atom being created
697 151
        store = list() # to store indices while branching
698 151
        bonding_to = idx # which atom are we going to bond to
699

700 151
        atom_string, start, end = _find_embedded_brackets(smirks, r'\[', r'\]')
701

702 151
        if start != 0: # first atom is not in square brackets
703 0
            if start != -1:
704 0
                start_string = smirks[:start]
705
            else:
706 0
                start_string = smirks
707

708
            # Check for atoms not between square brackets
709 0
            split = re.split(self.no_bracket_atom_reg, start_string)
710 0
            atom_string = split[1]
711

712
            # update leftover for this condition
713 0
            if start != -1: # there is at least 1 more bracketed atom
714 0
                leftover = ''.join(split[2:])+smirks[start:]
715
            else:
716 0
                leftover = ''.join(split[2:])
717

718
        else: # First atom is in square brackets
719 151
            leftover = smirks[end+1:]
720
            # remove square brackets for parsing
721 151
            atom_string = atom_string[1:-1]
722

723
        # Check for ring index, i.e. the 1s in "[#6:1]1-CCCCC1"
724 151
        match = re.match(r'(\d+)',leftover)
725 151
        if match is not None:  # leftover starts with int
726 151
            ring = re.findall(r'(\d+)',leftover)[0]
727 151
            leftover = leftover[match.end():]
728
        else:
729 151
            ring = None
730

731
        # Get atom information and create first atom
732 151
        ors, ands, index = self._get_atom_info(atom_string)
733 151
        new_atom = self.add_atom(None, new_or_types= ors, new_and_types= ands,
734
                                 new_atom_index= index, new_atom_ring= ring, beyond_beta= True)
735 151
        atoms[idx] = new_atom
736

737 151
        while len(leftover) > 0:
738 151
            idx += 1
739
            # Check for branching
740 151
            if leftover[0] == ')':
741 151
                bonding_to = store.pop()
742 151
                leftover = leftover[1:]
743 151
                continue
744

745 151
            if leftover[0] == '(':
746 151
                store.append(bonding_to)
747 151
                leftover = leftover[1:]
748 151
                continue
749

750
            # find beginning and end of next [atom]
751 151
            atom_string, start, end = _find_embedded_brackets(leftover, r'\[', r'\]')
752

753 151
            if start != -1: # no more square brackets
754 151
                bond_string = leftover[:start]
755
            else:
756 0
                bond_string = leftover
757

758
            # Check for atoms not between square brackets
759 151
            bond_split = re.split(self.no_bracket_atom_reg, bond_string)
760
            # Next atom is not in brackets for example C in "[#7:1]-C"
761 151
            if len(bond_split) > 1:
762 151
                bond_string = bond_split[0]
763 151
                atom_string = '['+bond_split[1]+']'
764
                # update leftover for this condition
765 151
                if start != -1: # ther is at least 1 more bracketed atom
766 151
                    leftover = ''.join(bond_split[2:])+leftover[start:]
767
                else:
768 0
                    leftover = ''.join(bond_split[2:])
769

770
            else: # next atom is in the brackets [atom]
771
                # bond and atom string stay the same, update leftover
772 151
                leftover = leftover[end+1:]
773

774
            # Get bond and atom info
775 151
            b_or, b_and = self._get_bond_info(bond_string)
776 151
            a_or, a_and, index = self._get_atom_info(atom_string[1:-1])
777

778
            # Check for ring index, i.e. the 1s in "[#6:1]1-CCCCC1"
779 151
            match = re.match(r'(\d+)',leftover)
780 151
            if match is not None: # leftover starts with int
781 151
                ring = re.findall(r'(\d+)',leftover)[0]
782 151
                leftover = leftover[match.end():]
783
            else:
784 151
                ring = None
785

786
            # create new atom
787 151
            new_atom = self.add_atom(atoms[bonding_to], bond_or_types=b_or,
788
                                     bond_and_types=b_and, new_or_types=a_or, new_and_types=a_and,
789
                                     new_atom_index=index, new_atom_ring=ring, beyond_beta=True)
790

791
            # update state
792 151
            atoms[idx] = new_atom
793 151
            bonding_to = idx
794 151
        return
795

796 151
    def _get_atom_info(self, atom):
797
        """
798
        Parses string for one atom
799

800
        Parameters
801
        -----------
802
        atom : str
803
            string for one atom (the part between brackets)
804

805
        Returns
806
        --------
807
        or_types : list of tuples
808
            OR decorators are in the form [ (base, [decorators]), ...]
809
        and_types : list
810
        index : int
811
        """
812
        # Find atom index
813 151
        colon = atom.find(':')
814 151
        if colon == -1:
815 151
            index = None
816
        else:
817 151
            index = int(atom[colon+1:])
818 151
            atom = atom[:colon]
819

820 151
        split = atom.split(';')
821

822
        # Get and_types (and split them if they don't use ;)
823 151
        and_types = list()
824 151
        for a in split[1:]: