1
#!/usr/bin/env python
2

3
#==============================================================================
4
# MODULE DOCSTRING
5
#==============================================================================
6

7 100
"""
8
environment.py
9
this is adapted from the openforcefield ChemicalEnvironments class.
10

11
There have been some on going debates over where environment.py should live,
12
here or in the base openforcefield toolkit. Due to the want to update
13
this module for use in chemper amidst the openforcefield API overall,
14
this environment.py has been updated independently in this repository.
15
These updates have been fairly substantial, specifically the getter
16
and setter functions for decorators were removed and replaced with more
17
pythonic @property and @[property].setter functions instead.
18
All methods have also been renamed to use snake case.
19

20
The only function of real use in openforcefield.py is the `get_type` function,
21
which has also been updated here.
22
"""
23

24
#==============================================================================
25
# GLOBAL IMPORTS
26
#==============================================================================
27

28 100
import networkx as nx
29 100
import re
30 100
import copy
31

32 100
from numpy import random
33

34

35
#==============================================================================
36
# Functions
37
#==============================================================================
38

39 100
def _find_embedded_brackets(string, in_char, out_char):
40
    """
41
    Finds the substring surrounded by the in_char and out_char
42
    intended use is to identify embedded bracketed sequences
43

44
    For example, if you have the input
45
    string = "[#1$(*-C(-[#7,#8,F,#16,Cl,Br])-[#7,#8,F,#16,Cl,Br]):1]"
46
    sub_string, in_idx, out_idx = _find_embedded_brackets(string, '\(','\)')
47
    # sub_string = (*-C(-[#7,#8,F,#16,Cl,Br])-[#7,#8,F,#16,Cl,Br])  in_idx = 4, out_idx = 50
48

49
    Parameters
50
    -----------
51
    string : str
52
        a string you want separated
53
    in_char : str
54
        regular expression for the character you're looking for '\(' for '('
55
    out_char : str
56
        regular expression for the closing character such as '\)' for ')'
57

58
    Returns
59
    --------
60
    substring : str
61
        string between the first occurances of the in_char and out_char
62
    in_idx : int
63
        index from initial string with the first in_char
64
    out_idx : int
65
        index from initial string with the first out_char
66
    """
67 100
    in_list = [m.start() for m in re.finditer(in_char, string)]
68 100
    out_list = [m.start() for m in re.finditer(out_char, string)]
69
    # If no occurance of the in_char return an empty string
70 100
    if len(in_list) == 0:
71 0
        return "", -1, -1
72
    # If no out_char returns the first in_char to the end
73 100
    if len(out_list) == 0:
74 0
        return string[in_list[0]:], in_list[0], -1
75

76
    # Otherwise find closure from the first in_char
77 100
    list_idx = 0
78 100
    while list_idx < len(in_list) - 1:
79 100
        if in_list[list_idx+1] > out_list[list_idx]:
80 100
            break
81 100
        list_idx+=1
82 100
    in_idx = in_list[0]
83 100
    out_idx = out_list[list_idx]
84 100
    return string[in_idx:out_idx+1], in_idx, out_idx
85

86

87 100
def _convert_embedded_smirks(smirks):
88
    """
89
    Converts a SMIRKS string with the $(...) in an atom to the
90
    form expected by the environment parser
91

92
    For example, if you provide initial_smirks = "[#1$(*~[#6]):1]"
93
    then new_smirks = _convert_embedded_smirks(initial_smirks)
94
    will return new_smirks = [#1:1]~[#6]
95

96
    Parameters
97
    -----------
98
    smirks : str
99
        any smirks string, if no $(...) then the original smirks is returned
100

101
    Returns
102
    --------
103
    updated_smirks: str
104
        smirks string with no recursive smirks
105
    """
106 100
    a_out = 0
107 100
    while smirks.find('$(') != -1:
108
        # Find first atom
109 100
        atom, a_in, a_out = _find_embedded_brackets(smirks, r'\[', r'\]')
110 100
        d = atom.find('$(')
111
        # Find atom with the $ string embedded
112 100
        while d == -1:
113 100
            atom, temp_in, temp_out = _find_embedded_brackets(smirks[a_out+1:], r'\[', r'\]')
114 100
            a_in = a_out + temp_in + 1
115 100
            a_out += temp_out + 1
116 100
            d = atom.find('$(')
117

118
        # Store the smirks pattern before and after the relevant atom
119 100
        pre_smirks = smirks[:a_in]
120 100
        post_smirks = smirks[a_out+1:]
121

122
        # Check for ring index, i.e. the 1s in "[#6:1]1-CCCCC1"
123 100
        match = re.match(r'(\d+)',post_smirks)
124 100
        if match is not None: # leftover starts with int
125 100
            ring_out = re.findall(r'(\d+)',post_smirks)[0]
126
            # update post_smirks
127 100
            post_smirks = post_smirks[match.end():]
128
        else:
129 100
            ring_out = ''
130

131 100
        embedded, p_in, p_out = _find_embedded_brackets(atom, r'\(', r'\)')
132
        # two forms of embedded strings $(*~stuff) or $([..]~stuff)
133
        # in the latter case the first atom refers the current atom
134 100
        if embedded[1] == '[':
135 100
            first, f_in, f_out = _find_embedded_brackets(embedded, r'\[',r'\]')
136 100
            first = _convert_embedded_smirks(first)
137 100
            new_atom = atom[:d]+first[1:-1]+atom[p_out+1:]
138 100
            embedded = embedded[f_out+1:]
139
            # if embedded is empty between brackets, remove it
140 100
            if embedded.replace('(','').replace(')','') == '':
141 0
                embedded = ''
142

143 100
        elif embedded[1] == '*': # embedded[1] = *
144 100
            new_atom = atom[:d]+atom[p_out+1:]
145 100
            embedded = embedded[2:]
146

147
        else: # embedded starts with a "no bracket" atom such as 'C'
148 0
            embedded = embedded[1:] # remove leading '('
149
            # atoms by symbol don't need brackets, this covers atomic symbols and aromatic atoms
150 0
            no_bracket = r'(!?[A-Z][a-z]?|!?[cnops])'
151 0
            match = re.match(no_bracket, embedded)
152 0
            if match is not None:
153 0
                new_atom = atom[:d]+embedded[:match.end()]+atom[p_out+1:]
154 0
                embedded = embedded[match.end():]
155
            else:
156 0
                new_atom = atom[:d]+atom[p_out+1]
157

158
        # Look for ring inside embedded SMIRKS "[#6$(*1CCC1)]"
159 100
        match = re.match(r'(\d+)', embedded)
160 100
        if match is not None: # embedded starts with an int
161 100
            ring_in = re.findall(r'(\d+)', embedded)[0]
162 100
            embedded = '(' + embedded[match.end():]
163
        else:
164 100
            ring_in = ''
165 100
            if embedded != '':
166 100
                embedded = '(' + embedded
167

168
        # Make new smirks
169 100
        smirks = pre_smirks+new_atom+ring_out+ring_in+embedded+post_smirks
170

171 100
    return smirks
172

173

174 100
def _remove_blanks_repeats(init_list, remove_list = ['']):
175
    """
176
    Returns the input list 'init_list'
177
    without any repeating entries or blank strings ''
178

179
    Parameters
180
    -----------
181
    init_list : list
182
        This is a list of anything, but intended for decorator strings
183
    remove_list : list
184
        List of things you want removed from the init_list
185
        For decorators we don't need empty strings
186

187
    Returns
188
    --------
189
    final_list : list
190
        The init_list with no duplicates and nothing from the remove_list
191

192
    TODO: this changes the order of inputs potentially so this function
193
          will need to be updated if order of init_list is important.
194
    """
195 100
    final_list = [item for item in init_list if item not in remove_list]
196 100
    return list( set(final_list) )
197

198

199 100
class SMIRKSMismatchError(Exception):
200
    """
201
    Exception for cases where smirks are inappropriate
202
    for the environment type they are being parsed into
203
    """
204 100
    def __init__(self, msg):
205 0
        super(SMIRKSMismatchError, self).__init__(self,msg)
206 0
        self.msg = msg
207

208

209 100
class SMIRKSParsingError(Exception):
210
    """
211
    Exception for when SMIRKS are not parseable for any environment
212
    """
213 100
    def __init__(self, msg):
214 100
        super(SMIRKSParsingError, self).__init__(self, msg)
215 100
        self.msg = msg
216

217

218 100
class ChemicalEnvironment(object):
219
    """Chemical environment abstract base class that matches an atom, bond, angle, etc.
220
    """
221 100
    class Atom(object):
222
        """Atom representation, which may have some or_types and ANDtypes properties.
223

224
        Attributes
225
        ----------
226
        or_types : list of tuples in the form (base, [list of decorators])
227
            where bases and decorators are both strings
228
            The descriptor types that will be combined with logical OR
229
        and_types : list of string
230
            The descriptor types  that will be combined with logical AND
231
        """
232 100
        def __init__(self, or_types = None, and_types = None, index = 0, ring = None):
233
            """Initialize an Atom object with optional descriptors.
234

235
            Parameters
236
            -----------
237
            or_types : list of tuples for ORbases and ORdecorators,
238
                in the form (base, [list of decorators])
239
                optional, default = []
240
            and_types : list of str
241
                strings that will be AND'd together in a SMARTS
242
                optional, default = None
243
            index : int
244
                If greater than zero,
245
                the specified index will be attached as a SMIRKS index (e.g. '[#6:1]')
246
                otherwise, it is only used for accessing atom information
247
            ring : int, optional, default = None
248
                If not None, the specified ring index will be attached at the end of the atom i.e. '[#6:1]1'
249
            """
250
            # List of 2 tuples in the form [ (ORbase, ORdecorator), ...]
251 100
            if or_types is not None:
252 100
                self._or_types = copy.deepcopy(or_types)
253
            else:
254 100
                self._or_types = list()
255

256
            # Set of strings that will be AND'd to the the end
257 100
            if and_types is not None:
258 100
                self._and_types = list(copy.deepcopy(and_types))
259
            else:
260 100
                self._and_types = list()
261

262 100
            self.index = index
263 100
            self.ring = ring
264 100
            self.is_atom = True
265

266 100
        def is_generic(self):
267
            """
268
            returns True if there are no decorators on this atom
269
            (IMPORTANT: this is newly added and in chemper only as of 8/9/18)
270
            """
271 0
            if not self._or_types:
272 0
                if not self._and_types:
273 0
                    return True
274 0
            return False
275

276 100
        def as_smarts(self):
277
            """Return the atom representation as SMARTS.
278

279
            Returns
280
            --------
281
            smarts : str
282
            The SMARTS string for this atom, meaning it has no :n index
283
            """
284

285 100
            smarts = '['
286

287
            # Add the OR'd features
288 100
            if self._or_types:
289 100
                or_list = list()
290 100
                for (base, or_decorators) in self._or_types:
291 100
                    if len(base) > 0 and base[0] == '$':
292
                        # after a $base an explicit '&' is necessary
293 0
                        if or_decorators:
294 0
                            or_bit = base + '&' + ''.join(or_decorators)
295
                        else:
296 0
                            or_bit = base
297
                    else: # base doesn't start with $
298 100
                        or_bit = base + ''.join(or_decorators)
299 100
                    or_list.append(or_bit)
300 100
                smarts += ','.join(or_list)
301
            else:
302 100
                smarts += '*'
303

304 100
            if len(self._and_types) > 0:
305 100
                smarts += ';' + ';'.join(self._and_types)
306

307 100
            if self.ring is not None:
308 100
                return smarts + ']' + str(self.ring)
309
            else:
310 100
                return smarts + ']'
311

312 100
        def as_smirks(self):
313
            """Return the atom representation as SMIRKS.
314

315
            Returns
316
            --------
317
            smirks : str
318
            The SMIRKS string for this atom, same as SMARTS, but with :n index
319
            """
320 100
            smirks = self.as_smarts()
321

322
            # No index specified so SMIRKS = SMARTS
323 100
            if self.index <= 0:
324 100
                return smirks
325

326
            # Add label to the end of SMARTS
327 100
            sub_string, start, end = _find_embedded_brackets(smirks, r'\[', r'\]')
328 100
            end_string = smirks[end:]
329 100
            return sub_string[:-1] + ':' + str(self.index) + end_string
330

331 100
        def add_or_type(self, or_base, or_decorators):
332
            """
333
            Adds ORtype to the set for this atom.
334

335
            Parameters
336
            -----------
337
            or_base : string, such as '#6'
338
            or_decorators : list of strings, such as ['X4','+0']
339
            """
340 100
            or_decorators = _remove_blanks_repeats(or_decorators, ['', or_base])
341 100
            self._or_types.append((or_base, or_decorators))
342

343 100
        def add_and_type(self, and_type):
344
            """
345
            Adds ANDtype to the set for this atom.
346

347
            Parameters
348
            --------
349
            and_type : string
350
                added to the list of and_types for this atom
351
            """
352 100
            self._and_types.append(and_type)
353 100
            self._and_types = _remove_blanks_repeats(self._and_types)
354

355 100
        @property
356
        def or_types(self):
357
            """Provides the or_types in this atom"""
358 100
            return self._or_types
359

360 100
        @or_types.setter
361
        def or_types(self, new_or_types):
362
            """
363
            sets new or_types for this atom
364

365
            Parameters
366
            ----------
367
            new_or_types : list of tuples in the form (base, [ORdecorators])
368
                for example : ('#6', ['X4','H0','+0']) --> '#6X4H0+0'
369
            """
370 100
            self._or_types = list()
371 100
            if new_or_types is not None:
372 100
                for (base, decs) in new_or_types:
373 100
                    adjusted_decs = _remove_blanks_repeats(decs, ['', base])
374 100
                    self._or_types.append((base, adjusted_decs))
375

376 100
        @property
377
        def and_types(self):
378
            """
379
            returns a copy of the list of and_types for this atom
380
            """
381 100
            return list(copy.deepcopy(self._and_types))
382

383 100
        @and_types.setter
384
        def and_types(self, new_and_types):
385
            """
386
            sets new and_types for this atom
387

388
            Parameters
389
            ----------
390
            new_and_types : list of strings
391
                strings that will be AND'd together in a SMARTS
392
            """
393 100
            if new_and_types is None:
394 0
                self._and_types = list()
395
            else:
396 100
                self._and_types = _remove_blanks_repeats(new_and_types)
397

398 100
    class Bond(Atom):
399
        """Bond representation, which may have ORtype and ANDtype descriptors.
400

401
        Attributes
402
        ----------
403
        or_types : list of tuples of ORbases and ORdecorators
404
            in form (base: [list of decorators])
405
            The ORtype types that will be combined with logical OR
406
        and_types : list of string
407
            The and_types that will be combined with logical AND
408

409
        """
410
        # Implementation identical to atoms apart from what is put in the asSMARTS/asSMIRKS strings
411

412 100
        def __init__(self, or_types = None, and_types = None, index = 0):
413
            """
414
            Parameters
415
            -----------
416
            or_types : list of tuples, optional, default = None
417
                tuples have form (base, [ORdecorators])
418
                bond descriptors that will be OR'd together in a SMARTS
419
            and_types : list of str, optional, default = None
420
                strings that will be AND'd together in a SMARTS
421
            index : integer, default = 0
422
                This is for book keeping inside environments and will not be shown in SMARTS or SMIRKS
423
                example: bond1 in a Bond is the bond between atom1 and atom2
424
            """
425 100
            super(ChemicalEnvironment.Bond,self).__init__(or_types, and_types, index)
426 100
            self.is_atom = False
427 100
            return
428

429 100
        def as_smarts(self):
430
            """Return the atom representation as SMARTS.
431

432
            Returns
433
            --------
434
            smarts : str
435
                The SMARTS string for just this atom
436
            """
437 100
            if self._or_types:
438 100
                or_combos = list()
439 100
                for (OR_base, OR_decorators) in self._or_types:
440 100
                    or_combos.append(OR_base + ''.join(OR_decorators))
441 100
                smarts = ','.join(or_combos)
442
            else:
443 100
                smarts = '~'
444

445 100
            if len(self._and_types) > 0:
446 100
                smarts += ';' + ';'.join(self._and_types)
447

448 100
            return smarts
449

450 100
        def as_smirks(self):
451
            """
452
            Returns
453
            --------
454
            smarts : str
455
                The SMIRKS string for just this bond
456
            """
457
            # the same as as_smarts()
458
            #    for consistency as_smarts() or as_smirks() can be called
459
            #    for all environment objects
460 100
            return self.as_smarts()
461

462 100
        def get_order(self):
463
            """
464
            Returns a float for the order of this bond
465
            for multiple or_types or ~ it returns the minimum possible order
466
            the intended application is for checking valence around a given atom
467

468
            Returns
469
            --------
470
            min_order : float
471
                minimum order for this bond (i.e. 1 for a '-' decorator)
472
            """
473
            # Minimum order for empty or_types is 1:
474 100
            if not self._or_types:
475 0
                return 1
476

477 100
            order_dict = {'~':1.,
478
                    '-':1., ':': 1.5, '=':2., '#':3.,
479
                    '!-':1.5, '!:':1., '!=':1., '!#':1.}
480 100
            order_list = [order_dict.get(base,1) for (base, decor) in self._or_types]
481 100
            return min(order_list)
482

483 100
    def __init__(self, smirks = None, label = None, replacements = None):
484
        """Initialize a chemical environment abstract base class.
485

486
        Parameters
487
        -----------
488
        smirks : string, optional
489
            if smirks is not None, a chemical environment is built
490
            from the provided SMIRKS string
491
        label : anything, optional
492
            intended to be used to label this chemical environment
493
            could be a string, int, or float, or anything
494
        replacements : list of lists, optional,
495
            [substitution, smarts] form for parsing SMIRKS
496
        """
497
        # Define the regular expressions used for all SMIRKS decorators
498
        # There are a limited number of descriptors for smirks string they are:
499
        # That is a # followed by one or more ints w/or w/o at ! in front '!#16'
500 100
        element_num = r"!?[#]\d+"
501
        # covers element symbols, i.e. N,C,O,Br not followed by a number
502 100
        element_sym = "!?[A-Z][a-z]?"
503
        # covers element symbols that are aromatic:
504 100
        aro_sym = "!?[cnops]"
505
        # replacement strings
506 100
        replace_str = r"\$\w+"
507
        # a or A w/ or w/o a ! in front 'A'
508 100
        aro_ali = "!?[aA]"
509
        # the decorators (D,H,j,r,V,X,^) followed by one or more integers
510 100
        needs_int = r"!?[DjVX^]\d+"
511
        # R(x), +, - do not need to be followed by a integer w/ or w/o a ! 'R2'
512 100
        optional_int = r"!?[RHhrx+-]\d*"
513
        # chirality options, "@", "@@", "@int" w/ or w/o a ! in front
514 100
        chirality = r"!?[@]\d+|!?[@]@?"
515

516
        # Generate RegEx string for decorators:
517 100
        self.no_bracket_atom_reg = r'('+'|'.join([element_sym, aro_sym, replace_str])+')'
518 100
        self.atom_reg = '|'.join([element_num, aro_ali, needs_int,
519
                                  optional_int, chirality, replace_str,
520
                                  element_sym, aro_sym])
521 100
        self.atom_reg = r'('+self.atom_reg+')'
522

523
        # Define bond regular expression options below in order:
524
        # single, double, triple, aromatic, directional up bond, directional down bond
525
        # Each can have ! in from and directional can have ? after
526
        # up and down bonds have lots of \ to fit the python requirements
527 100
        self.bond_regs = ['!?[-]', '!?[=]', '!?[#]', '!?[:]', '!?[@]', '!?[\\\\]\\??', '!?[\\/]\\??']
528 100
        self.bond_regs = r'('+'|'.join(self.bond_regs)+')'
529
        # Note, not looking for ~ because that is used for empty bonds
530

531
        # Create an empty graph which will store Atom objects.
532 100
        self._graph = nx.Graph()
533 100
        self.label = label
534 100
        self.replacements = replacements
535

536 100
        if smirks is not None:
537
            # Check that it is a valid SMIRKS
538 100
            if not self.is_valid(smirks):
539 100
                raise SMIRKSParsingError("Error Provided SMIRKS ('%s') was \
540
not parseable with current toolkit" % smirks)
541

542
            # Check for SMIRKS not supported by Chemical Environments
543 100
            if smirks.find('.') != -1:
544 0
                raise SMIRKSParsingError("Error: Provided SMIRKS ('%s') \
545
contains a '.' indicating multiple molecules in the same pattern. This type \
546
of pattern is not parseable into ChemicalEnvironments" % smirks)
547 100
            if smirks.find('>') != -1:
548 0
                raise SMIRKSParsingError("Error: Provided SMIRKS ('%s') \
549
contains a '>' indicating a reaction. This type of pattern is not parseable \
550
into ChemicalEnvironments." % smirks)
551

552
            # try parsing into environment object
553 100
            try:
554 100
                self._parse_smirks(smirks)
555 0
            except:
556 0
                raise SMIRKSParsingError("Error SMIRKS (%s) was not parseable\
557
                        into a ChemicalEnvironment" % smirks)
558

559
        # Check that the created Environment is valid
560 100
        if not self.is_valid():
561 0
            raise SMIRKSParsingError("Input SMIRKS (%s), converted to %s \
562
                    is now invalid" % (smirks, self.as_smirks()))
563

564 100
        return
565

566 100
    def _graph_remove_node(self, node):
567
        """
568
        removes a node from the graph, kept separate from other
569
        functions so if (when) networkx has an API change we only
570
        have to change one place.
571

572
        Parameters
573
        -----------
574
        node : node in self._graph
575

576
        Returns
577
        --------
578
        node_removed : bool
579
        """
580 100
        if node not in self._graph:
581 0
            return False
582 100
        self._graph.remove_node(node)
583 100
        return True
584

585 100
    def _graph_nodes(self, data=False):
586
        """
587
        When data is False returns a list of nodes in graph
588
        otherwise returns a dictionary in the form {node: data}
589

590
        Parameters
591
        -----------
592
        data : bool
593
            include data for each node
594

595
        Returns
596
        --------
597
        nodes : list or dict
598
            if data is False, returns a list in the form:
599
                [node1, node2, ...]
600
            if data is True, returns a dictionary in the form:
601
                {node: {data_key: data, ...}, ... }
602
        """
603 100
        if data:
604 0
            return dict(self._graph.nodes(data=True))
605 100
        return list(self._graph.nodes())
606

607 100
    def _graph_edges(self, data=False, node=None):
608
        """
609
        Returns all edges (node=None) or edges associated
610
        with a specific node. We use a custom internal function
611
        so that if (when) networkx changes their API we only
612
        have to change one place in the script.
613

614
        Parameters
615
        -----------
616
        data : bool
617
            include data on edges (bonds)?
618
        node : graph node
619
            get only edges connected to that edge
620

621
        Returns
622
        --------
623
        edges : list of edges
624
            Returns all edges (node=None)
625
            or edges connected to the specified node.
626
            If data is False then the list has the form:
627
                [ (node1, node2), ... ]
628
            otherwise, if data is True is has the form:
629
                [ (node1, node2, {dictionary of data}), ...]
630
        """
631 100
        if node is None:
632 100
            return list(self._graph.edges(data=data))
633 100
        return list(self._graph.edges(node, data=data))
634

635 100
    def _graph_neighbors(self, node):
636
        """
637
        Returns a list of neighbors for the given node.
638
        This is done in a custom function so we have only
639
        one place to change if (when) networkx changes the API.
640

641
        Parameters
642
        -----------
643
        node : graph node
644

645
        Returns
646
        --------
647
        neighbors : list of neighboring nodes
648
        """
649 100
        return list(self._graph.neighbors(node))
650

651 100
    def _graph_get_edge_data(self, node1, node2):
652
        """
653
        Returns a dictionary for the data at the edged connecting
654
        node1 and node2 in graph. We set this in a custom function
655
        so we only have to change one place if (when) networkx
656
        changes their API.
657

658
        Parameters
659
        -----------
660
        node1 : graph node
661
        node2 : a different graph node
662

663
        Returns
664
        --------
665
        data_dict : dict
666
            dictionary of the data stored on the edge between node1 and node2
667
        """
668 100
        return self._graph.get_edge_data(node1, node2)
669

670 100
    def is_valid(self, smirks=None):
671
        """
672
        Checks if the provided SMIRKS or the one created
673
        by the environment is valid according to ChemPer rules.
674

675
        Parameters
676
        -----------
677
        smirks : str or None
678
            if None then we call self.as_smirks()
679

680
        Returns
681
        --------
682
        is_valid : bool
683
            True if this is a valid ChemPer SMIRKS
684
        """
685 100
        if smirks is None:
686 100
            smirks = self._as_smirks()
687 100
        from chemper.chemper_utils import is_valid_smirks
688 100
        return is_valid_smirks(smirks)
689

690 100
    def _parse_smirks(self,input_smirks):
691
        """
692
        This function converts a smirks string to a Chemical Environment
693
        """
694 100
        smirks = _convert_embedded_smirks(input_smirks)
695 100
        atoms = dict() # store created atom
696 100
        idx = 1 # current atom being created
697 100
        store = list() # to store indices while branching
698 100
        bonding_to = idx # which atom are we going to bond to
699

700 100
        atom_string, start, end = _find_embedded_brackets(smirks, r'\[', r'\]')
701

702 100
        if start != 0: # first atom is not in square brackets
703 0
            if start != -1:
704 0
                start_string = smirks[:start]
705
            else:
706 0
                start_string = smirks
707

708
            # Check for atoms not between square brackets
709 0
            split = re.split(self.no_bracket_atom_reg, start_string)
710 0
            atom_string = split[1]
711

712
            # update leftover for this condition
713 0
            if start != -1: # there is at least 1 more bracketed atom
714 0
                leftover = ''.join(split[2:])+smirks[start:]
715
            else:
716 0
                leftover = ''.join(split[2:])
717

718
        else: # First atom is in square brackets
719 100
            leftover = smirks[end+1:]
720
            # remove square brackets for parsing
721 100
            atom_string = atom_string[1:-1]
722

723
        # Check for ring index, i.e. the 1s in "[#6:1]1-CCCCC1"
724 100
        match = re.match(r'(\d+)',leftover)
725 100
        if match is not None:  # leftover starts with int
726 100
            ring = re.findall(r'(\d+)',leftover)[0]
727 100
            leftover = leftover[match.end():]
728
        else:
729 100
            ring = None
730

731
        # Get atom information and create first atom
732 100
        ors, ands, index = self._get_atom_info(atom_string)
733 100
        new_atom = self.add_atom(None, new_or_types= ors, new_and_types= ands,
734
                                 new_atom_index= index, new_atom_ring= ring, beyond_beta= True)
735 100
        atoms[idx] = new_atom
736

737 100
        while len(leftover) > 0:
738 100
            idx += 1
739
            # Check for branching
740 100
            if leftover[0] == ')':
741 100
                bonding_to = store.pop()
742 100
                leftover = leftover[1:]
743 100
                continue
744

745 100
            if leftover[0] == '(':
746 100
                store.append(bonding_to)
747 100
                leftover = leftover[1:]
748 100
                continue
749

750
            # find beginning and end of next [atom]
751 100
            atom_string, start, end = _find_embedded_brackets(leftover, r'\[', r'\]')
752

753 100
            if start != -1: # no more square brackets
754 100
                bond_string = leftover[:start]
755
            else:
756 0
                bond_string = leftover
757

758
            # Check for atoms not between square brackets
759 100
            bond_split = re.split(self.no_bracket_atom_reg, bond_string)
760
            # Next atom is not in brackets for example C in "[#7:1]-C"
761 100
            if len(bond_split) > 1:
762 100
                bond_string = bond_split[0]
763 100
                atom_string = '['+bond_split[1]+']'
764
                # update leftover for this condition
765 100
                if start != -1: # ther is at least 1 more bracketed atom
766 100
                    leftover = ''.join(bond_split[2:])+leftover[start:]
767
                else:
768 0
                    leftover = ''.join(bond_split[2:])
769

770
            else: # next atom is in the brackets [atom]
771
                # bond and atom string stay the same, update leftover
772 100
                leftover = leftover[end+1:]
773

774
            # Get bond and atom info
775 100
            b_or, b_and = self._get_bond_info(bond_string)
776 100
            a_or, a_and, index = self._get_atom_info(atom_string[1:-1])
777

778
            # Check for ring index, i.e. the 1s in "[#6:1]1-CCCCC1"
779 100
            match = re.match(r'(\d+)',leftover)
780 100
            if match is not None: # leftover starts with int
781 100
                ring = re.findall(r'(\d+)',leftover)[0]
782 100
                leftover = leftover[match.end():]
783
            else:
784 100
                ring = None
785

786
            # create new atom
787 100
            new_atom = self.add_atom(atoms[bonding_to], bond_or_types=b_or,
788
                                     bond_and_types=b_and, new_or_types=a_or, new_and_types=a_and,
789
                                     new_atom_index=index, new_atom_ring=ring, beyond_beta=True)
790

791
            # update state
792 100
            atoms[idx] = new_atom
793 100
            bonding_to = idx
794 100
        return
795

796 100
    def _get_atom_info(self, atom):
797
        """
798
        Parses string for one atom
799

800
        Parameters
801
        -----------
802
        atom : str
803
            string for one atom (the part between brackets)
804

805
        Returns
806
        --------
807
        or_types : list of tuples
808
            OR decorators are in the form [ (base, [decorators]), ...]
809
        and_types : list
810
        index : int
811
        """
812
        # Find atom index
813 100
        colon = atom.find(':')
814 100
        if colon == -1:
815 100
            index = None
816
        else:
817 100
            index = int(atom[colon+1:])
818 100
            atom = atom[:colon]
819

820 100
        split = atom.split(';')
821

822
        # Get and_types (and split them if they don't use ;)
823 100
        and_types = list()
824 100
        for a in split[1:]:
825 100
            and_types += re.findall(self.atom_reg, a)
826

827
        # Get or_types
828 100
        or_list = split[0].split(',')
829 100
        or_types = list()
830
        # Separate or_types into bases and decorators
831 100
        for OR in or_list:
832 100
            or_base, or_decors = self._separate_or_types(OR)
833 100
            if or_base is not None:
834 100
                or_types.append((or_base, or_decors))
835

836 100
        return or_types, and_types, index
837

838 100
    def _separate_or_types(self, or_type):
839
        """
840
        Separates ORtype (i.e. "#6X4R+0") into
841
        a base and decorators (i.e. '#6', ['X4','R','+0'] )
842

843
        Parameters
844
        -----------
845
        or_type : str
846
            string for one or_type
847

848
        Returns
849
        --------
850
        base : str
851
            #n, element symbol, or *
852
        decs : list
853
            list of decorators
854
        """
855
        # special case 1: wild card
856 100
        if or_type == '*':
857 100
            return None, []
858

859
        # if OR base is a wildcard
860 100
        if or_type[0] == '*':
861 100
            return '*', re.findall(self.atom_reg, or_type[1:])
862

863
        # Split up decorators by RegEx strings for atoms
864 100
        split = re.findall(self.atom_reg, or_type)
865 100
        if len(split) == 0:
866 100
            return None, []
867

868 100
        base = split[0]
869 100
        decs = _remove_blanks_repeats(split[1:], ['',base])
870 100
        return base, decs
871

872 100
    def _get_bond_info(self, bond):
873
        """
874
        Given bond strings returns or_types and and_types
875

876
        Parameters
877
        -----------
878
        bond : str
879
            string for one bond (i.e. '-,:;!@')
880

881
        Returns
882
        --------
883
        or_types : list
884
            list of or_type decorators, following atom tuple format
885
            in the '-,:;!@' example you get [ ('-', []), (':', []) ]
886
        and_types : list
887
            list of and_type decorators
888
            in this example you get ['!@']
889
        """
890
        # blank bond string is single or aromatic
891
        # empty or_types in Chemical Environments are treated as ~ bonds
892 100
        if bond == "":
893 100
            and_types = list()
894 100
            or_types = [('-', []), (':', [])]
895 100
            return or_types, and_types
896

897
        # AND types indicated by ; at the end
898 100
        split = bond.split(';')
899 100
        and_types = list()
900 100
        for a in split[1:]:
901 100
            and_types += re.findall(self.bond_regs, a)
902

903
        # or_types are divided by ,
904 100
        or_list = split[0].split(',')
905 100
        or_types = list()
906 100
        for OR in or_list:
907 100
            if OR == '~':
908 100
                continue
909 100
            or_divide = re.findall(self.bond_regs, OR)
910 100
            if len(or_divide) > 0:
911 100
                or_types.append((or_divide[0], or_divide[1:]))
912

913 100
        return or_types, and_types
914

915 100
    def as_smirks(self, smarts = False):
916
        """
917
        Returns a SMIRKS representation of the chemical environment
918

919
        Parameters
920
        -----------
921
        smarts : optional, boolean
922
            if True, returns a SMARTS instead of SMIRKS without index labels
923

924
        Returns
925
        --------
926
        smirks : str
927
            SMIRKS string for this environment
928
        """
929 100
        init_atom = self.select_atom(1)
930 100
        return self._as_smirks(init_atom, None, smarts)
931

932 100
    def _as_smirks(self, initial_atom = None, neighbors = None, smarts = False):
933
        """
934
        Return a SMIRKS representation of the chemical environment.
935
        This uses a recursive structure to combine SMIRKS for every
936
        atom in this environment.
937
        TODO: figure out if this can be done with a while loop instead
938

939
        Parameters
940
        -----------
941
        inital_atom : optional, atom object
942
            This is randomly selected if not chosen.
943
        neighbors : optional, list of atom objects
944
            This is all of the initalAtom neighbors if not specified
945
            generally this is used only for the recursive calls
946
            so initial atoms are not reprinted
947
        smarts : optional, boolean
948
            if True, returns a SMARTS string instead of SMIRKS
949
        """
950
        # If empty chemical environment
951 100
        if len(self._graph_nodes()) == 0:
952 0
            return ""
953

954 100
        if initial_atom is None:
955 100
            initial_atom = self.get_atoms()[0]
956

957 100
        if neighbors is None:
958 100
            neighbors = self._graph_neighbors(initial_atom)
959

960
        # sort neighbors to guarantee order is constant
961 100
        neighbors = sorted(neighbors, key=lambda atom: atom.as_smirks())
962

963
        # initialize smirks for starting atom
964 100
        if smarts:
965 100
            smirks = initial_atom.as_smarts()
966
        else:
967 100
            smirks = initial_atom.as_smirks()
968

969
        # loop through neighbors
970 100
        for idx, neighbor in enumerate(neighbors):
971
            # get the SMIRKS for the bond between these atoms
972
            # bonds are the same if smarts or smirks
973 100
            bond_edge = self._graph_get_edge_data(initial_atom, neighbor)
974 100
            bond_smirks = bond_edge['bond'].as_smirks()
975

976
            # Get the neighbors for this neighbor
977 100
            new_neighbors = self._graph_neighbors(neighbor)
978
            # Remove initialAtom so it doesn't get reprinted
979 100
            new_neighbors.remove(initial_atom)
980

981
            # Call asSMIRKS again to get the details for that atom
982 100
            atom_smirks = self._as_smirks(neighbor, new_neighbors, smarts)
983

984
            # Use ( ) for branch atoms (all but last)
985 100
            if idx < len(neighbors) - 1:
986 100
                smirks += '(' + bond_smirks + atom_smirks + ')'
987
            # This is for the atoms that are a part of the main chain
988
            else:
989 100
                smirks += bond_smirks + atom_smirks
990

991 100
        return smirks
992

993 100
    def select_atom(self, descriptor=None):
994
        """
995
        Select a random atom fitting the provided descriptor.
996

997
        Parameters
998
        ----------
999
        descriptor : optional, None
1000
            describe what type of atom you want with the
1001
            follow options:
1002
                None - returns any atom with equal probability
1003
                int - will return an atom with that index
1004
                'Indexed' - returns a random indexed atom
1005
                'Unindexed' - returns a random unindexed atom
1006
                'Alpha' - returns a random alpha atom
1007
                'Beta' - returns a random beta atom
1008

1009
        Returns
1010
        --------
1011
        atom : Atom
1012
            one atom from this chemical environment which
1013
            fits the provided description. If no atom matched
1014
            the description then None is returned.
1015
        """
1016 100
        if descriptor is None or isinstance(descriptor,str):
1017 100
            atoms = self.get_component_list('atom', descriptor)
1018 100
            if len(atoms) == 0:
1019 100
                return None
1020 100
            return random.choice(atoms)
1021

1022 100
        if not isinstance(descriptor, int):
1023 0
            return None
1024

1025 100
        for atom in self.get_atoms():
1026 100
            if atom.index == descriptor:
1027 100
                return atom
1028 100
        return None
1029

1030 100
    def get_component_list(self, component_type, descriptor = None):
1031
        """
1032
        Returns a list of atoms or bonds matching the descriptor
1033

1034
        Parameters
1035
        -----------
1036
        component_type : string: 'atom' or 'bond'
1037
        descriptor : string, optional
1038
            'all', 'Indexed', 'Unindexed', 'Alpha', 'Beta'
1039

1040
        Returns
1041
        -------
1042
        component_list : list
1043
            list of atoms or bonds (from component type)
1044
            which match the provided descriptor
1045
        """
1046 100
        des_list = ['indexed', 'unindexed', 'alpha', 'beta', 'all']
1047 100
        if descriptor is not None:
1048 100
            d = descriptor.lower()
1049 100
            if d not in des_list:
1050 0
                raise LookupError("Error: descriptor must be in the list [%s]" %
1051
                                ', '.join(des_list))
1052
        else:
1053 100
            d = None
1054

1055 100
        if not component_type.lower() in ['atom', 'bond']:
1056 0
            raise LookupError("Error: component_type must be 'atom' or 'bond'")
1057

1058 100
        if component_type.lower() == 'atom':
1059 100
            if d == 'indexed':
1060 100
                return self.get_indexed_atoms()
1061 100
            elif d == 'unindexed':
1062 100
                return self.get_unindexed_atoms()
1063 100
            elif d == 'alpha':
1064 100
                return self.get_alpha_atoms()
1065 100
            elif d == 'beta':
1066 100
                return self.get_beta_atoms()
1067
            else:
1068 100
                return self.get_atoms()
1069

1070 100
        elif component_type.lower() == 'bond':
1071 100
            if d == 'indexed':
1072 100
                return self.get_indexed_bonds()
1073 100
            elif d == 'unindexed':
1074 100
                return self.get_unindexed_bonds()
1075 100
            elif d == 'alpha':
1076 100
                return self.get_alpha_bonds()
1077 100
            elif d == 'beta':
1078 100
                return self.get_beta_bonds()
1079

1080 100
            return self.get_bonds()
1081

1082 0
        return None
1083

1084 100
    def select_bond(self, descriptor = None):
1085
        """Select a random bond fitting the descriptor.
1086

1087
        Parameters
1088
        ----------
1089
        descriptor : optional, None
1090
            describe what type of atom you want with the
1091
            follow options:
1092
                None - returns any bond with equal probability
1093
                int - will return an bond with that index
1094
                'Indexed' - returns a random indexed bond
1095
                'Unindexed' - returns a random unindexed bond
1096
                'Alpha' - returns a random alpha bond
1097
                'Beta' - returns a random beta bond
1098

1099
        Returns
1100
        --------
1101
        bond : Bond
1102
            one bond from this chemical environment which
1103
            fits the provided description. If no bond matched
1104
            the description then None is returned.
1105
        """
1106 100
        if descriptor is None or isinstance(descriptor,str):
1107 100
            bonds = self.get_component_list('bond', descriptor)
1108 100
            if len(bonds) == 0:
1109 100
                return None
1110 100
            return random.choice(bonds)
1111

1112