1
# Nucleic Acid
2
# ============
3
#
4
# DNA and RNA types.
5
#
6
# This file is a part of the BioJulia ecosystem.
7
# License is MIT: https://github.com/BioJulia/NucleicAcids.jl/blob/master/LICENSE.md
8

9
# NucleicAcid Encoding
10
# -------------------
11
#
12
# Unambiguous nucleotides are represented in one-hot encoding as follows:
13
#
14
#   | NucleicAcid | Bits |
15
#   | ----------- | ---- |
16
#   |     A       | 0001 |
17
#   |     C       | 0010 |
18
#   |     G       | 0100 |
19
#   |    T/U      | 1000 |
20
#
21
# Ambiguous nucleotides are bitwise OR of these four nucleotides. For example, R
22
# , A or G, is represented as 0101 (= A: 0001 | G: 0100). The gap symbol is
23
# always 0000.  The meaningful four bits are stored in the least significant
24
# bits of a byte.
25

26
"""
27
An abstract nucleic acid type.
28
"""
29
abstract type NucleicAcid <: BioSymbol end
30

31
"""
32
A deoxyribonucleic acid type.
33
"""
34
primitive type DNA <: NucleicAcid 8 end
35

36
"""
37
A ribonucleic acid type.
38
"""
39
primitive type RNA <: NucleicAcid 8 end
40

41 3
prefix(::DNA) = "DNA"
42 3
prefix(::RNA) = "RNA"
43 3
type_text(x::NucleicAcid) = prefix(x)
44 3
isterm(symbol::NucleicAcid) = false
45

46

47
# Conversion from/to integers
48
# ---------------------------
49

50 5
Base.convert(::Type{T}, nt::UInt8) where T <: NucleicAcid = reinterpret(T, nt)
51 5
Base.convert(::Type{UInt8}, nt::T) where T <: NucleicAcid = reinterpret(UInt8, nt)
52 5
Base.convert(::Type{T}, nt::S) where {T <: Number, S <: NucleicAcid} = convert(T, convert(UInt8, nt))
53 3
Base.convert(::Type{S}, nt::T) where {T <: Number, S <: NucleicAcid} = convert(S, convert(UInt8, nt))
54 3
DNA(nt::Integer) = convert(DNA, nt)
55 3
RNA(nt::Integer) = convert(RNA, nt)
56

57 5
Base.convert(::Type{DNA}, nt::RNA) = DNA(convert(UInt8, nt))
58 5
DNA(nt::RNA) = convert(DNA, nt)
59 5
Base.convert(::Type{RNA}, nt::DNA) = RNA(convert(UInt8, nt))
60 5
RNA(nt::DNA) = convert(RNA, nt)
61

62

63
# Conversion from/to characters
64
# -----------------------------
65

66
function Base.convert(::Type{DNA}, c::Char)
67 5
    if c > '\uff'
68 5
        throw(InexactError(:convert, DNA, c))
69
    end
70 5
    @inbounds dna = char_to_dna[convert(Int, c) + 1]
71 5
    if !isvalid(DNA, dna)
72 5
        throw(InexactError(:convert, DNA, c))
73
    end
74 5
    return reinterpret(DNA, dna)
75
end
76 5
DNA(c::Char) = convert(DNA, c)
77

78
function Base.convert(::Type{RNA}, c::Char)
79 5
    if c > '\uff'
80 5
        throw(InexactError(:convert, RNA, c))
81
    end
82 5
    @inbounds rna = char_to_rna[convert(Int, c) + 1]
83 5
    if !isvalid(RNA, rna)
84 5
        throw(InexactError(:convert, RNA, c))
85
    end
86 5
    return reinterpret(RNA, rna)
87
end
88 5
RNA(c::Char) = convert(RNA, c)
89

90
function Base.convert(::Type{Char}, nt::DNA)
91 5
    return dna_to_char[convert(UInt8, nt) + 1]
92
end
93 3
Char(nt::DNA) = convert(Char, nt)
94

95
function Base.convert(::Type{Char}, nt::RNA)
96 5
    return rna_to_char[convert(UInt8, nt) + 1]
97
end
98 3
Char(nt::RNA) = convert(Char, nt)
99

100

101
# Encoding of DNA and RNA NucleicAcids
102
# ------------------------------------
103

104
"""
105
Lookup table used for converting characters to DNA symbol values
106

107
The provided `convert` method should be used rather than this table, but you can
108
use it if you insist and know what your are doing.
109

110
!!! note
111
    The array is indexed by converting a character to an integer. When indexed, it
112
    returns a UInt8 with the bit pattern on the corresponding nucleic acid.
113
    The `convert(DNA, x)` method does this for you.
114

115
!!! warning
116
    If you index this array with a character that is greater than '\uff', then
117
    you will get a bounds error. The `convert(DNA, x)` method checks such things
118
    to avoid this for you.
119

120
!!! warning
121
    If you index this array with a character that does not have a corresonding
122
    DNA symbol, then you get a byte with the bit pattern `0x80`, which is an
123
    invalid DNA symbol and will be of no use to you. The `convert(DNA, x)`
124
    checks such things for you and throws an exception gracefully if such a
125
    situation arises.
126
"""
127
const char_to_dna = [0x80 for _ in 0x00:0xff]
128

129
"""
130
Lookup table for converting DNA symbol values to characters
131

132
The provided `convert` method should be used rather than this table, but you can
133
use it if you insist and know what your are doing.
134

135
!!! note
136
    The array is indexed by reinterpreting a DNA symbol value as an UInt8.
137
    When indexed, it returns the character corresponding to the symbol.
138
    The `convert(Char, x::DNA)` method does this for you.
139

140
!!! warning
141
    If you index this array with an invalid DNA symbol, then you will hit a
142
    bounds error. If you construct DNA symbols properly, then this scenario
143
    should never occur.
144
"""
145
dna_to_char
146

147
# Derived from "The DDBJ/ENA/GenBank Feature Table Definition"
148
# §7.4.1 Nucleotide base code (IUPAC)
149
# http://www.insdc.org/documents/feature_table.html#7.4.1
150
const dna_to_char = let
151
    chararray = Vector{Char}(undef, 16)
152
    for (char, doc, bits) in [
153
        ('-', "DNA Gap",                                   0b0000),
154
        ('A', "DNA Adenine",                               0b0001),
155
        ('C', "DNA Cytosine",                              0b0010),
156
        ('G', "DNA Guanine",                               0b0100),
157
        ('T', "DNA Thymine",                               0b1000),
158
        ('M', "DNA Adenine or Cytosine",                   0b0011),
159
        ('R', "DNA Adenine or Guanine",                    0b0101),
160
        ('W', "DNA Adenine or Thymine",                    0b1001),
161
        ('S', "DNA Cytosine or Guanine",                   0b0110),
162
        ('Y', "DNA Cytosine or Thymine",                   0b1010),
163
        ('K', "DNA Guanine or Thymine",                    0b1100),
164
        ('V', "DNA Adenine, Cytosine or Guanine",          0b0111),
165
        ('H', "DNA Adenine, Cytosine or Thymine",          0b1011),
166
        ('D', "DNA Adenine, Guanine or Thymine",           0b1101),
167
        ('B', "DNA Cytosine, Guanine or Thymine",          0b1110),
168
        ('N', "DNA Adenine, Cytosine, Guanine or Thymine", 0b1111)]
169
        var = Symbol("DNA_", char != '-' ? char : "Gap")
170
        @eval begin
171
            @doc $(doc) const $(var) = reinterpret(DNA, $(bits))
172
            char_to_dna[$(convert(Int, char) + 1)] = char_to_dna[$(convert(Int, lowercase(char)) + 1)] = $(bits)
173
            $(chararray)[$(convert(Int, bits) + 1)] = $(char)
174
        end
175
    end
176
    Tuple(chararray)
177
end
178

179
@eval function alphabet(::Type{DNA})
180 3
    return $(tuple([reinterpret(DNA, x) for x in 0b0000:0b1111]...))
181
end
182

183
"""
184
    alphabet(DNA)
185

186
Get all symbols of `DNA` in sorted order.
187

188
Examples
189
--------
190

191
```jldoctest
192
julia> alphabet(DNA)
193
(DNA_Gap, DNA_A, DNA_C, DNA_M, DNA_G, DNA_R, DNA_S, DNA_V, DNA_T, DNA_W, DNA_Y, DNA_H, DNA_K, DNA_D, DNA_B, DNA_N)
194

195
julia> issorted(alphabet(DNA))
196
true
197

198
```
199
"""
200
alphabet(::Type{DNA})
201

202
"""
203
    ACGT
204

205
Unambiguous DNA.
206

207
Examples
208
--------
209

210
```jldoctest
211
julia> ACGT
212
(DNA_A, DNA_C, DNA_G, DNA_T)
213

214
```
215
"""
216
const ACGT = (DNA_A, DNA_C, DNA_G, DNA_T)
217

218
"""
219
    ACGTN
220

221
Unambiguous DNA and `DNA_N`.
222

223
Examples
224
--------
225

226
```jldoctest
227
julia> ACGTN
228
(DNA_A, DNA_C, DNA_G, DNA_T, DNA_N)
229

230
```
231
"""
232
const ACGTN = (DNA_A, DNA_C, DNA_G, DNA_T, DNA_N)
233

234
"""
235
Lookup table used for converting characters to RNA symbol values
236

237
The provided `convert` method should be used rather than this table, but you can
238
use it if you insist and know what your are doing.
239

240
!!! note
241
    The array is indexed by converting a character to an integer. When indexed, it
242
    returns a UInt8 with the bit pattern on the corresponding nucleic acid.
243
    The `convert(RNA, x)` method does this for you.
244

245
!!! warning
246
    If you index this array with a character that is greater than '\uff', then
247
    you will get a bounds error. The `convert(RNA, x)` method checks such things
248
    to avoid this for you.
249

250
!!! warning
251
    If you index this array with a character that does not have a corresonding
252
    RNA symbol, then you get a byte with the bit pattern `0x80`, which is an
253
    invalid RNA symbol and will be of no use to you. The `convert(RNA, x)`
254
    checks such things for you and throws an exception gracefully if such a
255
    situation arises.
256
"""
257
const char_to_rna = [0x80 for _ in 0x00:0xff]
258

259
"""
260
Lookup table for converting RNA symbol values to characters
261

262
The provided `convert` method should be used rather than this table, but you can
263
use it if you insist and know what your are doing.
264

265
!!! note
266
    The array is indexed by reinterpreting a RNA symbol value as an UInt8.
267
    When indexed, it returns the character corresponding to the symbol.
268
    The `convert(Char, x::RNA)` method does this for you.
269

270
!!! warning
271
    If you index this array with an invalid RNA symbol, then you will hit a
272
    bounds error. If you construct RNA symbols properly, then this scenario
273
    should never occur.
274
"""
275
rna_to_char
276

277

278
const rna_to_char = let
279
    chararray = Vector{Char}(undef, 16)
280
    for (char, doc, dna) in [
281
        ('-', "RNA Gap",                                  DNA_Gap),
282
        ('A', "RNA Adenine",                              DNA_A  ),
283
        ('C', "RNA Cytosine",                             DNA_C  ),
284
        ('G', "RNA Guanine",                              DNA_G  ),
285
        ('U', "RNA Uracil",                               DNA_T  ),
286
        ('M', "RNA Adenine or Cytosine",                  DNA_M  ),
287
        ('R', "RNA Adenine or Guanine",                   DNA_R  ),
288
        ('W', "RNA Adenine or Uracil",                    DNA_W  ),
289
        ('S', "RNA Cytosine or Guanine",                  DNA_S  ),
290
        ('Y', "RNA Cytosine or Uracil",                   DNA_Y  ),
291
        ('K', "RNA Guanine or Uracil",                    DNA_K  ),
292
        ('V', "RNA Adenine, Cytosine or Guanine",         DNA_V  ),
293
        ('H', "RNA Adenine, Cytosine or Uracil",          DNA_H  ),
294
        ('D', "RNA Adenine, Guanine or Uracil",           DNA_D  ),
295
        ('B', "RNA Cytosine, Guanine or Uracil",          DNA_B  ),
296
        ('N', "RNA Adenine, Cytosine, Guanine or Uracil", DNA_N  )]
297
        var = Symbol("RNA_", char != '-' ? char : "Gap")
298
        @eval begin
299
            @doc $(doc) const $(var) = reinterpret(RNA, $(dna))
300
            char_to_rna[$(convert(Int, char) + 1)] = char_to_rna[$(convert(Int, lowercase(char) + 1))] = reinterpret(UInt8, $(dna))
301
            $(chararray)[$(convert(Int, dna) + 1)] = $(char)
302
        end
303
    end
304
    Tuple(chararray)
305
end
306

307
@eval function alphabet(::Type{RNA})
308 3
    return $(tuple([reinterpret(RNA, x) for x in 0b0000:0b1111]...))
309
end
310

311
"""
312
    alphabet(RNA)
313

314
Get all symbols of `RNA` in sorted order.
315

316
Examples
317
--------
318

319
```jldoctest
320
julia> alphabet(RNA)
321
(RNA_Gap, RNA_A, RNA_C, RNA_M, RNA_G, RNA_R, RNA_S, RNA_V, RNA_U, RNA_W, RNA_Y, RNA_H, RNA_K, RNA_D, RNA_B, RNA_N)
322

323
julia> issorted(alphabet(RNA))
324
true
325

326
```
327
"""
328
alphabet(::Type{RNA})
329

330
"""
331
    ACGU
332

333
Unambiguous RNA.
334

335
Examples
336
--------
337

338
```jldoctest
339
julia> ACGU
340
(RNA_A, RNA_C, RNA_G, RNA_U)
341

342
```
343
"""
344
const ACGU = (RNA_A, RNA_C, RNA_G, RNA_U)
345

346
"""
347
    ACGUN
348

349
Unambiguous RNA and `RNA_N`.
350

351
Examples
352
--------
353

354
```jldoctest
355
julia> ACGUN
356
(RNA_A, RNA_C, RNA_G, RNA_U, RNA_N)
357

358
```
359
"""
360
const ACGUN = (RNA_A, RNA_C, RNA_G, RNA_U, RNA_N)
361

362
function Base.:~(x::N) where N <: NucleicAcid
363 3
    return reinterpret(N, ~reinterpret(UInt8, x) & 0b1111)
364
end
365

366
function Base.:|(x::N, y::N) where N <: NucleicAcid
367 3
    return reinterpret(N, reinterpret(UInt8, x) | reinterpret(UInt8, y))
368
end
369

370
function Base.:-(x::N, y::Integer) where N <: NucleicAcid
371 3
    return x + (-y)
372
end
373

374
function Base.:+(x::N, y::Integer) where N <: NucleicAcid
375 3
    return reinterpret(N, (convert(UInt8, x) + y % UInt8) & 0b1111)
376
end
377

378
"""
379
    gap(DNA)
380

381
Return `DNA_Gap`.
382
"""
383 5
gap(::Type{DNA}) = DNA_Gap
384

385
"""
386
    gap(RNA)
387

388
Return `RNA_Gap`.
389
"""
390 5
gap(::Type{RNA}) = RNA_Gap
391

392
"""
393
    isGC(nt::NucleicAcid)
394

395
Test if `nt` is surely either guanine or cytosine.
396
"""
397
function isGC(nt::NucleicAcid)
398 5
    bits = reinterpret(UInt8, nt)
399 5
    return bits != 0 && (bits & 0b1001) == 0
400
end
401

402
"""
403
    ispurine(nt::NucleicAcid)
404

405
Test if `nt` is surely a purine.
406
"""
407
@inline function ispurine(nt::NucleicAcid)
408 5
    bits = reinterpret(UInt8, nt)
409 5
    return bits != 0 && (bits & 0b1010) == 0
410
end
411

412
"""
413
    ispyrimidine(nt::NucleicAcid)
414

415
Test if `nt` is surely a pyrimidine.
416
"""
417
@inline function ispyrimidine(nt::NucleicAcid)
418 5
    bits = reinterpret(UInt8, nt)
419 5
    return bits != 0 && (bits & 0b0101) == 0
420
end
421

422
"""
423
    isambiguous(nt::NucleicAcid)
424

425
Test if `nt` is an ambiguous nucleotide.
426
"""
427
@inline function isambiguous(nt::NucleicAcid)
428 5
    return count_ones(nt) > 1
429
end
430

431
"""
432
    iscertain(nt::NucleicAcid)
433

434
Test if `nt` is a non-ambiguous nucleotide e.g. ACGT.
435
"""
436
@inline function iscertain(nt::NucleicAcid)
437 5
    return count_ones(nt) == 1
438
end
439

440
"""
441
    complement(nt::NucleicAcid)
442

443
Return the complementary nucleotide of `nt`.
444

445
This function returns the union of all possible complementary nucleotides.
446

447
Examples
448
--------
449

450
```jldoctest
451
julia> complement(DNA_A)
452
DNA_T
453

454
julia> complement(DNA_N)
455
DNA_N
456

457
julia> complement(RNA_U)
458
RNA_A
459

460
```
461
"""
462
function complement(nt::NucleicAcid)
463 3
    bits = compatbits(nt)
464 3
    return reinterpret(
465
        typeof(nt),
466
        (bits & 0x01) << 3 | (bits & 0x08) >> 3 |
467
        (bits & 0x02) << 1 | (bits & 0x04) >> 1)
468
end
469

470
function Base.isvalid(::Type{T}, x::Integer) where T <: NucleicAcid
471 5
    return 0  x < 16
472
end
473

474
function Base.isvalid(nt::NucleicAcid)
475 3
    return reinterpret(UInt8, nt)  0b1111
476
end
477

478

479

480
"""
481
    compatbits(nt::NucleicAcid)
482

483
Return the compatibility bits of `nt` as `UInt8`.
484

485
Examples
486
--------
487

488
```jldoctest
489
julia> compatbits(DNA_A)
490
0x01
491

492
julia> compatbits(DNA_C)
493
0x02
494

495
julia> compatbits(DNA_N)
496
0x0f
497

498
```
499
"""
500
@inline function compatbits(nt::NucleicAcid)
501 3
    return reinterpret(UInt8, nt)
502
end

Read our documentation on viewing source code .

Loading