1
# pylint: disable=invalid-name,too-many-lines
2 2
"""Density estimation functions for ArviZ."""
3 2
import warnings
4

5 2
import numpy as np
6 2
from scipy.fftpack import fft
7 2
from scipy.optimize import brentq
8 2
from scipy.signal import convolve, convolve2d, gaussian  # pylint: disable=no-name-in-module
9 2
from scipy.sparse import coo_matrix
10 2
from scipy.special import ive  # pylint: disable=no-name-in-module
11

12 2
from ..utils import _cov, _dot, _stack, conditional_jit
13

14 2
__all__ = ["kde"]
15

16

17 2
def _bw_scott(x, x_std=None, **kwargs):  # pylint: disable=unused-argument
18
    """Scott's Rule."""
19 0
    if x_std is None:
20 0
        x_std = np.std(x)
21 0
    bw = 1.06 * x_std * len(x) ** (-0.2)
22 0
    return bw
23

24

25 2
def _bw_silverman(x, x_std=None, **kwargs):  # pylint: disable=unused-argument
26
    """Silverman's Rule."""
27 2
    if x_std is None:
28 2
        x_std = np.std(x)
29 2
    q75, q25 = np.percentile(x, [75, 25])
30 2
    x_iqr = q75 - q25
31 2
    a = min(x_std, x_iqr / 1.34)
32 2
    bw = 0.9 * a * len(x) ** (-0.2)
33 2
    return bw
34

35

36 2
def _bw_isj(x, grid_counts=None, x_std=None, x_range=None):
37
    """Improved Sheather-Jones bandwidth estimation.
38

39
    Improved Sheather and Jones method as explained in [1]_.
40
    This is an internal version pretended to be used by the KDE estimator.
41
    When used internally computation time is saved because things like minimums,
42
    maximums and the grid are pre-computed.
43

44
    References
45
    ----------
46
    .. [1] Kernel density estimation via diffusion.
47
       Z. I. Botev, J. F. Grotowski, and D. P. Kroese.
48
       Ann. Statist. 38 (2010), no. 5, 2916--2957.
49
    """
50 2
    x_len = len(x)
51 2
    if x_range is None:
52 0
        x_min = np.min(x)
53 0
        x_max = np.max(x)
54 0
        x_range = x_max - x_min
55

56
    # Relative frequency per bin
57 2
    if grid_counts is None:
58 0
        x_std = np.std(x)
59 0
        grid_len = 256
60 0
        grid_min = x_min - 0.5 * x_std
61 0
        grid_max = x_max + 0.5 * x_std
62 0
        grid_counts, _, _ = histogram(x, grid_len, (grid_min, grid_max))
63
    else:
64 2
        grid_len = len(grid_counts) - 1
65

66 2
    grid_relfreq = grid_counts / x_len
67

68
    # Discrete cosine transform of the data
69 2
    a_k = _dct1d(grid_relfreq)
70

71 2
    k_sq = np.arange(1, grid_len) ** 2
72 2
    a_sq = a_k[range(1, grid_len)] ** 2
73

74 2
    t = _root(_fixed_point, x_len, args=(x_len, k_sq, a_sq), x=x)
75 2
    h = t ** 0.5 * x_range
76 2
    return h
77

78

79 2
def _bw_experimental(x, grid_counts=None, x_std=None, x_range=None):
80
    """Experimental bandwidth estimator."""
81 2
    bw_silverman = _bw_silverman(x, x_std=x_std)
82 2
    bw_isj = _bw_isj(x, grid_counts=grid_counts, x_range=x_range)
83 2
    return 0.5 * (bw_silverman + bw_isj)
84

85

86 2
def _bw_taylor(x):
87
    """Taylor's rule for circular bandwidth estimation.
88

89
    This function implements a rule-of-thumb for choosing the bandwidth of
90
    a von Mises kernel density estimator that assumes the underlying
91
    distribution is von Mises as introduced in [1]_.
92
    It is analogous to Scott's rule for the Gaussian KDE.
93

94
    Circular bandwidth has a different scale from linear bandwidth.
95
    Unlike linear scale, low bandwidths are associated with oversmoothing
96
    while high values are associated with undersmoothing.
97

98
    References
99
    ----------
100
    .. [1] C.C Taylor (2008). Automatic bandwidth selection for circular
101
           density estimation.
102
           Computational Statistics and Data Analysis, 52, 7, 3493–3500.
103
    """
104 0
    x_len = len(x)
105 0
    kappa = _kappa_mle(x)
106 0
    num = 3 * x_len * kappa ** 2 * ive(2, 2 * kappa)
107 0
    den = 4 * np.pi ** 0.5 * ive(0, kappa) ** 2
108 0
    return (num / den) ** 0.4
109

110

111 2
_BW_METHODS_LINEAR = {
112
    "scott": _bw_scott,
113
    "silverman": _bw_silverman,
114
    "isj": _bw_isj,
115
    "experimental": _bw_experimental,
116
}
117

118

119 2
def _get_bw(x, bw, grid_counts=None, x_std=None, x_range=None):
120
    """Compute bandwidth for a given data `x` and `bw`.
121

122
    Also checks `bw` is correctly specified.
123

124
    Parameters
125
    ----------
126
    x : 1-D numpy array
127
        1 dimensional array of sample data from the
128
        variable for which a density estimate is desired.
129
    bw: int, float or str
130
        If numeric, indicates the bandwidth and must be positive.
131
        If str, indicates the method to estimate the bandwidth.
132

133
    Returns
134
    -------
135
    bw: float
136
        Bandwidth
137
    """
138 2
    if isinstance(bw, bool):
139 0
        raise ValueError(
140
            (
141
                "`bw` must not be of type `bool`.\n"
142
                "Expected a positive numeric or one of the following strings:\n"
143
                "{}."
144
            ).format(list(_BW_METHODS_LINEAR.keys()))
145
        )
146 2
    if isinstance(bw, (int, float)):
147 0
        if bw < 0:
148 0
            raise ValueError("Numeric `bw` must be positive.\nInput: {:.4f}.".format(bw))
149 2
    elif isinstance(bw, str):
150 2
        bw_lower = bw.lower()
151

152 2
        if bw_lower not in _BW_METHODS_LINEAR.keys():
153 0
            raise ValueError(
154
                (
155
                    "Unrecognized bandwidth method.\n" "Input is: {}.\n" "Expected one of: {}."
156
                ).format(bw_lower, list(_BW_METHODS_LINEAR.keys()))
157
            )
158

159 2
        bw_fun = _BW_METHODS_LINEAR[bw_lower]
160 2
        bw = bw_fun(x, grid_counts=grid_counts, x_std=x_std, x_range=x_range)
161
    else:
162 0
        raise ValueError(
163
            (
164
                "Unrecognized `bw` argument.\n"
165
                "Expected a positive numeric or one of the following strings:\n"
166
                "{}."
167
            ).format(list(_BW_METHODS_LINEAR.keys()))
168
        )
169 2
    return bw
170

171

172 2
def _vonmises_pdf(x, mu, kappa):
173
    """Calculate vonmises_pdf."""
174 0
    if kappa <= 0:
175 0
        raise ValueError("Argument 'kappa' must be positive.")
176 0
    pdf = 1 / (2 * np.pi * ive(0, kappa)) * np.exp(np.cos(x - mu) - 1) ** kappa
177 0
    return pdf
178

179

180 2
def _a1inv(x):
181
    """Compute inverse function.
182

183
    Inverse function of the ratio of the first and
184
    zeroth order Bessel functions of the first kind.
185

186
    Returns the value k, such that a1inv(x) = k, i.e. a1(k) = x.
187
    """
188 0
    if 0 <= x < 0.53:
189 0
        return 2 * x + x ** 3 + (5 * x ** 5) / 6
190 0
    elif x < 0.85:
191 0
        return -0.4 + 1.39 * x + 0.43 / (1 - x)
192
    else:
193 0
        return 1 / (x ** 3 - 4 * x ** 2 + 3 * x)
194

195

196 2
def _kappa_mle(x):
197 0
    mean = _circular_mean(x)
198 0
    kappa = _a1inv(np.mean(np.cos(x - mean)))
199 0
    return kappa
200

201

202 2
def _dct1d(x):
203
    """Discrete Cosine Transform in 1 Dimension.
204

205
    Parameters
206
    ----------
207
    x : numpy array
208
        1 dimensional array of values for which the
209
        DCT is desired
210

211
    Returns
212
    -------
213
    output : DTC transformed values
214
    """
215 2
    x_len = len(x)
216

217 2
    even_increasing = np.arange(0, x_len, 2)
218 2
    odd_decreasing = np.arange(x_len - 1, 0, -2)
219

220 2
    x = np.concatenate((x[even_increasing], x[odd_decreasing]))
221

222 2
    w_1k = np.r_[1, (2 * np.exp(-(0 + 1j) * (np.arange(1, x_len)) * np.pi / (2 * x_len)))]
223 2
    output = np.real(w_1k * fft(x))
224

225 2
    return output
226

227

228 2
def _fixed_point(t, N, k_sq, a_sq):
229
    """Calculate t-zeta*gamma^[l](t).
230

231
    Implementation of the function t-zeta*gamma^[l](t) derived from equation (30) in [1].
232

233
    References
234
    ----------
235
    .. [1] Kernel density estimation via diffusion.
236
       Z. I. Botev, J. F. Grotowski, and D. P. Kroese.
237
       Ann. Statist. 38 (2010), no. 5, 2916--2957.
238
    """
239 2
    k_sq = np.asfarray(k_sq, dtype=np.float64)
240 2
    a_sq = np.asfarray(a_sq, dtype=np.float64)
241

242 2
    l = 7
243 2
    f = np.sum(np.power(k_sq, l) * a_sq * np.exp(-k_sq * np.pi ** 2 * t))
244 2
    f *= 0.5 * np.pi ** (2.0 * l)
245

246 2
    for j in np.arange(l - 1, 2 - 1, -1):
247 2
        c1 = (1 + 0.5 ** (j + 0.5)) / 3
248 2
        c2 = np.product(np.arange(1.0, 2 * j + 1, 2, dtype=np.float64))
249 2
        c2 /= (np.pi / 2) ** 0.5
250 2
        t_j = np.power((c1 * (c2 / (N * f))), (2.0 / (3.0 + 2.0 * j)))
251 2
        f = np.sum(k_sq ** j * a_sq * np.exp(-k_sq * np.pi ** 2.0 * t_j))
252 2
        f *= 0.5 * np.pi ** (2 * j)
253

254 2
    out = t - (2 * N * np.pi ** 0.5 * f) ** (-0.4)
255 2
    return out
256

257

258 2
def _root(function, N, args, x):
259
    # The right bound is at most 0.01
260 2
    found = False
261 2
    N = max(min(1050, N), 50)
262 2
    tol = 10e-12 + 0.01 * (N - 50) / 1000
263

264 2
    while not found:
265 2
        try:
266 2
            bw, res = brentq(function, 0, 0.01, args=args, full_output=True, disp=False)
267 2
            found = res.converged
268 2
        except ValueError:
269 2
            bw = 0
270 2
            tol *= 2.0
271 2
            found = False
272 2
        if bw <= 0 or tol >= 1:
273
            # warnings.warn(
274
            #    "Improved Sheather-Jones did not converge as expected. "
275
            #    "Using Silverman's rule instead.",
276
            #    Warning
277
            # )
278 2
            bw = (_bw_silverman(x) / np.ptp(x)) ** 2
279 2
            return bw
280 2
    return bw
281

282

283 2
def _check_type(x):
284
    """Check the input is of the correct type.
285

286
    It only accepts numeric lists/numpy arrays of 1 dimension or something that
287
    can be flattened to 1 dimension.
288

289
    Parameters
290
    ----------
291
    x : Object whose type is checked before computing the KDE.
292

293
    Returns
294
    -------
295
    x : 1-D numpy array
296
        If no error is thrown, a 1 dimensional array of
297
        sample data from the variable for which a density estimate is desired.
298
    """
299
    # Will raise an error if `x` can't be casted to numeric or flattened to one dimension.
300 2
    try:
301 2
        x = np.asfarray(x).flatten()
302 0
    except Exception as e:
303 0
        warnings.warn(
304
            "The following exception occurred while trying to convert `x`"
305
            "to a 1 dimensional float array."
306
        )
307 0
        raise e
308

309 2
    x = x[np.isfinite(x)]
310

311 2
    if x.size == 0:
312 0
        raise ValueError("`x` does not contain any finite number.")
313 2
    if x.size == 1:
314 2
        raise ValueError("`x` is of length 1. Can't produce a KDE with only one data point.")
315 2
    return x
316

317

318 2
def _check_custom_lims(custom_lims, x_min, x_max):
319
    """Check if `custom_lims` are of the correct type.
320

321
    It accepts numeric lists/tuples of length 2.
322

323
    Parameters
324
    ----------
325
    custom_lims : Object whose type is checked.
326

327
    Returns
328
    -------
329
    None: Object of type None
330
    """
331 2
    if not isinstance(custom_lims, (list, tuple)):
332 0
        raise TypeError(
333
            (
334
                "`custom_lims` must be a numeric list or tuple of length 2.\n"
335
                "Not an object of {}."
336
            ).format(type(custom_lims))
337
        )
338

339 2
    if len(custom_lims) != 2:
340 0
        raise AttributeError("`len(custom_lims)` must be 2, not {}.".format(len(custom_lims)))
341

342 2
    any_bool = any(isinstance(i, bool) for i in custom_lims)
343 2
    if any_bool:
344 0
        raise TypeError("Elements of `custom_lims` must be numeric or None, not bool.")
345

346 2
    custom_lims = list(custom_lims)  # convert to a mutable object
347 2
    if custom_lims[0] is None:
348 2
        custom_lims[0] = x_min
349

350 2
    if custom_lims[1] is None:
351 2
        custom_lims[1] = x_max
352

353 2
    all_numeric = all(isinstance(i, (int, float, np.integer, np.float)) for i in custom_lims)
354 2
    if not all_numeric:
355 0
        raise TypeError(
356
            ("Elements of `custom_lims` must be numeric or None.\n" "At least one of them is not.")
357
        )
358

359 2
    if not custom_lims[0] < custom_lims[1]:
360 0
        raise AttributeError("`custom_lims[0]` must be smaller than `custom_lims[1]`.")
361

362 2
    return custom_lims
363

364

365 2
def _get_grid(
366
    x_min, x_max, x_std, extend_fct, grid_len, custom_lims, extend=True, bound_correction=False
367
):
368
    """Compute the grid that bins the data used to estimate the density function.
369

370
    Parameters
371
    ----------
372
    x_min : float
373
        Minimum value of the data
374
    x_max: float
375
        Maximum value of the data.
376
    x_std: float
377
        Standard deviation of the data.
378
    extend_fct: bool
379
        Indicates the factor by which `x_std` is multiplied
380
        to extend the range of the data.
381
    grid_len: int
382
        Number of bins
383
    custom_lims: tuple or list
384
        Custom limits for the domain of the density estimation.
385
        Must be numeric of length 2. Overrides `extend`.
386
    extend: bool, optional
387
        Whether to extend the range of the data or not.
388
        Default is True.
389
    bound_correction: bool, optional
390
        Whether the density estimations performs boundary correction or not.
391
        This does not impacts directly in the output, but is used
392
        to override `extend`. Overrides `extend`.
393
        Default is False.
394

395
    Returns
396
    -------
397
    grid_len: int
398
        Number of bins
399
    grid_min: float
400
        Minimum value of the grid
401
    grid_max: float
402
        Maximum value of the grid
403
    """
404
    # Set up number of bins.
405 2
    if grid_len < 100:
406 0
        grid_len = 100
407 2
    grid_len = int(grid_len)
408

409
    # Set up domain
410 2
    if custom_lims is not None:
411 2
        custom_lims = _check_custom_lims(custom_lims, x_min, x_max)
412 2
        grid_min = custom_lims[0]
413 2
        grid_max = custom_lims[1]
414 2
    elif extend and not bound_correction:
415 0
        grid_extend = extend_fct * x_std
416 0
        grid_min = x_min - grid_extend
417 0
        grid_max = x_max + grid_extend
418
    else:
419 2
        grid_min = x_min
420 2
        grid_max = x_max
421 2
    return grid_min, grid_max, grid_len
422

423

424 2
def kde(x, circular=False, **kwargs):
425
    """One dimensional density estimation.
426

427
    It is a wrapper around `kde_linear()` and `kde_circular()`.
428

429
    Parameters
430
    ----------
431
    x : 1D numpy array
432
        Data used to calculate the density estimation.
433
        Theoritically it is a random sample obtained from $f$,
434
        the true probability density function we aim to estimate.
435
    circular: bool, optional
436
        Whether `x` is a circular variable or not. Defaults to False.
437
    **kwargs: Arguments passed to `kde_linear()` and `kde_circular()`.
438
        See their documentation for more info.
439

440
    Returns
441
    -------
442
    grid : Gridded numpy array for the x values.
443
    pdf : Numpy array for the density estimates.
444
    bw: optional, the estimated bandwidth.
445

446
    Examples
447
    --------
448
    Default density estimation for linear data
449
    .. plot::
450
        :context: close-figs
451

452
    >>> import numpy as np
453
    >>> import matplotlib.pyplot as plt
454
    >>> from arviz import kde
455
    >>>
456
    >>> rvs = np.random.gamma(shape=1.8, size=1000)
457
    >>> grid, pdf = kde(rvs)
458
    >>> plt.plot(grid, pdf)
459
    >>> plt.show()
460

461
    Density estimation for linear data with Silverman's rule bandwidth
462
    .. plot::
463
        :context: close-figs
464

465
    >>> grid, pdf = kde(rvs, bw="silverman")
466
    >>> plt.plot(grid, pdf)
467
    >>> plt.show()
468

469
    Density estimation for linear data with scaled bandwidth
470
    .. plot::
471
        :context: close-figs
472

473
    >>> # bw_fct > 1 means more smoothness.
474
    >>> grid, pdf = kde(rvs, bw_fct=2.5)
475
    >>> plt.plot(grid, pdf)
476
    >>> plt.show()
477

478
    Default density estimation for linear data with extended limits
479
    .. plot::
480
        :context: close-figs
481

482
    >>> grid, pdf = kde(rvs, bound_correction=False, extend=True, extend_fct=0.5)
483
    >>> plt.plot(grid, pdf)
484
    >>> plt.show()
485

486
    Default density estimation for linear data with custom limits
487
    .. plot::
488
        :context: close-figs
489
    # It accepts tuples and lists of length 2.
490
    >>> grid, pdf = kde(rvs, bound_correction=False, custom_lims=(0, 10))
491
    >>> plt.plot(grid, pdf)
492
    >>> plt.show()
493

494
    Default density estimation for circular data
495
    .. plot::
496
        :context: close-figs
497

498
    >>> rvs = np.random.vonmises(mu=np.pi, kappa=1, size=500)
499
    >>> grid, pdf = kde(rvs, circular=True)
500
    >>> plt.plot(grid, pdf)
501
    >>> plt.show()
502

503
    Density estimation for circular data with scaled bandwidth
504
    .. plot::
505
        :context: close-figs
506

507
    >>> rvs = np.random.vonmises(mu=np.pi, kappa=1, size=500)
508
    >>> # bw_fct > 1 means less smoothness.
509
    >>> grid, pdf = kde(rvs, circular=True, bw_fct=3)
510
    >>> plt.plot(grid, pdf)
511
    >>> plt.show()
512

513
    Density estimation for circular data with custom limits
514
    .. plot::
515
        :context: close-figs
516
    >>> # This is still experimental, does not always work.
517
    >>> rvs = np.random.vonmises(mu=0, kappa=30, size=500)
518
    >>> grid, pdf = kde(rvs, circular=True, custom_lims=(-1, 1))
519
    >>> plt.plot(grid, pdf)
520
    >>> plt.show()
521

522
    See Also
523
    --------
524
    plot_kde : Compute and plot a kernel density estimate.
525
    arviz.stats.density_utils.kde: Arviz KDE estimator
526
    """
527 2
    if circular:
528 0
        kde_fun = _kde_circular
529
    else:
530 2
        kde_fun = _kde_linear
531

532 2
    return kde_fun(x, **kwargs)
533

534

535 2
def _kde_linear(
536
    x,
537
    bw="experimental",
538
    adaptive=False,
539
    extend=False,
540
    bound_correction=True,
541
    extend_fct=0,
542
    bw_fct=1,
543
    bw_return=False,
544
    custom_lims=None,
545
    cumulative=False,
546
    grid_len=512,
547
    **kwargs,  # pylint: disable=unused-argument
548
):
549
    """One dimensional density estimation for linear data.
550

551
    Given an array of data points `x` it returns an estimate of
552
    the probability density function that generated the samples in `x`.
553

554
    Parameters
555
    ----------
556
    x : 1D numpy array
557
        Data used to calculate the density estimation.
558
        Theoritically it is a random sample obtained from $f$,
559
        the true probability density function we aim to estimate.
560
    bw: int, float or str, optional
561
        If numeric, indicates the bandwidth and must be positive.
562
        If str, indicates the method to estimate the bandwidth and must be
563
        one of "scott", "silverman", "isj" or "experimental".
564
        Defaults to "experimental".
565
    adaptive: boolean, optional
566
        Indicates if the bandwidth is adaptative or not.
567
        It is the recommended approach when there are multiple modalities
568
        with different spread.
569
        It is not compatible with convolution. Defaults to False.
570
    extend: boolean, optional
571
        Whether to extend the observed range for `x` in the estimation.
572
        It extends each bound by a multiple of the standard deviation of `x`
573
        given by `extend_fct`. Defaults to False.
574
    bound_correction: boolean, optional
575
        Whether to perform boundary correction on the bounds of `x` or not.
576
        Defaults to True.
577
    extend_fct: float, optional
578
        Number of standard deviations used to widen the
579
        lower and upper bounds of `x`. Defaults to 0.5.
580
    bw_fct: float, optional
581
        A value that multiplies `bw` which enables tuning smoothness by hand.
582
        Must be positive. Values below 1 decrease smoothness while values
583
        above 1 decrease it. Defaults to 1 (no modification).
584
    bw_return: bool, optional
585
        Whether to return the estimated bandwidth in addition to the
586
        other objects. Defaults to False.
587
    custom_lims: list or tuple, optional
588
        A list or tuple of length 2 indicating custom bounds
589
        for the range of `x`. Defaults to None which disables custom bounds.
590
    cumulative: bool, optional
591
        Whether return the PDF or the cumulative PDF. Defaults to False.
592
    grid_len: int, optional
593
        The number of intervals used to bin the data points
594
        (a.k.a. the length of the grid used in the estimation)
595
        Defaults to 512.
596

597
    Returns
598
    -------
599
    grid : Gridded numpy array for the x values.
600
    pdf : Numpy array for the density estimates.
601
    bw: optional, the estimated bandwidth.
602
    """
603
    # Check `x` is from appropiate type
604 2
    try:
605 2
        x = _check_type(x)
606 2
    except ValueError as e:
607 2
        warnings.warn("Something failed: " + str(e))
608 2
        return np.array([np.nan]), np.array([np.nan])
609

610
    # Check `bw_fct` is numeric and positive
611 2
    if not isinstance(bw_fct, (int, float, np.integer, np.floating)):
612 0
        raise TypeError(
613
            "`bw_fct` must be a positive number, not an object of {}.".format(type(bw_fct))
614
        )
615

616 2
    if bw_fct <= 0:
617 0
        raise ValueError("`bw_fct` must be a positive number, not {}.".format(bw_fct))
618

619
    # Preliminary calculations
620 2
    x_len = len(x)
621 2
    x_min = x.min()
622 2
    x_max = x.max()
623 2
    x_std = (((x ** 2).sum() / x_len) - (x.sum() / x_len) ** 2) ** 0.5
624 2
    x_range = x_max - x_min
625

626
    # Determine grid
627 2
    grid_min, grid_max, grid_len = _get_grid(
628
        x_min, x_max, x_std, extend_fct, grid_len, custom_lims, extend, bound_correction
629
    )
630 2
    grid_counts, _, grid_edges = histogram(x, grid_len, (grid_min, grid_max))
631

632
    # Bandwidth estimation
633 2
    bw = bw_fct * _get_bw(x, bw, grid_counts, x_std, x_range)
634

635
    # Density estimation
636 2
    if adaptive:
637 0
        grid, pdf = _kde_adaptive(x, bw, grid_edges, grid_counts, grid_len, bound_correction)
638
    else:
639 2
        grid, pdf = _kde_convolution(x, bw, grid_edges, grid_counts, grid_len, bound_correction)
640

641 2
    if cumulative:
642 2
        pdf = pdf.cumsum() / pdf.sum()
643

644 2
    if bw_return:
645 0
        return grid, pdf, bw
646
    else:
647 2
        return grid, pdf
648

649

650 2
def _kde_circular(
651
    x,
652
    bw="taylor",
653
    bw_fct=1,
654
    bw_return=False,
655
    custom_lims=None,
656
    cumulative=False,
657
    grid_len=512,
658
    **kwargs,  # pylint: disable=unused-argument
659
):
660
    """One dimensional density estimation for circular data.
661

662
    Given an array of data points `x` measured in radians,
663
    it returns an estimate of the probability density function that generated
664
    the samples in `x`.
665

666
    Parameters
667
    ----------
668
    x : 1D numpy array
669
        Data used to calculate the density estimation.
670
        Theoritically it is a random sample obtained from $f$,
671
        the true probability density function we aim to estimate.
672
    bw: int, float or str, optional
673
        If numeric, indicates the bandwidth and must be positive.
674
        If str, indicates the method to estimate the bandwidth and must be
675
        "taylor" since it is the only option supported so far. Defaults to "taylor".
676
    bw_fct: float, optional
677
        A value that multiplies `bw` which enables tuning smoothness by hand.
678
        Must be positive. Values above 1 decrease smoothness while values
679
        below 1 decrease it. Defaults to 1 (no modification).
680
    bw_return: bool, optional
681
        Whether to return the estimated bandwidth in addition to the
682
        other objects. Defaults to False.
683
    custom_lims: list or tuple, optional
684
        A list or tuple of length 2 indicating custom bounds
685
        for the range of `x`. Defaults to None which means the estimation
686
        limits are [-pi, pi].
687
    cumulative: bool, optional
688
        Whether return the PDF or the cumulative PDF. Defaults to False.
689
    grid_len: int, optional
690
        The number of intervals used to bin the data points
691
        (a.k.a. the length of the grid used in the estimation)
692
        Defaults to 512.
693
    """
694 0
    try:
695 0
        x = _check_type(x)
696 0
    except ValueError as e:
697 0
        warnings.warn("Something failed: " + str(e))
698 0
        return np.array([np.nan]), np.array([np.nan])
699

700
    # All values between -pi and pi
701 0
    x = _normalize_angle(x)
702

703
    # Check `bw_fct` is numeric and positive
704 0
    if not isinstance(bw_fct, (int, float, np.integer, np.floating)):
705 0
        raise TypeError(
706
            "`bw_fct` must be a positive number, not an object of {}.".format(type(bw_fct))
707
        )
708

709 0
    if bw_fct <= 0:
710 0
        raise ValueError("`bw_fct` must be a positive number, not {}.".format(bw_fct))
711

712
    # Determine bandwidth
713 0
    if isinstance(bw, bool):
714 0
        raise ValueError(
715
            ("`bw` can't be of type `bool`.\n" "Expected a positive numeric or 'taylor'")
716
        )
717 0
    if isinstance(bw, (int, float)):
718 0
        if bw < 0:
719 0
            raise ValueError("Numeric `bw` must be positive.\nInput: {:.4f}.".format(bw))
720 0
    if isinstance(bw, str):
721 0
        if bw == "taylor":
722 0
            bw = _bw_taylor(x)
723
        else:
724 0
            raise ValueError(("`bw` must be a positive numeric or `taylor`, not {}".format(bw)))
725 0
    bw *= bw_fct
726

727
    # Determine grid
728 0
    if custom_lims is not None:
729 0
        custom_lims = _check_custom_lims(custom_lims, x.min(), x.max())
730 0
        grid_min = custom_lims[0]
731 0
        grid_max = custom_lims[1]
732 0
        assert grid_min >= -np.pi, "Lower limit can't be smaller than -pi"
733 0
        assert grid_max <= np.pi, "Upper limit can't be larger than pi"
734
    else:
735 0
        grid_min = -np.pi
736 0
        grid_max = np.pi
737

738 0
    bins = np.linspace(grid_min, grid_max, grid_len + 1)
739 0
    bin_counts, _, bin_edges = histogram(x, bins=bins)
740 0
    grid = 0.5 * (bin_edges[1:] + bin_edges[:-1])
741

742 0
    kern = _vonmises_pdf(x=grid, mu=0, kappa=bw)
743 0
    pdf = np.fft.fftshift(np.fft.irfft(np.fft.rfft(kern) * np.fft.rfft(bin_counts)))
744 0
    pdf /= len(x)
745

746 0
    if cumulative:
747 0
        pdf = pdf.cumsum() / pdf.sum()
748

749 0
    if bw_return:
750 0
        return grid, pdf, bw
751
    else:
752 0
        return grid, pdf
753

754

755
# pylint: disable=unused-argument
756 2
def _kde_convolution(x, bw, grid_edges, grid_counts, grid_len, bound_correction, **kwargs):
757
    """Kernel density with convolution.
758

759
    One dimensional Gaussian kernel density estimation via
760
    convolution of the binned relative frequencies and a Gaussian filter.
761
    This is an internal function used by `kde()`.
762
    """
763
    # Calculate relative frequencies per bin
764 2
    bin_width = grid_edges[1] - grid_edges[0]
765 2
    f = grid_counts / bin_width / len(x)
766

767
    # Bandwidth must consider the bin width
768 2
    bw /= bin_width
769

770
    # See: https://stackoverflow.com/questions/2773606/gaussian-filter-in-matlab
771 2
    kernel_n = int(bw * 2 * np.pi)
772

773
    # Temporal fix?
774 2
    if kernel_n == 0:
775 2
        kernel_n = 1
776 2
    kernel = gaussian(kernel_n, bw)
777

778 2
    if bound_correction:
779 2
        npad = int(grid_len / 5)
780 2
        f = np.concatenate([f[npad - 1 :: -1], f, f[grid_len : grid_len - npad - 1 : -1]])
781 2
        pdf = convolve(f, kernel, mode="same", method="direct")[npad : npad + grid_len]
782 2
        pdf /= bw * (2 * np.pi) ** 0.5
783
    else:
784 0
        pdf = convolve(f, kernel, mode="same", method="direct")
785 0
        pdf /= bw * (2 * np.pi) ** 0.5
786

787 2
    grid = (grid_edges[1:] + grid_edges[:-1]) / 2
788 2
    return grid, pdf
789

790

791 2
def _kde_adaptive(x, bw, grid_edges, grid_counts, grid_len, bound_correction, **kwargs):
792
    """Compute Adaptive Kernel Density Estimation.
793

794
    One dimensional adaptive Gaussian kernel density estimation.
795
    The implementation uses the binning technique.
796
    Since there is not an unique `bw`, the convolution is not possible.
797
    The alternative implemented in this function is known as Abramson's method.
798
    This is an internal function used by `kde()`.
799
    """
800
    # Pilot computations used for bandwidth adjustment
801 0
    pilot_grid, pilot_pdf = _kde_convolution(
802
        x, bw, grid_edges, grid_counts, grid_len, bound_correction
803
    )
804

805
    # Adds to avoid np.log(0) and zero division
806 0
    pilot_pdf += 1e-9
807

808
    # Determine the modification factors
809 0
    pdf_interp = np.interp(x, pilot_grid, pilot_pdf)
810 0
    geom_mean = np.exp(np.mean(np.log(pdf_interp)))
811

812
    # Power of c = 0.5 -> Abramson's method
813 0
    adj_factor = (geom_mean / pilot_pdf) ** 0.5
814 0
    bw_adj = bw * adj_factor
815

816
    # Estimation of Gaussian KDE via binned method (convolution not possible)
817 0
    grid = pilot_grid
818

819 0
    if bound_correction:
820 0
        grid_npad = int(grid_len / 5)
821 0
        grid_width = grid_edges[1] - grid_edges[0]
822 0
        grid_pad = grid_npad * grid_width
823 0
        grid_padded = np.linspace(
824
            grid_edges[0] - grid_pad,
825
            grid_edges[grid_len - 1] + grid_pad,
826
            num=grid_len + 2 * grid_npad,
827
        )
828 0
        grid_counts = np.concatenate(
829
            [
830
                grid_counts[grid_npad - 1 :: -1],
831
                grid_counts,
832
                grid_counts[grid_len : grid_len - grid_npad - 1 : -1],
833
            ]
834
        )
835 0
        bw_adj = np.concatenate(
836
            [bw_adj[grid_npad - 1 :: -1], bw_adj, bw_adj[grid_len : grid_len - grid_npad - 1 : -1]]
837
        )
838 0
        pdf_mat = (grid_padded - grid_padded[:, None]) / bw_adj[:, None]
839 0
        pdf_mat = np.exp(-0.5 * pdf_mat ** 2) * grid_counts[:, None]
840 0
        pdf_mat /= (2 * np.pi) ** 0.5 * bw_adj[:, None]
841 0
        pdf = np.sum(pdf_mat[:, grid_npad : grid_npad + grid_len], axis=0) / len(x)
842

843
    else:
844 0
        pdf_mat = (grid - grid[:, None]) / bw_adj[:, None]
845 0
        pdf_mat = np.exp(-0.5 * pdf_mat ** 2) * grid_counts[:, None]
846 0
        pdf_mat /= (2 * np.pi) ** 0.5 * bw_adj[:, None]
847 0
        pdf = np.sum(pdf_mat, axis=0) / len(x)
848

849 0
    return grid, pdf
850

851

852 2
def _fast_kde(x, cumulative=False, bw=4.5, xmin=None, xmax=None):  # pylint: disable=unused-argument
853
    """Kernel Density Estimate, Deprecated."""
854 0
    if not (xmin is None and xmax is None):
855 0
        custom_lims = (xmin, xmax)
856
    else:
857 0
        custom_lims = None
858 0
    grid, pdf = kde(x, cumulative=cumulative, bw=bw, custom_lims=custom_lims)
859

860 0
    warnings.warn("_fast_kde() has been replaced by kde() in stats.density_utils.py", FutureWarning)
861 0
    return grid, pdf
862

863

864 2
def _fast_kde_2d(x, y, gridsize=(128, 128), circular=False):
865
    """
866
    2D fft-based Gaussian kernel density estimate (KDE).
867

868
    The code was adapted from https://github.com/mfouesneau/faststats
869

870
    Parameters
871
    ----------
872
    x : Numpy array or list
873
    y : Numpy array or list
874
    gridsize : tuple
875
        Number of points used to discretize data. Use powers of 2 for fft optimization
876
    circular: bool
877
        If True, use circular boundaries. Defaults to False
878
    Returns
879
    -------
880
    grid: A gridded 2D KDE of the input points (x, y)
881
    xmin: minimum value of x
882
    xmax: maximum value of x
883
    ymin: minimum value of y
884
    ymax: maximum value of y
885
    """
886 2
    x = np.asarray(x, dtype=float)
887 2
    x = x[np.isfinite(x)]
888 2
    y = np.asarray(y, dtype=float)
889 2
    y = y[np.isfinite(y)]
890

891 2
    xmin, xmax = x.min(), x.max()
892 2
    ymin, ymax = y.min(), y.max()
893

894 2
    len_x = len(x)
895 2
    weights = np.ones(len_x)
896 2
    n_x, n_y = gridsize
897

898 2
    d_x = (xmax - xmin) / (n_x - 1)
899 2
    d_y = (ymax - ymin) / (n_y - 1)
900

901 2
    xyi = _stack(x, y).T
902 2
    xyi -= [xmin, ymin]
903 2
    xyi /= [d_x, d_y]
904 2
    xyi = np.floor(xyi, xyi).T
905

906 2
    scotts_factor = len_x ** (-1 / 6)
907 2
    cov = _cov(xyi)
908 2
    std_devs = np.diag(cov) ** 0.5
909 2
    kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs)
910

911 2
    inv_cov = np.linalg.inv(cov * scotts_factor ** 2)
912

913 2
    x_x = np.arange(kern_nx) - kern_nx / 2
914 2
    y_y = np.arange(kern_ny) - kern_ny / 2
915 2
    x_x, y_y = np.meshgrid(x_x, y_y)
916

917 2
    kernel = _stack(x_x.flatten(), y_y.flatten())
918 2
    kernel = _dot(inv_cov, kernel) * kernel
919 2
    kernel = np.exp(-kernel.sum(axis=0) / 2)
920 2
    kernel = kernel.reshape((int(kern_ny), int(kern_nx)))
921

922 2
    boundary = "wrap" if circular else "symm"
923

924 2
    grid = coo_matrix((weights, xyi), shape=(n_x, n_y)).toarray()
925 2
    grid = convolve2d(grid, kernel, mode="same", boundary=boundary)
926

927 2
    norm_factor = np.linalg.det(2 * np.pi * cov * scotts_factor ** 2)
928 2
    norm_factor = len_x * d_x * d_y * norm_factor ** 0.5
929

930 2
    grid /= norm_factor
931

932 2
    return grid, xmin, xmax, ymin, ymax
933

934

935 2
def get_bins(values):
936
    """
937
    Automatically compute the number of bins for discrete variables.
938

939
    Parameters
940
    ----------
941
    values = numpy array
942
        values
943

944
    Returns
945
    -------
946
    array with the bins
947

948
    Notes
949
    -----
950
    Computes the width of the bins by taking the maximun of the Sturges and the Freedman-Diaconis
951
    estimators. According to numpy `np.histogram` this provides good all around performance.
952

953
    The Sturges is a very simplistic estimator based on the assumption of normality of the data.
954
    This estimator has poor performance for non-normal data, which becomes especially obvious for
955
    large data sets. The estimate depends only on size of the data.
956

957
    The Freedman-Diaconis rule uses interquartile range (IQR) to estimate the binwidth.
958
    It is considered a robusts version of the Scott rule as the IQR is less affected by outliers
959
    than the standard deviation. However, the IQR depends on fewer points than the standard
960
    deviation, so it is less accurate, especially for long tailed distributions.
961
    """
962 2
    x_min = values.min().astype(int)
963 2
    x_max = values.max().astype(int)
964

965
    # Sturges histogram bin estimator
966 2
    bins_sturges = (x_max - x_min) / (np.log2(values.size) + 1)
967

968
    # The Freedman-Diaconis histogram bin estimator.
969 2
    iqr = np.subtract(*np.percentile(values, [75, 25]))  # pylint: disable=assignment-from-no-return
970 2
    bins_fd = 2 * iqr * values.size ** (-1 / 3)
971

972 2
    width = np.round(np.max([1, bins_sturges, bins_fd])).astype(int)
973

974 2
    return np.arange(x_min, x_max + width + 1, width)
975

976

977 2
def _sturges_formula(dataset, mult=1):
978
    """Use Sturges' formula to determine number of bins.
979

980
    See https://en.wikipedia.org/wiki/Histogram#Sturges'_formula
981
    or https://doi.org/10.1080%2F01621459.1926.10502161
982

983
    Parameters
984
    ----------
985
    dataset: xarray.DataSet
986
       Must have the `draw` dimension
987

988
    mult: float
989
        Used to scale the number of bins up or down. Default is 1 for Sturges' formula.
990

991
    Returns
992
    -------
993
    int
994
        Number of bins to use
995
    """
996 2
    return int(np.ceil(mult * np.log2(dataset.draw.size)) + 1)
997

998

999 2
def _circular_mean(x, na_rm=False):
1000
    """Compute mean of circular variable measured in radians.
1001

1002
    The result is between -pi and pi.
1003
    """
1004 2
    if na_rm:
1005 0
        x = x[~np.isnan(x)]
1006

1007 2
    sinr = np.sum(np.sin(x))
1008 2
    cosr = np.sum(np.cos(x))
1009 2
    mean = np.arctan2(sinr, cosr)
1010

1011 2
    return mean
1012

1013

1014 2
def _normalize_angle(x, zero_centered=True):
1015
    """Normalize angles.
1016

1017
    Take angles in radians and normalize them to [-pi, pi) or [0, 2 * pi)
1018
    depending on `zero_centered`.
1019
    """
1020 2
    if zero_centered:
1021 2
        return (x + np.pi) % (2 * np.pi) - np.pi
1022
    else:
1023 2
        return x % (2 * np.pi)
1024

1025

1026 2
@conditional_jit(cache=True)
1027 2
def histogram(data, bins, range_hist=None):
1028
    """Conditionally jitted histogram.
1029

1030
    Parameters
1031
    ----------
1032
    data : array-like
1033
        Input data. Passed as first positional argument to ``np.histogram``.
1034
    bins : int or array-like
1035
        Passed as keyword argument ``bins`` to ``np.histogram``.
1036
    range_hist : (float, float), optional
1037
        Passed as keyword argument ``range`` to ``np.histogram``.
1038

1039
    Returns
1040
    -------
1041
    hist : array
1042
        The number of counts per bin.
1043
    density : array
1044
        The density corresponding to each bin.
1045
    bin_edges : array
1046
        The edges of the bins used.
1047
    """
1048 2
    hist, bin_edges = np.histogram(data, bins=bins, range=range_hist)
1049 2
    hist_dens = hist / (hist.sum() * np.diff(bin_edges))
1050 2
    return hist, hist_dens, bin_edges

Read our documentation on viewing source code .

Loading