1
|
|
# pylint: disable=invalid-name,too-many-lines
|
2
|
2
|
"""Density estimation functions for ArviZ."""
|
3
|
2
|
import warnings
|
4
|
|
|
5
|
2
|
import numpy as np
|
6
|
2
|
from scipy.fftpack import fft
|
7
|
2
|
from scipy.optimize import brentq
|
8
|
2
|
from scipy.signal import convolve, convolve2d, gaussian # pylint: disable=no-name-in-module
|
9
|
2
|
from scipy.sparse import coo_matrix
|
10
|
2
|
from scipy.special import ive # pylint: disable=no-name-in-module
|
11
|
|
|
12
|
2
|
from ..utils import _cov, _dot, _stack, conditional_jit
|
13
|
|
|
14
|
2
|
__all__ = ["kde"]
|
15
|
|
|
16
|
|
|
17
|
2
|
def _bw_scott(x, x_std=None, **kwargs): # pylint: disable=unused-argument
|
18
|
|
"""Scott's Rule."""
|
19
|
0
|
if x_std is None:
|
20
|
0
|
x_std = np.std(x)
|
21
|
0
|
bw = 1.06 * x_std * len(x) ** (-0.2)
|
22
|
0
|
return bw
|
23
|
|
|
24
|
|
|
25
|
2
|
def _bw_silverman(x, x_std=None, **kwargs): # pylint: disable=unused-argument
|
26
|
|
"""Silverman's Rule."""
|
27
|
2
|
if x_std is None:
|
28
|
2
|
x_std = np.std(x)
|
29
|
2
|
q75, q25 = np.percentile(x, [75, 25])
|
30
|
2
|
x_iqr = q75 - q25
|
31
|
2
|
a = min(x_std, x_iqr / 1.34)
|
32
|
2
|
bw = 0.9 * a * len(x) ** (-0.2)
|
33
|
2
|
return bw
|
34
|
|
|
35
|
|
|
36
|
2
|
def _bw_isj(x, grid_counts=None, x_std=None, x_range=None):
|
37
|
|
"""Improved Sheather-Jones bandwidth estimation.
|
38
|
|
|
39
|
|
Improved Sheather and Jones method as explained in [1]_.
|
40
|
|
This is an internal version pretended to be used by the KDE estimator.
|
41
|
|
When used internally computation time is saved because things like minimums,
|
42
|
|
maximums and the grid are pre-computed.
|
43
|
|
|
44
|
|
References
|
45
|
|
----------
|
46
|
|
.. [1] Kernel density estimation via diffusion.
|
47
|
|
Z. I. Botev, J. F. Grotowski, and D. P. Kroese.
|
48
|
|
Ann. Statist. 38 (2010), no. 5, 2916--2957.
|
49
|
|
"""
|
50
|
2
|
x_len = len(x)
|
51
|
2
|
if x_range is None:
|
52
|
0
|
x_min = np.min(x)
|
53
|
0
|
x_max = np.max(x)
|
54
|
0
|
x_range = x_max - x_min
|
55
|
|
|
56
|
|
# Relative frequency per bin
|
57
|
2
|
if grid_counts is None:
|
58
|
0
|
x_std = np.std(x)
|
59
|
0
|
grid_len = 256
|
60
|
0
|
grid_min = x_min - 0.5 * x_std
|
61
|
0
|
grid_max = x_max + 0.5 * x_std
|
62
|
0
|
grid_counts, _, _ = histogram(x, grid_len, (grid_min, grid_max))
|
63
|
|
else:
|
64
|
2
|
grid_len = len(grid_counts) - 1
|
65
|
|
|
66
|
2
|
grid_relfreq = grid_counts / x_len
|
67
|
|
|
68
|
|
# Discrete cosine transform of the data
|
69
|
2
|
a_k = _dct1d(grid_relfreq)
|
70
|
|
|
71
|
2
|
k_sq = np.arange(1, grid_len) ** 2
|
72
|
2
|
a_sq = a_k[range(1, grid_len)] ** 2
|
73
|
|
|
74
|
2
|
t = _root(_fixed_point, x_len, args=(x_len, k_sq, a_sq), x=x)
|
75
|
2
|
h = t ** 0.5 * x_range
|
76
|
2
|
return h
|
77
|
|
|
78
|
|
|
79
|
2
|
def _bw_experimental(x, grid_counts=None, x_std=None, x_range=None):
|
80
|
|
"""Experimental bandwidth estimator."""
|
81
|
2
|
bw_silverman = _bw_silverman(x, x_std=x_std)
|
82
|
2
|
bw_isj = _bw_isj(x, grid_counts=grid_counts, x_range=x_range)
|
83
|
2
|
return 0.5 * (bw_silverman + bw_isj)
|
84
|
|
|
85
|
|
|
86
|
2
|
def _bw_taylor(x):
|
87
|
|
"""Taylor's rule for circular bandwidth estimation.
|
88
|
|
|
89
|
|
This function implements a rule-of-thumb for choosing the bandwidth of
|
90
|
|
a von Mises kernel density estimator that assumes the underlying
|
91
|
|
distribution is von Mises as introduced in [1]_.
|
92
|
|
It is analogous to Scott's rule for the Gaussian KDE.
|
93
|
|
|
94
|
|
Circular bandwidth has a different scale from linear bandwidth.
|
95
|
|
Unlike linear scale, low bandwidths are associated with oversmoothing
|
96
|
|
while high values are associated with undersmoothing.
|
97
|
|
|
98
|
|
References
|
99
|
|
----------
|
100
|
|
.. [1] C.C Taylor (2008). Automatic bandwidth selection for circular
|
101
|
|
density estimation.
|
102
|
|
Computational Statistics and Data Analysis, 52, 7, 3493–3500.
|
103
|
|
"""
|
104
|
0
|
x_len = len(x)
|
105
|
0
|
kappa = _kappa_mle(x)
|
106
|
0
|
num = 3 * x_len * kappa ** 2 * ive(2, 2 * kappa)
|
107
|
0
|
den = 4 * np.pi ** 0.5 * ive(0, kappa) ** 2
|
108
|
0
|
return (num / den) ** 0.4
|
109
|
|
|
110
|
|
|
111
|
2
|
_BW_METHODS_LINEAR = {
|
112
|
|
"scott": _bw_scott,
|
113
|
|
"silverman": _bw_silverman,
|
114
|
|
"isj": _bw_isj,
|
115
|
|
"experimental": _bw_experimental,
|
116
|
|
}
|
117
|
|
|
118
|
|
|
119
|
2
|
def _get_bw(x, bw, grid_counts=None, x_std=None, x_range=None):
|
120
|
|
"""Compute bandwidth for a given data `x` and `bw`.
|
121
|
|
|
122
|
|
Also checks `bw` is correctly specified.
|
123
|
|
|
124
|
|
Parameters
|
125
|
|
----------
|
126
|
|
x : 1-D numpy array
|
127
|
|
1 dimensional array of sample data from the
|
128
|
|
variable for which a density estimate is desired.
|
129
|
|
bw: int, float or str
|
130
|
|
If numeric, indicates the bandwidth and must be positive.
|
131
|
|
If str, indicates the method to estimate the bandwidth.
|
132
|
|
|
133
|
|
Returns
|
134
|
|
-------
|
135
|
|
bw: float
|
136
|
|
Bandwidth
|
137
|
|
"""
|
138
|
2
|
if isinstance(bw, bool):
|
139
|
0
|
raise ValueError(
|
140
|
|
(
|
141
|
|
"`bw` must not be of type `bool`.\n"
|
142
|
|
"Expected a positive numeric or one of the following strings:\n"
|
143
|
|
"{}."
|
144
|
|
).format(list(_BW_METHODS_LINEAR.keys()))
|
145
|
|
)
|
146
|
2
|
if isinstance(bw, (int, float)):
|
147
|
0
|
if bw < 0:
|
148
|
0
|
raise ValueError("Numeric `bw` must be positive.\nInput: {:.4f}.".format(bw))
|
149
|
2
|
elif isinstance(bw, str):
|
150
|
2
|
bw_lower = bw.lower()
|
151
|
|
|
152
|
2
|
if bw_lower not in _BW_METHODS_LINEAR.keys():
|
153
|
0
|
raise ValueError(
|
154
|
|
(
|
155
|
|
"Unrecognized bandwidth method.\n" "Input is: {}.\n" "Expected one of: {}."
|
156
|
|
).format(bw_lower, list(_BW_METHODS_LINEAR.keys()))
|
157
|
|
)
|
158
|
|
|
159
|
2
|
bw_fun = _BW_METHODS_LINEAR[bw_lower]
|
160
|
2
|
bw = bw_fun(x, grid_counts=grid_counts, x_std=x_std, x_range=x_range)
|
161
|
|
else:
|
162
|
0
|
raise ValueError(
|
163
|
|
(
|
164
|
|
"Unrecognized `bw` argument.\n"
|
165
|
|
"Expected a positive numeric or one of the following strings:\n"
|
166
|
|
"{}."
|
167
|
|
).format(list(_BW_METHODS_LINEAR.keys()))
|
168
|
|
)
|
169
|
2
|
return bw
|
170
|
|
|
171
|
|
|
172
|
2
|
def _vonmises_pdf(x, mu, kappa):
|
173
|
|
"""Calculate vonmises_pdf."""
|
174
|
0
|
if kappa <= 0:
|
175
|
0
|
raise ValueError("Argument 'kappa' must be positive.")
|
176
|
0
|
pdf = 1 / (2 * np.pi * ive(0, kappa)) * np.exp(np.cos(x - mu) - 1) ** kappa
|
177
|
0
|
return pdf
|
178
|
|
|
179
|
|
|
180
|
2
|
def _a1inv(x):
|
181
|
|
"""Compute inverse function.
|
182
|
|
|
183
|
|
Inverse function of the ratio of the first and
|
184
|
|
zeroth order Bessel functions of the first kind.
|
185
|
|
|
186
|
|
Returns the value k, such that a1inv(x) = k, i.e. a1(k) = x.
|
187
|
|
"""
|
188
|
0
|
if 0 <= x < 0.53:
|
189
|
0
|
return 2 * x + x ** 3 + (5 * x ** 5) / 6
|
190
|
0
|
elif x < 0.85:
|
191
|
0
|
return -0.4 + 1.39 * x + 0.43 / (1 - x)
|
192
|
|
else:
|
193
|
0
|
return 1 / (x ** 3 - 4 * x ** 2 + 3 * x)
|
194
|
|
|
195
|
|
|
196
|
2
|
def _kappa_mle(x):
|
197
|
0
|
mean = _circular_mean(x)
|
198
|
0
|
kappa = _a1inv(np.mean(np.cos(x - mean)))
|
199
|
0
|
return kappa
|
200
|
|
|
201
|
|
|
202
|
2
|
def _dct1d(x):
|
203
|
|
"""Discrete Cosine Transform in 1 Dimension.
|
204
|
|
|
205
|
|
Parameters
|
206
|
|
----------
|
207
|
|
x : numpy array
|
208
|
|
1 dimensional array of values for which the
|
209
|
|
DCT is desired
|
210
|
|
|
211
|
|
Returns
|
212
|
|
-------
|
213
|
|
output : DTC transformed values
|
214
|
|
"""
|
215
|
2
|
x_len = len(x)
|
216
|
|
|
217
|
2
|
even_increasing = np.arange(0, x_len, 2)
|
218
|
2
|
odd_decreasing = np.arange(x_len - 1, 0, -2)
|
219
|
|
|
220
|
2
|
x = np.concatenate((x[even_increasing], x[odd_decreasing]))
|
221
|
|
|
222
|
2
|
w_1k = np.r_[1, (2 * np.exp(-(0 + 1j) * (np.arange(1, x_len)) * np.pi / (2 * x_len)))]
|
223
|
2
|
output = np.real(w_1k * fft(x))
|
224
|
|
|
225
|
2
|
return output
|
226
|
|
|
227
|
|
|
228
|
2
|
def _fixed_point(t, N, k_sq, a_sq):
|
229
|
|
"""Calculate t-zeta*gamma^[l](t).
|
230
|
|
|
231
|
|
Implementation of the function t-zeta*gamma^[l](t) derived from equation (30) in [1].
|
232
|
|
|
233
|
|
References
|
234
|
|
----------
|
235
|
|
.. [1] Kernel density estimation via diffusion.
|
236
|
|
Z. I. Botev, J. F. Grotowski, and D. P. Kroese.
|
237
|
|
Ann. Statist. 38 (2010), no. 5, 2916--2957.
|
238
|
|
"""
|
239
|
2
|
k_sq = np.asfarray(k_sq, dtype=np.float64)
|
240
|
2
|
a_sq = np.asfarray(a_sq, dtype=np.float64)
|
241
|
|
|
242
|
2
|
l = 7
|
243
|
2
|
f = np.sum(np.power(k_sq, l) * a_sq * np.exp(-k_sq * np.pi ** 2 * t))
|
244
|
2
|
f *= 0.5 * np.pi ** (2.0 * l)
|
245
|
|
|
246
|
2
|
for j in np.arange(l - 1, 2 - 1, -1):
|
247
|
2
|
c1 = (1 + 0.5 ** (j + 0.5)) / 3
|
248
|
2
|
c2 = np.product(np.arange(1.0, 2 * j + 1, 2, dtype=np.float64))
|
249
|
2
|
c2 /= (np.pi / 2) ** 0.5
|
250
|
2
|
t_j = np.power((c1 * (c2 / (N * f))), (2.0 / (3.0 + 2.0 * j)))
|
251
|
2
|
f = np.sum(k_sq ** j * a_sq * np.exp(-k_sq * np.pi ** 2.0 * t_j))
|
252
|
2
|
f *= 0.5 * np.pi ** (2 * j)
|
253
|
|
|
254
|
2
|
out = t - (2 * N * np.pi ** 0.5 * f) ** (-0.4)
|
255
|
2
|
return out
|
256
|
|
|
257
|
|
|
258
|
2
|
def _root(function, N, args, x):
|
259
|
|
# The right bound is at most 0.01
|
260
|
2
|
found = False
|
261
|
2
|
N = max(min(1050, N), 50)
|
262
|
2
|
tol = 10e-12 + 0.01 * (N - 50) / 1000
|
263
|
|
|
264
|
2
|
while not found:
|
265
|
2
|
try:
|
266
|
2
|
bw, res = brentq(function, 0, 0.01, args=args, full_output=True, disp=False)
|
267
|
2
|
found = res.converged
|
268
|
2
|
except ValueError:
|
269
|
2
|
bw = 0
|
270
|
2
|
tol *= 2.0
|
271
|
2
|
found = False
|
272
|
2
|
if bw <= 0 or tol >= 1:
|
273
|
|
# warnings.warn(
|
274
|
|
# "Improved Sheather-Jones did not converge as expected. "
|
275
|
|
# "Using Silverman's rule instead.",
|
276
|
|
# Warning
|
277
|
|
# )
|
278
|
2
|
bw = (_bw_silverman(x) / np.ptp(x)) ** 2
|
279
|
2
|
return bw
|
280
|
2
|
return bw
|
281
|
|
|
282
|
|
|
283
|
2
|
def _check_type(x):
|
284
|
|
"""Check the input is of the correct type.
|
285
|
|
|
286
|
|
It only accepts numeric lists/numpy arrays of 1 dimension or something that
|
287
|
|
can be flattened to 1 dimension.
|
288
|
|
|
289
|
|
Parameters
|
290
|
|
----------
|
291
|
|
x : Object whose type is checked before computing the KDE.
|
292
|
|
|
293
|
|
Returns
|
294
|
|
-------
|
295
|
|
x : 1-D numpy array
|
296
|
|
If no error is thrown, a 1 dimensional array of
|
297
|
|
sample data from the variable for which a density estimate is desired.
|
298
|
|
"""
|
299
|
|
# Will raise an error if `x` can't be casted to numeric or flattened to one dimension.
|
300
|
2
|
try:
|
301
|
2
|
x = np.asfarray(x).flatten()
|
302
|
0
|
except Exception as e:
|
303
|
0
|
warnings.warn(
|
304
|
|
"The following exception occurred while trying to convert `x`"
|
305
|
|
"to a 1 dimensional float array."
|
306
|
|
)
|
307
|
0
|
raise e
|
308
|
|
|
309
|
2
|
x = x[np.isfinite(x)]
|
310
|
|
|
311
|
2
|
if x.size == 0:
|
312
|
0
|
raise ValueError("`x` does not contain any finite number.")
|
313
|
2
|
if x.size == 1:
|
314
|
2
|
raise ValueError("`x` is of length 1. Can't produce a KDE with only one data point.")
|
315
|
2
|
return x
|
316
|
|
|
317
|
|
|
318
|
2
|
def _check_custom_lims(custom_lims, x_min, x_max):
|
319
|
|
"""Check if `custom_lims` are of the correct type.
|
320
|
|
|
321
|
|
It accepts numeric lists/tuples of length 2.
|
322
|
|
|
323
|
|
Parameters
|
324
|
|
----------
|
325
|
|
custom_lims : Object whose type is checked.
|
326
|
|
|
327
|
|
Returns
|
328
|
|
-------
|
329
|
|
None: Object of type None
|
330
|
|
"""
|
331
|
2
|
if not isinstance(custom_lims, (list, tuple)):
|
332
|
0
|
raise TypeError(
|
333
|
|
(
|
334
|
|
"`custom_lims` must be a numeric list or tuple of length 2.\n"
|
335
|
|
"Not an object of {}."
|
336
|
|
).format(type(custom_lims))
|
337
|
|
)
|
338
|
|
|
339
|
2
|
if len(custom_lims) != 2:
|
340
|
0
|
raise AttributeError("`len(custom_lims)` must be 2, not {}.".format(len(custom_lims)))
|
341
|
|
|
342
|
2
|
any_bool = any(isinstance(i, bool) for i in custom_lims)
|
343
|
2
|
if any_bool:
|
344
|
0
|
raise TypeError("Elements of `custom_lims` must be numeric or None, not bool.")
|
345
|
|
|
346
|
2
|
custom_lims = list(custom_lims) # convert to a mutable object
|
347
|
2
|
if custom_lims[0] is None:
|
348
|
2
|
custom_lims[0] = x_min
|
349
|
|
|
350
|
2
|
if custom_lims[1] is None:
|
351
|
2
|
custom_lims[1] = x_max
|
352
|
|
|
353
|
2
|
all_numeric = all(isinstance(i, (int, float, np.integer, np.float)) for i in custom_lims)
|
354
|
2
|
if not all_numeric:
|
355
|
0
|
raise TypeError(
|
356
|
|
("Elements of `custom_lims` must be numeric or None.\n" "At least one of them is not.")
|
357
|
|
)
|
358
|
|
|
359
|
2
|
if not custom_lims[0] < custom_lims[1]:
|
360
|
0
|
raise AttributeError("`custom_lims[0]` must be smaller than `custom_lims[1]`.")
|
361
|
|
|
362
|
2
|
return custom_lims
|
363
|
|
|
364
|
|
|
365
|
2
|
def _get_grid(
|
366
|
|
x_min, x_max, x_std, extend_fct, grid_len, custom_lims, extend=True, bound_correction=False
|
367
|
|
):
|
368
|
|
"""Compute the grid that bins the data used to estimate the density function.
|
369
|
|
|
370
|
|
Parameters
|
371
|
|
----------
|
372
|
|
x_min : float
|
373
|
|
Minimum value of the data
|
374
|
|
x_max: float
|
375
|
|
Maximum value of the data.
|
376
|
|
x_std: float
|
377
|
|
Standard deviation of the data.
|
378
|
|
extend_fct: bool
|
379
|
|
Indicates the factor by which `x_std` is multiplied
|
380
|
|
to extend the range of the data.
|
381
|
|
grid_len: int
|
382
|
|
Number of bins
|
383
|
|
custom_lims: tuple or list
|
384
|
|
Custom limits for the domain of the density estimation.
|
385
|
|
Must be numeric of length 2. Overrides `extend`.
|
386
|
|
extend: bool, optional
|
387
|
|
Whether to extend the range of the data or not.
|
388
|
|
Default is True.
|
389
|
|
bound_correction: bool, optional
|
390
|
|
Whether the density estimations performs boundary correction or not.
|
391
|
|
This does not impacts directly in the output, but is used
|
392
|
|
to override `extend`. Overrides `extend`.
|
393
|
|
Default is False.
|
394
|
|
|
395
|
|
Returns
|
396
|
|
-------
|
397
|
|
grid_len: int
|
398
|
|
Number of bins
|
399
|
|
grid_min: float
|
400
|
|
Minimum value of the grid
|
401
|
|
grid_max: float
|
402
|
|
Maximum value of the grid
|
403
|
|
"""
|
404
|
|
# Set up number of bins.
|
405
|
2
|
if grid_len < 100:
|
406
|
0
|
grid_len = 100
|
407
|
2
|
grid_len = int(grid_len)
|
408
|
|
|
409
|
|
# Set up domain
|
410
|
2
|
if custom_lims is not None:
|
411
|
2
|
custom_lims = _check_custom_lims(custom_lims, x_min, x_max)
|
412
|
2
|
grid_min = custom_lims[0]
|
413
|
2
|
grid_max = custom_lims[1]
|
414
|
2
|
elif extend and not bound_correction:
|
415
|
0
|
grid_extend = extend_fct * x_std
|
416
|
0
|
grid_min = x_min - grid_extend
|
417
|
0
|
grid_max = x_max + grid_extend
|
418
|
|
else:
|
419
|
2
|
grid_min = x_min
|
420
|
2
|
grid_max = x_max
|
421
|
2
|
return grid_min, grid_max, grid_len
|
422
|
|
|
423
|
|
|
424
|
2
|
def kde(x, circular=False, **kwargs):
|
425
|
|
"""One dimensional density estimation.
|
426
|
|
|
427
|
|
It is a wrapper around `kde_linear()` and `kde_circular()`.
|
428
|
|
|
429
|
|
Parameters
|
430
|
|
----------
|
431
|
|
x : 1D numpy array
|
432
|
|
Data used to calculate the density estimation.
|
433
|
|
Theoritically it is a random sample obtained from $f$,
|
434
|
|
the true probability density function we aim to estimate.
|
435
|
|
circular: bool, optional
|
436
|
|
Whether `x` is a circular variable or not. Defaults to False.
|
437
|
|
**kwargs: Arguments passed to `kde_linear()` and `kde_circular()`.
|
438
|
|
See their documentation for more info.
|
439
|
|
|
440
|
|
Returns
|
441
|
|
-------
|
442
|
|
grid : Gridded numpy array for the x values.
|
443
|
|
pdf : Numpy array for the density estimates.
|
444
|
|
bw: optional, the estimated bandwidth.
|
445
|
|
|
446
|
|
Examples
|
447
|
|
--------
|
448
|
|
Default density estimation for linear data
|
449
|
|
.. plot::
|
450
|
|
:context: close-figs
|
451
|
|
|
452
|
|
>>> import numpy as np
|
453
|
|
>>> import matplotlib.pyplot as plt
|
454
|
|
>>> from arviz import kde
|
455
|
|
>>>
|
456
|
|
>>> rvs = np.random.gamma(shape=1.8, size=1000)
|
457
|
|
>>> grid, pdf = kde(rvs)
|
458
|
|
>>> plt.plot(grid, pdf)
|
459
|
|
>>> plt.show()
|
460
|
|
|
461
|
|
Density estimation for linear data with Silverman's rule bandwidth
|
462
|
|
.. plot::
|
463
|
|
:context: close-figs
|
464
|
|
|
465
|
|
>>> grid, pdf = kde(rvs, bw="silverman")
|
466
|
|
>>> plt.plot(grid, pdf)
|
467
|
|
>>> plt.show()
|
468
|
|
|
469
|
|
Density estimation for linear data with scaled bandwidth
|
470
|
|
.. plot::
|
471
|
|
:context: close-figs
|
472
|
|
|
473
|
|
>>> # bw_fct > 1 means more smoothness.
|
474
|
|
>>> grid, pdf = kde(rvs, bw_fct=2.5)
|
475
|
|
>>> plt.plot(grid, pdf)
|
476
|
|
>>> plt.show()
|
477
|
|
|
478
|
|
Default density estimation for linear data with extended limits
|
479
|
|
.. plot::
|
480
|
|
:context: close-figs
|
481
|
|
|
482
|
|
>>> grid, pdf = kde(rvs, bound_correction=False, extend=True, extend_fct=0.5)
|
483
|
|
>>> plt.plot(grid, pdf)
|
484
|
|
>>> plt.show()
|
485
|
|
|
486
|
|
Default density estimation for linear data with custom limits
|
487
|
|
.. plot::
|
488
|
|
:context: close-figs
|
489
|
|
# It accepts tuples and lists of length 2.
|
490
|
|
>>> grid, pdf = kde(rvs, bound_correction=False, custom_lims=(0, 10))
|
491
|
|
>>> plt.plot(grid, pdf)
|
492
|
|
>>> plt.show()
|
493
|
|
|
494
|
|
Default density estimation for circular data
|
495
|
|
.. plot::
|
496
|
|
:context: close-figs
|
497
|
|
|
498
|
|
>>> rvs = np.random.vonmises(mu=np.pi, kappa=1, size=500)
|
499
|
|
>>> grid, pdf = kde(rvs, circular=True)
|
500
|
|
>>> plt.plot(grid, pdf)
|
501
|
|
>>> plt.show()
|
502
|
|
|
503
|
|
Density estimation for circular data with scaled bandwidth
|
504
|
|
.. plot::
|
505
|
|
:context: close-figs
|
506
|
|
|
507
|
|
>>> rvs = np.random.vonmises(mu=np.pi, kappa=1, size=500)
|
508
|
|
>>> # bw_fct > 1 means less smoothness.
|
509
|
|
>>> grid, pdf = kde(rvs, circular=True, bw_fct=3)
|
510
|
|
>>> plt.plot(grid, pdf)
|
511
|
|
>>> plt.show()
|
512
|
|
|
513
|
|
Density estimation for circular data with custom limits
|
514
|
|
.. plot::
|
515
|
|
:context: close-figs
|
516
|
|
>>> # This is still experimental, does not always work.
|
517
|
|
>>> rvs = np.random.vonmises(mu=0, kappa=30, size=500)
|
518
|
|
>>> grid, pdf = kde(rvs, circular=True, custom_lims=(-1, 1))
|
519
|
|
>>> plt.plot(grid, pdf)
|
520
|
|
>>> plt.show()
|
521
|
|
|
522
|
|
See Also
|
523
|
|
--------
|
524
|
|
plot_kde : Compute and plot a kernel density estimate.
|
525
|
|
arviz.stats.density_utils.kde: Arviz KDE estimator
|
526
|
|
"""
|
527
|
2
|
if circular:
|
528
|
0
|
kde_fun = _kde_circular
|
529
|
|
else:
|
530
|
2
|
kde_fun = _kde_linear
|
531
|
|
|
532
|
2
|
return kde_fun(x, **kwargs)
|
533
|
|
|
534
|
|
|
535
|
2
|
def _kde_linear(
|
536
|
|
x,
|
537
|
|
bw="experimental",
|
538
|
|
adaptive=False,
|
539
|
|
extend=False,
|
540
|
|
bound_correction=True,
|
541
|
|
extend_fct=0,
|
542
|
|
bw_fct=1,
|
543
|
|
bw_return=False,
|
544
|
|
custom_lims=None,
|
545
|
|
cumulative=False,
|
546
|
|
grid_len=512,
|
547
|
|
**kwargs, # pylint: disable=unused-argument
|
548
|
|
):
|
549
|
|
"""One dimensional density estimation for linear data.
|
550
|
|
|
551
|
|
Given an array of data points `x` it returns an estimate of
|
552
|
|
the probability density function that generated the samples in `x`.
|
553
|
|
|
554
|
|
Parameters
|
555
|
|
----------
|
556
|
|
x : 1D numpy array
|
557
|
|
Data used to calculate the density estimation.
|
558
|
|
Theoritically it is a random sample obtained from $f$,
|
559
|
|
the true probability density function we aim to estimate.
|
560
|
|
bw: int, float or str, optional
|
561
|
|
If numeric, indicates the bandwidth and must be positive.
|
562
|
|
If str, indicates the method to estimate the bandwidth and must be
|
563
|
|
one of "scott", "silverman", "isj" or "experimental".
|
564
|
|
Defaults to "experimental".
|
565
|
|
adaptive: boolean, optional
|
566
|
|
Indicates if the bandwidth is adaptative or not.
|
567
|
|
It is the recommended approach when there are multiple modalities
|
568
|
|
with different spread.
|
569
|
|
It is not compatible with convolution. Defaults to False.
|
570
|
|
extend: boolean, optional
|
571
|
|
Whether to extend the observed range for `x` in the estimation.
|
572
|
|
It extends each bound by a multiple of the standard deviation of `x`
|
573
|
|
given by `extend_fct`. Defaults to False.
|
574
|
|
bound_correction: boolean, optional
|
575
|
|
Whether to perform boundary correction on the bounds of `x` or not.
|
576
|
|
Defaults to True.
|
577
|
|
extend_fct: float, optional
|
578
|
|
Number of standard deviations used to widen the
|
579
|
|
lower and upper bounds of `x`. Defaults to 0.5.
|
580
|
|
bw_fct: float, optional
|
581
|
|
A value that multiplies `bw` which enables tuning smoothness by hand.
|
582
|
|
Must be positive. Values below 1 decrease smoothness while values
|
583
|
|
above 1 decrease it. Defaults to 1 (no modification).
|
584
|
|
bw_return: bool, optional
|
585
|
|
Whether to return the estimated bandwidth in addition to the
|
586
|
|
other objects. Defaults to False.
|
587
|
|
custom_lims: list or tuple, optional
|
588
|
|
A list or tuple of length 2 indicating custom bounds
|
589
|
|
for the range of `x`. Defaults to None which disables custom bounds.
|
590
|
|
cumulative: bool, optional
|
591
|
|
Whether return the PDF or the cumulative PDF. Defaults to False.
|
592
|
|
grid_len: int, optional
|
593
|
|
The number of intervals used to bin the data points
|
594
|
|
(a.k.a. the length of the grid used in the estimation)
|
595
|
|
Defaults to 512.
|
596
|
|
|
597
|
|
Returns
|
598
|
|
-------
|
599
|
|
grid : Gridded numpy array for the x values.
|
600
|
|
pdf : Numpy array for the density estimates.
|
601
|
|
bw: optional, the estimated bandwidth.
|
602
|
|
"""
|
603
|
|
# Check `x` is from appropiate type
|
604
|
2
|
try:
|
605
|
2
|
x = _check_type(x)
|
606
|
2
|
except ValueError as e:
|
607
|
2
|
warnings.warn("Something failed: " + str(e))
|
608
|
2
|
return np.array([np.nan]), np.array([np.nan])
|
609
|
|
|
610
|
|
# Check `bw_fct` is numeric and positive
|
611
|
2
|
if not isinstance(bw_fct, (int, float, np.integer, np.floating)):
|
612
|
0
|
raise TypeError(
|
613
|
|
"`bw_fct` must be a positive number, not an object of {}.".format(type(bw_fct))
|
614
|
|
)
|
615
|
|
|
616
|
2
|
if bw_fct <= 0:
|
617
|
0
|
raise ValueError("`bw_fct` must be a positive number, not {}.".format(bw_fct))
|
618
|
|
|
619
|
|
# Preliminary calculations
|
620
|
2
|
x_len = len(x)
|
621
|
2
|
x_min = x.min()
|
622
|
2
|
x_max = x.max()
|
623
|
2
|
x_std = (((x ** 2).sum() / x_len) - (x.sum() / x_len) ** 2) ** 0.5
|
624
|
2
|
x_range = x_max - x_min
|
625
|
|
|
626
|
|
# Determine grid
|
627
|
2
|
grid_min, grid_max, grid_len = _get_grid(
|
628
|
|
x_min, x_max, x_std, extend_fct, grid_len, custom_lims, extend, bound_correction
|
629
|
|
)
|
630
|
2
|
grid_counts, _, grid_edges = histogram(x, grid_len, (grid_min, grid_max))
|
631
|
|
|
632
|
|
# Bandwidth estimation
|
633
|
2
|
bw = bw_fct * _get_bw(x, bw, grid_counts, x_std, x_range)
|
634
|
|
|
635
|
|
# Density estimation
|
636
|
2
|
if adaptive:
|
637
|
0
|
grid, pdf = _kde_adaptive(x, bw, grid_edges, grid_counts, grid_len, bound_correction)
|
638
|
|
else:
|
639
|
2
|
grid, pdf = _kde_convolution(x, bw, grid_edges, grid_counts, grid_len, bound_correction)
|
640
|
|
|
641
|
2
|
if cumulative:
|
642
|
2
|
pdf = pdf.cumsum() / pdf.sum()
|
643
|
|
|
644
|
2
|
if bw_return:
|
645
|
0
|
return grid, pdf, bw
|
646
|
|
else:
|
647
|
2
|
return grid, pdf
|
648
|
|
|
649
|
|
|
650
|
2
|
def _kde_circular(
|
651
|
|
x,
|
652
|
|
bw="taylor",
|
653
|
|
bw_fct=1,
|
654
|
|
bw_return=False,
|
655
|
|
custom_lims=None,
|
656
|
|
cumulative=False,
|
657
|
|
grid_len=512,
|
658
|
|
**kwargs, # pylint: disable=unused-argument
|
659
|
|
):
|
660
|
|
"""One dimensional density estimation for circular data.
|
661
|
|
|
662
|
|
Given an array of data points `x` measured in radians,
|
663
|
|
it returns an estimate of the probability density function that generated
|
664
|
|
the samples in `x`.
|
665
|
|
|
666
|
|
Parameters
|
667
|
|
----------
|
668
|
|
x : 1D numpy array
|
669
|
|
Data used to calculate the density estimation.
|
670
|
|
Theoritically it is a random sample obtained from $f$,
|
671
|
|
the true probability density function we aim to estimate.
|
672
|
|
bw: int, float or str, optional
|
673
|
|
If numeric, indicates the bandwidth and must be positive.
|
674
|
|
If str, indicates the method to estimate the bandwidth and must be
|
675
|
|
"taylor" since it is the only option supported so far. Defaults to "taylor".
|
676
|
|
bw_fct: float, optional
|
677
|
|
A value that multiplies `bw` which enables tuning smoothness by hand.
|
678
|
|
Must be positive. Values above 1 decrease smoothness while values
|
679
|
|
below 1 decrease it. Defaults to 1 (no modification).
|
680
|
|
bw_return: bool, optional
|
681
|
|
Whether to return the estimated bandwidth in addition to the
|
682
|
|
other objects. Defaults to False.
|
683
|
|
custom_lims: list or tuple, optional
|
684
|
|
A list or tuple of length 2 indicating custom bounds
|
685
|
|
for the range of `x`. Defaults to None which means the estimation
|
686
|
|
limits are [-pi, pi].
|
687
|
|
cumulative: bool, optional
|
688
|
|
Whether return the PDF or the cumulative PDF. Defaults to False.
|
689
|
|
grid_len: int, optional
|
690
|
|
The number of intervals used to bin the data points
|
691
|
|
(a.k.a. the length of the grid used in the estimation)
|
692
|
|
Defaults to 512.
|
693
|
|
"""
|
694
|
0
|
try:
|
695
|
0
|
x = _check_type(x)
|
696
|
0
|
except ValueError as e:
|
697
|
0
|
warnings.warn("Something failed: " + str(e))
|
698
|
0
|
return np.array([np.nan]), np.array([np.nan])
|
699
|
|
|
700
|
|
# All values between -pi and pi
|
701
|
0
|
x = _normalize_angle(x)
|
702
|
|
|
703
|
|
# Check `bw_fct` is numeric and positive
|
704
|
0
|
if not isinstance(bw_fct, (int, float, np.integer, np.floating)):
|
705
|
0
|
raise TypeError(
|
706
|
|
"`bw_fct` must be a positive number, not an object of {}.".format(type(bw_fct))
|
707
|
|
)
|
708
|
|
|
709
|
0
|
if bw_fct <= 0:
|
710
|
0
|
raise ValueError("`bw_fct` must be a positive number, not {}.".format(bw_fct))
|
711
|
|
|
712
|
|
# Determine bandwidth
|
713
|
0
|
if isinstance(bw, bool):
|
714
|
0
|
raise ValueError(
|
715
|
|
("`bw` can't be of type `bool`.\n" "Expected a positive numeric or 'taylor'")
|
716
|
|
)
|
717
|
0
|
if isinstance(bw, (int, float)):
|
718
|
0
|
if bw < 0:
|
719
|
0
|
raise ValueError("Numeric `bw` must be positive.\nInput: {:.4f}.".format(bw))
|
720
|
0
|
if isinstance(bw, str):
|
721
|
0
|
if bw == "taylor":
|
722
|
0
|
bw = _bw_taylor(x)
|
723
|
|
else:
|
724
|
0
|
raise ValueError(("`bw` must be a positive numeric or `taylor`, not {}".format(bw)))
|
725
|
0
|
bw *= bw_fct
|
726
|
|
|
727
|
|
# Determine grid
|
728
|
0
|
if custom_lims is not None:
|
729
|
0
|
custom_lims = _check_custom_lims(custom_lims, x.min(), x.max())
|
730
|
0
|
grid_min = custom_lims[0]
|
731
|
0
|
grid_max = custom_lims[1]
|
732
|
0
|
assert grid_min >= -np.pi, "Lower limit can't be smaller than -pi"
|
733
|
0
|
assert grid_max <= np.pi, "Upper limit can't be larger than pi"
|
734
|
|
else:
|
735
|
0
|
grid_min = -np.pi
|
736
|
0
|
grid_max = np.pi
|
737
|
|
|
738
|
0
|
bins = np.linspace(grid_min, grid_max, grid_len + 1)
|
739
|
0
|
bin_counts, _, bin_edges = histogram(x, bins=bins)
|
740
|
0
|
grid = 0.5 * (bin_edges[1:] + bin_edges[:-1])
|
741
|
|
|
742
|
0
|
kern = _vonmises_pdf(x=grid, mu=0, kappa=bw)
|
743
|
0
|
pdf = np.fft.fftshift(np.fft.irfft(np.fft.rfft(kern) * np.fft.rfft(bin_counts)))
|
744
|
0
|
pdf /= len(x)
|
745
|
|
|
746
|
0
|
if cumulative:
|
747
|
0
|
pdf = pdf.cumsum() / pdf.sum()
|
748
|
|
|
749
|
0
|
if bw_return:
|
750
|
0
|
return grid, pdf, bw
|
751
|
|
else:
|
752
|
0
|
return grid, pdf
|
753
|
|
|
754
|
|
|
755
|
|
# pylint: disable=unused-argument
|
756
|
2
|
def _kde_convolution(x, bw, grid_edges, grid_counts, grid_len, bound_correction, **kwargs):
|
757
|
|
"""Kernel density with convolution.
|
758
|
|
|
759
|
|
One dimensional Gaussian kernel density estimation via
|
760
|
|
convolution of the binned relative frequencies and a Gaussian filter.
|
761
|
|
This is an internal function used by `kde()`.
|
762
|
|
"""
|
763
|
|
# Calculate relative frequencies per bin
|
764
|
2
|
bin_width = grid_edges[1] - grid_edges[0]
|
765
|
2
|
f = grid_counts / bin_width / len(x)
|
766
|
|
|
767
|
|
# Bandwidth must consider the bin width
|
768
|
2
|
bw /= bin_width
|
769
|
|
|
770
|
|
# See: https://stackoverflow.com/questions/2773606/gaussian-filter-in-matlab
|
771
|
2
|
kernel_n = int(bw * 2 * np.pi)
|
772
|
|
|
773
|
|
# Temporal fix?
|
774
|
2
|
if kernel_n == 0:
|
775
|
2
|
kernel_n = 1
|
776
|
2
|
kernel = gaussian(kernel_n, bw)
|
777
|
|
|
778
|
2
|
if bound_correction:
|
779
|
2
|
npad = int(grid_len / 5)
|
780
|
2
|
f = np.concatenate([f[npad - 1 :: -1], f, f[grid_len : grid_len - npad - 1 : -1]])
|
781
|
2
|
pdf = convolve(f, kernel, mode="same", method="direct")[npad : npad + grid_len]
|
782
|
2
|
pdf /= bw * (2 * np.pi) ** 0.5
|
783
|
|
else:
|
784
|
0
|
pdf = convolve(f, kernel, mode="same", method="direct")
|
785
|
0
|
pdf /= bw * (2 * np.pi) ** 0.5
|
786
|
|
|
787
|
2
|
grid = (grid_edges[1:] + grid_edges[:-1]) / 2
|
788
|
2
|
return grid, pdf
|
789
|
|
|
790
|
|
|
791
|
2
|
def _kde_adaptive(x, bw, grid_edges, grid_counts, grid_len, bound_correction, **kwargs):
|
792
|
|
"""Compute Adaptive Kernel Density Estimation.
|
793
|
|
|
794
|
|
One dimensional adaptive Gaussian kernel density estimation.
|
795
|
|
The implementation uses the binning technique.
|
796
|
|
Since there is not an unique `bw`, the convolution is not possible.
|
797
|
|
The alternative implemented in this function is known as Abramson's method.
|
798
|
|
This is an internal function used by `kde()`.
|
799
|
|
"""
|
800
|
|
# Pilot computations used for bandwidth adjustment
|
801
|
0
|
pilot_grid, pilot_pdf = _kde_convolution(
|
802
|
|
x, bw, grid_edges, grid_counts, grid_len, bound_correction
|
803
|
|
)
|
804
|
|
|
805
|
|
# Adds to avoid np.log(0) and zero division
|
806
|
0
|
pilot_pdf += 1e-9
|
807
|
|
|
808
|
|
# Determine the modification factors
|
809
|
0
|
pdf_interp = np.interp(x, pilot_grid, pilot_pdf)
|
810
|
0
|
geom_mean = np.exp(np.mean(np.log(pdf_interp)))
|
811
|
|
|
812
|
|
# Power of c = 0.5 -> Abramson's method
|
813
|
0
|
adj_factor = (geom_mean / pilot_pdf) ** 0.5
|
814
|
0
|
bw_adj = bw * adj_factor
|
815
|
|
|
816
|
|
# Estimation of Gaussian KDE via binned method (convolution not possible)
|
817
|
0
|
grid = pilot_grid
|
818
|
|
|
819
|
0
|
if bound_correction:
|
820
|
0
|
grid_npad = int(grid_len / 5)
|
821
|
0
|
grid_width = grid_edges[1] - grid_edges[0]
|
822
|
0
|
grid_pad = grid_npad * grid_width
|
823
|
0
|
grid_padded = np.linspace(
|
824
|
|
grid_edges[0] - grid_pad,
|
825
|
|
grid_edges[grid_len - 1] + grid_pad,
|
826
|
|
num=grid_len + 2 * grid_npad,
|
827
|
|
)
|
828
|
0
|
grid_counts = np.concatenate(
|
829
|
|
[
|
830
|
|
grid_counts[grid_npad - 1 :: -1],
|
831
|
|
grid_counts,
|
832
|
|
grid_counts[grid_len : grid_len - grid_npad - 1 : -1],
|
833
|
|
]
|
834
|
|
)
|
835
|
0
|
bw_adj = np.concatenate(
|
836
|
|
[bw_adj[grid_npad - 1 :: -1], bw_adj, bw_adj[grid_len : grid_len - grid_npad - 1 : -1]]
|
837
|
|
)
|
838
|
0
|
pdf_mat = (grid_padded - grid_padded[:, None]) / bw_adj[:, None]
|
839
|
0
|
pdf_mat = np.exp(-0.5 * pdf_mat ** 2) * grid_counts[:, None]
|
840
|
0
|
pdf_mat /= (2 * np.pi) ** 0.5 * bw_adj[:, None]
|
841
|
0
|
pdf = np.sum(pdf_mat[:, grid_npad : grid_npad + grid_len], axis=0) / len(x)
|
842
|
|
|
843
|
|
else:
|
844
|
0
|
pdf_mat = (grid - grid[:, None]) / bw_adj[:, None]
|
845
|
0
|
pdf_mat = np.exp(-0.5 * pdf_mat ** 2) * grid_counts[:, None]
|
846
|
0
|
pdf_mat /= (2 * np.pi) ** 0.5 * bw_adj[:, None]
|
847
|
0
|
pdf = np.sum(pdf_mat, axis=0) / len(x)
|
848
|
|
|
849
|
0
|
return grid, pdf
|
850
|
|
|
851
|
|
|
852
|
2
|
def _fast_kde(x, cumulative=False, bw=4.5, xmin=None, xmax=None): # pylint: disable=unused-argument
|
853
|
|
"""Kernel Density Estimate, Deprecated."""
|
854
|
0
|
if not (xmin is None and xmax is None):
|
855
|
0
|
custom_lims = (xmin, xmax)
|
856
|
|
else:
|
857
|
0
|
custom_lims = None
|
858
|
0
|
grid, pdf = kde(x, cumulative=cumulative, bw=bw, custom_lims=custom_lims)
|
859
|
|
|
860
|
0
|
warnings.warn("_fast_kde() has been replaced by kde() in stats.density_utils.py", FutureWarning)
|
861
|
0
|
return grid, pdf
|
862
|
|
|
863
|
|
|
864
|
2
|
def _fast_kde_2d(x, y, gridsize=(128, 128), circular=False):
|
865
|
|
"""
|
866
|
|
2D fft-based Gaussian kernel density estimate (KDE).
|
867
|
|
|
868
|
|
The code was adapted from https://github.com/mfouesneau/faststats
|
869
|
|
|
870
|
|
Parameters
|
871
|
|
----------
|
872
|
|
x : Numpy array or list
|
873
|
|
y : Numpy array or list
|
874
|
|
gridsize : tuple
|
875
|
|
Number of points used to discretize data. Use powers of 2 for fft optimization
|
876
|
|
circular: bool
|
877
|
|
If True, use circular boundaries. Defaults to False
|
878
|
|
Returns
|
879
|
|
-------
|
880
|
|
grid: A gridded 2D KDE of the input points (x, y)
|
881
|
|
xmin: minimum value of x
|
882
|
|
xmax: maximum value of x
|
883
|
|
ymin: minimum value of y
|
884
|
|
ymax: maximum value of y
|
885
|
|
"""
|
886
|
2
|
x = np.asarray(x, dtype=float)
|
887
|
2
|
x = x[np.isfinite(x)]
|
888
|
2
|
y = np.asarray(y, dtype=float)
|
889
|
2
|
y = y[np.isfinite(y)]
|
890
|
|
|
891
|
2
|
xmin, xmax = x.min(), x.max()
|
892
|
2
|
ymin, ymax = y.min(), y.max()
|
893
|
|
|
894
|
2
|
len_x = len(x)
|
895
|
2
|
weights = np.ones(len_x)
|
896
|
2
|
n_x, n_y = gridsize
|
897
|
|
|
898
|
2
|
d_x = (xmax - xmin) / (n_x - 1)
|
899
|
2
|
d_y = (ymax - ymin) / (n_y - 1)
|
900
|
|
|
901
|
2
|
xyi = _stack(x, y).T
|
902
|
2
|
xyi -= [xmin, ymin]
|
903
|
2
|
xyi /= [d_x, d_y]
|
904
|
2
|
xyi = np.floor(xyi, xyi).T
|
905
|
|
|
906
|
2
|
scotts_factor = len_x ** (-1 / 6)
|
907
|
2
|
cov = _cov(xyi)
|
908
|
2
|
std_devs = np.diag(cov) ** 0.5
|
909
|
2
|
kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs)
|
910
|
|
|
911
|
2
|
inv_cov = np.linalg.inv(cov * scotts_factor ** 2)
|
912
|
|
|
913
|
2
|
x_x = np.arange(kern_nx) - kern_nx / 2
|
914
|
2
|
y_y = np.arange(kern_ny) - kern_ny / 2
|
915
|
2
|
x_x, y_y = np.meshgrid(x_x, y_y)
|
916
|
|
|
917
|
2
|
kernel = _stack(x_x.flatten(), y_y.flatten())
|
918
|
2
|
kernel = _dot(inv_cov, kernel) * kernel
|
919
|
2
|
kernel = np.exp(-kernel.sum(axis=0) / 2)
|
920
|
2
|
kernel = kernel.reshape((int(kern_ny), int(kern_nx)))
|
921
|
|
|
922
|
2
|
boundary = "wrap" if circular else "symm"
|
923
|
|
|
924
|
2
|
grid = coo_matrix((weights, xyi), shape=(n_x, n_y)).toarray()
|
925
|
2
|
grid = convolve2d(grid, kernel, mode="same", boundary=boundary)
|
926
|
|
|
927
|
2
|
norm_factor = np.linalg.det(2 * np.pi * cov * scotts_factor ** 2)
|
928
|
2
|
norm_factor = len_x * d_x * d_y * norm_factor ** 0.5
|
929
|
|
|
930
|
2
|
grid /= norm_factor
|
931
|
|
|
932
|
2
|
return grid, xmin, xmax, ymin, ymax
|
933
|
|
|
934
|
|
|
935
|
2
|
def get_bins(values):
|
936
|
|
"""
|
937
|
|
Automatically compute the number of bins for discrete variables.
|
938
|
|
|
939
|
|
Parameters
|
940
|
|
----------
|
941
|
|
values = numpy array
|
942
|
|
values
|
943
|
|
|
944
|
|
Returns
|
945
|
|
-------
|
946
|
|
array with the bins
|
947
|
|
|
948
|
|
Notes
|
949
|
|
-----
|
950
|
|
Computes the width of the bins by taking the maximun of the Sturges and the Freedman-Diaconis
|
951
|
|
estimators. According to numpy `np.histogram` this provides good all around performance.
|
952
|
|
|
953
|
|
The Sturges is a very simplistic estimator based on the assumption of normality of the data.
|
954
|
|
This estimator has poor performance for non-normal data, which becomes especially obvious for
|
955
|
|
large data sets. The estimate depends only on size of the data.
|
956
|
|
|
957
|
|
The Freedman-Diaconis rule uses interquartile range (IQR) to estimate the binwidth.
|
958
|
|
It is considered a robusts version of the Scott rule as the IQR is less affected by outliers
|
959
|
|
than the standard deviation. However, the IQR depends on fewer points than the standard
|
960
|
|
deviation, so it is less accurate, especially for long tailed distributions.
|
961
|
|
"""
|
962
|
2
|
x_min = values.min().astype(int)
|
963
|
2
|
x_max = values.max().astype(int)
|
964
|
|
|
965
|
|
# Sturges histogram bin estimator
|
966
|
2
|
bins_sturges = (x_max - x_min) / (np.log2(values.size) + 1)
|
967
|
|
|
968
|
|
# The Freedman-Diaconis histogram bin estimator.
|
969
|
2
|
iqr = np.subtract(*np.percentile(values, [75, 25])) # pylint: disable=assignment-from-no-return
|
970
|
2
|
bins_fd = 2 * iqr * values.size ** (-1 / 3)
|
971
|
|
|
972
|
2
|
width = np.round(np.max([1, bins_sturges, bins_fd])).astype(int)
|
973
|
|
|
974
|
2
|
return np.arange(x_min, x_max + width + 1, width)
|
975
|
|
|
976
|
|
|
977
|
2
|
def _sturges_formula(dataset, mult=1):
|
978
|
|
"""Use Sturges' formula to determine number of bins.
|
979
|
|
|
980
|
|
See https://en.wikipedia.org/wiki/Histogram#Sturges'_formula
|
981
|
|
or https://doi.org/10.1080%2F01621459.1926.10502161
|
982
|
|
|
983
|
|
Parameters
|
984
|
|
----------
|
985
|
|
dataset: xarray.DataSet
|
986
|
|
Must have the `draw` dimension
|
987
|
|
|
988
|
|
mult: float
|
989
|
|
Used to scale the number of bins up or down. Default is 1 for Sturges' formula.
|
990
|
|
|
991
|
|
Returns
|
992
|
|
-------
|
993
|
|
int
|
994
|
|
Number of bins to use
|
995
|
|
"""
|
996
|
2
|
return int(np.ceil(mult * np.log2(dataset.draw.size)) + 1)
|
997
|
|
|
998
|
|
|
999
|
2
|
def _circular_mean(x, na_rm=False):
|
1000
|
|
"""Compute mean of circular variable measured in radians.
|
1001
|
|
|
1002
|
|
The result is between -pi and pi.
|
1003
|
|
"""
|
1004
|
2
|
if na_rm:
|
1005
|
0
|
x = x[~np.isnan(x)]
|
1006
|
|
|
1007
|
2
|
sinr = np.sum(np.sin(x))
|
1008
|
2
|
cosr = np.sum(np.cos(x))
|
1009
|
2
|
mean = np.arctan2(sinr, cosr)
|
1010
|
|
|
1011
|
2
|
return mean
|
1012
|
|
|
1013
|
|
|
1014
|
2
|
def _normalize_angle(x, zero_centered=True):
|
1015
|
|
"""Normalize angles.
|
1016
|
|
|
1017
|
|
Take angles in radians and normalize them to [-pi, pi) or [0, 2 * pi)
|
1018
|
|
depending on `zero_centered`.
|
1019
|
|
"""
|
1020
|
2
|
if zero_centered:
|
1021
|
2
|
return (x + np.pi) % (2 * np.pi) - np.pi
|
1022
|
|
else:
|
1023
|
2
|
return x % (2 * np.pi)
|
1024
|
|
|
1025
|
|
|
1026
|
2
|
@conditional_jit(cache=True)
|
1027
|
2
|
def histogram(data, bins, range_hist=None):
|
1028
|
|
"""Conditionally jitted histogram.
|
1029
|
|
|
1030
|
|
Parameters
|
1031
|
|
----------
|
1032
|
|
data : array-like
|
1033
|
|
Input data. Passed as first positional argument to ``np.histogram``.
|
1034
|
|
bins : int or array-like
|
1035
|
|
Passed as keyword argument ``bins`` to ``np.histogram``.
|
1036
|
|
range_hist : (float, float), optional
|
1037
|
|
Passed as keyword argument ``range`` to ``np.histogram``.
|
1038
|
|
|
1039
|
|
Returns
|
1040
|
|
-------
|
1041
|
|
hist : array
|
1042
|
|
The number of counts per bin.
|
1043
|
|
density : array
|
1044
|
|
The density corresponding to each bin.
|
1045
|
|
bin_edges : array
|
1046
|
|
The edges of the bins used.
|
1047
|
|
"""
|
1048
|
2
|
hist, bin_edges = np.histogram(data, bins=bins, range=range_hist)
|
1049
|
2
|
hist_dens = hist / (hist.sum() * np.diff(bin_edges))
|
1050
|
2
|
return hist, hist_dens, bin_edges
|