1
#include "numpy/random/distributions.h"
2
#include "logfactorial.h"
3
#include <stdint.h>
4

5
/*
6
 *  Generate a sample from the hypergeometric distribution.
7
 *
8
 *  Assume sample is not greater than half the total.  See below
9
 *  for how the opposite case is handled.
10
 *
11
 *  We initialize the following:
12
 *      computed_sample = sample
13
 *      remaining_good = good
14
 *      remaining_total = good + bad
15
 *
16
 *  In the loop:
17
 *  * computed_sample counts down to 0;
18
 *  * remaining_good is the number of good choices not selected yet;
19
 *  * remaining_total is the total number of choices not selected yet.
20
 *
21
 *  In the loop, we select items by choosing a random integer in
22
 *  the interval [0, remaining_total), and if the value is less
23
 *  than remaining_good, it means we have selected a good one,
24
 *  so remaining_good is decremented.  Then, regardless of that
25
 *  result, computed_sample is decremented.  The loop continues
26
 *  until either computed_sample is 0, remaining_good is 0, or
27
 *  remaining_total == remaining_good.  In the latter case, it
28
 *  means there are only good choices left, so we can stop the
29
 *  loop early and select what is left of computed_sample from
30
 *  the good choices (i.e. decrease remaining_good by computed_sample).
31
 *
32
 *  When the loop exits, the actual number of good choices is
33
 *  good - remaining_good.
34
 *
35
 *  If sample is more than half the total, then initially we set
36
 *      computed_sample = total - sample
37
 *  and at the end we return remaining_good (i.e. the loop in effect
38
 *  selects the complement of the result).
39
 *
40
 *  It is assumed that when this function is called:
41
 *    * good, bad and sample are nonnegative;
42
 *    * the sum good+bad will not result in overflow; 
43
 *    * sample <= good+bad.
44
 */
45

46 1
static int64_t hypergeometric_sample(bitgen_t *bitgen_state,
47
                                     int64_t good, int64_t bad, int64_t sample)
48
{
49
    int64_t remaining_total, remaining_good, result, computed_sample;
50 1
    int64_t total = good + bad;
51

52 1
    if (sample > total/2) {
53 1
        computed_sample = total - sample;
54
    }
55
    else {
56
        computed_sample = sample;
57
    }
58

59
    remaining_total = total;
60
    remaining_good = good;
61

62 1
    while ((computed_sample > 0) && (remaining_good > 0) &&
63
           (remaining_total > remaining_good)) {
64
         // random_interval(bitgen_state, max) returns an integer in
65
         // [0, max] *inclusive*, so we decrement remaining_total before
66
         // passing it to random_interval().
67 1
        --remaining_total;
68 1
        if ((int64_t) random_interval(bitgen_state,
69
                                      remaining_total) < remaining_good) {
70
            // Selected a "good" one, so decrement remaining_good.
71 1
            --remaining_good;
72
        }
73 1
        --computed_sample;
74
    }
75

76 1
    if (remaining_total == remaining_good) {
77
        // Only "good" choices are left.
78 1
        remaining_good -= computed_sample;
79
    }
80

81 1
    if (sample > total/2) {
82
        result = remaining_good;
83
    }
84
    else {
85 1
        result = good - remaining_good;
86
    }
87

88 1
    return result;
89
}
90

91

92
// D1 = 2*sqrt(2/e)
93
// D2 = 3 - 2*sqrt(3/e)
94
#define D1 1.7155277699214135
95
#define D2 0.8989161620588988
96

97
/*
98
 *  Generate variates from the hypergeometric distribution
99
 *  using the ratio-of-uniforms method.
100
 *
101
 *  In the code, the variable names a, b, c, g, h, m, p, q, K, T,
102
 *  U and X match the names used in "Algorithm HRUA" beginning on
103
 *  page 82 of Stadlober's 1989 thesis.
104
 *
105
 *  It is assumed that when this function is called:
106
 *    * good, bad and sample are nonnegative;
107
 *    * the sum good+bad will not result in overflow; 
108
 *    * sample <= good+bad.
109
 *
110
 *  References:
111
 *  -  Ernst Stadlober's thesis "Sampling from Poisson, Binomial and
112
 *     Hypergeometric Distributions: Ratio of Uniforms as a Simple and
113
 *     Fast Alternative" (1989)
114
 *  -  Ernst Stadlober, "The ratio of uniforms approach for generating
115
 *     discrete random variates", Journal of Computational and Applied
116
 *     Mathematics, 31, pp. 181-189 (1990).
117
 */
118

119 1
static int64_t hypergeometric_hrua(bitgen_t *bitgen_state,
120
                                   int64_t good, int64_t bad, int64_t sample)
121
{
122
    int64_t mingoodbad, maxgoodbad, popsize;
123
    int64_t computed_sample;
124
    double p, q;
125
    double mu, var;
126
    double a, c, b, h, g;
127
    int64_t m, K;
128

129 1
    popsize = good + bad;
130 1
    computed_sample = MIN(sample, popsize - sample);
131 1
    mingoodbad = MIN(good, bad);
132 1
    maxgoodbad = MAX(good, bad);
133

134
    /*
135
     *  Variables that do not match Stadlober (1989)
136
     *    Here               Stadlober
137
     *    ----------------   ---------
138
     *    mingoodbad            M
139
     *    popsize               N
140
     *    computed_sample       n
141
     */
142

143 1
    p = ((double) mingoodbad) / popsize;
144 1
    q = ((double) maxgoodbad) / popsize;
145

146
    // mu is the mean of the distribution.
147 1
    mu = computed_sample * p;
148

149 1
    a = mu + 0.5;
150

151
    // var is the variance of the distribution.
152 1
    var = ((double)(popsize - computed_sample) *
153 1
           computed_sample * p * q / (popsize - 1));
154

155 1
    c = sqrt(var + 0.5);
156

157
    /*
158
     *  h is 2*s_hat (See Stadlober's theses (1989), Eq. (5.17); or
159
     *  Stadlober (1990), Eq. 8).  s_hat is the scale of the "table mountain"
160
     *  function that dominates the scaled hypergeometric PMF ("scaled" means
161
     *  normalized to have a maximum value of 1).
162
     */
163 1
    h = D1*c + D2;
164

165 1
    m = (int64_t) floor((double)(computed_sample + 1) * (mingoodbad + 1) /
166 1
                        (popsize + 2));
167

168 1
    g = (logfactorial(m) +
169 1
         logfactorial(mingoodbad - m) +
170 1
         logfactorial(computed_sample - m) +
171 1
         logfactorial(maxgoodbad - computed_sample + m));
172

173
    /*
174
     *  b is the upper bound for random samples:
175
     *  ... min(computed_sample, mingoodbad) + 1 is the length of the support.
176
     *  ... floor(a + 16*c) is 16 standard deviations beyond the mean.
177
     *
178
     *  The idea behind the second upper bound is that values that far out in
179
     *  the tail have negligible probabilities.
180
     *
181
     *  There is a comment in a previous version of this algorithm that says
182
     *      "16 for 16-decimal-digit precision in D1 and D2",
183
     *  but there is no documented justification for this value.  A lower value
184
     *  might work just as well, but I've kept the value 16 here.
185
     */
186 1
    b = MIN(MIN(computed_sample, mingoodbad) + 1, floor(a + 16*c));
187

188
    while (1) {
189
        double U, V, X, T;
190
        double gp;
191 1
        U = next_double(bitgen_state);
192 1
        V = next_double(bitgen_state);  // "U star" in Stadlober (1989)
193 1
        X = a + h*(V - 0.5) / U;
194

195
        // fast rejection:
196 1
        if ((X < 0.0) || (X >= b)) {
197 1
            continue;
198
        }
199

200 1
        K = (int64_t) floor(X);
201

202 1
        gp = (logfactorial(K) +
203 1
              logfactorial(mingoodbad - K) +
204 1
              logfactorial(computed_sample - K) +
205 1
              logfactorial(maxgoodbad - computed_sample + K));
206

207 1
        T = g - gp;
208

209
        // fast acceptance:
210 1
        if ((U*(4.0 - U) - 3.0) <= T) {
211
            break;
212
        }
213

214
        // fast rejection:
215 1
        if (U*(U - T) >= 1) {
216 1
            continue;
217
        }
218

219 1
        if (2.0*log(U) <= T) {
220
            // acceptance
221
            break;  
222
        }
223
    }
224

225 1
    if (good > bad) {
226 1
        K = computed_sample - K;
227
    }
228

229 1
    if (computed_sample < sample) {
230 1
        K = good - K;
231
    }
232

233 1
    return K;
234
}
235

236

237
/*
238
 *  Draw a sample from the hypergeometric distribution.
239
 *
240
 *  It is assumed that when this function is called:
241
 *    * good, bad and sample are nonnegative;
242
 *    * the sum good+bad will not result in overflow; 
243
 *    * sample <= good+bad.
244
 */
245

246 1
int64_t random_hypergeometric(bitgen_t *bitgen_state,
247
                              int64_t good, int64_t bad, int64_t sample)
248
{
249
    int64_t r;
250

251 1
    if ((sample >= 10) && (sample <= good + bad - 10)) {
252
        // This will use the ratio-of-uniforms method.
253 1
        r = hypergeometric_hrua(bitgen_state, good, bad, sample);
254
    }
255
    else {
256
        // The simpler implementation is faster for small samples.
257 1
        r = hypergeometric_sample(bitgen_state, good, bad, sample);
258
    }
259 1
    return r;
260
}

Read our documentation on viewing source code .

Loading