1
#define PY_SSIZE_T_CLEAN
2
#include <Python.h>
3

4
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
5
#define _MULTIARRAYMODULE
6
#include "npy_config.h"
7
#include "numpy/arrayobject.h"
8

9
#define NPY_NUMBER_MAX(a, b) ((a) > (b) ? (a) : (b))
10
#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
11

12
/*
13
 * Functions used to try to avoid/elide temporaries in python expressions
14
 * of type a + b + b by translating some operations into in-place operations.
15
 * This example translates to this bytecode:
16
 *
17
 *        0 LOAD_FAST                0 (a)
18
 *        3 LOAD_FAST                1 (b)
19
 *        6 BINARY_ADD
20
 *        7 LOAD_FAST                1 (b)
21
 *       10 BINARY_ADD
22
 *
23
 * The two named variables get their reference count increased by the load
24
 * instructions so they always have a reference count larger than 1.
25
 * The temporary of the first BINARY_ADD on the other hand only has a count of
26
 * 1. Only temporaries can have a count of 1 in python so we can use this to
27
 * transform the second operation into an in-place operation and not affect the
28
 * output of the program.
29
 * CPython does the same thing to resize memory instead of copying when doing
30
 * string concatenation.
31
 * The gain can be very significant (4x-6x) when avoiding the temporary allows
32
 * the operation to remain in the cpu caches and can still be 50% faster for
33
 * array larger than cpu cache size.
34
 *
35
 * A complication is that a DSO (dynamic shared object) module (e.g. cython)
36
 * could call the PyNumber functions directly on arrays with reference count of
37
 * 1.
38
 * This is handled by checking the call stack to verify that we have been
39
 * called directly from the cpython interpreter.
40
 * To achieve this we check that all functions in the callstack until the
41
 * cpython frame evaluation function are located in cpython or numpy.
42
 * This is an expensive operation so temporaries are only avoided for rather
43
 * large arrays.
44
 *
45
 * A possible future improvement would be to change cpython to give us access
46
 * to the top of the stack. Then we could just check that the objects involved
47
 * are on the cpython stack instead of checking the function callstack.
48
 *
49
 * Elision can be applied to all operations that do have in-place variants and
50
 * do not change types (addition, subtraction, multiplication, float division,
51
 * logical and bitwise operations ...)
52
 * For commutative operations (addition, multiplication, ...) if eliding into
53
 * the lefthand side fails it can succeed on the righthand side by swapping the
54
 * arguments. E.g. b * (a * 2) can be elided by changing it to (2 * a) * b.
55
 *
56
 * TODO only supports systems with backtrace(), Windows can probably be
57
 * supported too by using the appropriate Windows APIs.
58
 */
59

60
#if defined HAVE_BACKTRACE && defined HAVE_DLFCN_H && ! defined PYPY_VERSION
61
/* 1 prints elided operations, 2 prints stacktraces */
62
#define NPY_ELIDE_DEBUG 0
63
#define NPY_MAX_STACKSIZE 10
64

65
/* TODO can pep523 be used to somehow? */
66
#define PYFRAMEEVAL_FUNC "_PyEval_EvalFrameDefault"
67
/*
68
 * Heuristic size of the array in bytes at which backtrace overhead generation
69
 * becomes less than speed gained by in-place operations. Depends on stack depth
70
 * being checked.  Measurements with 10 stacks show it getting worthwhile
71
 * around 100KiB but to be conservative put it higher around where the L2 cache
72
 * spills.
73
 */
74
#ifndef Py_DEBUG
75
#define NPY_MIN_ELIDE_BYTES (256 * 1024)
76
#else
77
/*
78
 * in debug mode always elide but skip scalars as these can convert to 0d array
79
 * during in-place operations
80
 */
81
#define NPY_MIN_ELIDE_BYTES (32)
82
#endif
83
#include <dlfcn.h>
84
#include <execinfo.h>
85

86
/*
87
 * linear search pointer in table
88
 * number of pointers is usually quite small but if a performance impact can be
89
 * measured this could be converted to a binary search
90
 */
91
static int
92
find_addr(void * addresses[], npy_intp naddr, void * addr)
93
{
94
    npy_intp j;
95 1
    for (j = 0; j < naddr; j++) {
96 1
        if (addr == addresses[j]) {
97
            return 1;
98
        }
99
    }
100
    return 0;
101
}
102

103
static int
104 1
check_callers(int * cannot)
105
{
106
    /*
107
     * get base addresses of multiarray and python, check if
108
     * backtrace is in these libraries only calling dladdr if a new max address
109
     * is found.
110
     * When after the initial multiarray stack everything is inside python we
111
     * can elide as no C-API user could have messed up the reference counts.
112
     * Only check until the python frame evaluation function is found
113
     * approx 10us overhead for stack size of 10
114
     *
115
     * TODO some calls go over scalarmath in umath but we cannot get the base
116
     * address of it from multiarraymodule as it is not linked against it
117
     */
118
    static int init = 0;
119
    /*
120
     * measured DSO object memory start and end, if an address is located
121
     * inside these bounds it is part of that library so we don't need to call
122
     * dladdr on it (assuming linear memory)
123
     */
124
    static void * pos_python_start;
125
    static void * pos_python_end;
126
    static void * pos_ma_start;
127
    static void * pos_ma_end;
128

129
    /* known address storage to save dladdr calls */
130
    static void * py_addr[64];
131
    static void * pyeval_addr[64];
132
    static npy_intp n_py_addr = 0;
133
    static npy_intp n_pyeval = 0;
134

135
    void *buffer[NPY_MAX_STACKSIZE];
136
    int i, nptrs;
137 1
    int ok = 0;
138
    /* cannot determine callers */
139 1
    if (init == -1) {
140 0
        *cannot = 1;
141 0
        return 0;
142
    }
143

144 1
    nptrs = backtrace(buffer, NPY_MAX_STACKSIZE);
145 1
    if (nptrs == 0) {
146
        /* complete failure, disable elision */
147 0
        init = -1;
148 0
        *cannot = 1;
149 0
        return 0;
150
    }
151

152
    /* setup DSO base addresses, ends updated later */
153 1
    if (NPY_UNLIKELY(init == 0)) {
154
        Dl_info info;
155
        /* get python base address */
156 1
        if (dladdr(&PyNumber_Or, &info)) {
157 1
            pos_python_start = info.dli_fbase;
158 1
            pos_python_end = info.dli_fbase;
159
        }
160
        else {
161 0
            init = -1;
162 0
            return 0;
163
        }
164
        /* get multiarray base address */
165 1
        if (dladdr(&PyArray_INCREF, &info)) {
166 1
            pos_ma_start = info.dli_fbase;
167 1
            pos_ma_end = info.dli_fbase;
168
        }
169
        else {
170 0
            init = -1;
171 0
            return 0;
172
        }
173 1
        init = 1;
174
    }
175

176
    /* loop over callstack addresses to check if they leave numpy or cpython */
177 1
    for (i = 0; i < nptrs; i++) {
178
        Dl_info info;
179 1
        int in_python = 0;
180 1
        int in_multiarray = 0;
181

182
#if NPY_ELIDE_DEBUG >= 2
183
        dladdr(buffer[i], &info);
184
        printf("%s(%p) %s(%p)\n", info.dli_fname, info.dli_fbase,
185
               info.dli_sname, info.dli_saddr);
186
#endif
187

188
        /* check stored DSO boundaries first */
189 1
        if (buffer[i] >= pos_python_start && buffer[i] <= pos_python_end) {
190
            in_python = 1;
191
        }
192 1
        else if (buffer[i] >= pos_ma_start && buffer[i] <= pos_ma_end) {
193 1
            in_multiarray = 1;
194
        }
195

196
        /* update DSO boundaries via dladdr if necessary */
197 1
        if (!in_python && !in_multiarray) {
198 1
            if (dladdr(buffer[i], &info) == 0) {
199 0
                init = -1;
200 0
                ok = 0;
201 1
                break;
202
            }
203
            /* update DSO end */
204 1
            if (info.dli_fbase == pos_python_start) {
205 1
                pos_python_end = NPY_NUMBER_MAX(buffer[i], pos_python_end);
206 1
                in_python = 1;
207
            }
208 1
            else if (info.dli_fbase == pos_ma_start) {
209 1
                pos_ma_end = NPY_NUMBER_MAX(buffer[i], pos_ma_end);
210 1
                in_multiarray = 1;
211
            }
212
        }
213

214
        /* no longer in ok libraries and not reached PyEval -> no elide */
215 1
        if (!in_python && !in_multiarray) {
216
            ok = 0;
217
            break;
218
        }
219

220
        /* in python check if the frame eval function was reached */
221 1
        if (in_python) {
222
            /* if reached eval we are done */
223 1
            if (find_addr(pyeval_addr, n_pyeval, buffer[i])) {
224
                ok = 1;
225
                break;
226
            }
227
            /*
228
             * check if its some other function, use pointer lookup table to
229
             * save expensive dladdr calls
230
             */
231 1
            if (find_addr(py_addr, n_py_addr, buffer[i])) {
232 1
                continue;
233
            }
234

235
            /* new python address, check for PyEvalFrame */
236 1
            if (dladdr(buffer[i], &info) == 0) {
237 0
                init = -1;
238 0
                ok = 0;
239
                break;
240
            }
241 1
            if (info.dli_sname &&
242 1
                    strcmp(info.dli_sname, PYFRAMEEVAL_FUNC) == 0) {
243 1
                if (n_pyeval < (npy_intp)ARRAY_SIZE(pyeval_addr)) {
244
                    /* store address to not have to dladdr it again */
245 1
                    pyeval_addr[n_pyeval++] = buffer[i];
246
                }
247 1
                ok = 1;
248
                break;
249
            }
250 1
            else if (n_py_addr < (npy_intp)ARRAY_SIZE(py_addr)) {
251
                /* store other py function to not have to dladdr it again */
252 1
                py_addr[n_py_addr++] = buffer[i];
253
            }
254
        }
255
    }
256

257
    /* all stacks after numpy are from python, we can elide */
258 1
    if (ok) {
259 1
        *cannot = 0;
260 1
        return 1;
261
    }
262
    else {
263
#if NPY_ELIDE_DEBUG != 0
264
        puts("cannot elide due to c-api usage");
265
#endif
266 1
        *cannot = 1;
267 1
        return 0;
268
    }
269
}
270

271
/*
272
 * check if in "alhs @op@ orhs" that alhs is a temporary (refcnt == 1) so we
273
 * can do in-place operations instead of creating a new temporary
274
 * "cannot" is set to true if it cannot be done even with swapped arguments
275
 */
276
static int
277 1
can_elide_temp(PyArrayObject * alhs, PyObject * orhs, int * cannot)
278
{
279
    /*
280
     * to be a candidate the array needs to have reference count 1, be an exact
281
     * array of a basic type, own its data and size larger than threshold
282
     */
283 1
    if (Py_REFCNT(alhs) != 1 || !PyArray_CheckExact(alhs) ||
284 1
            !PyArray_ISNUMBER(alhs) ||
285 1
            !PyArray_CHKFLAGS(alhs, NPY_ARRAY_OWNDATA) ||
286 1
            !PyArray_ISWRITEABLE(alhs) ||
287 1
            PyArray_CHKFLAGS(alhs, NPY_ARRAY_UPDATEIFCOPY) ||
288 1
            PyArray_CHKFLAGS(alhs, NPY_ARRAY_WRITEBACKIFCOPY) ||
289 1
            PyArray_NBYTES(alhs) < NPY_MIN_ELIDE_BYTES) {
290
        return 0;
291
    }
292 1
    if (PyArray_CheckExact(orhs) ||
293 1
        PyArray_CheckAnyScalar(orhs)) {
294
        PyArrayObject * arhs;
295

296
        /* create array from right hand side */
297 1
        Py_INCREF(orhs);
298 1
        arhs = (PyArrayObject *)PyArray_EnsureArray(orhs);
299 1
        if (arhs == NULL) {
300
            return 0;
301
        }
302

303
        /*
304
         * if rhs is not a scalar dimensions must match
305
         * TODO: one could allow broadcasting on equal types
306
         */
307 1
        if (!(PyArray_NDIM(arhs) == 0 ||
308 1
              (PyArray_NDIM(arhs) == PyArray_NDIM(alhs) &&
309 1
               PyArray_CompareLists(PyArray_DIMS(alhs), PyArray_DIMS(arhs),
310
                                    PyArray_NDIM(arhs))))) {
311 0
                Py_DECREF(arhs);
312
                return 0;
313
        }
314

315
        /* must be safe to cast (checks values for scalar in rhs) */
316 1
        if (PyArray_CanCastArrayTo(arhs, PyArray_DESCR(alhs),
317
                                   NPY_SAFE_CASTING)) {
318 1
            Py_DECREF(arhs);
319 1
            return check_callers(cannot);
320
        }
321 1
        Py_DECREF(arhs);
322
    }
323

324
    return 0;
325
}
326

327
/*
328
 * try eliding a binary op, if commutative is true also try swapped arguments
329
 */
330
NPY_NO_EXPORT int
331 1
try_binary_elide(PyArrayObject * m1, PyObject * m2,
332
                 PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
333
                 PyObject ** res, int commutative)
334
{
335
    /* set when no elision can be done independent of argument order */
336 1
    int cannot = 0;
337 1
    if (can_elide_temp(m1, m2, &cannot)) {
338 1
        *res = inplace_op(m1, m2);
339
#if NPY_ELIDE_DEBUG != 0
340
        puts("elided temporary in binary op");
341
#endif
342 1
        return 1;
343
    }
344 1
    else if (commutative && !cannot) {
345 1
        if (can_elide_temp((PyArrayObject *)m2, (PyObject *)m1, &cannot)) {
346 1
            *res = inplace_op((PyArrayObject *)m2, (PyObject *)m1);
347
#if NPY_ELIDE_DEBUG != 0
348
            puts("elided temporary in commutative binary op");
349
#endif
350 1
            return 1;
351
        }
352
    }
353 1
    *res = NULL;
354 1
    return 0;
355
}
356

357
/* try elide unary temporary */
358
NPY_NO_EXPORT int
359 1
can_elide_temp_unary(PyArrayObject * m1)
360
{
361
    int cannot;
362 1
    if (Py_REFCNT(m1) != 1 || !PyArray_CheckExact(m1) ||
363 1
            !PyArray_ISNUMBER(m1) ||
364 1
            !PyArray_CHKFLAGS(m1, NPY_ARRAY_OWNDATA) ||
365 1
            !PyArray_ISWRITEABLE(m1) ||
366 1
            PyArray_CHKFLAGS(m1, NPY_ARRAY_UPDATEIFCOPY) ||
367 1
            PyArray_NBYTES(m1) < NPY_MIN_ELIDE_BYTES) {
368
        return 0;
369
    }
370 1
    if (check_callers(&cannot)) {
371
#if NPY_ELIDE_DEBUG != 0
372
        puts("elided temporary in unary op");
373
#endif
374
        return 1;
375
    }
376
    else {
377 0
        return 0;
378
    }
379
}
380
#else /* unsupported interpreter or missing backtrace */
381
NPY_NO_EXPORT int
382
can_elide_temp_unary(PyArrayObject * m1)
383
{
384
    return 0;
385
}
386

387
NPY_NO_EXPORT int
388
try_binary_elide(PyArrayObject * m1, PyObject * m2,
389
                 PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
390
                 PyObject ** res, int commutative)
391
{
392
    *res = NULL;
393
    return 0;
394
}
395
#endif

Read our documentation on viewing source code .

Loading