1
#ifndef __LOWLEVEL_STRIDED_LOOPS_H
2
#define __LOWLEVEL_STRIDED_LOOPS_H
3
#include "common.h"
4
#include <npy_config.h>
5
#include "mem_overlap.h"
6

7
/* For PyArray_ macros used below */
8
#include "numpy/ndarrayobject.h"
9

10
/*
11
 * NOTE: This API should remain private for the time being, to allow
12
 *       for further refinement.  I think the 'aligned' mechanism
13
 *       needs changing, for example. 
14
 *
15
 *       Note: Updated in 2018 to distinguish "true" from "uint" alignment.
16
 */
17

18
/*
19
 * This function pointer is for unary operations that input an
20
 * arbitrarily strided one-dimensional array segment and output
21
 * an arbitrarily strided array segment of the same size.
22
 * It may be a fully general function, or a specialized function
23
 * when the strides or item size have particular known values.
24
 *
25
 * Examples of unary operations are a straight copy, a byte-swap,
26
 * and a casting operation,
27
 *
28
 * The 'transferdata' parameter is slightly special, following a
29
 * generic auxiliary data pattern defined in ndarraytypes.h
30
 * Use NPY_AUXDATA_CLONE and NPY_AUXDATA_FREE to deal with this data.
31
 *
32
 */
33
typedef int (PyArray_StridedUnaryOp)(
34
        char *dst, npy_intp dst_stride, char *src, npy_intp src_stride,
35
        npy_intp N, npy_intp src_itemsize, NpyAuxData *transferdata);
36

37
/*
38
 * This is for pointers to functions which behave exactly as
39
 * for PyArray_StridedUnaryOp, but with an additional mask controlling
40
 * which values are transformed.
41
 *
42
 * In particular, the 'i'-th element is operated on if and only if
43
 * mask[i*mask_stride] is true.
44
 */
45
typedef int (PyArray_MaskedStridedUnaryOp)(
46
        char *dst, npy_intp dst_stride, char *src, npy_intp src_stride,
47
        npy_bool *mask, npy_intp mask_stride,
48
        npy_intp N, npy_intp src_itemsize, NpyAuxData *transferdata);
49

50
/*
51
 * Gives back a function pointer to a specialized function for copying
52
 * strided memory.  Returns NULL if there is a problem with the inputs.
53
 *
54
 * aligned:
55
 *      Should be 1 if the src and dst pointers always point to
56
 *      locations at which a uint of equal size to dtype->elsize
57
 *      would be aligned, 0 otherwise.
58
 * src_stride:
59
 *      Should be the src stride if it will always be the same,
60
 *      NPY_MAX_INTP otherwise.
61
 * dst_stride:
62
 *      Should be the dst stride if it will always be the same,
63
 *      NPY_MAX_INTP otherwise.
64
 * itemsize:
65
 *      Should be the item size if it will always be the same, 0 otherwise.
66
 *
67
 */
68
NPY_NO_EXPORT PyArray_StridedUnaryOp *
69
PyArray_GetStridedCopyFn(int aligned,
70
                        npy_intp src_stride, npy_intp dst_stride,
71
                        npy_intp itemsize);
72

73
/*
74
 * Gives back a function pointer to a specialized function for copying
75
 * and swapping strided memory.  This assumes each element is a single
76
 * value to be swapped.
77
 *
78
 * For information on the 'aligned', 'src_stride' and 'dst_stride' parameters
79
 * see above.
80
 *
81
 * Parameters are as for PyArray_GetStridedCopyFn.
82
 */
83
NPY_NO_EXPORT PyArray_StridedUnaryOp *
84
PyArray_GetStridedCopySwapFn(int aligned,
85
                            npy_intp src_stride, npy_intp dst_stride,
86
                            npy_intp itemsize);
87

88
/*
89
 * Gives back a function pointer to a specialized function for copying
90
 * and swapping strided memory.  This assumes each element is a pair
91
 * of values, each of which needs to be swapped.
92
 *
93
 * For information on the 'aligned', 'src_stride' and 'dst_stride' parameters
94
 * see above.
95
 *
96
 * Parameters are as for PyArray_GetStridedCopyFn.
97
 */
98
NPY_NO_EXPORT PyArray_StridedUnaryOp *
99
PyArray_GetStridedCopySwapPairFn(int aligned,
100
                            npy_intp src_stride, npy_intp dst_stride,
101
                            npy_intp itemsize);
102

103
/*
104
 * Gives back a transfer function and transfer data pair which copies
105
 * the data from source to dest, truncating it if the data doesn't
106
 * fit, and padding with zero bytes if there's too much space.
107
 *
108
 * For information on the 'aligned', 'src_stride' and 'dst_stride' parameters
109
 * see above.
110
 *
111
 * Returns NPY_SUCCEED or NPY_FAIL
112
 */
113
NPY_NO_EXPORT int
114
PyArray_GetStridedZeroPadCopyFn(int aligned, int unicode_swap,
115
                            npy_intp src_stride, npy_intp dst_stride,
116
                            npy_intp src_itemsize, npy_intp dst_itemsize,
117
                            PyArray_StridedUnaryOp **outstransfer,
118
                            NpyAuxData **outtransferdata);
119

120
/*
121
 * For casts between built-in numeric types,
122
 * this produces a function pointer for casting from src_type_num
123
 * to dst_type_num.  If a conversion is unsupported, returns NULL
124
 * without setting a Python exception.
125
 */
126
NPY_NO_EXPORT PyArray_StridedUnaryOp *
127
PyArray_GetStridedNumericCastFn(int aligned,
128
                            npy_intp src_stride, npy_intp dst_stride,
129
                            int src_type_num, int dst_type_num);
130

131
/*
132
 * Gets an operation which copies elements of the given dtype,
133
 * swapping if the dtype isn't in NBO.
134
 *
135
 * Returns NPY_SUCCEED or NPY_FAIL
136
 */
137
NPY_NO_EXPORT int
138
PyArray_GetDTypeCopySwapFn(int aligned,
139
                            npy_intp src_stride, npy_intp dst_stride,
140
                            PyArray_Descr *dtype,
141
                            PyArray_StridedUnaryOp **outstransfer,
142
                            NpyAuxData **outtransferdata);
143

144
/*
145
 * If it's possible, gives back a transfer function which casts and/or
146
 * byte swaps data with the dtype 'src_dtype' into data with the dtype
147
 * 'dst_dtype'.  If the outtransferdata is populated with a non-NULL value,
148
 * it must be deallocated with the NPY_AUXDATA_FREE
149
 * function when the transfer function is no longer required.
150
 *
151
 * aligned:
152
 *      Should be 1 if the src and dst pointers always point to
153
 *      locations at which a uint of equal size to dtype->elsize
154
 *      would be aligned, 0 otherwise.
155
 * src_stride:
156
 *      Should be the src stride if it will always be the same,
157
 *      NPY_MAX_INTP otherwise.
158
 * dst_stride:
159
 *      Should be the dst stride if it will always be the same,
160
 *      NPY_MAX_INTP otherwise.
161
 * src_dtype:
162
 *      The data type of source data.  If this is NULL, a transfer
163
 *      function which sets the destination to zeros is produced.
164
 * dst_dtype:
165
 *      The data type of destination data.  If this is NULL and
166
 *      move_references is 1, a transfer function which decrements
167
 *      source data references is produced.
168
 * move_references:
169
 *      If 0, the destination data gets new reference ownership.
170
 *      If 1, the references from the source data are moved to
171
 *      the destination data.
172
 * out_stransfer:
173
 *      The resulting transfer function is placed here.
174
 * out_transferdata:
175
 *      The auxiliary data for the transfer function is placed here.
176
 *      When finished with the transfer function, the caller must call
177
 *      NPY_AUXDATA_FREE on this data.
178
 * out_needs_api:
179
 *      If this is non-NULL, and the transfer function produced needs
180
 *      to call into the (Python) API, this gets set to 1.  This
181
 *      remains untouched if no API access is required.
182
 *
183
 * WARNING: If you set move_references to 1, it is best that src_stride is
184
 *          never zero when calling the transfer function.  Otherwise, the
185
 *          first destination reference will get the value and all the rest
186
 *          will get NULL.
187
 *
188
 * Returns NPY_SUCCEED or NPY_FAIL.
189
 */
190
NPY_NO_EXPORT int
191
PyArray_GetDTypeTransferFunction(int aligned,
192
                            npy_intp src_stride, npy_intp dst_stride,
193
                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
194
                            int move_references,
195
                            PyArray_StridedUnaryOp **out_stransfer,
196
                            NpyAuxData **out_transferdata,
197
                            int *out_needs_api);
198

199
/*
200
 * This is identical to PyArray_GetDTypeTransferFunction, but returns a
201
 * transfer function which also takes a mask as a parameter.  The mask is used
202
 * to determine which values to copy, and data is transferred exactly when
203
 * mask[i*mask_stride] is true.
204
 *
205
 * If move_references is true, values which are not copied to the
206
 * destination will still have their source reference decremented.
207
 *
208
 * If mask_dtype is NPY_BOOL or NPY_UINT8, each full element is either
209
 * transferred or not according to the mask as described above. If
210
 * dst_dtype and mask_dtype are both struct dtypes, their names must
211
 * match exactly, and the dtype of each leaf field in mask_dtype must
212
 * be either NPY_BOOL or NPY_UINT8.
213
 */
214
NPY_NO_EXPORT int
215
PyArray_GetMaskedDTypeTransferFunction(int aligned,
216
                            npy_intp src_stride,
217
                            npy_intp dst_stride,
218
                            npy_intp mask_stride,
219
                            PyArray_Descr *src_dtype,
220
                            PyArray_Descr *dst_dtype,
221
                            PyArray_Descr *mask_dtype,
222
                            int move_references,
223
                            PyArray_MaskedStridedUnaryOp **out_stransfer,
224
                            NpyAuxData **out_transferdata,
225
                            int *out_needs_api);
226

227
/*
228
 * Casts the specified number of elements from 'src' with data type
229
 * 'src_dtype' to 'dst' with 'dst_dtype'. See
230
 * PyArray_GetDTypeTransferFunction for more details.
231
 *
232
 * Returns NPY_SUCCEED or NPY_FAIL.
233
 */
234
NPY_NO_EXPORT int
235
PyArray_CastRawArrays(npy_intp count,
236
                      char *src, char *dst,
237
                      npy_intp src_stride, npy_intp dst_stride,
238
                      PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
239
                      int move_references);
240

241
/*
242
 * These two functions copy or convert the data of an n-dimensional array
243
 * to/from a 1-dimensional strided buffer.  These functions will only call
244
 * 'stransfer' with the provided dst_stride/src_stride and
245
 * dst_strides[0]/src_strides[0], so the caller can use those values to
246
 * specialize the function.
247
 * Note that even if ndim == 0, everything needs to be set as if ndim == 1.
248
 *
249
 * The return value is the number of elements it couldn't copy.  A return value
250
 * of 0 means all elements were copied, a larger value means the end of
251
 * the n-dimensional array was reached before 'count' elements were copied.
252
 * A negative return value indicates an error occurred.
253
 *
254
 * ndim:
255
 *      The number of dimensions of the n-dimensional array.
256
 * dst/src/mask:
257
 *      The destination, source or mask starting pointer.
258
 * dst_stride/src_stride/mask_stride:
259
 *      The stride of the 1-dimensional strided buffer
260
 * dst_strides/src_strides:
261
 *      The strides of the n-dimensional array.
262
 * dst_strides_inc/src_strides_inc:
263
 *      How much to add to the ..._strides pointer to get to the next stride.
264
 * coords:
265
 *      The starting coordinates in the n-dimensional array.
266
 * coords_inc:
267
 *      How much to add to the coords pointer to get to the next coordinate.
268
 * shape:
269
 *      The shape of the n-dimensional array.
270
 * shape_inc:
271
 *      How much to add to the shape pointer to get to the next shape entry.
272
 * count:
273
 *      How many elements to transfer
274
 * src_itemsize:
275
 *      How big each element is.  If transferring between elements of different
276
 *      sizes, for example a casting operation, the 'stransfer' function
277
 *      should be specialized for that, in which case 'stransfer' will use
278
 *      this parameter as the source item size.
279
 * stransfer:
280
 *      The strided transfer function.
281
 * transferdata:
282
 *      An auxiliary data pointer passed to the strided transfer function.
283
 *      This follows the conventions of NpyAuxData objects.
284
 */
285
NPY_NO_EXPORT npy_intp
286
PyArray_TransferNDimToStrided(npy_intp ndim,
287
                char *dst, npy_intp dst_stride,
288
                char *src, npy_intp const *src_strides, npy_intp src_strides_inc,
289
                npy_intp const *coords, npy_intp coords_inc,
290
                npy_intp const *shape, npy_intp shape_inc,
291
                npy_intp count, npy_intp src_itemsize,
292
                PyArray_StridedUnaryOp *stransfer,
293
                NpyAuxData *transferdata);
294

295
NPY_NO_EXPORT npy_intp
296
PyArray_TransferStridedToNDim(npy_intp ndim,
297
                char *dst, npy_intp const *dst_strides, npy_intp dst_strides_inc,
298
                char *src, npy_intp src_stride,
299
                npy_intp const *coords, npy_intp coords_inc,
300
                npy_intp const *shape, npy_intp shape_inc,
301
                npy_intp count, npy_intp src_itemsize,
302
                PyArray_StridedUnaryOp *stransfer,
303
                NpyAuxData *transferdata);
304

305
NPY_NO_EXPORT npy_intp
306
PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
307
                char *dst, npy_intp const *dst_strides, npy_intp dst_strides_inc,
308
                char *src, npy_intp src_stride,
309
                npy_bool *mask, npy_intp mask_stride,
310
                npy_intp const *coords, npy_intp coords_inc,
311
                npy_intp const *shape, npy_intp shape_inc,
312
                npy_intp count, npy_intp src_itemsize,
313
                PyArray_MaskedStridedUnaryOp *stransfer,
314
                NpyAuxData *data);
315

316
NPY_NO_EXPORT int
317
mapiter_trivial_get(PyArrayObject *self, PyArrayObject *ind,
318
                       PyArrayObject *result);
319

320
NPY_NO_EXPORT int
321
mapiter_trivial_set(PyArrayObject *self, PyArrayObject *ind,
322
                       PyArrayObject *result);
323

324
NPY_NO_EXPORT int
325
mapiter_get(PyArrayMapIterObject *mit);
326

327
NPY_NO_EXPORT int
328
mapiter_set(PyArrayMapIterObject *mit);
329

330
/*
331
 * Prepares shape and strides for a simple raw array iteration.
332
 * This sorts the strides into FORTRAN order, reverses any negative
333
 * strides, then coalesces axes where possible. The results are
334
 * filled in the output parameters.
335
 *
336
 * This is intended for simple, lightweight iteration over arrays
337
 * where no buffering of any kind is needed, and the array may
338
 * not be stored as a PyArrayObject.
339
 *
340
 * You can use this together with NPY_RAW_ITER_START and
341
 * NPY_RAW_ITER_ONE_NEXT to handle the looping boilerplate of everything
342
 * but the innermost loop (which is for idim == 0).
343
 *
344
 * Returns 0 on success, -1 on failure.
345
 */
346
NPY_NO_EXPORT int
347
PyArray_PrepareOneRawArrayIter(int ndim, npy_intp const *shape,
348
                            char *data, npy_intp const *strides,
349
                            int *out_ndim, npy_intp *out_shape,
350
                            char **out_data, npy_intp *out_strides);
351

352
/*
353
 * The same as PyArray_PrepareOneRawArrayIter, but for two
354
 * operands instead of one. Any broadcasting of the two operands
355
 * should have already been done before calling this function,
356
 * as the ndim and shape is only specified once for both operands.
357
 *
358
 * Only the strides of the first operand are used to reorder
359
 * the dimensions, no attempt to consider all the strides together
360
 * is made, as is done in the NpyIter object.
361
 *
362
 * You can use this together with NPY_RAW_ITER_START and
363
 * NPY_RAW_ITER_TWO_NEXT to handle the looping boilerplate of everything
364
 * but the innermost loop (which is for idim == 0).
365
 *
366
 * Returns 0 on success, -1 on failure.
367
 */
368
NPY_NO_EXPORT int
369
PyArray_PrepareTwoRawArrayIter(int ndim, npy_intp const *shape,
370
                            char *dataA, npy_intp const *stridesA,
371
                            char *dataB, npy_intp const *stridesB,
372
                            int *out_ndim, npy_intp *out_shape,
373
                            char **out_dataA, npy_intp *out_stridesA,
374
                            char **out_dataB, npy_intp *out_stridesB);
375

376
/*
377
 * The same as PyArray_PrepareOneRawArrayIter, but for three
378
 * operands instead of one. Any broadcasting of the three operands
379
 * should have already been done before calling this function,
380
 * as the ndim and shape is only specified once for all operands.
381
 *
382
 * Only the strides of the first operand are used to reorder
383
 * the dimensions, no attempt to consider all the strides together
384
 * is made, as is done in the NpyIter object.
385
 *
386
 * You can use this together with NPY_RAW_ITER_START and
387
 * NPY_RAW_ITER_THREE_NEXT to handle the looping boilerplate of everything
388
 * but the innermost loop (which is for idim == 0).
389
 *
390
 * Returns 0 on success, -1 on failure.
391
 */
392
NPY_NO_EXPORT int
393
PyArray_PrepareThreeRawArrayIter(int ndim, npy_intp const *shape,
394
                            char *dataA, npy_intp const *stridesA,
395
                            char *dataB, npy_intp const *stridesB,
396
                            char *dataC, npy_intp const *stridesC,
397
                            int *out_ndim, npy_intp *out_shape,
398
                            char **out_dataA, npy_intp *out_stridesA,
399
                            char **out_dataB, npy_intp *out_stridesB,
400
                            char **out_dataC, npy_intp *out_stridesC);
401

402
/*
403
 * Return number of elements that must be peeled from the start of 'addr' with
404
 * 'nvals' elements of size 'esize' in order to reach blockable alignment.
405
 * The required alignment in bytes is passed as the 'alignment' argument and
406
 * must be a power of two. This function is used to prepare an array for
407
 * blocking. See the 'npy_blocked_end' function documentation below for an
408
 * example of how this function is used.
409
 */
410
static NPY_INLINE npy_intp
411
npy_aligned_block_offset(const void * addr, const npy_uintp esize,
412
                         const npy_uintp alignment, const npy_uintp nvals)
413
{
414
    npy_uintp offset, peel;
415

416 1
    offset = (npy_uintp)addr & (alignment - 1);
417 1
    peel = offset ? (alignment - offset) / esize : 0;
418 1
    peel = (peel <= nvals) ? peel : nvals;
419
    assert(peel <= NPY_MAX_INTP);
420 1
    return (npy_intp)peel;
421
}
422

423
/*
424
 * Return upper loop bound for an array of 'nvals' elements
425
 * of size 'esize' peeled by 'offset' elements and blocking to
426
 * a vector size of 'vsz' in bytes
427
 *
428
 * example usage:
429
 * npy_intp i;
430
 * double v[101];
431
 * npy_intp esize = sizeof(v[0]);
432
 * npy_intp peel = npy_aligned_block_offset(v, esize, 16, n);
433
 * // peel to alignment 16
434
 * for (i = 0; i < peel; i++)
435
 *   <scalar-op>
436
 * // simd vectorized operation
437
 * for (; i < npy_blocked_end(peel, esize, 16, n); i += 16 / esize)
438
 *   <blocked-op>
439
 * // handle scalar rest
440
 * for(; i < n; i++)
441
 *   <scalar-op>
442
 */
443
static NPY_INLINE npy_intp
444
npy_blocked_end(const npy_uintp peel, const npy_uintp esize,
445
                const npy_uintp vsz, const npy_uintp nvals)
446
{
447 1
    npy_uintp ndiff = nvals - peel;
448 1
    npy_uintp res = (ndiff - ndiff % (vsz / esize));
449

450
    assert(nvals >= peel);
451
    assert(res <= NPY_MAX_INTP);
452 1
    return (npy_intp)(res);
453
}
454

455

456
/* byte swapping functions */
457
static NPY_INLINE npy_uint16
458
npy_bswap2(npy_uint16 x)
459
{
460 1
    return ((x & 0xffu) << 8) | (x >> 8);
461
}
462

463
/*
464
 * treat as int16 and byteswap unaligned memory,
465
 * some cpus don't support unaligned access
466
 */
467
static NPY_INLINE void
468
npy_bswap2_unaligned(char * x)
469
{
470 0
    char a = x[0];
471 0
    x[0] = x[1];
472 0
    x[1] = a;
473
}
474

475
static NPY_INLINE npy_uint32
476
npy_bswap4(npy_uint32 x)
477
{
478
#ifdef HAVE___BUILTIN_BSWAP32
479 1
    return __builtin_bswap32(x);
480
#else
481
    return ((x & 0xffu) << 24) | ((x & 0xff00u) << 8) |
482
           ((x & 0xff0000u) >> 8) | (x >> 24);
483
#endif
484
}
485

486
static NPY_INLINE void
487
npy_bswap4_unaligned(char * x)
488
{
489 1
    char a = x[0];
490 1
    x[0] = x[3];
491 1
    x[3] = a;
492 1
    a = x[1];
493 1
    x[1] = x[2];
494 1
    x[2] = a;
495
}
496

497
static NPY_INLINE npy_uint64
498
npy_bswap8(npy_uint64 x)
499
{
500
#ifdef HAVE___BUILTIN_BSWAP64
501 1
    return __builtin_bswap64(x);
502
#else
503
    return ((x & 0xffULL) << 56) |
504
           ((x & 0xff00ULL) << 40) |
505
           ((x & 0xff0000ULL) << 24) |
506
           ((x & 0xff000000ULL) << 8) |
507
           ((x & 0xff00000000ULL) >> 8) |
508
           ((x & 0xff0000000000ULL) >> 24) |
509
           ((x & 0xff000000000000ULL) >> 40) |
510
           ( x >> 56);
511
#endif
512
}
513

514
static NPY_INLINE void
515
npy_bswap8_unaligned(char * x)
516
{
517 1
    char a = x[0]; x[0] = x[7]; x[7] = a;
518 1
    a = x[1]; x[1] = x[6]; x[6] = a;
519 1
    a = x[2]; x[2] = x[5]; x[5] = a;
520 1
    a = x[3]; x[3] = x[4]; x[4] = a;
521
}
522

523

524
/* Start raw iteration */
525
#define NPY_RAW_ITER_START(idim, ndim, coord, shape) \
526
        memset((coord), 0, (ndim) * sizeof(coord[0])); \
527
        do {
528

529
/* Increment to the next n-dimensional coordinate for one raw array */
530
#define NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides) \
531
            for ((idim) = 1; (idim) < (ndim); ++(idim)) { \
532
                if (++(coord)[idim] == (shape)[idim]) { \
533
                    (coord)[idim] = 0; \
534
                    (data) -= ((shape)[idim] - 1) * (strides)[idim]; \
535
                } \
536
                else { \
537
                    (data) += (strides)[idim]; \
538
                    break; \
539
                } \
540
            } \
541
        } while ((idim) < (ndim))
542

543
/* Increment to the next n-dimensional coordinate for two raw arrays */
544
#define NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape, \
545
                              dataA, stridesA, dataB, stridesB) \
546
            for ((idim) = 1; (idim) < (ndim); ++(idim)) { \
547
                if (++(coord)[idim] == (shape)[idim]) { \
548
                    (coord)[idim] = 0; \
549
                    (dataA) -= ((shape)[idim] - 1) * (stridesA)[idim]; \
550
                    (dataB) -= ((shape)[idim] - 1) * (stridesB)[idim]; \
551
                } \
552
                else { \
553
                    (dataA) += (stridesA)[idim]; \
554
                    (dataB) += (stridesB)[idim]; \
555
                    break; \
556
                } \
557
            } \
558
        } while ((idim) < (ndim))
559

560
/* Increment to the next n-dimensional coordinate for three raw arrays */
561
#define NPY_RAW_ITER_THREE_NEXT(idim, ndim, coord, shape, \
562
                              dataA, stridesA, \
563
                              dataB, stridesB, \
564
                              dataC, stridesC) \
565
            for ((idim) = 1; (idim) < (ndim); ++(idim)) { \
566
                if (++(coord)[idim] == (shape)[idim]) { \
567
                    (coord)[idim] = 0; \
568
                    (dataA) -= ((shape)[idim] - 1) * (stridesA)[idim]; \
569
                    (dataB) -= ((shape)[idim] - 1) * (stridesB)[idim]; \
570
                    (dataC) -= ((shape)[idim] - 1) * (stridesC)[idim]; \
571
                } \
572
                else { \
573
                    (dataA) += (stridesA)[idim]; \
574
                    (dataB) += (stridesB)[idim]; \
575
                    (dataC) += (stridesC)[idim]; \
576
                    break; \
577
                } \
578
            } \
579
        } while ((idim) < (ndim))
580

581
/* Increment to the next n-dimensional coordinate for four raw arrays */
582
#define NPY_RAW_ITER_FOUR_NEXT(idim, ndim, coord, shape, \
583
                              dataA, stridesA, \
584
                              dataB, stridesB, \
585
                              dataC, stridesC, \
586
                              dataD, stridesD) \
587
            for ((idim) = 1; (idim) < (ndim); ++(idim)) { \
588
                if (++(coord)[idim] == (shape)[idim]) { \
589
                    (coord)[idim] = 0; \
590
                    (dataA) -= ((shape)[idim] - 1) * (stridesA)[idim]; \
591
                    (dataB) -= ((shape)[idim] - 1) * (stridesB)[idim]; \
592
                    (dataC) -= ((shape)[idim] - 1) * (stridesC)[idim]; \
593
                    (dataD) -= ((shape)[idim] - 1) * (stridesD)[idim]; \
594
                } \
595
                else { \
596
                    (dataA) += (stridesA)[idim]; \
597
                    (dataB) += (stridesB)[idim]; \
598
                    (dataC) += (stridesC)[idim]; \
599
                    (dataD) += (stridesD)[idim]; \
600
                    break; \
601
                } \
602
            } \
603
        } while ((idim) < (ndim))
604

605

606
/*
607
 *            TRIVIAL ITERATION
608
 *
609
 * In some cases when the iteration order isn't important, iteration over
610
 * arrays is trivial.  This is the case when:
611
 *   * The array has 0 or 1 dimensions.
612
 *   * The array is C or Fortran contiguous.
613
 * Use of an iterator can be skipped when this occurs.  These macros assist
614
 * in detecting and taking advantage of the situation.  Note that it may
615
 * be worthwhile to further check if the stride is a contiguous stride
616
 * and take advantage of that.
617
 *
618
 * Here is example code for a single array:
619
 *
620
 *      if (PyArray_TRIVIALLY_ITERABLE(self)) {
621
 *          char *data;
622
 *          npy_intp count, stride;
623
 *
624
 *          PyArray_PREPARE_TRIVIAL_ITERATION(self, count, data, stride);
625
 *
626
 *          while (count--) {
627
 *              // Use the data pointer
628
 *
629
 *              data += stride;
630
 *          }
631
 *      }
632
 *      else {
633
 *          // Create iterator, etc...
634
 *      }
635
 *
636
 * Here is example code for a pair of arrays:
637
 *
638
 *      if (PyArray_TRIVIALLY_ITERABLE_PAIR(a1, a2)) {
639
 *          char *data1, *data2;
640
 *          npy_intp count, stride1, stride2;
641
 *
642
 *          PyArray_PREPARE_TRIVIAL_PAIR_ITERATION(a1, a2, count,
643
 *                                  data1, data2, stride1, stride2);
644
 *
645
 *          while (count--) {
646
 *              // Use the data1 and data2 pointers
647
 *
648
 *              data1 += stride1;
649
 *              data2 += stride2;
650
 *          }
651
 *      }
652
 *      else {
653
 *          // Create iterator, etc...
654
 *      }
655
 */
656

657
/*
658
 * Note: Equivalently iterable macro requires one of arr1 or arr2 be
659
 *       trivially iterable to be valid.
660
 */
661

662
/**
663
 * Determine whether two arrays are safe for trivial iteration in cases where
664
 * some of the arrays may be modified.
665
 *
666
 * In-place iteration is safe if one of the following is true:
667
 *
668
 * - Both arrays are read-only
669
 * - The arrays do not have overlapping memory (based on a check that may be too
670
 *   strict)
671
 * - The strides match, and the non-read-only array base addresses are equal or
672
 *   before the read-only one, ensuring correct data dependency.
673
 */
674

675
#define PyArray_TRIVIALLY_ITERABLE_OP_NOREAD 0
676
#define PyArray_TRIVIALLY_ITERABLE_OP_READ 1
677

678
#define PyArray_TRIVIALLY_ITERABLE(arr) ( \
679
                    PyArray_NDIM(arr) <= 1 || \
680
                    PyArray_CHKFLAGS(arr, NPY_ARRAY_C_CONTIGUOUS) || \
681
                    PyArray_CHKFLAGS(arr, NPY_ARRAY_F_CONTIGUOUS) \
682
                    )
683

684
#define PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size, arr) ( \
685
        assert(PyArray_TRIVIALLY_ITERABLE(arr)), \
686
        size == 1 ? 0 : ((PyArray_NDIM(arr) == 1) ? \
687
                             PyArray_STRIDE(arr, 0) : PyArray_ITEMSIZE(arr)))
688

689
static NPY_INLINE int
690 1
PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr2,
691
                                         int arr1_read, int arr2_read)
692
{
693
    npy_intp size1, size2, stride1, stride2;
694 1
    int arr1_ahead = 0, arr2_ahead = 0;
695

696 1
    if (arr1_read && arr2_read) {
697
        return 1;
698
    }
699

700 1
    if (solve_may_share_memory(arr1, arr2, 1) == 0) {
701
        return 1;
702
    }
703

704
    /*
705
     * Arrays overlapping in memory may be equivalently iterable if input
706
     * arrays stride ahead faster than output arrays.
707
     */
708

709 1
    size1 = PyArray_SIZE(arr1);
710 1
    size2 = PyArray_SIZE(arr2);
711

712 1
    stride1 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size1, arr1);
713 1
    stride2 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size2, arr2);
714

715
    /*
716
     * Arrays with zero stride are never "ahead" since the element is reused
717
     * (at this point we know the array extents overlap).
718
     */
719

720 1
    if (stride1 > 0) {
721 1
        arr1_ahead = (stride1 >= stride2 &&
722 1
                      PyArray_BYTES(arr1) >= PyArray_BYTES(arr2));
723
    }
724 1
    else if (stride1 < 0) {
725 0
        arr1_ahead = (stride1 <= stride2 &&
726 0
                      PyArray_BYTES(arr1) <= PyArray_BYTES(arr2));
727
    }
728

729 1
    if (stride2 > 0) {
730 1
        arr2_ahead = (stride2 >= stride1 &&
731 1
                      PyArray_BYTES(arr2) >= PyArray_BYTES(arr1));
732
    }
733 1
    else if (stride2 < 0) {
734 1
        arr2_ahead = (stride2 <= stride1 &&
735 1
                      PyArray_BYTES(arr2) <= PyArray_BYTES(arr1));
736
    }
737

738 1
    return (!arr1_read || arr1_ahead) && (!arr2_read || arr2_ahead);
739
}
740

741
#define PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) (            \
742
                        PyArray_NDIM(arr1) == PyArray_NDIM(arr2) && \
743
                        PyArray_CompareLists(PyArray_DIMS(arr1), \
744
                                             PyArray_DIMS(arr2), \
745
                                             PyArray_NDIM(arr1)) && \
746
                        (PyArray_FLAGS(arr1)&(NPY_ARRAY_C_CONTIGUOUS| \
747
                                      NPY_ARRAY_F_CONTIGUOUS)) & \
748
                                (PyArray_FLAGS(arr2)&(NPY_ARRAY_C_CONTIGUOUS| \
749
                                              NPY_ARRAY_F_CONTIGUOUS)) \
750
                        )
751

752
#define PyArray_EQUIVALENTLY_ITERABLE(arr1, arr2, arr1_read, arr2_read) ( \
753
                        PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) && \
754
                        PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK( \
755
                            arr1, arr2, arr1_read, arr2_read))
756

757
#define PyArray_PREPARE_TRIVIAL_ITERATION(arr, count, data, stride) \
758
                    count = PyArray_SIZE(arr); \
759
                    data = PyArray_BYTES(arr); \
760
                    stride = ((PyArray_NDIM(arr) == 0) ? 0 : \
761
                                    ((PyArray_NDIM(arr) == 1) ? \
762
                                            PyArray_STRIDE(arr, 0) : \
763
                                            PyArray_ITEMSIZE(arr)));
764

765
#define PyArray_TRIVIALLY_ITERABLE_PAIR(arr1, arr2, arr1_read, arr2_read) (   \
766
                    PyArray_TRIVIALLY_ITERABLE(arr1) && \
767
                        (PyArray_NDIM(arr2) == 0 || \
768
                         PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) ||  \
769
                         (PyArray_NDIM(arr1) == 0 && \
770
                             PyArray_TRIVIALLY_ITERABLE(arr2) \
771
                         ) \
772
                        ) && \
773
                        PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(arr1, arr2, arr1_read, arr2_read) \
774
                        )
775
#define PyArray_PREPARE_TRIVIAL_PAIR_ITERATION(arr1, arr2, \
776
                                        count, \
777
                                        data1, data2, \
778
                                        stride1, stride2) { \
779
                    npy_intp size1 = PyArray_SIZE(arr1); \
780
                    npy_intp size2 = PyArray_SIZE(arr2); \
781
                    count = ((size1 > size2) || size1 == 0) ? size1 : size2; \
782
                    data1 = PyArray_BYTES(arr1); \
783
                    data2 = PyArray_BYTES(arr2); \
784
                    stride1 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size1, arr1); \
785
                    stride2 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size2, arr2); \
786
                }
787

788
#define PyArray_TRIVIALLY_ITERABLE_TRIPLE(arr1, arr2, arr3, arr1_read, arr2_read, arr3_read) ( \
789
                PyArray_TRIVIALLY_ITERABLE(arr1) && \
790
                    ((PyArray_NDIM(arr2) == 0 && \
791
                        (PyArray_NDIM(arr3) == 0 || \
792
                            PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr3) \
793
                        ) \
794
                     ) || \
795
                     (PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) && \
796
                        (PyArray_NDIM(arr3) == 0 || \
797
                            PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr3) \
798
                        ) \
799
                     ) || \
800
                     (PyArray_NDIM(arr1) == 0 && \
801
                        PyArray_TRIVIALLY_ITERABLE(arr2) && \
802
                            (PyArray_NDIM(arr3) == 0 || \
803
                                PyArray_EQUIVALENTLY_ITERABLE_BASE(arr2, arr3) \
804
                            ) \
805
                     ) \
806
                    ) && \
807
                    PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(arr1, arr2, arr1_read, arr2_read) && \
808
                    PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(arr1, arr3, arr1_read, arr3_read) && \
809
                    PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(arr2, arr3, arr2_read, arr3_read) \
810
                )
811

812
#define PyArray_PREPARE_TRIVIAL_TRIPLE_ITERATION(arr1, arr2, arr3, \
813
                                        count, \
814
                                        data1, data2, data3, \
815
                                        stride1, stride2, stride3) { \
816
                    npy_intp size1 = PyArray_SIZE(arr1); \
817
                    npy_intp size2 = PyArray_SIZE(arr2); \
818
                    npy_intp size3 = PyArray_SIZE(arr3); \
819
                    count = ((size1 > size2) || size1 == 0) ? size1 : size2; \
820
                    count = ((size3 > count) || size3 == 0) ? size3 : count; \
821
                    data1 = PyArray_BYTES(arr1); \
822
                    data2 = PyArray_BYTES(arr2); \
823
                    data3 = PyArray_BYTES(arr3); \
824
                    stride1 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size1, arr1); \
825
                    stride2 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size2, arr2); \
826
                    stride3 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size3, arr3); \
827
                }
828

829
#endif

Read our documentation on viewing source code .

Loading