1
|
|
#ifndef __LOWLEVEL_STRIDED_LOOPS_H
|
2
|
|
#define __LOWLEVEL_STRIDED_LOOPS_H
|
3
|
|
#include "common.h"
|
4
|
|
#include <npy_config.h>
|
5
|
|
#include "mem_overlap.h"
|
6
|
|
|
7
|
|
/* For PyArray_ macros used below */
|
8
|
|
#include "numpy/ndarrayobject.h"
|
9
|
|
|
10
|
|
/*
|
11
|
|
* NOTE: This API should remain private for the time being, to allow
|
12
|
|
* for further refinement. I think the 'aligned' mechanism
|
13
|
|
* needs changing, for example.
|
14
|
|
*
|
15
|
|
* Note: Updated in 2018 to distinguish "true" from "uint" alignment.
|
16
|
|
*/
|
17
|
|
|
18
|
|
/*
|
19
|
|
* This function pointer is for unary operations that input an
|
20
|
|
* arbitrarily strided one-dimensional array segment and output
|
21
|
|
* an arbitrarily strided array segment of the same size.
|
22
|
|
* It may be a fully general function, or a specialized function
|
23
|
|
* when the strides or item size have particular known values.
|
24
|
|
*
|
25
|
|
* Examples of unary operations are a straight copy, a byte-swap,
|
26
|
|
* and a casting operation,
|
27
|
|
*
|
28
|
|
* The 'transferdata' parameter is slightly special, following a
|
29
|
|
* generic auxiliary data pattern defined in ndarraytypes.h
|
30
|
|
* Use NPY_AUXDATA_CLONE and NPY_AUXDATA_FREE to deal with this data.
|
31
|
|
*
|
32
|
|
*/
|
33
|
|
typedef int (PyArray_StridedUnaryOp)(
|
34
|
|
char *dst, npy_intp dst_stride, char *src, npy_intp src_stride,
|
35
|
|
npy_intp N, npy_intp src_itemsize, NpyAuxData *transferdata);
|
36
|
|
|
37
|
|
/*
|
38
|
|
* This is for pointers to functions which behave exactly as
|
39
|
|
* for PyArray_StridedUnaryOp, but with an additional mask controlling
|
40
|
|
* which values are transformed.
|
41
|
|
*
|
42
|
|
* In particular, the 'i'-th element is operated on if and only if
|
43
|
|
* mask[i*mask_stride] is true.
|
44
|
|
*/
|
45
|
|
typedef int (PyArray_MaskedStridedUnaryOp)(
|
46
|
|
char *dst, npy_intp dst_stride, char *src, npy_intp src_stride,
|
47
|
|
npy_bool *mask, npy_intp mask_stride,
|
48
|
|
npy_intp N, npy_intp src_itemsize, NpyAuxData *transferdata);
|
49
|
|
|
50
|
|
/*
|
51
|
|
* Gives back a function pointer to a specialized function for copying
|
52
|
|
* strided memory. Returns NULL if there is a problem with the inputs.
|
53
|
|
*
|
54
|
|
* aligned:
|
55
|
|
* Should be 1 if the src and dst pointers always point to
|
56
|
|
* locations at which a uint of equal size to dtype->elsize
|
57
|
|
* would be aligned, 0 otherwise.
|
58
|
|
* src_stride:
|
59
|
|
* Should be the src stride if it will always be the same,
|
60
|
|
* NPY_MAX_INTP otherwise.
|
61
|
|
* dst_stride:
|
62
|
|
* Should be the dst stride if it will always be the same,
|
63
|
|
* NPY_MAX_INTP otherwise.
|
64
|
|
* itemsize:
|
65
|
|
* Should be the item size if it will always be the same, 0 otherwise.
|
66
|
|
*
|
67
|
|
*/
|
68
|
|
NPY_NO_EXPORT PyArray_StridedUnaryOp *
|
69
|
|
PyArray_GetStridedCopyFn(int aligned,
|
70
|
|
npy_intp src_stride, npy_intp dst_stride,
|
71
|
|
npy_intp itemsize);
|
72
|
|
|
73
|
|
/*
|
74
|
|
* Gives back a function pointer to a specialized function for copying
|
75
|
|
* and swapping strided memory. This assumes each element is a single
|
76
|
|
* value to be swapped.
|
77
|
|
*
|
78
|
|
* For information on the 'aligned', 'src_stride' and 'dst_stride' parameters
|
79
|
|
* see above.
|
80
|
|
*
|
81
|
|
* Parameters are as for PyArray_GetStridedCopyFn.
|
82
|
|
*/
|
83
|
|
NPY_NO_EXPORT PyArray_StridedUnaryOp *
|
84
|
|
PyArray_GetStridedCopySwapFn(int aligned,
|
85
|
|
npy_intp src_stride, npy_intp dst_stride,
|
86
|
|
npy_intp itemsize);
|
87
|
|
|
88
|
|
/*
|
89
|
|
* Gives back a function pointer to a specialized function for copying
|
90
|
|
* and swapping strided memory. This assumes each element is a pair
|
91
|
|
* of values, each of which needs to be swapped.
|
92
|
|
*
|
93
|
|
* For information on the 'aligned', 'src_stride' and 'dst_stride' parameters
|
94
|
|
* see above.
|
95
|
|
*
|
96
|
|
* Parameters are as for PyArray_GetStridedCopyFn.
|
97
|
|
*/
|
98
|
|
NPY_NO_EXPORT PyArray_StridedUnaryOp *
|
99
|
|
PyArray_GetStridedCopySwapPairFn(int aligned,
|
100
|
|
npy_intp src_stride, npy_intp dst_stride,
|
101
|
|
npy_intp itemsize);
|
102
|
|
|
103
|
|
/*
|
104
|
|
* Gives back a transfer function and transfer data pair which copies
|
105
|
|
* the data from source to dest, truncating it if the data doesn't
|
106
|
|
* fit, and padding with zero bytes if there's too much space.
|
107
|
|
*
|
108
|
|
* For information on the 'aligned', 'src_stride' and 'dst_stride' parameters
|
109
|
|
* see above.
|
110
|
|
*
|
111
|
|
* Returns NPY_SUCCEED or NPY_FAIL
|
112
|
|
*/
|
113
|
|
NPY_NO_EXPORT int
|
114
|
|
PyArray_GetStridedZeroPadCopyFn(int aligned, int unicode_swap,
|
115
|
|
npy_intp src_stride, npy_intp dst_stride,
|
116
|
|
npy_intp src_itemsize, npy_intp dst_itemsize,
|
117
|
|
PyArray_StridedUnaryOp **outstransfer,
|
118
|
|
NpyAuxData **outtransferdata);
|
119
|
|
|
120
|
|
/*
|
121
|
|
* For casts between built-in numeric types,
|
122
|
|
* this produces a function pointer for casting from src_type_num
|
123
|
|
* to dst_type_num. If a conversion is unsupported, returns NULL
|
124
|
|
* without setting a Python exception.
|
125
|
|
*/
|
126
|
|
NPY_NO_EXPORT PyArray_StridedUnaryOp *
|
127
|
|
PyArray_GetStridedNumericCastFn(int aligned,
|
128
|
|
npy_intp src_stride, npy_intp dst_stride,
|
129
|
|
int src_type_num, int dst_type_num);
|
130
|
|
|
131
|
|
/*
|
132
|
|
* Gets an operation which copies elements of the given dtype,
|
133
|
|
* swapping if the dtype isn't in NBO.
|
134
|
|
*
|
135
|
|
* Returns NPY_SUCCEED or NPY_FAIL
|
136
|
|
*/
|
137
|
|
NPY_NO_EXPORT int
|
138
|
|
PyArray_GetDTypeCopySwapFn(int aligned,
|
139
|
|
npy_intp src_stride, npy_intp dst_stride,
|
140
|
|
PyArray_Descr *dtype,
|
141
|
|
PyArray_StridedUnaryOp **outstransfer,
|
142
|
|
NpyAuxData **outtransferdata);
|
143
|
|
|
144
|
|
/*
|
145
|
|
* If it's possible, gives back a transfer function which casts and/or
|
146
|
|
* byte swaps data with the dtype 'src_dtype' into data with the dtype
|
147
|
|
* 'dst_dtype'. If the outtransferdata is populated with a non-NULL value,
|
148
|
|
* it must be deallocated with the NPY_AUXDATA_FREE
|
149
|
|
* function when the transfer function is no longer required.
|
150
|
|
*
|
151
|
|
* aligned:
|
152
|
|
* Should be 1 if the src and dst pointers always point to
|
153
|
|
* locations at which a uint of equal size to dtype->elsize
|
154
|
|
* would be aligned, 0 otherwise.
|
155
|
|
* src_stride:
|
156
|
|
* Should be the src stride if it will always be the same,
|
157
|
|
* NPY_MAX_INTP otherwise.
|
158
|
|
* dst_stride:
|
159
|
|
* Should be the dst stride if it will always be the same,
|
160
|
|
* NPY_MAX_INTP otherwise.
|
161
|
|
* src_dtype:
|
162
|
|
* The data type of source data. If this is NULL, a transfer
|
163
|
|
* function which sets the destination to zeros is produced.
|
164
|
|
* dst_dtype:
|
165
|
|
* The data type of destination data. If this is NULL and
|
166
|
|
* move_references is 1, a transfer function which decrements
|
167
|
|
* source data references is produced.
|
168
|
|
* move_references:
|
169
|
|
* If 0, the destination data gets new reference ownership.
|
170
|
|
* If 1, the references from the source data are moved to
|
171
|
|
* the destination data.
|
172
|
|
* out_stransfer:
|
173
|
|
* The resulting transfer function is placed here.
|
174
|
|
* out_transferdata:
|
175
|
|
* The auxiliary data for the transfer function is placed here.
|
176
|
|
* When finished with the transfer function, the caller must call
|
177
|
|
* NPY_AUXDATA_FREE on this data.
|
178
|
|
* out_needs_api:
|
179
|
|
* If this is non-NULL, and the transfer function produced needs
|
180
|
|
* to call into the (Python) API, this gets set to 1. This
|
181
|
|
* remains untouched if no API access is required.
|
182
|
|
*
|
183
|
|
* WARNING: If you set move_references to 1, it is best that src_stride is
|
184
|
|
* never zero when calling the transfer function. Otherwise, the
|
185
|
|
* first destination reference will get the value and all the rest
|
186
|
|
* will get NULL.
|
187
|
|
*
|
188
|
|
* Returns NPY_SUCCEED or NPY_FAIL.
|
189
|
|
*/
|
190
|
|
NPY_NO_EXPORT int
|
191
|
|
PyArray_GetDTypeTransferFunction(int aligned,
|
192
|
|
npy_intp src_stride, npy_intp dst_stride,
|
193
|
|
PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
|
194
|
|
int move_references,
|
195
|
|
PyArray_StridedUnaryOp **out_stransfer,
|
196
|
|
NpyAuxData **out_transferdata,
|
197
|
|
int *out_needs_api);
|
198
|
|
|
199
|
|
/*
|
200
|
|
* This is identical to PyArray_GetDTypeTransferFunction, but returns a
|
201
|
|
* transfer function which also takes a mask as a parameter. The mask is used
|
202
|
|
* to determine which values to copy, and data is transferred exactly when
|
203
|
|
* mask[i*mask_stride] is true.
|
204
|
|
*
|
205
|
|
* If move_references is true, values which are not copied to the
|
206
|
|
* destination will still have their source reference decremented.
|
207
|
|
*
|
208
|
|
* If mask_dtype is NPY_BOOL or NPY_UINT8, each full element is either
|
209
|
|
* transferred or not according to the mask as described above. If
|
210
|
|
* dst_dtype and mask_dtype are both struct dtypes, their names must
|
211
|
|
* match exactly, and the dtype of each leaf field in mask_dtype must
|
212
|
|
* be either NPY_BOOL or NPY_UINT8.
|
213
|
|
*/
|
214
|
|
NPY_NO_EXPORT int
|
215
|
|
PyArray_GetMaskedDTypeTransferFunction(int aligned,
|
216
|
|
npy_intp src_stride,
|
217
|
|
npy_intp dst_stride,
|
218
|
|
npy_intp mask_stride,
|
219
|
|
PyArray_Descr *src_dtype,
|
220
|
|
PyArray_Descr *dst_dtype,
|
221
|
|
PyArray_Descr *mask_dtype,
|
222
|
|
int move_references,
|
223
|
|
PyArray_MaskedStridedUnaryOp **out_stransfer,
|
224
|
|
NpyAuxData **out_transferdata,
|
225
|
|
int *out_needs_api);
|
226
|
|
|
227
|
|
/*
|
228
|
|
* Casts the specified number of elements from 'src' with data type
|
229
|
|
* 'src_dtype' to 'dst' with 'dst_dtype'. See
|
230
|
|
* PyArray_GetDTypeTransferFunction for more details.
|
231
|
|
*
|
232
|
|
* Returns NPY_SUCCEED or NPY_FAIL.
|
233
|
|
*/
|
234
|
|
NPY_NO_EXPORT int
|
235
|
|
PyArray_CastRawArrays(npy_intp count,
|
236
|
|
char *src, char *dst,
|
237
|
|
npy_intp src_stride, npy_intp dst_stride,
|
238
|
|
PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
|
239
|
|
int move_references);
|
240
|
|
|
241
|
|
/*
|
242
|
|
* These two functions copy or convert the data of an n-dimensional array
|
243
|
|
* to/from a 1-dimensional strided buffer. These functions will only call
|
244
|
|
* 'stransfer' with the provided dst_stride/src_stride and
|
245
|
|
* dst_strides[0]/src_strides[0], so the caller can use those values to
|
246
|
|
* specialize the function.
|
247
|
|
* Note that even if ndim == 0, everything needs to be set as if ndim == 1.
|
248
|
|
*
|
249
|
|
* The return value is the number of elements it couldn't copy. A return value
|
250
|
|
* of 0 means all elements were copied, a larger value means the end of
|
251
|
|
* the n-dimensional array was reached before 'count' elements were copied.
|
252
|
|
* A negative return value indicates an error occurred.
|
253
|
|
*
|
254
|
|
* ndim:
|
255
|
|
* The number of dimensions of the n-dimensional array.
|
256
|
|
* dst/src/mask:
|
257
|
|
* The destination, source or mask starting pointer.
|
258
|
|
* dst_stride/src_stride/mask_stride:
|
259
|
|
* The stride of the 1-dimensional strided buffer
|
260
|
|
* dst_strides/src_strides:
|
261
|
|
* The strides of the n-dimensional array.
|
262
|
|
* dst_strides_inc/src_strides_inc:
|
263
|
|
* How much to add to the ..._strides pointer to get to the next stride.
|
264
|
|
* coords:
|
265
|
|
* The starting coordinates in the n-dimensional array.
|
266
|
|
* coords_inc:
|
267
|
|
* How much to add to the coords pointer to get to the next coordinate.
|
268
|
|
* shape:
|
269
|
|
* The shape of the n-dimensional array.
|
270
|
|
* shape_inc:
|
271
|
|
* How much to add to the shape pointer to get to the next shape entry.
|
272
|
|
* count:
|
273
|
|
* How many elements to transfer
|
274
|
|
* src_itemsize:
|
275
|
|
* How big each element is. If transferring between elements of different
|
276
|
|
* sizes, for example a casting operation, the 'stransfer' function
|
277
|
|
* should be specialized for that, in which case 'stransfer' will use
|
278
|
|
* this parameter as the source item size.
|
279
|
|
* stransfer:
|
280
|
|
* The strided transfer function.
|
281
|
|
* transferdata:
|
282
|
|
* An auxiliary data pointer passed to the strided transfer function.
|
283
|
|
* This follows the conventions of NpyAuxData objects.
|
284
|
|
*/
|
285
|
|
NPY_NO_EXPORT npy_intp
|
286
|
|
PyArray_TransferNDimToStrided(npy_intp ndim,
|
287
|
|
char *dst, npy_intp dst_stride,
|
288
|
|
char *src, npy_intp const *src_strides, npy_intp src_strides_inc,
|
289
|
|
npy_intp const *coords, npy_intp coords_inc,
|
290
|
|
npy_intp const *shape, npy_intp shape_inc,
|
291
|
|
npy_intp count, npy_intp src_itemsize,
|
292
|
|
PyArray_StridedUnaryOp *stransfer,
|
293
|
|
NpyAuxData *transferdata);
|
294
|
|
|
295
|
|
NPY_NO_EXPORT npy_intp
|
296
|
|
PyArray_TransferStridedToNDim(npy_intp ndim,
|
297
|
|
char *dst, npy_intp const *dst_strides, npy_intp dst_strides_inc,
|
298
|
|
char *src, npy_intp src_stride,
|
299
|
|
npy_intp const *coords, npy_intp coords_inc,
|
300
|
|
npy_intp const *shape, npy_intp shape_inc,
|
301
|
|
npy_intp count, npy_intp src_itemsize,
|
302
|
|
PyArray_StridedUnaryOp *stransfer,
|
303
|
|
NpyAuxData *transferdata);
|
304
|
|
|
305
|
|
NPY_NO_EXPORT npy_intp
|
306
|
|
PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
|
307
|
|
char *dst, npy_intp const *dst_strides, npy_intp dst_strides_inc,
|
308
|
|
char *src, npy_intp src_stride,
|
309
|
|
npy_bool *mask, npy_intp mask_stride,
|
310
|
|
npy_intp const *coords, npy_intp coords_inc,
|
311
|
|
npy_intp const *shape, npy_intp shape_inc,
|
312
|
|
npy_intp count, npy_intp src_itemsize,
|
313
|
|
PyArray_MaskedStridedUnaryOp *stransfer,
|
314
|
|
NpyAuxData *data);
|
315
|
|
|
316
|
|
NPY_NO_EXPORT int
|
317
|
|
mapiter_trivial_get(PyArrayObject *self, PyArrayObject *ind,
|
318
|
|
PyArrayObject *result);
|
319
|
|
|
320
|
|
NPY_NO_EXPORT int
|
321
|
|
mapiter_trivial_set(PyArrayObject *self, PyArrayObject *ind,
|
322
|
|
PyArrayObject *result);
|
323
|
|
|
324
|
|
NPY_NO_EXPORT int
|
325
|
|
mapiter_get(PyArrayMapIterObject *mit);
|
326
|
|
|
327
|
|
NPY_NO_EXPORT int
|
328
|
|
mapiter_set(PyArrayMapIterObject *mit);
|
329
|
|
|
330
|
|
/*
|
331
|
|
* Prepares shape and strides for a simple raw array iteration.
|
332
|
|
* This sorts the strides into FORTRAN order, reverses any negative
|
333
|
|
* strides, then coalesces axes where possible. The results are
|
334
|
|
* filled in the output parameters.
|
335
|
|
*
|
336
|
|
* This is intended for simple, lightweight iteration over arrays
|
337
|
|
* where no buffering of any kind is needed, and the array may
|
338
|
|
* not be stored as a PyArrayObject.
|
339
|
|
*
|
340
|
|
* You can use this together with NPY_RAW_ITER_START and
|
341
|
|
* NPY_RAW_ITER_ONE_NEXT to handle the looping boilerplate of everything
|
342
|
|
* but the innermost loop (which is for idim == 0).
|
343
|
|
*
|
344
|
|
* Returns 0 on success, -1 on failure.
|
345
|
|
*/
|
346
|
|
NPY_NO_EXPORT int
|
347
|
|
PyArray_PrepareOneRawArrayIter(int ndim, npy_intp const *shape,
|
348
|
|
char *data, npy_intp const *strides,
|
349
|
|
int *out_ndim, npy_intp *out_shape,
|
350
|
|
char **out_data, npy_intp *out_strides);
|
351
|
|
|
352
|
|
/*
|
353
|
|
* The same as PyArray_PrepareOneRawArrayIter, but for two
|
354
|
|
* operands instead of one. Any broadcasting of the two operands
|
355
|
|
* should have already been done before calling this function,
|
356
|
|
* as the ndim and shape is only specified once for both operands.
|
357
|
|
*
|
358
|
|
* Only the strides of the first operand are used to reorder
|
359
|
|
* the dimensions, no attempt to consider all the strides together
|
360
|
|
* is made, as is done in the NpyIter object.
|
361
|
|
*
|
362
|
|
* You can use this together with NPY_RAW_ITER_START and
|
363
|
|
* NPY_RAW_ITER_TWO_NEXT to handle the looping boilerplate of everything
|
364
|
|
* but the innermost loop (which is for idim == 0).
|
365
|
|
*
|
366
|
|
* Returns 0 on success, -1 on failure.
|
367
|
|
*/
|
368
|
|
NPY_NO_EXPORT int
|
369
|
|
PyArray_PrepareTwoRawArrayIter(int ndim, npy_intp const *shape,
|
370
|
|
char *dataA, npy_intp const *stridesA,
|
371
|
|
char *dataB, npy_intp const *stridesB,
|
372
|
|
int *out_ndim, npy_intp *out_shape,
|
373
|
|
char **out_dataA, npy_intp *out_stridesA,
|
374
|
|
char **out_dataB, npy_intp *out_stridesB);
|
375
|
|
|
376
|
|
/*
|
377
|
|
* The same as PyArray_PrepareOneRawArrayIter, but for three
|
378
|
|
* operands instead of one. Any broadcasting of the three operands
|
379
|
|
* should have already been done before calling this function,
|
380
|
|
* as the ndim and shape is only specified once for all operands.
|
381
|
|
*
|
382
|
|
* Only the strides of the first operand are used to reorder
|
383
|
|
* the dimensions, no attempt to consider all the strides together
|
384
|
|
* is made, as is done in the NpyIter object.
|
385
|
|
*
|
386
|
|
* You can use this together with NPY_RAW_ITER_START and
|
387
|
|
* NPY_RAW_ITER_THREE_NEXT to handle the looping boilerplate of everything
|
388
|
|
* but the innermost loop (which is for idim == 0).
|
389
|
|
*
|
390
|
|
* Returns 0 on success, -1 on failure.
|
391
|
|
*/
|
392
|
|
NPY_NO_EXPORT int
|
393
|
|
PyArray_PrepareThreeRawArrayIter(int ndim, npy_intp const *shape,
|
394
|
|
char *dataA, npy_intp const *stridesA,
|
395
|
|
char *dataB, npy_intp const *stridesB,
|
396
|
|
char *dataC, npy_intp const *stridesC,
|
397
|
|
int *out_ndim, npy_intp *out_shape,
|
398
|
|
char **out_dataA, npy_intp *out_stridesA,
|
399
|
|
char **out_dataB, npy_intp *out_stridesB,
|
400
|
|
char **out_dataC, npy_intp *out_stridesC);
|
401
|
|
|
402
|
|
/*
|
403
|
|
* Return number of elements that must be peeled from the start of 'addr' with
|
404
|
|
* 'nvals' elements of size 'esize' in order to reach blockable alignment.
|
405
|
|
* The required alignment in bytes is passed as the 'alignment' argument and
|
406
|
|
* must be a power of two. This function is used to prepare an array for
|
407
|
|
* blocking. See the 'npy_blocked_end' function documentation below for an
|
408
|
|
* example of how this function is used.
|
409
|
|
*/
|
410
|
|
static NPY_INLINE npy_intp
|
411
|
|
npy_aligned_block_offset(const void * addr, const npy_uintp esize,
|
412
|
|
const npy_uintp alignment, const npy_uintp nvals)
|
413
|
|
{
|
414
|
|
npy_uintp offset, peel;
|
415
|
|
|
416
|
1
|
offset = (npy_uintp)addr & (alignment - 1);
|
417
|
1
|
peel = offset ? (alignment - offset) / esize : 0;
|
418
|
1
|
peel = (peel <= nvals) ? peel : nvals;
|
419
|
|
assert(peel <= NPY_MAX_INTP);
|
420
|
1
|
return (npy_intp)peel;
|
421
|
|
}
|
422
|
|
|
423
|
|
/*
|
424
|
|
* Return upper loop bound for an array of 'nvals' elements
|
425
|
|
* of size 'esize' peeled by 'offset' elements and blocking to
|
426
|
|
* a vector size of 'vsz' in bytes
|
427
|
|
*
|
428
|
|
* example usage:
|
429
|
|
* npy_intp i;
|
430
|
|
* double v[101];
|
431
|
|
* npy_intp esize = sizeof(v[0]);
|
432
|
|
* npy_intp peel = npy_aligned_block_offset(v, esize, 16, n);
|
433
|
|
* // peel to alignment 16
|
434
|
|
* for (i = 0; i < peel; i++)
|
435
|
|
* <scalar-op>
|
436
|
|
* // simd vectorized operation
|
437
|
|
* for (; i < npy_blocked_end(peel, esize, 16, n); i += 16 / esize)
|
438
|
|
* <blocked-op>
|
439
|
|
* // handle scalar rest
|
440
|
|
* for(; i < n; i++)
|
441
|
|
* <scalar-op>
|
442
|
|
*/
|
443
|
|
static NPY_INLINE npy_intp
|
444
|
|
npy_blocked_end(const npy_uintp peel, const npy_uintp esize,
|
445
|
|
const npy_uintp vsz, const npy_uintp nvals)
|
446
|
|
{
|
447
|
1
|
npy_uintp ndiff = nvals - peel;
|
448
|
1
|
npy_uintp res = (ndiff - ndiff % (vsz / esize));
|
449
|
|
|
450
|
|
assert(nvals >= peel);
|
451
|
|
assert(res <= NPY_MAX_INTP);
|
452
|
1
|
return (npy_intp)(res);
|
453
|
|
}
|
454
|
|
|
455
|
|
|
456
|
|
/* byte swapping functions */
|
457
|
|
static NPY_INLINE npy_uint16
|
458
|
|
npy_bswap2(npy_uint16 x)
|
459
|
|
{
|
460
|
1
|
return ((x & 0xffu) << 8) | (x >> 8);
|
461
|
|
}
|
462
|
|
|
463
|
|
/*
|
464
|
|
* treat as int16 and byteswap unaligned memory,
|
465
|
|
* some cpus don't support unaligned access
|
466
|
|
*/
|
467
|
|
static NPY_INLINE void
|
468
|
|
npy_bswap2_unaligned(char * x)
|
469
|
|
{
|
470
|
0
|
char a = x[0];
|
471
|
0
|
x[0] = x[1];
|
472
|
0
|
x[1] = a;
|
473
|
|
}
|
474
|
|
|
475
|
|
static NPY_INLINE npy_uint32
|
476
|
|
npy_bswap4(npy_uint32 x)
|
477
|
|
{
|
478
|
|
#ifdef HAVE___BUILTIN_BSWAP32
|
479
|
1
|
return __builtin_bswap32(x);
|
480
|
|
#else
|
481
|
|
return ((x & 0xffu) << 24) | ((x & 0xff00u) << 8) |
|
482
|
|
((x & 0xff0000u) >> 8) | (x >> 24);
|
483
|
|
#endif
|
484
|
|
}
|
485
|
|
|
486
|
|
static NPY_INLINE void
|
487
|
|
npy_bswap4_unaligned(char * x)
|
488
|
|
{
|
489
|
1
|
char a = x[0];
|
490
|
1
|
x[0] = x[3];
|
491
|
1
|
x[3] = a;
|
492
|
1
|
a = x[1];
|
493
|
1
|
x[1] = x[2];
|
494
|
1
|
x[2] = a;
|
495
|
|
}
|
496
|
|
|
497
|
|
static NPY_INLINE npy_uint64
|
498
|
|
npy_bswap8(npy_uint64 x)
|
499
|
|
{
|
500
|
|
#ifdef HAVE___BUILTIN_BSWAP64
|
501
|
1
|
return __builtin_bswap64(x);
|
502
|
|
#else
|
503
|
|
return ((x & 0xffULL) << 56) |
|
504
|
|
((x & 0xff00ULL) << 40) |
|
505
|
|
((x & 0xff0000ULL) << 24) |
|
506
|
|
((x & 0xff000000ULL) << 8) |
|
507
|
|
((x & 0xff00000000ULL) >> 8) |
|
508
|
|
((x & 0xff0000000000ULL) >> 24) |
|
509
|
|
((x & 0xff000000000000ULL) >> 40) |
|
510
|
|
( x >> 56);
|
511
|
|
#endif
|
512
|
|
}
|
513
|
|
|
514
|
|
static NPY_INLINE void
|
515
|
|
npy_bswap8_unaligned(char * x)
|
516
|
|
{
|
517
|
1
|
char a = x[0]; x[0] = x[7]; x[7] = a;
|
518
|
1
|
a = x[1]; x[1] = x[6]; x[6] = a;
|
519
|
1
|
a = x[2]; x[2] = x[5]; x[5] = a;
|
520
|
1
|
a = x[3]; x[3] = x[4]; x[4] = a;
|
521
|
|
}
|
522
|
|
|
523
|
|
|
524
|
|
/* Start raw iteration */
|
525
|
|
#define NPY_RAW_ITER_START(idim, ndim, coord, shape) \
|
526
|
|
memset((coord), 0, (ndim) * sizeof(coord[0])); \
|
527
|
|
do {
|
528
|
|
|
529
|
|
/* Increment to the next n-dimensional coordinate for one raw array */
|
530
|
|
#define NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides) \
|
531
|
|
for ((idim) = 1; (idim) < (ndim); ++(idim)) { \
|
532
|
|
if (++(coord)[idim] == (shape)[idim]) { \
|
533
|
|
(coord)[idim] = 0; \
|
534
|
|
(data) -= ((shape)[idim] - 1) * (strides)[idim]; \
|
535
|
|
} \
|
536
|
|
else { \
|
537
|
|
(data) += (strides)[idim]; \
|
538
|
|
break; \
|
539
|
|
} \
|
540
|
|
} \
|
541
|
|
} while ((idim) < (ndim))
|
542
|
|
|
543
|
|
/* Increment to the next n-dimensional coordinate for two raw arrays */
|
544
|
|
#define NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape, \
|
545
|
|
dataA, stridesA, dataB, stridesB) \
|
546
|
|
for ((idim) = 1; (idim) < (ndim); ++(idim)) { \
|
547
|
|
if (++(coord)[idim] == (shape)[idim]) { \
|
548
|
|
(coord)[idim] = 0; \
|
549
|
|
(dataA) -= ((shape)[idim] - 1) * (stridesA)[idim]; \
|
550
|
|
(dataB) -= ((shape)[idim] - 1) * (stridesB)[idim]; \
|
551
|
|
} \
|
552
|
|
else { \
|
553
|
|
(dataA) += (stridesA)[idim]; \
|
554
|
|
(dataB) += (stridesB)[idim]; \
|
555
|
|
break; \
|
556
|
|
} \
|
557
|
|
} \
|
558
|
|
} while ((idim) < (ndim))
|
559
|
|
|
560
|
|
/* Increment to the next n-dimensional coordinate for three raw arrays */
|
561
|
|
#define NPY_RAW_ITER_THREE_NEXT(idim, ndim, coord, shape, \
|
562
|
|
dataA, stridesA, \
|
563
|
|
dataB, stridesB, \
|
564
|
|
dataC, stridesC) \
|
565
|
|
for ((idim) = 1; (idim) < (ndim); ++(idim)) { \
|
566
|
|
if (++(coord)[idim] == (shape)[idim]) { \
|
567
|
|
(coord)[idim] = 0; \
|
568
|
|
(dataA) -= ((shape)[idim] - 1) * (stridesA)[idim]; \
|
569
|
|
(dataB) -= ((shape)[idim] - 1) * (stridesB)[idim]; \
|
570
|
|
(dataC) -= ((shape)[idim] - 1) * (stridesC)[idim]; \
|
571
|
|
} \
|
572
|
|
else { \
|
573
|
|
(dataA) += (stridesA)[idim]; \
|
574
|
|
(dataB) += (stridesB)[idim]; \
|
575
|
|
(dataC) += (stridesC)[idim]; \
|
576
|
|
break; \
|
577
|
|
} \
|
578
|
|
} \
|
579
|
|
} while ((idim) < (ndim))
|
580
|
|
|
581
|
|
/* Increment to the next n-dimensional coordinate for four raw arrays */
|
582
|
|
#define NPY_RAW_ITER_FOUR_NEXT(idim, ndim, coord, shape, \
|
583
|
|
dataA, stridesA, \
|
584
|
|
dataB, stridesB, \
|
585
|
|
dataC, stridesC, \
|
586
|
|
dataD, stridesD) \
|
587
|
|
for ((idim) = 1; (idim) < (ndim); ++(idim)) { \
|
588
|
|
if (++(coord)[idim] == (shape)[idim]) { \
|
589
|
|
(coord)[idim] = 0; \
|
590
|
|
(dataA) -= ((shape)[idim] - 1) * (stridesA)[idim]; \
|
591
|
|
(dataB) -= ((shape)[idim] - 1) * (stridesB)[idim]; \
|
592
|
|
(dataC) -= ((shape)[idim] - 1) * (stridesC)[idim]; \
|
593
|
|
(dataD) -= ((shape)[idim] - 1) * (stridesD)[idim]; \
|
594
|
|
} \
|
595
|
|
else { \
|
596
|
|
(dataA) += (stridesA)[idim]; \
|
597
|
|
(dataB) += (stridesB)[idim]; \
|
598
|
|
(dataC) += (stridesC)[idim]; \
|
599
|
|
(dataD) += (stridesD)[idim]; \
|
600
|
|
break; \
|
601
|
|
} \
|
602
|
|
} \
|
603
|
|
} while ((idim) < (ndim))
|
604
|
|
|
605
|
|
|
606
|
|
/*
|
607
|
|
* TRIVIAL ITERATION
|
608
|
|
*
|
609
|
|
* In some cases when the iteration order isn't important, iteration over
|
610
|
|
* arrays is trivial. This is the case when:
|
611
|
|
* * The array has 0 or 1 dimensions.
|
612
|
|
* * The array is C or Fortran contiguous.
|
613
|
|
* Use of an iterator can be skipped when this occurs. These macros assist
|
614
|
|
* in detecting and taking advantage of the situation. Note that it may
|
615
|
|
* be worthwhile to further check if the stride is a contiguous stride
|
616
|
|
* and take advantage of that.
|
617
|
|
*
|
618
|
|
* Here is example code for a single array:
|
619
|
|
*
|
620
|
|
* if (PyArray_TRIVIALLY_ITERABLE(self)) {
|
621
|
|
* char *data;
|
622
|
|
* npy_intp count, stride;
|
623
|
|
*
|
624
|
|
* PyArray_PREPARE_TRIVIAL_ITERATION(self, count, data, stride);
|
625
|
|
*
|
626
|
|
* while (count--) {
|
627
|
|
* // Use the data pointer
|
628
|
|
*
|
629
|
|
* data += stride;
|
630
|
|
* }
|
631
|
|
* }
|
632
|
|
* else {
|
633
|
|
* // Create iterator, etc...
|
634
|
|
* }
|
635
|
|
*
|
636
|
|
* Here is example code for a pair of arrays:
|
637
|
|
*
|
638
|
|
* if (PyArray_TRIVIALLY_ITERABLE_PAIR(a1, a2)) {
|
639
|
|
* char *data1, *data2;
|
640
|
|
* npy_intp count, stride1, stride2;
|
641
|
|
*
|
642
|
|
* PyArray_PREPARE_TRIVIAL_PAIR_ITERATION(a1, a2, count,
|
643
|
|
* data1, data2, stride1, stride2);
|
644
|
|
*
|
645
|
|
* while (count--) {
|
646
|
|
* // Use the data1 and data2 pointers
|
647
|
|
*
|
648
|
|
* data1 += stride1;
|
649
|
|
* data2 += stride2;
|
650
|
|
* }
|
651
|
|
* }
|
652
|
|
* else {
|
653
|
|
* // Create iterator, etc...
|
654
|
|
* }
|
655
|
|
*/
|
656
|
|
|
657
|
|
/*
|
658
|
|
* Note: Equivalently iterable macro requires one of arr1 or arr2 be
|
659
|
|
* trivially iterable to be valid.
|
660
|
|
*/
|
661
|
|
|
662
|
|
/**
|
663
|
|
* Determine whether two arrays are safe for trivial iteration in cases where
|
664
|
|
* some of the arrays may be modified.
|
665
|
|
*
|
666
|
|
* In-place iteration is safe if one of the following is true:
|
667
|
|
*
|
668
|
|
* - Both arrays are read-only
|
669
|
|
* - The arrays do not have overlapping memory (based on a check that may be too
|
670
|
|
* strict)
|
671
|
|
* - The strides match, and the non-read-only array base addresses are equal or
|
672
|
|
* before the read-only one, ensuring correct data dependency.
|
673
|
|
*/
|
674
|
|
|
675
|
|
#define PyArray_TRIVIALLY_ITERABLE_OP_NOREAD 0
|
676
|
|
#define PyArray_TRIVIALLY_ITERABLE_OP_READ 1
|
677
|
|
|
678
|
|
#define PyArray_TRIVIALLY_ITERABLE(arr) ( \
|
679
|
|
PyArray_NDIM(arr) <= 1 || \
|
680
|
|
PyArray_CHKFLAGS(arr, NPY_ARRAY_C_CONTIGUOUS) || \
|
681
|
|
PyArray_CHKFLAGS(arr, NPY_ARRAY_F_CONTIGUOUS) \
|
682
|
|
)
|
683
|
|
|
684
|
|
#define PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size, arr) ( \
|
685
|
|
assert(PyArray_TRIVIALLY_ITERABLE(arr)), \
|
686
|
|
size == 1 ? 0 : ((PyArray_NDIM(arr) == 1) ? \
|
687
|
|
PyArray_STRIDE(arr, 0) : PyArray_ITEMSIZE(arr)))
|
688
|
|
|
689
|
|
static NPY_INLINE int
|
690
|
1
|
PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr2,
|
691
|
|
int arr1_read, int arr2_read)
|
692
|
|
{
|
693
|
|
npy_intp size1, size2, stride1, stride2;
|
694
|
1
|
int arr1_ahead = 0, arr2_ahead = 0;
|
695
|
|
|
696
|
1
|
if (arr1_read && arr2_read) {
|
697
|
|
return 1;
|
698
|
|
}
|
699
|
|
|
700
|
1
|
if (solve_may_share_memory(arr1, arr2, 1) == 0) {
|
701
|
|
return 1;
|
702
|
|
}
|
703
|
|
|
704
|
|
/*
|
705
|
|
* Arrays overlapping in memory may be equivalently iterable if input
|
706
|
|
* arrays stride ahead faster than output arrays.
|
707
|
|
*/
|
708
|
|
|
709
|
1
|
size1 = PyArray_SIZE(arr1);
|
710
|
1
|
size2 = PyArray_SIZE(arr2);
|
711
|
|
|
712
|
1
|
stride1 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size1, arr1);
|
713
|
1
|
stride2 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size2, arr2);
|
714
|
|
|
715
|
|
/*
|
716
|
|
* Arrays with zero stride are never "ahead" since the element is reused
|
717
|
|
* (at this point we know the array extents overlap).
|
718
|
|
*/
|
719
|
|
|
720
|
1
|
if (stride1 > 0) {
|
721
|
1
|
arr1_ahead = (stride1 >= stride2 &&
|
722
|
1
|
PyArray_BYTES(arr1) >= PyArray_BYTES(arr2));
|
723
|
|
}
|
724
|
1
|
else if (stride1 < 0) {
|
725
|
0
|
arr1_ahead = (stride1 <= stride2 &&
|
726
|
0
|
PyArray_BYTES(arr1) <= PyArray_BYTES(arr2));
|
727
|
|
}
|
728
|
|
|
729
|
1
|
if (stride2 > 0) {
|
730
|
1
|
arr2_ahead = (stride2 >= stride1 &&
|
731
|
1
|
PyArray_BYTES(arr2) >= PyArray_BYTES(arr1));
|
732
|
|
}
|
733
|
1
|
else if (stride2 < 0) {
|
734
|
1
|
arr2_ahead = (stride2 <= stride1 &&
|
735
|
1
|
PyArray_BYTES(arr2) <= PyArray_BYTES(arr1));
|
736
|
|
}
|
737
|
|
|
738
|
1
|
return (!arr1_read || arr1_ahead) && (!arr2_read || arr2_ahead);
|
739
|
|
}
|
740
|
|
|
741
|
|
#define PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) ( \
|
742
|
|
PyArray_NDIM(arr1) == PyArray_NDIM(arr2) && \
|
743
|
|
PyArray_CompareLists(PyArray_DIMS(arr1), \
|
744
|
|
PyArray_DIMS(arr2), \
|
745
|
|
PyArray_NDIM(arr1)) && \
|
746
|
|
(PyArray_FLAGS(arr1)&(NPY_ARRAY_C_CONTIGUOUS| \
|
747
|
|
NPY_ARRAY_F_CONTIGUOUS)) & \
|
748
|
|
(PyArray_FLAGS(arr2)&(NPY_ARRAY_C_CONTIGUOUS| \
|
749
|
|
NPY_ARRAY_F_CONTIGUOUS)) \
|
750
|
|
)
|
751
|
|
|
752
|
|
#define PyArray_EQUIVALENTLY_ITERABLE(arr1, arr2, arr1_read, arr2_read) ( \
|
753
|
|
PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) && \
|
754
|
|
PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK( \
|
755
|
|
arr1, arr2, arr1_read, arr2_read))
|
756
|
|
|
757
|
|
#define PyArray_PREPARE_TRIVIAL_ITERATION(arr, count, data, stride) \
|
758
|
|
count = PyArray_SIZE(arr); \
|
759
|
|
data = PyArray_BYTES(arr); \
|
760
|
|
stride = ((PyArray_NDIM(arr) == 0) ? 0 : \
|
761
|
|
((PyArray_NDIM(arr) == 1) ? \
|
762
|
|
PyArray_STRIDE(arr, 0) : \
|
763
|
|
PyArray_ITEMSIZE(arr)));
|
764
|
|
|
765
|
|
#define PyArray_TRIVIALLY_ITERABLE_PAIR(arr1, arr2, arr1_read, arr2_read) ( \
|
766
|
|
PyArray_TRIVIALLY_ITERABLE(arr1) && \
|
767
|
|
(PyArray_NDIM(arr2) == 0 || \
|
768
|
|
PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) || \
|
769
|
|
(PyArray_NDIM(arr1) == 0 && \
|
770
|
|
PyArray_TRIVIALLY_ITERABLE(arr2) \
|
771
|
|
) \
|
772
|
|
) && \
|
773
|
|
PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(arr1, arr2, arr1_read, arr2_read) \
|
774
|
|
)
|
775
|
|
#define PyArray_PREPARE_TRIVIAL_PAIR_ITERATION(arr1, arr2, \
|
776
|
|
count, \
|
777
|
|
data1, data2, \
|
778
|
|
stride1, stride2) { \
|
779
|
|
npy_intp size1 = PyArray_SIZE(arr1); \
|
780
|
|
npy_intp size2 = PyArray_SIZE(arr2); \
|
781
|
|
count = ((size1 > size2) || size1 == 0) ? size1 : size2; \
|
782
|
|
data1 = PyArray_BYTES(arr1); \
|
783
|
|
data2 = PyArray_BYTES(arr2); \
|
784
|
|
stride1 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size1, arr1); \
|
785
|
|
stride2 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size2, arr2); \
|
786
|
|
}
|
787
|
|
|
788
|
|
#define PyArray_TRIVIALLY_ITERABLE_TRIPLE(arr1, arr2, arr3, arr1_read, arr2_read, arr3_read) ( \
|
789
|
|
PyArray_TRIVIALLY_ITERABLE(arr1) && \
|
790
|
|
((PyArray_NDIM(arr2) == 0 && \
|
791
|
|
(PyArray_NDIM(arr3) == 0 || \
|
792
|
|
PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr3) \
|
793
|
|
) \
|
794
|
|
) || \
|
795
|
|
(PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) && \
|
796
|
|
(PyArray_NDIM(arr3) == 0 || \
|
797
|
|
PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr3) \
|
798
|
|
) \
|
799
|
|
) || \
|
800
|
|
(PyArray_NDIM(arr1) == 0 && \
|
801
|
|
PyArray_TRIVIALLY_ITERABLE(arr2) && \
|
802
|
|
(PyArray_NDIM(arr3) == 0 || \
|
803
|
|
PyArray_EQUIVALENTLY_ITERABLE_BASE(arr2, arr3) \
|
804
|
|
) \
|
805
|
|
) \
|
806
|
|
) && \
|
807
|
|
PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(arr1, arr2, arr1_read, arr2_read) && \
|
808
|
|
PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(arr1, arr3, arr1_read, arr3_read) && \
|
809
|
|
PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(arr2, arr3, arr2_read, arr3_read) \
|
810
|
|
)
|
811
|
|
|
812
|
|
#define PyArray_PREPARE_TRIVIAL_TRIPLE_ITERATION(arr1, arr2, arr3, \
|
813
|
|
count, \
|
814
|
|
data1, data2, data3, \
|
815
|
|
stride1, stride2, stride3) { \
|
816
|
|
npy_intp size1 = PyArray_SIZE(arr1); \
|
817
|
|
npy_intp size2 = PyArray_SIZE(arr2); \
|
818
|
|
npy_intp size3 = PyArray_SIZE(arr3); \
|
819
|
|
count = ((size1 > size2) || size1 == 0) ? size1 : size2; \
|
820
|
|
count = ((size3 > count) || size3 == 0) ? size3 : count; \
|
821
|
|
data1 = PyArray_BYTES(arr1); \
|
822
|
|
data2 = PyArray_BYTES(arr2); \
|
823
|
|
data3 = PyArray_BYTES(arr3); \
|
824
|
|
stride1 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size1, arr1); \
|
825
|
|
stride2 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size2, arr2); \
|
826
|
|
stride3 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size3, arr3); \
|
827
|
|
}
|
828
|
|
|
829
|
|
#endif
|