1
|
|
/**
|
2
|
|
* Macros to help build fast ufunc inner loops.
|
3
|
|
*
|
4
|
|
* These expect to have access to the arguments of a typical ufunc loop,
|
5
|
|
*
|
6
|
|
* char **args
|
7
|
|
* npy_intp const *dimensions
|
8
|
|
* npy_intp const *steps
|
9
|
|
*/
|
10
|
|
#ifndef _NPY_UMATH_FAST_LOOP_MACROS_H_
|
11
|
|
#define _NPY_UMATH_FAST_LOOP_MACROS_H_
|
12
|
|
|
13
|
|
static NPY_INLINE npy_uintp
|
14
|
|
abs_ptrdiff(char *a, char *b)
|
15
|
|
{
|
16
|
1
|
return (a > b) ? (a - b) : (b - a);
|
17
|
|
}
|
18
|
|
|
19
|
|
/**
|
20
|
|
* Simple unoptimized loop macros that iterate over the ufunc arguments in
|
21
|
|
* parallel.
|
22
|
|
* @{
|
23
|
|
*/
|
24
|
|
|
25
|
|
/** (<ignored>) -> (op1) */
|
26
|
|
#define OUTPUT_LOOP\
|
27
|
|
char *op1 = args[1];\
|
28
|
|
npy_intp os1 = steps[1];\
|
29
|
|
npy_intp n = dimensions[0];\
|
30
|
|
npy_intp i;\
|
31
|
|
for(i = 0; i < n; i++, op1 += os1)
|
32
|
|
|
33
|
|
/** (ip1) -> (op1) */
|
34
|
|
#define UNARY_LOOP\
|
35
|
|
char *ip1 = args[0], *op1 = args[1];\
|
36
|
|
npy_intp is1 = steps[0], os1 = steps[1];\
|
37
|
|
npy_intp n = dimensions[0];\
|
38
|
|
npy_intp i;\
|
39
|
|
for(i = 0; i < n; i++, ip1 += is1, op1 += os1)
|
40
|
|
|
41
|
|
/** (ip1) -> (op1, op2) */
|
42
|
|
#define UNARY_LOOP_TWO_OUT\
|
43
|
|
char *ip1 = args[0], *op1 = args[1], *op2 = args[2];\
|
44
|
|
npy_intp is1 = steps[0], os1 = steps[1], os2 = steps[2];\
|
45
|
|
npy_intp n = dimensions[0];\
|
46
|
|
npy_intp i;\
|
47
|
|
for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)
|
48
|
|
|
49
|
|
/** (ip1, ip2) -> (op1) */
|
50
|
|
#define BINARY_LOOP\
|
51
|
|
char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
|
52
|
|
npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
|
53
|
|
npy_intp n = dimensions[0];\
|
54
|
|
npy_intp i;\
|
55
|
|
for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1)
|
56
|
|
|
57
|
|
/** (ip1, ip2) -> (op1, op2) */
|
58
|
|
#define BINARY_LOOP_TWO_OUT\
|
59
|
|
char *ip1 = args[0], *ip2 = args[1], *op1 = args[2], *op2 = args[3];\
|
60
|
|
npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2], os2 = steps[3];\
|
61
|
|
npy_intp n = dimensions[0];\
|
62
|
|
npy_intp i;\
|
63
|
|
for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1, op2 += os2)
|
64
|
|
|
65
|
|
/** (ip1, ip2, ip3) -> (op1) */
|
66
|
|
#define TERNARY_LOOP\
|
67
|
|
char *ip1 = args[0], *ip2 = args[1], *ip3 = args[2], *op1 = args[3];\
|
68
|
|
npy_intp is1 = steps[0], is2 = steps[1], is3 = steps[2], os1 = steps[3];\
|
69
|
|
npy_intp n = dimensions[0];\
|
70
|
|
npy_intp i;\
|
71
|
|
for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1)
|
72
|
|
|
73
|
|
/** @} */
|
74
|
|
|
75
|
|
/* unary loop input and output contiguous */
|
76
|
|
#define IS_UNARY_CONT(tin, tout) (steps[0] == sizeof(tin) && \
|
77
|
|
steps[1] == sizeof(tout))
|
78
|
|
|
79
|
|
#define IS_OUTPUT_CONT(tout) (steps[1] == sizeof(tout))
|
80
|
|
|
81
|
|
#define IS_BINARY_REDUCE ((args[0] == args[2])\
|
82
|
|
&& (steps[0] == steps[2])\
|
83
|
|
&& (steps[0] == 0))
|
84
|
|
|
85
|
|
/* binary loop input and output contiguous */
|
86
|
|
#define IS_BINARY_CONT(tin, tout) (steps[0] == sizeof(tin) && \
|
87
|
|
steps[1] == sizeof(tin) && \
|
88
|
|
steps[2] == sizeof(tout))
|
89
|
|
|
90
|
|
/* binary loop input and output contiguous with first scalar */
|
91
|
|
#define IS_BINARY_CONT_S1(tin, tout) (steps[0] == 0 && \
|
92
|
|
steps[1] == sizeof(tin) && \
|
93
|
|
steps[2] == sizeof(tout))
|
94
|
|
|
95
|
|
/* binary loop input and output contiguous with second scalar */
|
96
|
|
#define IS_BINARY_CONT_S2(tin, tout) (steps[0] == sizeof(tin) && \
|
97
|
|
steps[1] == 0 && \
|
98
|
|
steps[2] == sizeof(tout))
|
99
|
|
|
100
|
|
/*
|
101
|
|
* loop with contiguous specialization
|
102
|
|
* op should be the code working on `tin in` and
|
103
|
|
* storing the result in `tout *out`
|
104
|
|
* combine with NPY_GCC_OPT_3 to allow autovectorization
|
105
|
|
* should only be used where its worthwhile to avoid code bloat
|
106
|
|
*/
|
107
|
|
#define BASE_UNARY_LOOP(tin, tout, op) \
|
108
|
|
UNARY_LOOP { \
|
109
|
|
const tin in = *(tin *)ip1; \
|
110
|
|
tout *out = (tout *)op1; \
|
111
|
|
op; \
|
112
|
|
}
|
113
|
|
|
114
|
|
#define UNARY_LOOP_FAST(tin, tout, op) \
|
115
|
|
do { \
|
116
|
|
/* condition allows compiler to optimize the generic macro */ \
|
117
|
|
if (IS_UNARY_CONT(tin, tout)) { \
|
118
|
|
if (args[0] == args[1]) { \
|
119
|
|
BASE_UNARY_LOOP(tin, tout, op) \
|
120
|
|
} \
|
121
|
|
else { \
|
122
|
|
BASE_UNARY_LOOP(tin, tout, op) \
|
123
|
|
} \
|
124
|
|
} \
|
125
|
|
else { \
|
126
|
|
BASE_UNARY_LOOP(tin, tout, op) \
|
127
|
|
} \
|
128
|
|
} \
|
129
|
|
while (0)
|
130
|
|
|
131
|
|
/*
|
132
|
|
* loop with contiguous specialization
|
133
|
|
* op should be the code working on `tin in1`, `tin in2` and
|
134
|
|
* storing the result in `tout *out`
|
135
|
|
* combine with NPY_GCC_OPT_3 to allow autovectorization
|
136
|
|
* should only be used where its worthwhile to avoid code bloat
|
137
|
|
*/
|
138
|
|
#define BASE_BINARY_LOOP(tin, tout, op) \
|
139
|
|
BINARY_LOOP { \
|
140
|
|
const tin in1 = *(tin *)ip1; \
|
141
|
|
const tin in2 = *(tin *)ip2; \
|
142
|
|
tout *out = (tout *)op1; \
|
143
|
|
op; \
|
144
|
|
}
|
145
|
|
|
146
|
|
/*
|
147
|
|
* unfortunately gcc 6/7 regressed and we need to give it additional hints to
|
148
|
|
* vectorize inplace operations (PR80198)
|
149
|
|
* must only be used after op1 == ip1 or ip2 has been checked
|
150
|
|
* TODO: using ivdep might allow other compilers to vectorize too
|
151
|
|
*/
|
152
|
|
#if __GNUC__ >= 6
|
153
|
|
#define IVDEP_LOOP _Pragma("GCC ivdep")
|
154
|
|
#else
|
155
|
|
#define IVDEP_LOOP
|
156
|
|
#endif
|
157
|
|
#define BASE_BINARY_LOOP_INP(tin, tout, op) \
|
158
|
|
char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
|
159
|
|
npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
|
160
|
|
npy_intp n = dimensions[0];\
|
161
|
|
npy_intp i;\
|
162
|
|
IVDEP_LOOP \
|
163
|
|
for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \
|
164
|
|
const tin in1 = *(tin *)ip1; \
|
165
|
|
const tin in2 = *(tin *)ip2; \
|
166
|
|
tout *out = (tout *)op1; \
|
167
|
|
op; \
|
168
|
|
}
|
169
|
|
|
170
|
|
#define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
|
171
|
|
const tin cin = *(tin *)cinp; \
|
172
|
|
BINARY_LOOP { \
|
173
|
|
const tin vin = *(tin *)vinp; \
|
174
|
|
tout *out = (tout *)op1; \
|
175
|
|
op; \
|
176
|
|
}
|
177
|
|
|
178
|
|
/* PR80198 again, scalar works without the pragma */
|
179
|
|
#define BASE_BINARY_LOOP_S_INP(tin, tout, cin, cinp, vin, vinp, op) \
|
180
|
|
const tin cin = *(tin *)cinp; \
|
181
|
|
BINARY_LOOP { \
|
182
|
|
const tin vin = *(tin *)vinp; \
|
183
|
|
tout *out = (tout *)vinp; \
|
184
|
|
op; \
|
185
|
|
}
|
186
|
|
|
187
|
|
#define BINARY_LOOP_FAST(tin, tout, op) \
|
188
|
|
do { \
|
189
|
|
/* condition allows compiler to optimize the generic macro */ \
|
190
|
|
if (IS_BINARY_CONT(tin, tout)) { \
|
191
|
|
if (abs_ptrdiff(args[2], args[0]) == 0 && \
|
192
|
|
abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
|
193
|
|
BASE_BINARY_LOOP_INP(tin, tout, op) \
|
194
|
|
} \
|
195
|
|
else if (abs_ptrdiff(args[2], args[1]) == 0 && \
|
196
|
|
abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
|
197
|
|
BASE_BINARY_LOOP_INP(tin, tout, op) \
|
198
|
|
} \
|
199
|
|
else { \
|
200
|
|
BASE_BINARY_LOOP(tin, tout, op) \
|
201
|
|
} \
|
202
|
|
} \
|
203
|
|
else if (IS_BINARY_CONT_S1(tin, tout)) { \
|
204
|
|
if (abs_ptrdiff(args[2], args[1]) == 0) { \
|
205
|
|
BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \
|
206
|
|
} \
|
207
|
|
else { \
|
208
|
|
BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
|
209
|
|
} \
|
210
|
|
} \
|
211
|
|
else if (IS_BINARY_CONT_S2(tin, tout)) { \
|
212
|
|
if (abs_ptrdiff(args[2], args[0]) == 0) { \
|
213
|
|
BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \
|
214
|
|
} \
|
215
|
|
else { \
|
216
|
|
BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
|
217
|
|
}\
|
218
|
|
} \
|
219
|
|
else { \
|
220
|
|
BASE_BINARY_LOOP(tin, tout, op) \
|
221
|
|
} \
|
222
|
|
} \
|
223
|
|
while (0)
|
224
|
|
|
225
|
|
#define BINARY_REDUCE_LOOP_INNER\
|
226
|
|
char *ip2 = args[1]; \
|
227
|
|
npy_intp is2 = steps[1]; \
|
228
|
|
npy_intp n = dimensions[0]; \
|
229
|
|
npy_intp i; \
|
230
|
|
for(i = 0; i < n; i++, ip2 += is2)
|
231
|
|
|
232
|
|
#define BINARY_REDUCE_LOOP(TYPE)\
|
233
|
|
char *iop1 = args[0]; \
|
234
|
|
TYPE io1 = *(TYPE *)iop1; \
|
235
|
|
BINARY_REDUCE_LOOP_INNER
|
236
|
|
|
237
|
|
|
238
|
|
#endif /* _NPY_UMATH_FAST_LOOP_MACROS_H_ */
|