1
/**
2
 * Macros to help build fast ufunc inner loops.
3
 *
4
 * These expect to have access to the arguments of a typical ufunc loop,
5
 *
6
 *     char **args
7
 *     npy_intp const *dimensions
8
 *     npy_intp const *steps
9
 */
10
#ifndef _NPY_UMATH_FAST_LOOP_MACROS_H_
11
#define _NPY_UMATH_FAST_LOOP_MACROS_H_
12

13
static NPY_INLINE npy_uintp
14
abs_ptrdiff(char *a, char *b)
15
{
16 1
    return (a > b) ? (a - b) : (b - a);
17
}
18

19
/**
20
 * Simple unoptimized loop macros that iterate over the ufunc arguments in
21
 * parallel.
22
 * @{
23
 */
24

25
/** (<ignored>) -> (op1) */
26
#define OUTPUT_LOOP\
27
    char *op1 = args[1];\
28
    npy_intp os1 = steps[1];\
29
    npy_intp n = dimensions[0];\
30
    npy_intp i;\
31
    for(i = 0; i < n; i++, op1 += os1)
32

33
/** (ip1) -> (op1) */
34
#define UNARY_LOOP\
35
    char *ip1 = args[0], *op1 = args[1];\
36
    npy_intp is1 = steps[0], os1 = steps[1];\
37
    npy_intp n = dimensions[0];\
38
    npy_intp i;\
39
    for(i = 0; i < n; i++, ip1 += is1, op1 += os1)
40

41
/** (ip1) -> (op1, op2) */
42
#define UNARY_LOOP_TWO_OUT\
43
    char *ip1 = args[0], *op1 = args[1], *op2 = args[2];\
44
    npy_intp is1 = steps[0], os1 = steps[1], os2 = steps[2];\
45
    npy_intp n = dimensions[0];\
46
    npy_intp i;\
47
    for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)
48

49
/** (ip1, ip2) -> (op1) */
50
#define BINARY_LOOP\
51
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
52
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
53
    npy_intp n = dimensions[0];\
54
    npy_intp i;\
55
    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1)
56

57
/** (ip1, ip2) -> (op1, op2) */
58
#define BINARY_LOOP_TWO_OUT\
59
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2], *op2 = args[3];\
60
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2], os2 = steps[3];\
61
    npy_intp n = dimensions[0];\
62
    npy_intp i;\
63
    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1, op2 += os2)
64

65
/** (ip1, ip2, ip3) -> (op1) */
66
#define TERNARY_LOOP\
67
    char *ip1 = args[0], *ip2 = args[1], *ip3 = args[2], *op1 = args[3];\
68
    npy_intp is1 = steps[0], is2 = steps[1], is3 = steps[2], os1 = steps[3];\
69
    npy_intp n = dimensions[0];\
70
    npy_intp i;\
71
    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1)
72

73
/** @} */
74

75
/* unary loop input and output contiguous */
76
#define IS_UNARY_CONT(tin, tout) (steps[0] == sizeof(tin) && \
77
                                  steps[1] == sizeof(tout))
78

79
#define IS_OUTPUT_CONT(tout) (steps[1] == sizeof(tout))
80

81
#define IS_BINARY_REDUCE ((args[0] == args[2])\
82
        && (steps[0] == steps[2])\
83
        && (steps[0] == 0))
84

85
/* binary loop input and output contiguous */
86
#define IS_BINARY_CONT(tin, tout) (steps[0] == sizeof(tin) && \
87
                                   steps[1] == sizeof(tin) && \
88
                                   steps[2] == sizeof(tout))
89

90
/* binary loop input and output contiguous with first scalar */
91
#define IS_BINARY_CONT_S1(tin, tout) (steps[0] == 0 && \
92
                                   steps[1] == sizeof(tin) && \
93
                                   steps[2] == sizeof(tout))
94

95
/* binary loop input and output contiguous with second scalar */
96
#define IS_BINARY_CONT_S2(tin, tout) (steps[0] == sizeof(tin) && \
97
                                   steps[1] == 0 && \
98
                                   steps[2] == sizeof(tout))
99

100
/*
101
 * loop with contiguous specialization
102
 * op should be the code working on `tin in` and
103
 * storing the result in `tout *out`
104
 * combine with NPY_GCC_OPT_3 to allow autovectorization
105
 * should only be used where its worthwhile to avoid code bloat
106
 */
107
#define BASE_UNARY_LOOP(tin, tout, op) \
108
    UNARY_LOOP { \
109
        const tin in = *(tin *)ip1; \
110
        tout *out = (tout *)op1; \
111
        op; \
112
    }
113

114
#define UNARY_LOOP_FAST(tin, tout, op)          \
115
    do { \
116
        /* condition allows compiler to optimize the generic macro */ \
117
        if (IS_UNARY_CONT(tin, tout)) { \
118
            if (args[0] == args[1]) { \
119
                BASE_UNARY_LOOP(tin, tout, op) \
120
            } \
121
            else { \
122
                BASE_UNARY_LOOP(tin, tout, op) \
123
            } \
124
        } \
125
        else { \
126
            BASE_UNARY_LOOP(tin, tout, op) \
127
        } \
128
    } \
129
    while (0)
130

131
/*
132
 * loop with contiguous specialization
133
 * op should be the code working on `tin in1`, `tin in2` and
134
 * storing the result in `tout *out`
135
 * combine with NPY_GCC_OPT_3 to allow autovectorization
136
 * should only be used where its worthwhile to avoid code bloat
137
 */
138
#define BASE_BINARY_LOOP(tin, tout, op) \
139
    BINARY_LOOP { \
140
        const tin in1 = *(tin *)ip1; \
141
        const tin in2 = *(tin *)ip2; \
142
        tout *out = (tout *)op1; \
143
        op; \
144
    }
145

146
/*
147
 * unfortunately gcc 6/7 regressed and we need to give it additional hints to
148
 * vectorize inplace operations (PR80198)
149
 * must only be used after op1 == ip1 or ip2 has been checked
150
 * TODO: using ivdep might allow other compilers to vectorize too
151
 */
152
#if __GNUC__ >= 6
153
#define IVDEP_LOOP _Pragma("GCC ivdep")
154
#else
155
#define IVDEP_LOOP
156
#endif
157
#define BASE_BINARY_LOOP_INP(tin, tout, op) \
158
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
159
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
160
    npy_intp n = dimensions[0];\
161
    npy_intp i;\
162
    IVDEP_LOOP \
163
    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \
164
        const tin in1 = *(tin *)ip1; \
165
        const tin in2 = *(tin *)ip2; \
166
        tout *out = (tout *)op1; \
167
        op; \
168
    }
169

170
#define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
171
    const tin cin = *(tin *)cinp; \
172
    BINARY_LOOP { \
173
        const tin vin = *(tin *)vinp; \
174
        tout *out = (tout *)op1; \
175
        op; \
176
    }
177

178
/* PR80198 again, scalar works without the pragma */
179
#define BASE_BINARY_LOOP_S_INP(tin, tout, cin, cinp, vin, vinp, op) \
180
    const tin cin = *(tin *)cinp; \
181
    BINARY_LOOP { \
182
        const tin vin = *(tin *)vinp; \
183
        tout *out = (tout *)vinp; \
184
        op; \
185
    }
186

187
#define BINARY_LOOP_FAST(tin, tout, op)         \
188
    do { \
189
        /* condition allows compiler to optimize the generic macro */ \
190
        if (IS_BINARY_CONT(tin, tout)) { \
191
            if (abs_ptrdiff(args[2], args[0]) == 0 && \
192
                    abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
193
                BASE_BINARY_LOOP_INP(tin, tout, op) \
194
            } \
195
            else if (abs_ptrdiff(args[2], args[1]) == 0 && \
196
                         abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
197
                BASE_BINARY_LOOP_INP(tin, tout, op) \
198
            } \
199
            else { \
200
                BASE_BINARY_LOOP(tin, tout, op) \
201
            } \
202
        } \
203
        else if (IS_BINARY_CONT_S1(tin, tout)) { \
204
            if (abs_ptrdiff(args[2], args[1]) == 0) { \
205
                BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \
206
            } \
207
            else { \
208
                BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
209
            } \
210
        } \
211
        else if (IS_BINARY_CONT_S2(tin, tout)) { \
212
            if (abs_ptrdiff(args[2], args[0]) == 0) { \
213
                BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \
214
            } \
215
            else { \
216
                BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
217
            }\
218
        } \
219
        else { \
220
            BASE_BINARY_LOOP(tin, tout, op) \
221
        } \
222
    } \
223
    while (0)
224

225
#define BINARY_REDUCE_LOOP_INNER\
226
    char *ip2 = args[1]; \
227
    npy_intp is2 = steps[1]; \
228
    npy_intp n = dimensions[0]; \
229
    npy_intp i; \
230
    for(i = 0; i < n; i++, ip2 += is2)
231

232
#define BINARY_REDUCE_LOOP(TYPE)\
233
    char *iop1 = args[0]; \
234
    TYPE io1 = *(TYPE *)iop1; \
235
    BINARY_REDUCE_LOOP_INNER
236

237

238
#endif /* _NPY_UMATH_FAST_LOOP_MACROS_H_ */

Read our documentation on viewing source code .

Loading