1

2

3
/*
4
 * This file is for the definitions of simd vectorized operations.
5
 *
6
 * Currently contains sse2 functions that are built on amd64, x32 or
7
 * non-generic builds (CFLAGS=-march=...)
8
 * In future it may contain other instruction sets like AVX or NEON detected
9
 * at runtime in which case it needs to be included indirectly via a file
10
 * compiled with special options (or use gcc target attributes) so the binary
11
 * stays portable.
12
 */
13

14

15
#ifndef __NPY_SIMD_INC
16
#define __NPY_SIMD_INC
17

18
#include "lowlevel_strided_loops.h"
19
#include "numpy/npy_common.h"
20
#include "numpy/npy_math.h"
21
#include "npy_simd_data.h"
22
#ifdef NPY_HAVE_SSE2_INTRINSICS
23
#include <emmintrin.h>
24
#if !defined(_MSC_VER) || _MSC_VER >= 1600
25
#include <immintrin.h>
26
#else
27
#undef __AVX2__
28
#undef __AVX512F__
29
#endif
30
#endif
31
#include "simd/simd.h"
32
#include <assert.h>
33
#include <stdlib.h>
34
#include <float.h>
35
#include <string.h> /* for memcpy */
36

37
#define VECTOR_SIZE_BYTES 16
38

39
/*
40
 * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc.
41
 * Very large step size can be as slow as processing it using scalar. The
42
 * value of 2097152 ( = 2MB) was chosen using 2 considerations:
43
 * 1) Typical linux kernel page size is 4Kb, but sometimes it could also be 2MB
44
 *    which is == 2097152 Bytes. For a step size as large as this, surely all
45
 *    the loads/stores of gather/scatter instructions falls on 16 different pages
46
 *    which one would think would slow down gather/scatter instructions.
47
 * 2) It additionally satisfies MAX_STEP_SIZE*16/esize < NPY_MAX_INT32 which
48
 *    allows us to use i32 version of gather/scatter (as opposed to the i64 version)
49
 *    without problems (step larger than NPY_MAX_INT32*esize/16 would require use of
50
 *    i64gather/scatter). esize = element size = 4/8 bytes for float/double.
51
 */
52
#define MAX_STEP_SIZE 2097152
53

54
/*
55
 * nomemoverlap - returns true if two strided arrays have an overlapping
56
 * region in memory. ip_size/op_size = size of the arrays which can be negative
57
 * indicating negative steps.
58
 */
59
static NPY_INLINE npy_bool
60
nomemoverlap(char *ip,
61
             npy_intp ip_size,
62
             char *op,
63
             npy_intp op_size)
64
{
65
    char *ip_start, *ip_end, *op_start, *op_end;
66 1
    if (ip_size < 0) {
67 0
        ip_start = ip + ip_size;
68 0
        ip_end = ip;
69
    }
70
    else {
71 1
        ip_start = ip;
72 1
        ip_end = ip + ip_size;
73
    }
74 1
    if (op_size < 0) {
75 1
        op_start = op + op_size;
76 1
        op_end = op;
77
    }
78
    else {
79 1
        op_start = op;
80 1
        op_end = op + op_size;
81
    }
82 1
    return (ip_start > op_end) | (op_start > ip_end);
83
}
84

85
#define IS_BINARY_STRIDE_ONE(esize, vsize) \
86
    ((steps[0] == esize) && \
87
     (steps[1] == esize) && \
88
     (steps[2] == esize) && \
89
     (abs_ptrdiff(args[2], args[0]) >= vsize) && \
90
     (abs_ptrdiff(args[2], args[1]) >= vsize))
91

92
/*
93
 * stride is equal to element size and input and destination are equal or
94
 * don't overlap within one register. The check of the steps against
95
 * esize also quarantees that steps are >= 0.
96
 */
97
#define IS_BLOCKABLE_UNARY(esize, vsize) \
98
    (steps[0] == (esize) && steps[0] == steps[1] && \
99
     (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
100
     ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
101
      ((abs_ptrdiff(args[1], args[0]) == 0))))
102

103
/*
104
 * Avoid using SIMD for very large step sizes for several reasons:
105
 * 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
106
 *    in which case we need two i64gather instructions and an additional vinsertf32x8
107
 *    instruction to load a single zmm register (since one i64gather instruction
108
 *    loads into a ymm register). This is not ideal for performance.
109
 * 2) Gather and scatter instructions can be slow when the loads/stores
110
 *    cross page boundaries.
111
 *
112
 * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
113
 * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
114
 * ensures this. The condition also requires that the input and output arrays
115
 * should have no overlap in memory.
116
 */
117
#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
118
    ((labs(steps[0]) < MAX_STEP_SIZE)  && \
119
     (labs(steps[1]) < MAX_STEP_SIZE)  && \
120
     (labs(steps[2]) < MAX_STEP_SIZE)  && \
121
     (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
122
     (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
123

124
#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
125
    ((labs(steps[0]) < MAX_STEP_SIZE)  && \
126
     (labs(steps[1]) < MAX_STEP_SIZE)  && \
127
     (labs(steps[2]) < MAX_STEP_SIZE)  && \
128
     (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
129
     (nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0])))
130

131
/*
132
 * 1) Output should be contiguous, can handle strided input data
133
 * 2) Input step should be smaller than MAX_STEP_SIZE for performance
134
 * 3) Input and output arrays should have no overlap in memory
135
 */
136
#define IS_OUTPUT_BLOCKABLE_UNARY(esizein, esizeout, vsize) \
137
    ((steps[0] & (esizein-1)) == 0 && \
138
     steps[1] == (esizeout) && labs(steps[0]) < MAX_STEP_SIZE && \
139
     (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))
140

141
#define IS_BLOCKABLE_REDUCE(esize, vsize) \
142
    (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
143
     npy_is_aligned(args[1], (esize)) && \
144
     npy_is_aligned(args[0], (esize)))
145

146
#define IS_BLOCKABLE_BINARY(esize, vsize) \
147
    (steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \
148
     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
149
     npy_is_aligned(args[0], (esize)) && \
150
     (abs_ptrdiff(args[2], args[0]) >= (vsize) || \
151
      abs_ptrdiff(args[2], args[0]) == 0) && \
152
     (abs_ptrdiff(args[2], args[1]) >= (vsize) || \
153
      abs_ptrdiff(args[2], args[1]) >= 0))
154

155
#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \
156
    (steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \
157
     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
158
     ((abs_ptrdiff(args[2], args[1]) >= (vsize)) || \
159
      (abs_ptrdiff(args[2], args[1]) == 0)) && \
160
     abs_ptrdiff(args[2], args[0]) >= (esize))
161

162
#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \
163
    (steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \
164
     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \
165
     ((abs_ptrdiff(args[2], args[0]) >= (vsize)) || \
166
      (abs_ptrdiff(args[2], args[0]) == 0)) && \
167
     abs_ptrdiff(args[2], args[1]) >= (esize))
168

169
#undef abs_ptrdiff
170

171
#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
172
    (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
173
     npy_is_aligned(args[1], (esize)) && \
174
     npy_is_aligned(args[0], (esize)))
175

176
#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
177
    (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
178
     npy_is_aligned(args[1], (esize)))
179

180
#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
181
    (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
182
     npy_is_aligned(args[0], (esize)))
183

184
/* align var to alignment */
185
#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
186
    npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
187
                                                alignment, n);\
188
    for(i = 0; i < peel; i++)
189

190
#define LOOP_BLOCKED(type, vsize)\
191
    for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
192
            i += (vsize / sizeof(type)))
193

194
#define LOOP_BLOCKED_END\
195
    for (; i < n; i++)
196

197

198
/*
199
 * Dispatcher functions
200
 * decide whether the operation can be vectorized and run it
201
 * if it was run returns true and false if nothing was done
202
 */
203

204
/*
205
 *****************************************************************************
206
 **                           CMPLX DISPATCHERS
207
 *****************************************************************************
208
 */
209

210
/**begin repeat
211
 * #TYPE = CFLOAT, CDOUBLE#
212
 * #type= npy_float, npy_double#
213
 * #esize = 8, 16#
214
 */
215

216
/**begin repeat1
217
 *  #func = add, subtract, multiply#
218
 */
219

220
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
221
static NPY_INLINE NPY_GCC_TARGET_AVX512F void
222
AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps);
223
#endif
224

225
static NPY_INLINE int
226 1
run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
227
{
228
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
229 1
    if (IS_BINARY_STRIDE_ONE(@esize@, 64)) {
230 1
        AVX512F_@func@_@TYPE@(args, dimensions, steps);
231 1
        return 1;
232
    }
233
    else
234
        return 0;
235
#endif
236
    return 0;
237
}
238

239
/**end repeat1**/
240

241
/**begin repeat1
242
 *  #func = square, absolute, conjugate#
243
 *  #outsize = 1, 2, 1#
244
 *  #max_stride = 2, 8, 8#
245
 */
246

247
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
248
static NPY_INLINE NPY_GCC_TARGET_AVX512F void
249
AVX512F_@func@_@TYPE@(@type@*, @type@*, const npy_intp n, const npy_intp stride);
250
#endif
251

252
static NPY_INLINE int
253 1
run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
254
{
255
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
256 1
    if ((IS_OUTPUT_BLOCKABLE_UNARY(@esize@, (npy_uint)(@esize@/@outsize@), 64)) && (labs(steps[0]) < 2*@max_stride@*@esize@)) {
257 1
        AVX512F_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
258 1
        return 1;
259
    }
260
    else
261
        return 0;
262
#endif
263
    return 0;
264
}
265

266
/**end repeat1**/
267
/**end repeat**/
268

269
/*
270
 *****************************************************************************
271
 **                           FLOAT DISPATCHERS
272
 *****************************************************************************
273
 */
274

275
/**begin repeat
276
 * #type = npy_float, npy_double, npy_longdouble#
277
 * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
278
 * #EXISTS = 1, 1, 0#
279
 */
280

281
/**begin repeat1
282
 *  #func = maximum, minimum#
283
 */
284

285
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
286
static NPY_INLINE NPY_GCC_TARGET_AVX512F void
287
AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
288
#endif
289

290
static NPY_INLINE int
291 1
run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
292
{
293
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
294 1
    if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
295 1
        AVX512F_@func@_@TYPE@(args, dimensions, steps);
296 1
        return 1;
297
    }
298
    else
299
        return 0;
300
#endif
301
    return 0;
302
}
303

304

305
/**end repeat1**/
306

307
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
308
static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
309
AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
310

311
static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
312
AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
313
#endif
314

315
static NPY_INLINE int
316
run_binary_avx512_skx_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
317
{
318
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
319
    if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
320
        AVX512_SKX_ldexp_@TYPE@(args, dimensions, steps);
321
        return 1;
322
    }
323
    else
324
        return 0;
325
#endif
326
    return 0;
327
}
328

329
static NPY_INLINE int
330
run_unary_two_out_avx512_skx_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
331
{
332
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
333
    if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) {
334
        AVX512_SKX_frexp_@TYPE@(args, dimensions, steps);
335
        return 1;
336
    }
337
    else
338
        return 0;
339
#endif
340
    return 0;
341
}
342
/**end repeat**/
343

344
/**begin repeat
345
 * #type = npy_float, npy_double, npy_longdouble#
346
 * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
347
 * #EXISTS = 1, 1, 0#
348
 */
349

350
/**begin repeat1
351
 * #func = isnan, isfinite, isinf, signbit#
352
 */
353

354
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
355
static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
356
AVX512_SKX_@func@_@TYPE@(npy_bool*, @type@*, const npy_intp n, const npy_intp stride);
357
#endif
358

359
static NPY_INLINE int
360
run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
361
{
362
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
363
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(npy_bool), 64)) {
364
        AVX512_SKX_@func@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
365
        return 1;
366
    }
367
    else {
368
        return 0;
369
    }
370
#endif
371
    return 0;
372
}
373

374

375
/**end repeat1**/
376
/**end repeat**/
377

378
/**begin repeat
379
 * #ISA = FMA, AVX512F#
380
 * #isa = fma, avx512f#
381
 * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
382
 * #REGISTER_SIZE = 32, 64#
383
 */
384

385
/* prototypes */
386

387
/**begin repeat1
388
 * #type = npy_float, npy_double#
389
 * #TYPE = FLOAT, DOUBLE#
390
 */
391

392
/**begin repeat2
393
 *  #func = sqrt, absolute, square, reciprocal, rint, floor, ceil, trunc#
394
 */
395

396
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
397
static NPY_INLINE NPY_GCC_TARGET_@ISA@ void
398
@ISA@_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n, const npy_intp stride);
399
#endif
400

401
static NPY_INLINE int
402 1
run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
403
{
404
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
405 1
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(@type@), @REGISTER_SIZE@)) {
406 1
        @ISA@_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
407 1
        return 1;
408
    }
409
    else
410
        return 0;
411
#endif
412
    return 0;
413
}
414

415
/**end repeat2**/
416
/**end repeat1**/
417

418
/**begin repeat1
419
 * #func = exp, log#
420
 */
421

422
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
423
static NPY_INLINE void
424
@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp stride);
425
#endif
426

427
static NPY_INLINE int
428 1
run_unary_@isa@_@func@_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps)
429
{
430
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
431 1
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), sizeof(npy_float), @REGISTER_SIZE@)) {
432 1
        @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
433 1
        return 1;
434
    }
435
    else
436
        return 0;
437
#endif
438
    return 0;
439
}
440

441
/**end repeat1**/
442

443
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
444
static NPY_INLINE void
445
@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, NPY_TRIG_OP);
446
#endif
447

448
static NPY_INLINE int
449 1
run_unary_@isa@_sincos_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps, NPY_TRIG_OP my_trig_op)
450
{
451
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
452 1
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), sizeof(npy_float), @REGISTER_SIZE@)) {
453 1
        @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], my_trig_op);
454 1
        return 1;
455
    }
456
    else
457
        return 0;
458
#endif
459
    return 0;
460
}
461

462
/**end repeat**/
463

464
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined  NPY_HAVE_SSE2_INTRINSICS
465
static NPY_INLINE void
466
AVX512F_exp_DOUBLE(npy_double *, npy_double *, const npy_intp n, const npy_intp stride);
467
#endif
468
static NPY_INLINE int
469 1
run_unary_avx512f_exp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
470
{
471
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
472
#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
473 1
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), sizeof(npy_double), 64)) {
474 1
        AVX512F_exp_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]);
475 1
        return 1;
476
    }
477
    else
478
        return 0;
479
#endif
480
#endif
481
    return 0;
482
}
483

484
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined  NPY_HAVE_SSE2_INTRINSICS
485
static NPY_INLINE void
486
AVX512F_log_DOUBLE(npy_double *, npy_double *, const npy_intp n, const npy_intp stride);
487
#endif
488
static NPY_INLINE int
489 1
run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
490
{
491
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
492
#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
493 1
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), sizeof(npy_double), 64)) {
494 1
        AVX512F_log_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]);
495 1
        return 1;
496
    }
497
    else
498
        return 0;
499
#endif
500
#endif
501
    return 0;
502
}
503

504
/**begin repeat
505
 * Float types
506
 *  #type = npy_float, npy_double, npy_longdouble#
507
 *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
508
 *  #vector = 1, 1, 0#
509
 *  #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
510
 */
511

512
/**begin repeat1
513
 * #func = sqrt, absolute, negative, minimum, maximum#
514
 * #check = IS_BLOCKABLE_UNARY*3, IS_BLOCKABLE_REDUCE*2 #
515
 * #name = unary*3, unary_reduce*2#
516
 */
517

518
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
519

520
/* prototypes */
521
static void
522
sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n);
523

524
#endif
525

526
static NPY_INLINE int
527 1
run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
528
{
529
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
530 1
    if (@check@(sizeof(@type@), VECTOR_SIZE_BYTES)) {
531 1
        sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
532 1
        return 1;
533
    }
534
#endif
535
    return 0;
536
}
537

538
/**end repeat1**/
539

540
/**begin repeat1
541
 * Arithmetic
542
 * # kind = add, subtract, multiply, divide#
543
 */
544

545
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
546

547
/* prototypes */
548
static void
549
sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
550
                          npy_intp n);
551
static void
552
sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
553
                                  npy_intp n);
554
static void
555
sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
556
                                  npy_intp n);
557

558
#elif @VECTOR@
559

560
static void
561
simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
562
                          npy_intp n);
563
static void
564
simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
565
                                  npy_intp n);
566
static void
567
simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
568
                                  npy_intp n);
569

570
#endif
571

572
static NPY_INLINE int
573 1
run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
574
{
575
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
576 1
    @type@ * ip1 = (@type@ *)args[0];
577 1
    @type@ * ip2 = (@type@ *)args[1];
578 1
    @type@ * op = (@type@ *)args[2];
579 1
    npy_intp n = dimensions[0];
580
#if defined __AVX512F__
581
    const npy_uintp vector_size_bytes = 64;
582
#elif defined __AVX2__
583
    const npy_uintp vector_size_bytes = 32;
584
#else
585 1
    const npy_uintp vector_size_bytes = 32;
586
#endif
587
    /* argument one scalar */
588 1
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
589 1
        sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
590
        return 1;
591
    }
592
    /* argument two scalar */
593 1
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
594 1
        sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
595
        return 1;
596
    }
597 1
    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
598 1
        sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
599
        return 1;
600
    }
601
#elif @VECTOR@
602
    @type@ * ip1 = (@type@ *)args[0];
603
    @type@ * ip2 = (@type@ *)args[1];
604
    @type@ * op = (@type@ *)args[2];
605
    npy_intp n = dimensions[0];
606
    /* argument one scalar */
607
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
608
        simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
609
        return 1;
610
    }
611
    /* argument two scalar */
612
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
613
        simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
614
        return 1;
615
    }
616
    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
617
        simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
618
        return 1;
619
    }
620
#endif
621
    return 0;
622
}
623

624
/**end repeat1**/
625

626
/**begin repeat1
627
 * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
628
 *         logical_and, logical_or#
629
 * #simd = 1, 1, 1, 1, 1, 1, 0, 0#
630
 */
631

632
#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS
633

634
/* prototypes */
635
static void
636
sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
637
                          npy_intp n);
638
static void
639
sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
640
                                  npy_intp n);
641
static void
642
sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
643
                                  npy_intp n);
644

645
#endif
646

647
static NPY_INLINE int
648 1
run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
649
{
650
#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS
651 1
    @type@ * ip1 = (@type@ *)args[0];
652 1
    @type@ * ip2 = (@type@ *)args[1];
653 1
    npy_bool * op = (npy_bool *)args[2];
654 1
    npy_intp n = dimensions[0];
655
    /* argument one scalar */
656 1
    if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
657 1
        sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
658
        return 1;
659
    }
660
    /* argument two scalar */
661 1
    else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
662 1
        sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
663
        return 1;
664
    }
665 1
    else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
666 1
        sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
667
        return 1;
668
    }
669
#endif
670
    return 0;
671
}
672

673
/**end repeat1**/
674

675
/**begin repeat1
676
 * #kind = isnan, isfinite, isinf, signbit#
677
 */
678

679
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
680

681
static void
682
sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n);
683

684
#endif
685

686
static NPY_INLINE int
687 0
run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
688
{
689
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
690 0
    if (steps[0] == sizeof(@type@) && steps[1] == 1 &&
691 0
        npy_is_aligned(args[0], sizeof(@type@))) {
692 0
        sse2_@kind@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0]);
693 0
        return 1;
694
    }
695
#endif
696
    return 0;
697
}
698

699
/**end repeat1**/
700

701
/**end repeat**/
702

703
/*
704
 *****************************************************************************
705
 **                           BOOL DISPATCHERS
706
 *****************************************************************************
707
 */
708

709
/**begin repeat
710
 * # kind = logical_or, logical_and#
711
 */
712

713
#if defined NPY_HAVE_SSE2_INTRINSICS
714
static void
715
sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2,
716
                        npy_intp n);
717

718
static void
719
sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n);
720
#endif
721

722
static NPY_INLINE int
723 1
run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
724
{
725
#if defined NPY_HAVE_SSE2_INTRINSICS
726 1
    if (sizeof(npy_bool) == 1 &&
727 1
            IS_BLOCKABLE_BINARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
728 1
        sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
729
                               (npy_bool*)args[1], dimensions[0]);
730 1
        return 1;
731
    }
732
#endif
733
    return 0;
734
}
735

736

737
static NPY_INLINE int
738 1
run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
739
{
740
#if defined NPY_HAVE_SSE2_INTRINSICS
741 1
    if (sizeof(npy_bool) == 1 &&
742 1
            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
743 1
        sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
744
                                dimensions[0]);
745 1
        return 1;
746
    }
747
#endif
748
    return 0;
749
}
750

751
/**end repeat**/
752

753
/**begin repeat
754
 * # kind = absolute, logical_not#
755
 */
756

757
#if defined NPY_HAVE_SSE2_INTRINSICS
758
static void
759
sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n);
760
#endif
761

762
static NPY_INLINE int
763 1
run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
764
{
765
#if defined NPY_HAVE_SSE2_INTRINSICS
766 1
    if (sizeof(npy_bool) == 1 &&
767 1
            IS_BLOCKABLE_UNARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
768 1
        sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
769 1
        return 1;
770
    }
771
#endif
772
    return 0;
773
}
774

775
/**end repeat**/
776

777
#ifdef NPY_HAVE_SSE2_INTRINSICS
778

779
/*
780
 * Vectorized operations
781
 */
782
/*
783
 *****************************************************************************
784
 **                           FLOAT LOOPS
785
 *****************************************************************************
786
 */
787

788
/**begin repeat
789
* horizontal reductions on a vector
790
* # VOP = min, max#
791
*/
792

793
static NPY_INLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
794
{
795
    npy_float r;
796 1
    __m128 tmp = _mm_movehl_ps(v, v);                   /* c     d     ... */
797 1
    __m128 m = _mm_@VOP@_ps(v, tmp);                    /* m(ac) m(bd) ... */
798 1
    tmp = _mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1));/* m(bd) m(bd) ... */
799 1
    _mm_store_ss(&r, _mm_@VOP@_ps(tmp, m));             /* m(acbd) ... */
800
    return r;
801
}
802

803
static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
804
{
805
    npy_double r;
806 1
    __m128d tmp = _mm_unpackhi_pd(v, v);    /* b     b */
807 1
    _mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */
808
    return r;
809
}
810

811
/**end repeat**/
812

813
/**begin repeat
814
 *  #type = npy_float, npy_double#
815
 *  #TYPE = FLOAT, DOUBLE#
816
 *  #scalarf = npy_sqrtf, npy_sqrt#
817
 *  #c = f, #
818
 *  #vtype = __m128, __m128d#
819
 *  #vtype256 = __m256, __m256d#
820
 *  #vtype512 = __m512, __m512d#
821
 *  #vpre = _mm, _mm#
822
 *  #vpre256 = _mm256, _mm256#
823
 *  #vpre512 = _mm512, _mm512#
824
 *  #vsuf = ps, pd#
825
 *  #vsufs = ss, sd#
826
 *  #nan = NPY_NANF, NPY_NAN#
827
 *  #double = 0, 1#
828
 *  #cast = _mm_castps_si128, _mm_castpd_si128#
829
 */
830

831

832
/**begin repeat1
833
* Arithmetic
834
* # kind = add, subtract, multiply, divide#
835
* # OP = +, -, *, /#
836
* # VOP = add, sub, mul, div#
837
*/
838

839
static void
840 1
sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
841
{
842
#ifdef  __AVX512F__
843
    const npy_intp vector_size_bytes = 64;
844
    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
845
        op[i] = ip1[i] @OP@ ip2[i];
846
    /* lots of specializations, to squeeze out max performance */
847
    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
848
        if (ip1 == ip2) {
849
            LOOP_BLOCKED(@type@, vector_size_bytes) {
850
                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
851
                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
852
                @vpre512@_store_@vsuf@(&op[i], c);
853
            }
854
        }
855
        else {
856
            LOOP_BLOCKED(@type@, vector_size_bytes) {
857
                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
858
                @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
859
                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
860
                @vpre512@_store_@vsuf@(&op[i], c);
861
            }
862
        }
863
    }
864
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
865
        LOOP_BLOCKED(@type@, vector_size_bytes) {
866
            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
867
            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
868
            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
869
            @vpre512@_store_@vsuf@(&op[i], c);
870
        }
871
    }
872
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
873
        LOOP_BLOCKED(@type@, vector_size_bytes) {
874
            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
875
            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
876
            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
877
            @vpre512@_store_@vsuf@(&op[i], c);
878
        }
879
    }
880
    else {
881
        if (ip1 == ip2) {
882
            LOOP_BLOCKED(@type@, vector_size_bytes) {
883
                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
884
                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
885
                @vpre512@_store_@vsuf@(&op[i], c);
886
            }
887
        }
888
        else {
889
            LOOP_BLOCKED(@type@, vector_size_bytes) {
890
                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
891
                @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
892
                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
893
                @vpre512@_store_@vsuf@(&op[i], c);
894
            }
895
        }
896
    }
897
#elif __AVX2__
898
    const npy_intp vector_size_bytes = 32;
899
    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
900
        op[i] = ip1[i] @OP@ ip2[i];
901
    /* lots of specializations, to squeeze out max performance */
902
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
903
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
904
        if (ip1 == ip2) {
905
            LOOP_BLOCKED(@type@, vector_size_bytes) {
906
                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
907
                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
908
                @vpre256@_store_@vsuf@(&op[i], c);
909
            }
910
        }
911
        else {
912
            LOOP_BLOCKED(@type@, vector_size_bytes) {
913
                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
914
                @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
915
                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
916
                @vpre256@_store_@vsuf@(&op[i], c);
917
            }
918
        }
919
    }
920
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
921
        LOOP_BLOCKED(@type@, vector_size_bytes) {
922
            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
923
            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
924
            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
925
            @vpre256@_store_@vsuf@(&op[i], c);
926
        }
927
    }
928
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
929
        LOOP_BLOCKED(@type@, vector_size_bytes) {
930
            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
931
            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
932
            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
933
            @vpre256@_store_@vsuf@(&op[i], c);
934
        }
935
    }
936
    else {
937
        if (ip1 == ip2) {
938
            LOOP_BLOCKED(@type@, vector_size_bytes) {
939
                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
940
                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
941
                @vpre256@_store_@vsuf@(&op[i], c);
942
            }
943
        }
944
        else {
945
            LOOP_BLOCKED(@type@, vector_size_bytes) {
946
                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
947
                @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
948
                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
949
                @vpre256@_store_@vsuf@(&op[i], c);
950
            }
951
        }
952
    }
953
#else
954 1
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
955 1
        op[i] = ip1[i] @OP@ ip2[i];
956
    /* lots of specializations, to squeeze out max performance */
957 1
    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) &&
958 1
            npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
959 1
        if (ip1 == ip2) {
960 1
            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
961 1
                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
962 1
                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
963 1
                @vpre@_store_@vsuf@(&op[i], c);
964
            }
965
        }
966
        else {
967 1
            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
968 1
                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
969 1
                @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
970 1
                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
971 1
                @vpre@_store_@vsuf@(&op[i], c);
972
            }
973
        }
974
    }
975 1
    else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
976 1
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
977 1
            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
978 1
            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
979 1
            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
980 1
            @vpre@_store_@vsuf@(&op[i], c);
981
        }
982
    }
983 1
    else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
984 1
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
985 1
            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
986 1
            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
987 1
            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
988 1
            @vpre@_store_@vsuf@(&op[i], c);
989
        }
990
    }
991
    else {
992 1
        if (ip1 == ip2) {
993 1
            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
994 1
                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
995 1
                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
996 1
                @vpre@_store_@vsuf@(&op[i], c);
997
            }
998
        }
999
        else {
1000 1
            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
1001 1
                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
1002 1
                @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
1003 1
                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
1004 1
                @vpre@_store_@vsuf@(&op[i], c);
1005
            }
1006
        }
1007
    }
1008
#endif
1009 1
    LOOP_BLOCKED_END {
1010 1
        op[i] = ip1[i] @OP@ ip2[i];
1011
    }
1012 1
}
1013

1014

1015
static void
1016 1
sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
1017
{
1018
#ifdef __AVX512F__
1019
    const npy_intp vector_size_bytes = 64;
1020
    const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
1021
    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
1022
        op[i] = ip1[0] @OP@ ip2[i];
1023
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
1024
        LOOP_BLOCKED(@type@, vector_size_bytes) {
1025
            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
1026
            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
1027
            @vpre512@_store_@vsuf@(&op[i], c);
1028
        }
1029
    }
1030
    else {
1031
        LOOP_BLOCKED(@type@, vector_size_bytes) {
1032
            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
1033
            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
1034
            @vpre512@_store_@vsuf@(&op[i], c);
1035
        }
1036
    }
1037

1038

1039
#elif __AVX2__
1040
    const npy_intp vector_size_bytes = 32;
1041
    const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
1042
    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
1043
        op[i] = ip1[0] @OP@ ip2[i];
1044
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
1045
        LOOP_BLOCKED(@type@, vector_size_bytes) {
1046
            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
1047
            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
1048
            @vpre256@_store_@vsuf@(&op[i], c);
1049
        }
1050
    }
1051
    else {
1052
        LOOP_BLOCKED(@type@, vector_size_bytes) {
1053
            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
1054
            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
1055
            @vpre256@_store_@vsuf@(&op[i], c);
1056
        }
1057
    }
1058
#else
1059 1
    const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
1060 1
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
1061 1
        op[i] = ip1[0] @OP@ ip2[i];
1062 1
    if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
1063 1
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
1064 1
            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
1065 1
            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
1066 1
            @vpre@_store_@vsuf@(&op[i], c);
1067
        }
1068
    }
1069
    else {
1070 1
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
1071 1
            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
1072 1
            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
1073 1
            @vpre@_store_@vsuf@(&op[i], c);
1074
        }
1075
    }
1076
#endif
1077 1
    LOOP_BLOCKED_END {
1078 1
        op[i] = ip1[0] @OP@ ip2[i];
1079
    }
1080 1
}
1081

1082

1083
static void
1084 1
sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
1085
{
1086
#ifdef __AVX512F__
1087
    const npy_intp vector_size_bytes = 64;
1088
    const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
1089
    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
1090
        op[i] = ip1[i] @OP@ ip2[0];
1091
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
1092
        LOOP_BLOCKED(@type@, vector_size_bytes) {
1093
            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
1094
            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
1095
            @vpre512@_store_@vsuf@(&op[i], c);
1096
        }
1097
    }
1098
    else {
1099
        LOOP_BLOCKED(@type@, vector_size_bytes) {
1100
            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
1101
            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
1102
            @vpre512@_store_@vsuf@(&op[i], c);
1103
        }
1104
    }
1105

1106
#elif __AVX2__
1107
    const npy_intp vector_size_bytes = 32;
1108
    const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
1109
    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
1110
        op[i] = ip1[i] @OP@ ip2[0];
1111
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
1112
        LOOP_BLOCKED(@type@, vector_size_bytes) {
1113
            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
1114
            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
1115
            @vpre256@_store_@vsuf@(&op[i], c);
1116
        }
1117
    }
1118
    else {
1119
        LOOP_BLOCKED(@type@, vector_size_bytes) {
1120
            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
1121
            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
1122
            @vpre256@_store_@vsuf@(&op[i], c);
1123
        }
1124
    }
1125
#else
1126 1
    const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
1127 1
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
1128 1
        op[i] = ip1[i] @OP@ ip2[0];
1129 1
    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
1130 1
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
1131 1
            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
1132 1
            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
1133 1
            @vpre@_store_@vsuf@(&op[i], c);
1134
        }
1135
    }
1136
    else {
1137 1
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
1138 1
            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
1139 1
            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
1140 1
            @vpre@_store_@vsuf@(&op[i], c);
1141
        }
1142
    }
1143
#endif
1144 1
    LOOP_BLOCKED_END {
1145 1
        op[i] = ip1[i] @OP@ ip2[0];
1146
    }
1147 1
}
1148

1149
/**end repeat1**/
1150

1151
/*
1152
 * compress 4 vectors to 4/8 bytes in op with filled with 0 or 1
1153
 * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the
1154
 * calling convention leading to C2719 on 32 bit, see #4795
1155
 */
1156
static NPY_INLINE void
1157
sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
1158
                              npy_bool * op)
1159
{
1160 1
    const __m128i mask = @vpre@_set1_epi8(0x1);
1161 1
    __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
1162 1
    __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(*r4));
1163 1
    __m128i rr = @vpre@_packs_epi16(ir1, ir2);
1164
#if @double@
1165 1
    rr = @vpre@_packs_epi16(rr, rr);
1166 1
    rr = @vpre@_and_si128(rr, mask);
1167 1
    @vpre@_storel_epi64((__m128i*)op, rr);
1168
#else
1169 1
    rr = @vpre@_and_si128(rr, mask);
1170 1
    @vpre@_storeu_si128((__m128i*)op, rr);
1171
#endif
1172
}
1173

1174
static void
1175 0
sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
1176
{
1177 0
    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
1178 0
        op[i] = npy_signbit(ip1[i]) != 0;
1179
    }
1180 0
    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
1181 0
        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
1182 0
        int r = @vpre@_movemask_@vsuf@(a);
1183
        if (sizeof(@type@) == 8) {
1184 0
            op[i] = r & 1;
1185 0
            op[i + 1] = (r >> 1);
1186
        }
1187
        else {
1188 0
            op[i] = r & 1;
1189 0
            op[i + 1] = (r >> 1) & 1;
1190 0
            op[i + 2] = (r >> 2) & 1;
1191 0
            op[i + 3] = (r >> 3);
1192
        }
1193
    }
1194 0
    LOOP_BLOCKED_END {
1195 0
        op[i] = npy_signbit(ip1[i]) != 0;
1196
    }
1197 0
}
1198

1199
/**begin repeat1
1200
 * #kind = isnan, isfinite, isinf#
1201
 * #var = 0, 1, 2#
1202
 */
1203

1204
static void
1205 0
sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
1206
{
1207
#if @var@ != 0 /* isinf/isfinite */
1208
    /* signbit mask 0x7FFFFFFF after andnot */
1209 0
    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
1210 0
    const @vtype@ ones = @vpre@_cmpeq_@vsuf@(@vpre@_setzero_@vsuf@(),
1211
                                             @vpre@_setzero_@vsuf@());
1212
#if @double@
1213 0
    const @vtype@ fltmax = @vpre@_set1_@vsuf@(DBL_MAX);
1214
#else
1215 0
    const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX);
1216
#endif
1217
#endif
1218 0
    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
1219 0
        op[i] = npy_@kind@(ip1[i]) != 0;
1220
    }
1221 0
    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
1222 0
        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1223 0
        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1224 0
        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1225 0
        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1226
        @vtype@ r1, r2, r3, r4;
1227
#if @var@ != 0 /* isinf/isfinite */
1228
        /* fabs via masking of sign bit */
1229 0
        r1 = @vpre@_andnot_@vsuf@(mask, a);
1230 0
        r2 = @vpre@_andnot_@vsuf@(mask, b);
1231 0
        r3 = @vpre@_andnot_@vsuf@(mask, c);
1232 0
        r4 = @vpre@_andnot_@vsuf@(mask, d);
1233
#if @var@ == 1 /* isfinite */
1234
        /* negative compare against max float, nan is always true */
1235 0
        r1 = @vpre@_cmpnle_@vsuf@(r1, fltmax);
1236 0
        r2 = @vpre@_cmpnle_@vsuf@(r2, fltmax);
1237 0
        r3 = @vpre@_cmpnle_@vsuf@(r3, fltmax);
1238 0
        r4 = @vpre@_cmpnle_@vsuf@(r4, fltmax);
1239
#else /* isinf */
1240 0
        r1 = @vpre@_cmpnlt_@vsuf@(fltmax, r1);
1241 0
        r2 = @vpre@_cmpnlt_@vsuf@(fltmax, r2);
1242 0
        r3 = @vpre@_cmpnlt_@vsuf@(fltmax, r3);
1243 0
        r4 = @vpre@_cmpnlt_@vsuf@(fltmax, r4);
1244
#endif
1245
        /* flip results to what we want (andnot as there is no sse not) */
1246 0
        r1 = @vpre@_andnot_@vsuf@(r1, ones);
1247 0
        r2 = @vpre@_andnot_@vsuf@(r2, ones);
1248 0
        r3 = @vpre@_andnot_@vsuf@(r3, ones);
1249 0
        r4 = @vpre@_andnot_@vsuf@(r4, ones);
1250
#endif
1251
#if @var@ == 0 /* isnan */
1252 0
        r1 = @vpre@_cmpneq_@vsuf@(a, a);
1253 0
        r2 = @vpre@_cmpneq_@vsuf@(b, b);
1254 0
        r3 = @vpre@_cmpneq_@vsuf@(c, c);
1255 0
        r4 = @vpre@_cmpneq_@vsuf@(d, d);
1256
#endif
1257 0
        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
1258
    }
1259 0
    LOOP_BLOCKED_END {
1260 0
        op[i] = npy_@kind@(ip1[i]) != 0;
1261
    }
1262 0
}
1263

1264
/**end repeat1**/
1265

1266
/**begin repeat1
1267
 * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
1268
 * #OP = ==, !=, <, <=, >, >=#
1269
 * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
1270
*/
1271

1272
/* sets invalid fpu flag on QNaN for consistency with packed compare */
1273
static NPY_INLINE int
1274
sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b)
1275
{
1276 1
    @vtype@ one = @vpre@_set1_@vsuf@(1);
1277
    @type@ tmp;
1278 1
    @vtype@ v = @vpre@_@VOP@_@vsufs@(@vpre@_load_@vsufs@(&a),
1279
                                     @vpre@_load_@vsufs@(&b));
1280 1
    v = @vpre@_and_@vsuf@(v, one);
1281 1
    @vpre@_store_@vsufs@(&tmp, v);
1282 1
    return tmp;
1283
}
1284

1285
static void
1286 1
sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
1287
{
1288 1
    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
1289 1
        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
1290
    }
1291 1
    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
1292 1
        @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1293 1
        @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1294 1
        @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1295 1
        @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1296 1
        @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1297 1
        @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1298 1
        @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1299 1
        @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1300 1
        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
1301 1
        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
1302 1
        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
1303 1
        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2);
1304 1
        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
1305
    }
1306 1
    LOOP_BLOCKED_END {
1307 1
        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
1308
    }
1309 1
}
1310

1311

1312
static void
1313 1
sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
1314
{
1315 1
    @vtype@ s = @vpre@_set1_@vsuf@(ip1[0]);
1316 1
    LOOP_BLOCK_ALIGN_VAR(ip2, @type@, VECTOR_SIZE_BYTES) {
1317 1
        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
1318
    }
1319 1
    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
1320 1
        @vtype@ a = @vpre@_load_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1321 1
        @vtype@ b = @vpre@_load_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1322 1
        @vtype@ c = @vpre@_load_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1323 1
        @vtype@ d = @vpre@_load_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1324 1
        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(s, a);
1325 1
        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(s, b);
1326 1
        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(s, c);
1327 1
        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(s, d);
1328 1
        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
1329
    }
1330 1
    LOOP_BLOCKED_END {
1331 1
        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
1332
    }
1333 1
}
1334

1335

1336
static void
1337 1
sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
1338
{
1339 1
    @vtype@ s = @vpre@_set1_@vsuf@(ip2[0]);
1340 1
    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
1341 1
        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
1342
    }
1343 1
    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
1344 1
        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1345 1
        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1346 1
        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1347 1
        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
1348 1
        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a, s);
1349 1
        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, s);
1350 1
        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, s);
1351 1
        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d, s);
1352 1
        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
1353
    }
1354 1
    LOOP_BLOCKED_END {
1355 1
        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
1356
    }
1357 1
}
1358
/**end repeat1**/
1359

1360
static void
1361 0
sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
1362
{
1363
    /* align output to VECTOR_SIZE_BYTES bytes */
1364 0
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
1365 0
        op[i] = @scalarf@(ip[i]);
1366
    }
1367
    assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
1368
           npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
1369 0
    if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
1370 0
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
1371 0
            @vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
1372 0
            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
1373
        }
1374
    }
1375
    else {
1376 0
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
1377 0
            @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
1378 0
            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
1379
        }
1380
    }
1381 0
    LOOP_BLOCKED_END {
1382 0
        op[i] = @scalarf@(ip[i]);
1383
    }
1384 0
}
1385

1386

1387
static NPY_INLINE
1388
@type@ scalar_abs_@type@(@type@ v)
1389
{
1390
    /* add 0 to clear -0.0 */
1391 0
    return (v > 0 ? v: -v) + 0;
1392
}
1393

1394
static NPY_INLINE
1395
@type@ scalar_neg_@type@(@type@ v)
1396
{
1397 1
    return -v;
1398
}
1399

1400
/**begin repeat1
1401
 * #kind = absolute, negative#
1402
 * #VOP = andnot, xor#
1403
 * #scalar = scalar_abs, scalar_neg#
1404
 **/
1405
static void
1406 1
sse2_@kind@_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
1407
{
1408
    /*
1409
     * get 0x7FFFFFFF mask (everything but signbit set)
1410
     * float & ~mask will remove the sign, float ^ mask flips the sign
1411
     * this is equivalent to how the compiler implements fabs on amd64
1412
     */
1413 1
    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
1414

1415
    /* align output to VECTOR_SIZE_BYTES bytes */
1416 1
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
1417 1
        op[i] = @scalar@_@type@(ip[i]);
1418
    }
1419
    assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
1420
           npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
1421 1
    if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
1422 1
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
1423 1
            @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
1424 1
            @vpre@_store_@vsuf@(&op[i], @vpre@_@VOP@_@vsuf@(mask, a));
1425
        }
1426
    }
1427
    else {
1428 1
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
1429 1
            @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
1430 1
            @vpre@_store_@vsuf@(&op[i], @vpre@_@VOP@_@vsuf@(mask, a));
1431
        }
1432
    }
1433 1
    LOOP_BLOCKED_END {
1434 1
        op[i] = @scalar@_@type@(ip[i]);
1435
    }
1436 1
}
1437
/**end repeat1**/
1438

1439

1440
/**begin repeat1
1441
 * #kind = maximum, minimum#
1442
 * #VOP = max, min#
1443
 * #OP = >=, <=#
1444
 **/
1445
/* arguments swapped as unary reduce has the swapped compared to unary */
1446
static void
1447 1
sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
1448
{
1449 1
    const npy_intp stride = VECTOR_SIZE_BYTES / (npy_intp)sizeof(@type@);
1450 1
    LOOP_BLOCK_ALIGN_VAR(ip, @type@, VECTOR_SIZE_BYTES) {
1451
        /* Order of operations important for MSVC 2015 */
1452 1
        *op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
1453
    }
1454
    assert(n < stride || npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES));
1455 1
    if (i + 3 * stride <= n) {
1456
        /* load the first elements */
1457 1
        @vtype@ c1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
1458 1
        @vtype@ c2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
1459 1
        i += 2 * stride;
1460

1461
        /* minps/minpd will set invalid flag if nan is encountered */
1462 1
        npy_clear_floatstatus_barrier((char*)&c1);
1463 1
        LOOP_BLOCKED(@type@, 2 * VECTOR_SIZE_BYTES) {
1464 1
            @vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
1465 1
            @vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
1466 1
            c1 = @vpre@_@VOP@_@vsuf@(c1, v1);
1467 1
            c2 = @vpre@_@VOP@_@vsuf@(c2, v2);
1468
        }
1469 1
        c1 = @vpre@_@VOP@_@vsuf@(c1, c2);
1470

1471 1
        if (npy_get_floatstatus_barrier((char*)&c1) & NPY_FPE_INVALID) {
1472 1
            *op = @nan@;
1473
        }
1474
        else {
1475 1
            @type@ tmp = sse2_horizontal_@VOP@_@vtype@(c1);
1476
            /* Order of operations important for MSVC 2015 */
1477 1
            *op  = (*op @OP@ tmp || npy_isnan(*op)) ? *op : tmp;
1478
        }
1479
    }
1480 1
    LOOP_BLOCKED_END {
1481
        /* Order of operations important for MSVC 2015 */
1482 1
        *op  = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
1483
    }
1484 1
    npy_clear_floatstatus_barrier((char*)op);
1485 1
}
1486
/**end repeat1**/
1487

1488
/**end repeat**/
1489

1490
/* bunch of helper functions used in ISA_exp/log_FLOAT*/
1491

1492
#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
1493
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
1494
fma_get_full_load_mask_ps(void)
1495
{
1496 0
    return _mm256_set1_ps(-1.0);
1497
}
1498

1499
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
1500
fma_get_full_load_mask_pd(void)
1501
{
1502 0
    return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
1503
}
1504

1505
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
1506 0
fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
1507
{
1508 0
    float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
1509
                            1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
1510 0
    float* addr = maskint + num_lanes - num_elem;
1511 0
    return _mm256_loadu_ps(addr);
1512
}
1513

1514
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
1515 0
fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
1516
{
1517 0
    npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
1518 0
    npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
1519 0
    return _mm256_loadu_si256((__m256i*) addr);
1520
}
1521

1522
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
1523
fma_masked_gather_ps(__m256 src,
1524
                     npy_float* addr,
1525
                     __m256i vindex,
1526
                     __m256 mask)
1527
{
1528 0
    return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
1529
}
1530

1531
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
1532
fma_masked_gather_pd(__m256d src,
1533
                     npy_double* addr,
1534
                     __m128i vindex,
1535
                     __m256d mask)
1536
{
1537 0
    return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
1538
}
1539

1540
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
1541
fma_masked_load_ps(__m256 mask, npy_float* addr)
1542
{
1543 0
    return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
1544
}
1545

1546
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
1547
fma_masked_load_pd(__m256i mask, npy_double* addr)
1548
{
1549 0
    return _mm256_maskload_pd(addr, mask);
1550
}
1551

1552
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
1553
fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
1554
{
1555 0
    return _mm256_blendv_ps(x, val, mask);
1556
}
1557

1558
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
1559
fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
1560
{
1561 0
    return _mm256_blendv_pd(x, val, mask);
1562
}
1563

1564
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
1565
fma_blend(__m256 x, __m256 y, __m256 ymask)
1566
{
1567 0
    return _mm256_blendv_ps(x, y, ymask);
1568
}
1569

1570
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
1571
fma_invert_mask_ps(__m256 ymask)
1572
{
1573 0
    return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
1574
}
1575

1576
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
1577
fma_invert_mask_pd(__m256i ymask)
1578
{
1579 0
    return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
1580
}
1581

1582
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
1583
fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
1584
{
1585 0
   return _mm256_cvtepi32_ps(
1586
                _mm256_cmpeq_epi32(_mm256_and_si256(k, andop), cmp));
1587
}
1588

1589
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
1590
fma_should_negate(__m256i k, __m256i andop, __m256i cmp)
1591
{
1592 0
    return fma_should_calculate_sine(k, andop, cmp);
1593
}
1594

1595
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
1596
fma_get_exponent(__m256 x)
1597
{
1598
    /*
1599
     * Special handling of denormals:
1600
     * 1) Multiply denormal elements with 2**100 (0x71800000)
1601
     * 2) Get the 8 bits of unbiased exponent
1602
     * 3) Subtract 100 from exponent of denormals
1603
     */
1604

1605 0
    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
1606 0
    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
1607 0
    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);
1608

1609 0
    __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
1610 0
    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
1611 0
    x = _mm256_blendv_ps(x, temp, denormal_mask);
1612

1613 0
    __m256 exp = _mm256_cvtepi32_ps(
1614
                    _mm256_sub_epi32(
1615
                        _mm256_srli_epi32(
1616
                            _mm256_castps_si256(x), 23),_mm256_set1_epi32(0x7E)));
1617

1618 0
    __m256 denorm_exp = _mm256_sub_ps(exp, _mm256_set1_ps(100.0f));
1619 0
    return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
1620
}
1621

1622
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
1623
fma_get_mantissa(__m256 x)
1624
{
1625
    /*
1626
     * Special handling of denormals:
1627
     * 1) Multiply denormal elements with 2**100 (0x71800000)
1628
     * 2) Get the 23 bits of mantissa
1629
     * 3) Mantissa for denormals is not affected by the multiplication
1630
     */
1631

1632 0
    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
1633 0
    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
1634 0
    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);
1635

1636 0
    __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
1637 0
    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
1638 0
    x = _mm256_blendv_ps(x, temp, denormal_mask);
1639

1640 0
    __m256i mantissa_bits = _mm256_set1_epi32(0x7fffff);
1641 0
    __m256i exp_126_bits  = _mm256_set1_epi32(126 << 23);
1642 0
    return _mm256_castsi256_ps(
1643
                _mm256_or_si256(
1644
                    _mm256_and_si256(
1645
                        _mm256_castps_si256(x), mantissa_bits), exp_126_bits));
1646
}
1647

1648
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
1649 0
fma_scalef_ps(__m256 poly, __m256 quadrant)
1650
{
1651
    /*
1652
     * Handle denormals (which occur when quadrant <= -125):
1653
     * 1) This function computes poly*(2^quad) by adding the exponent of
1654
     poly to quad
1655
     * 2) When quad <= -125, the output is a denormal and the above logic
1656
     breaks down
1657
     * 3) To handle such cases, we split quadrant: -125 + (quadrant + 125)
1658
     * 4) poly*(2^-125) is computed the usual way
1659
     * 5) 2^(quad-125) can be computed by: 2 << abs(quad-125)
1660
     * 6) The final div operation generates the denormal
1661
     */
1662 0
     __m256 minquadrant = _mm256_set1_ps(-125.0f);
1663 0
     __m256 denormal_mask = _mm256_cmp_ps(quadrant, minquadrant, _CMP_LE_OQ);
1664 0
     if (_mm256_movemask_ps(denormal_mask) != 0x0000) {
1665 0
        __m256 quad_diff = _mm256_sub_ps(quadrant, minquadrant);
1666 0
        quad_diff = _mm256_sub_ps(_mm256_setzero_ps(), quad_diff);
1667 0
        quad_diff = _mm256_blendv_ps(_mm256_setzero_ps(), quad_diff, denormal_mask);
1668 0
        __m256i two_power_diff = _mm256_sllv_epi32(
1669
                                   _mm256_set1_epi32(1), _mm256_cvtps_epi32(quad_diff));
1670 0
        quadrant = _mm256_max_ps(quadrant, minquadrant); //keep quadrant >= -126
1671 0
        __m256i exponent = _mm256_slli_epi32(_mm256_cvtps_epi32(quadrant), 23);
1672 0
        poly = _mm256_castsi256_ps(
1673
                   _mm256_add_epi32(
1674
                       _mm256_castps_si256(poly), exponent));
1675 0
        __m256 denorm_poly = _mm256_div_ps(poly, _mm256_cvtepi32_ps(two_power_diff));
1676 0
        return _mm256_blendv_ps(poly, denorm_poly, denormal_mask);
1677
     }
1678
     else {
1679 0
        __m256i exponent = _mm256_slli_epi32(_mm256_cvtps_epi32(quadrant), 23);
1680 0
        poly = _mm256_castsi256_ps(
1681
                   _mm256_add_epi32(
1682
                       _mm256_castps_si256(poly), exponent));
1683 0
        return poly;
1684
     }
1685
}
1686

1687
/**begin repeat
1688
 *  #vsub = ps, pd#
1689
 *  #vtype = __m256, __m256d#
1690
 */
1691
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
1692
fma_abs_@vsub@(@vtype@ x)
1693
{
1694 0
    return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x);
1695
}
1696

1697
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
1698
fma_reciprocal_@vsub@(@vtype@ x)
1699
{
1700 0
    return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x);
1701
}
1702

1703
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
1704
fma_rint_@vsub@(@vtype@ x)
1705
{
1706 0
    return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT);
1707
}
1708

1709
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
1710
fma_floor_@vsub@(@vtype@ x)
1711
{
1712 0
    return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF);
1713
}
1714

1715
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
1716
fma_ceil_@vsub@(@vtype@ x)
1717
{
1718 0
    return _mm256_round_@vsub@(x, _MM_FROUND_TO_POS_INF);
1719
}
1720

1721
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
1722
fma_trunc_@vsub@(@vtype@ x)
1723
{
1724 0
    return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO);
1725
}
1726
/**end repeat**/
1727
#endif
1728

1729
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
1730
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
1731
avx512_get_full_load_mask_ps(void)
1732
{
1733
    return 0xFFFF;
1734
}
1735

1736
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
1737
avx512_get_full_load_mask_pd(void)
1738
{
1739
    return 0xFF;
1740
}
1741

1742
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
1743
avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
1744
{
1745 1
    return (0x0001 << num_elem) - 0x0001;
1746
}
1747

1748
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
1749
avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
1750
{
1751 1
    return (0x01 << num_elem) - 0x01;
1752
}
1753

1754
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
1755
avx512_masked_gather_ps(__m512 src,
1756
                        npy_float* addr,
1757
                        __m512i vindex,
1758
                        __mmask16 kmask)
1759
{
1760 1
    return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
1761
}
1762

1763
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
1764
avx512_masked_gather_pd(__m512d src,
1765
                        npy_double* addr,
1766
                        __m256i vindex,
1767
                        __mmask8 kmask)
1768
{
1769 1
    return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
1770
}
1771

1772
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
1773
avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
1774
{
1775 1
    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
1776
}
1777

1778
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
1779
avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
1780
{
1781 1
    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
1782
}
1783

1784
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
1785
avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
1786
{
1787 1
    return _mm512_mask_blend_ps(mask, x, val);
1788
}
1789

1790
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
1791
avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
1792
{
1793 1
    return _mm512_mask_blend_pd(mask, x, val);
1794
}
1795

1796
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
1797
avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
1798
{
1799 1
    return _mm512_mask_mov_ps(x, ymask, y);
1800
}
1801

1802
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
1803
avx512_invert_mask_ps(__mmask16 ymask)
1804
{
1805 1
    return _mm512_knot(ymask);
1806
}
1807

1808
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
1809
avx512_invert_mask_pd(__mmask8 ymask)
1810
{
1811 1
    return _mm512_knot(ymask);
1812
}
1813

1814
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
1815
avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp)
1816
{
1817 1
    return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp);
1818
}
1819

1820
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
1821
avx512_should_negate(__m512i k, __m512i andop, __m512i cmp)
1822
{
1823 1
    return avx512_should_calculate_sine(k, andop, cmp);
1824
}
1825

1826
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
1827
avx512_get_exponent(__m512 x)
1828
{
1829 1
    return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
1830
}
1831

1832
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
1833
avx512_get_mantissa(__m512 x)
1834
{
1835 1
    return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
1836
}
1837

1838
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
1839
avx512_scalef_ps(__m512 poly, __m512 quadrant)
1840
{
1841 1
    return _mm512_scalef_ps(poly, quadrant);
1842
}
1843

1844
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
1845
avx512_permute_x4var_pd(__m512d t0,
1846
                        __m512d t1,
1847
                        __m512d t2,
1848
                        __m512d t3,
1849
                        __m512i index)
1850
{
1851 1
    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
1852
                          _mm512_and_epi64(_mm512_set1_epi64(0x10ULL), index),
1853
                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
1854 1
    __m512d res1 = _mm512_permutex2var_pd(t0, index, t1);
1855 1
    __m512d res2 = _mm512_permutex2var_pd(t2, index, t3);
1856 1
    return _mm512_mask_blend_pd(lut_mask, res1, res2);
1857
}
1858

1859
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
1860
avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
1861
                        __m512d t4, __m512d t5, __m512d t6, __m512d t7,
1862
                        __m512i index)
1863
{
1864 1
    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
1865
                          _mm512_and_epi64(_mm512_set1_epi64(0x20ULL), index),
1866
                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
1867 1
    __m512d res1 = avx512_permute_x4var_pd(t0, t1, t2, t3, index);
1868 1
    __m512d res2 = avx512_permute_x4var_pd(t4, t5, t6, t7, index);
1869 1
    return _mm512_mask_blend_pd(lut_mask, res1, res2);
1870
}
1871

1872
/**begin repeat
1873
 *  #vsub  = ps, pd#
1874
 *  #type= npy_float, npy_double#
1875
 *  #epi_vsub  = epi32, epi64#
1876
 *  #vtype = __m512, __m512d#
1877
 *  #mask = __mmask16, __mmask8#
1878
 *  #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
1879
 *  #neg_mask = 0x80000000, 0x8000000000000000#
1880
 *  #perm_ = 0xb1, 0x55#
1881
 *  #cmpx_img_mask = 0xAAAA, 0xAA#
1882
 *  #cmpx_re_mask = 0x5555, 0x55#
1883
 *  #INF = NPY_INFINITYF, NPY_INFINITY#
1884
 *  #NAN = NPY_NANF, NPY_NAN#
1885
 */
1886
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
1887
avx512_abs_@vsub@(@vtype@ x)
1888
{
1889 1
    return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x,
1890
				    _mm512_set1_@epi_vsub@ (@and_const@));
1891
}
1892

1893
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
1894
avx512_reciprocal_@vsub@(@vtype@ x)
1895
{
1896 1
    return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x);
1897
}
1898

1899
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
1900
avx512_rint_@vsub@(@vtype@ x)
1901
{
1902 1
    return _mm512_roundscale_@vsub@(x, 0x08);
1903
}
1904

1905
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
1906
avx512_floor_@vsub@(@vtype@ x)
1907
{
1908 1
    return _mm512_roundscale_@vsub@(x, 0x09);
1909
}
1910

1911
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
1912
avx512_ceil_@vsub@(@vtype@ x)
1913
{
1914 1
    return _mm512_roundscale_@vsub@(x, 0x0A);
1915
}
1916

1917
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
1918
avx512_trunc_@vsub@(@vtype@ x)
1919
{
1920 1
    return _mm512_roundscale_@vsub@(x, 0x0B);
1921
}
1922

1923
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
1924
avx512_hadd_@vsub@(const @vtype@ x)
1925
{
1926 1
    return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
1927
}
1928

1929
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
1930
avx512_hsub_@vsub@(const @vtype@ x)
1931
{
1932 1
    return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
1933
}
1934

1935
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
1936 1
avx512_cabsolute_@vsub@(const @vtype@ x1,
1937
                        const @vtype@ x2,
1938
                        const __m512i re_indices,
1939
                        const __m512i im_indices)
1940
{
1941 1
    @vtype@ inf = _mm512_set1_@vsub@(@INF@);
1942 1
    @vtype@ nan = _mm512_set1_@vsub@(@NAN@);
1943 1
    @vtype@ x1_abs = avx512_abs_@vsub@(x1);
1944 1
    @vtype@ x2_abs = avx512_abs_@vsub@(x2);
1945 1
    @vtype@ re = _mm512_permutex2var_@vsub@(x1_abs, re_indices, x2_abs);
1946 1
    @vtype@ im = _mm512_permutex2var_@vsub@(x1_abs, im_indices , x2_abs);
1947
    /*
1948
     * If real or imag = INF, then convert it to inf + j*inf
1949
     * Handles: inf + j*nan, nan + j*inf
1950
     */
1951 1
    @mask@ re_infmask = _mm512_cmp_@vsub@_mask(re, inf, _CMP_EQ_OQ);
1952 1
    @mask@ im_infmask = _mm512_cmp_@vsub@_mask(im, inf, _CMP_EQ_OQ);
1953 1
    im = _mm512_mask_mov_@vsub@(im, re_infmask, inf);
1954 1
    re = _mm512_mask_mov_@vsub@(re, im_infmask, inf);
1955

1956
    /*
1957
     * If real or imag = NAN, then convert it to nan + j*nan
1958
     * Handles: x + j*nan, nan + j*x
1959
     */
1960 1
    @mask@ re_nanmask = _mm512_cmp_@vsub@_mask(re, re, _CMP_NEQ_UQ);
1961 1
    @mask@ im_nanmask = _mm512_cmp_@vsub@_mask(im, im, _CMP_NEQ_UQ);
1962 1
    im = _mm512_mask_mov_@vsub@(im, re_nanmask, nan);
1963 1
    re = _mm512_mask_mov_@vsub@(re, im_nanmask, nan);
1964

1965 1
    @vtype@ larger  = _mm512_max_@vsub@(re, im);
1966 1
    @vtype@ smaller = _mm512_min_@vsub@(im, re);
1967

1968
    /*
1969
     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
1970
     */
1971 1
    @mask@ zeromask = _mm512_cmp_@vsub@_mask(larger, _mm512_setzero_@vsub@(), _CMP_EQ_OQ);
1972 1
    @mask@ infmask = _mm512_cmp_@vsub@_mask(smaller, inf, _CMP_EQ_OQ);
1973 1
    @mask@ div_mask = _mm512_knot(_mm512_kor(zeromask, infmask));
1974 1
    @vtype@ ratio = _mm512_maskz_div_@vsub@(div_mask, smaller, larger);
1975 1
    @vtype@ hypot = _mm512_sqrt_@vsub@(_mm512_fmadd_@vsub@(
1976
                                        ratio, ratio, _mm512_set1_@vsub@(1.0f)));
1977 1
    return _mm512_mul_@vsub@(hypot, larger);
1978
}
1979

1980
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
1981
avx512_conjugate_@vsub@(const @vtype@ x)
1982
{
1983
    /*
1984
     * __mm512_mask_xor_ps/pd requires AVX512DQ. We cast it to __m512i and
1985
     * use the xor_epi32/64 uinstruction instead. Cast is a zero latency instruction
1986
     */
1987 1
    __m512i cast_x = _mm512_cast@vsub@_si512(x);
1988 1
    __m512i res = _mm512_mask_xor_@epi_vsub@(cast_x, @cmpx_img_mask@,
1989
                                        cast_x, _mm512_set1_@epi_vsub@(@neg_mask@));
1990 1
    return _mm512_castsi512_@vsub@(res);
1991
}
1992

1993
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
1994
avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
1995
{
1996
    // x1 = r1, i1
1997
    // x2 = r2, i2
1998 1
    @vtype@ x3  = _mm512_permute_@vsub@(x2, @perm_@);   // i2, r2
1999 1
    @vtype@ x12 = _mm512_mul_@vsub@(x1, x2);            // r1*r2, i1*i2
2000 1
    @vtype@ x13 = _mm512_mul_@vsub@(x1, x3);            // r1*i2, r2*i1
2001 1
    @vtype@ outreal = avx512_hsub_@vsub@(x12);          // r1*r2 - i1*i2, r1*r2 - i1*i2
2002 1
    @vtype@ outimg  = avx512_hadd_@vsub@(x13);          // r1*i2 + i1*r2, r1*i2 + i1*r2
2003 1
    return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
2004
}
2005

2006
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
2007
avx512_csquare_@vsub@(@vtype@ x)
2008
{
2009 1
    return avx512_cmul_@vsub@(x, x);
2010
}
2011

2012
/**end repeat**/
2013
#endif
2014

2015
/**begin repeat
2016
 * #ISA = FMA, AVX512F#
2017
 * #isa = fma, avx512#
2018
 * #vtype = __m256, __m512#
2019
 * #vsize = 256, 512#
2020
 * #or = or_ps, kor#
2021
 * #vsub = , _mask#
2022
 * #mask = __m256, __mmask16#
2023
 * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
2024
 * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
2025
 **/
2026

2027
#if defined @CHK@
2028

2029
/*
2030
 * Vectorized Cody-Waite range reduction technique
2031
 * Performs the reduction step x* = x - y*C in three steps:
2032
 * 1) x* = x - y*c1
2033
 * 2) x* = x - y*c2
2034
 * 3) x* = x - y*c3
2035
 * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
2036
 */
2037

2038
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
2039
@isa@_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
2040
{
2041 1
    @vtype@ reduced_x = @fmadd@(y, c1, x);
2042 1
    reduced_x = @fmadd@(y, c2, reduced_x);
2043 1
    reduced_x = @fmadd@(y, c3, reduced_x);
2044
    return reduced_x;
2045
}
2046

2047
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @mask@
2048
@isa@_in_range_mask(@vtype@ x, npy_float fmax, npy_float fmin)
2049
{
2050 1
    @mask@ m1 = _mm@vsize@_cmp_ps@vsub@(
2051
                                x, _mm@vsize@_set1_ps(fmax), _CMP_GT_OQ);
2052 1
    @mask@ m2 = _mm@vsize@_cmp_ps@vsub@(
2053
                                x, _mm@vsize@_set1_ps(fmin), _CMP_LT_OQ);
2054 1
    return _mm@vsize@_@or@(m1,m2);
2055
}
2056

2057
/*
2058
 * Approximate cosine algorithm for x \in [-PI/4, PI/4]
2059
 * Maximum ULP across all 32-bit floats = 0.875
2060
 */
2061

2062
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
2063
@isa@_cosine(@vtype@ x2, @vtype@ invf8, @vtype@ invf6, @vtype@ invf4,
2064
                                                @vtype@ invf2, @vtype@ invf0)
2065
{
2066 1
    @vtype@ cos = @fmadd@(invf8, x2, invf6);
2067 1
    cos = @fmadd@(cos, x2, invf4);
2068 1
    cos = @fmadd@(cos, x2, invf2);
2069 1
    cos = @fmadd@(cos, x2, invf0);
2070
    return cos;
2071
}
2072

2073
/*
2074
 * Approximate sine algorithm for x \in [-PI/4, PI/4]
2075
 * Maximum ULP across all 32-bit floats = 0.647
2076
 */
2077

2078
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
2079
@isa@_sine(@vtype@ x, @vtype@ x2, @vtype@ invf9, @vtype@ invf7,
2080
                                          @vtype@ invf5, @vtype@ invf3,
2081
                                          @vtype@ zero)
2082
{
2083 1
    @vtype@ sin = @fmadd@(invf9, x2, invf7);
2084 1
    sin = @fmadd@(sin, x2, invf5);
2085 1
    sin = @fmadd@(sin, x2, invf3);
2086 1
    sin = @fmadd@(sin, x2, zero);
2087 1
    sin = @fmadd@(sin, x, x);
2088
    return sin;
2089
}
2090

2091
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
2092
@isa@_sqrt_ps(@vtype@ x)
2093
{
2094 1
    return _mm@vsize@_sqrt_ps(x);
2095
}
2096

2097
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
2098
@isa@_sqrt_pd(@vtype@d x)
2099
{
2100 1
    return _mm@vsize@_sqrt_pd(x);
2101
}
2102

2103
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
2104
@isa@_square_ps(@vtype@ x)
2105
{
2106 1
    return _mm@vsize@_mul_ps(x,x);
2107
}
2108

2109
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
2110
@isa@_square_pd(@vtype@d x)
2111
{
2112 1
    return _mm@vsize@_mul_pd(x,x);
2113
}
2114

2115
#endif
2116
/**end repeat**/
2117

2118
/**begin repeat
2119
 * #type = npy_float, npy_double#
2120
 * #TYPE = FLOAT, DOUBLE#
2121
 * #num_lanes = 16, 8#
2122
 * #vsuffix = ps, pd#
2123
 * #mask = __mmask16, __mmask8#
2124
 * #vtype = __m512, __m512d#
2125
 * #scale = 4, 8#
2126
 * #vindextype = __m512i, __m256i#
2127
 * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
2128
 * #episize = epi32, epi64#
2129
 */
2130

2131
/**begin repeat1
2132
 * #func = isnan, isfinite, isinf, signbit#
2133
 * #IMM8 = 0x81, 0x99, 0x18, 0x04#
2134
 * #is_finite = 0, 1, 0, 0#
2135
 * #is_signbit = 0, 0, 0, 1#
2136
 */
2137
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
2138
static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
2139
AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps)
2140
{
2141
    const npy_intp stride_ip = steps/(npy_intp)sizeof(@type@);
2142
    npy_intp num_remaining_elements = array_size;
2143

2144
    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
2145
#if @is_signbit@
2146
    @vtype@ signbit = _mm512_set1_@vsuffix@(-0.0);
2147
#endif
2148

2149
    /*
2150
     * Note: while generally indices are npy_intp, we ensure that our maximum
2151
     * index will fit in an int32 as a precondition for this function via
2152
     * IS_OUTPUT_BLOCKABLE_UNARY
2153
     */
2154

2155
    npy_int32 index_ip[@num_lanes@];
2156
    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
2157
        index_ip[ii] = ii*stride_ip;
2158
    }
2159
    @vindextype@ vindex_ip = @vindexload@((@vindextype@*)&index_ip[0]);
2160
    @vtype@ zeros_f = _mm512_setzero_@vsuffix@();
2161
    __m512i ones = _mm512_set1_@episize@(1);
2162

2163
    while (num_remaining_elements > 0) {
2164
        if (num_remaining_elements < @num_lanes@) {
2165
            load_mask = avx512_get_partial_load_mask_@vsuffix@(
2166
                                    num_remaining_elements, @num_lanes@);
2167
        }
2168
        @vtype@ x1;
2169
        if (stride_ip == 1) {
2170
            x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
2171
        }
2172
        else {
2173
            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip, vindex_ip, load_mask);
2174
        }
2175
#if @is_signbit@
2176
        x1 = _mm512_and_@vsuffix@(x1,signbit); 
2177
#endif
2178

2179
        @mask@ fpclassmask = _mm512_fpclass_@vsuffix@_mask(x1, @IMM8@);
2180
#if @is_finite@
2181
        fpclassmask = _mm512_knot(fpclassmask);
2182
#endif
2183

2184
        __m128i out =_mm512_maskz_cvts@episize@_epi8(fpclassmask, ones);
2185
        _mm_mask_storeu_epi8(op, load_mask, out);
2186

2187
        ip += @num_lanes@*stride_ip;
2188
        op += @num_lanes@;
2189
        num_remaining_elements -= @num_lanes@;
2190
    }
2191
}
2192
#endif
2193
/**end repeat1**/
2194
/**end repeat**/
2195

2196
/**begin repeat
2197
 * #type = npy_float, npy_double#
2198
 * #TYPE = FLOAT, DOUBLE#
2199
 * #num_lanes = 16, 8#
2200
 * #vsuffix = ps, pd#
2201
 * #mask = __mmask16, __mmask8#
2202
 * #vtype1 = __m512, __m512d#
2203
 * #vtype2 = __m512i, __m256i#
2204
 * #scale = 4, 8#
2205
 * #vindextype = __m512i, __m256i#
2206
 * #vindexsize = 512, 256#
2207
 * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
2208
 * #vtype2_load = _mm512_maskz_loadu_epi32, _mm256_maskz_loadu_epi32#
2209
 * #vtype2_gather = _mm512_mask_i32gather_epi32, _mm256_mmask_i32gather_epi32#
2210
 * #vtype2_store = _mm512_mask_storeu_epi32, _mm256_mask_storeu_epi32#
2211
 * #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32#
2212
 * #setzero = _mm512_setzero_epi32, _mm256_setzero_si256#
2213
 */
2214

2215
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
2216
static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
2217
AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
2218
{
2219
    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
2220
    const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(int);
2221
    const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@);
2222
    const npy_intp array_size = dimensions[0];
2223
    npy_intp num_remaining_elements = array_size;
2224
    @type@* ip1 = (@type@*) args[0];
2225
    int* ip2 = (int*) args[1];
2226
    @type@* op  = (@type@*) args[2];
2227

2228
    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
2229

2230
    /*
2231
     * Note: while generally indices are npy_intp, we ensure that our maximum index
2232
     * will fit in an int32 as a precondition for this function via
2233
     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
2234
     */
2235

2236
    npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
2237
    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
2238
        index_ip1[ii] = ii*stride_ip1;
2239
        index_ip2[ii] = ii*stride_ip2;
2240
        index_op[ii] = ii*stride_op;
2241
    }
2242
    @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
2243
    @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
2244
    @vindextype@ vindex_op  = @vindexload@((@vindextype@*)&index_op[0]);
2245
    @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
2246
    @vtype2@ zeros = @setzero@();
2247

2248
    while (num_remaining_elements > 0) {
2249
        if (num_remaining_elements < @num_lanes@) {
2250
            load_mask = avx512_get_partial_load_mask_@vsuffix@(
2251
                                    num_remaining_elements, @num_lanes@);
2252
        }
2253
        @vtype1@ x1;
2254
        @vtype2@ x2;
2255
        if (stride_ip1 == 1) {
2256
            x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
2257
        }
2258
        else {
2259
            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
2260
        }
2261
        if (stride_ip2 == 1) {
2262
            x2 = @vtype2_load@(load_mask, ip2);
2263
        }
2264
        else {
2265
            x2 = @vtype2_gather@(zeros, load_mask, vindex_ip2, ip2, 4);
2266
        }
2267

2268
        @vtype1@ out = _mm512_scalef_@vsuffix@(x1, _mm512_cvtepi32_@vsuffix@(x2));
2269
        
2270
        if (stride_op == 1) {
2271
            _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
2272
        }
2273
        else {
2274
            /* scatter! */
2275
            _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
2276
        }
2277

2278
        ip1 += @num_lanes@*stride_ip1;
2279
        ip2 += @num_lanes@*stride_ip2;
2280
        op += @num_lanes@*stride_op;
2281
        num_remaining_elements -= @num_lanes@;
2282
    }
2283
}
2284

2285
static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
2286
AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
2287
{
2288
    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
2289
    const npy_intp stride_op1 = steps[1]/(npy_intp)sizeof(@type@);
2290
    const npy_intp stride_op2 = steps[2]/(npy_intp)sizeof(int);
2291
    const npy_intp array_size = dimensions[0];
2292
    npy_intp num_remaining_elements = array_size;
2293
    @type@* ip1 = (@type@*) args[0];
2294
    @type@* op1  = (@type@*) args[1];
2295
    int* op2 = (int*) args[2];
2296

2297
    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
2298

2299
    /*
2300
     * Note: while generally indices are npy_intp, we ensure that our maximum index
2301
     * will fit in an int32 as a precondition for this function via
2302
     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
2303
     */
2304

2305
    npy_int32 index_ip1[@num_lanes@], index_op1[@num_lanes@], index_op2[@num_lanes@];
2306
    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
2307
        index_ip1[ii] = ii*stride_ip1;
2308
        index_op1[ii] = ii*stride_op1;
2309
        index_op2[ii] = ii*stride_op2;
2310
    }
2311
    @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
2312
    @vindextype@ vindex_op1 = @vindexload@((@vindextype@*)&index_op1[0]);
2313
    @vindextype@ vindex_op2 = @vindexload@((@vindextype@*)&index_op2[0]);
2314
    @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
2315

2316
    while (num_remaining_elements > 0) {
2317
        if (num_remaining_elements < @num_lanes@) {
2318
            load_mask = avx512_get_partial_load_mask_@vsuffix@(
2319
                                    num_remaining_elements, @num_lanes@);
2320
        }
2321
        @vtype1@ x1;
2322
        if (stride_ip1 == 1) {
2323
            x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
2324
        }
2325
        else {
2326
            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
2327
        }
2328

2329
        /*
2330
         * The x86 instructions vpgetmant and vpgetexp do not conform
2331
         * with NumPy's output for special floating points: NAN, +/-INF, +/-0.0
2332
         * We mask these values with spmask to avoid invalid exceptions.
2333
         */
2334
        @mask@ spmask =_mm512_knot(_mm512_fpclass_@vsuffix@_mask(
2335
                                                x1, 0b10011111));
2336
        @vtype1@ out1 = _mm512_maskz_getmant_@vsuffix@(
2337
                                spmask, x1, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
2338
        out1 = _mm512_mask_mov_@vsuffix@(x1, spmask, out1);
2339
        @vtype2@ out2 = _mm512_cvt@vsuffix@_epi32(
2340
                            _mm512_maskz_add_@vsuffix@(spmask, _mm512_set1_@vsuffix@(1.0),
2341
                                _mm512_maskz_getexp_@vsuffix@(spmask, x1)));
2342
        if (stride_op1 == 1) {
2343
            _mm512_mask_storeu_@vsuffix@(op1, load_mask, out1);
2344
        }
2345
        else {
2346
            _mm512_mask_i32scatter_@vsuffix@(op1, load_mask, vindex_op1, out1, @scale@);
2347
        }
2348
        if (stride_op2 == 1) {
2349
            @vtype2_store@(op2, load_mask, out2);
2350
        }
2351
        else {
2352
            @vtype2_scatter@(op2, load_mask, vindex_op2, out2, 4);
2353
        }
2354

2355
        ip1 += @num_lanes@*stride_ip1;
2356
        op1 += @num_lanes@*stride_op1;
2357
        op2 += @num_lanes@*stride_op2;
2358
        num_remaining_elements -= @num_lanes@;
2359
    }
2360
}
2361
#endif
2362

2363
/**begin repeat1
2364
 *  #func = maximum, minimum#
2365
 *  #vectorf = max, min#
2366
 */
2367

2368
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
2369
static NPY_INLINE NPY_GCC_TARGET_AVX512F void
2370 1
AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
2371
{
2372 1
    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
2373 1
    const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(@type@);
2374 1
    const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@);
2375 1
    const npy_intp array_size = dimensions[0];
2376 1
    npy_intp num_remaining_elements = array_size;
2377 1
    @type@* ip1 = (@type@*) args[0];
2378 1
    @type@* ip2 = (@type@*) args[1];
2379 1
    @type@* op  = (@type@*) args[2];
2380

2381 1
    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
2382

2383
    /*
2384
     * Note: while generally indices are npy_intp, we ensure that our maximum index
2385
     * will fit in an int32 as a precondition for this function via
2386
     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
2387
     */
2388

2389
    npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
2390 1
    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
2391 1
        index_ip1[ii] = ii*stride_ip1;
2392 1
        index_ip2[ii] = ii*stride_ip2;
2393 1
        index_op[ii] = ii*stride_op;
2394
    }
2395 1
    @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
2396 1
    @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
2397 1
    @vindextype@ vindex_op  = @vindexload@((@vindextype@*)&index_op[0]);
2398 1
    @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
2399

2400 1
    while (num_remaining_elements > 0) {
2401 1
        if (num_remaining_elements < @num_lanes@) {
2402 1
            load_mask = avx512_get_partial_load_mask_@vsuffix@(
2403
                                    num_remaining_elements, @num_lanes@);
2404
        }
2405
        @vtype1@ x1, x2;
2406 1
        if (stride_ip1 == 1) {
2407 1
            x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
2408
        }
2409
        else {
2410 1
            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
2411
        }
2412 1
        if (stride_ip2 == 1) {
2413 1
            x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
2414
        }
2415
        else {
2416 1
            x2 = avx512_masked_gather_@vsuffix@(zeros_f, ip2, vindex_ip2, load_mask);
2417
        }
2418

2419
        /*
2420
         * when only one of the argument is a nan, the maxps/maxpd instruction
2421
         * returns the second argument. The additional blend instruction fixes
2422
         * this issue to conform with NumPy behaviour.
2423
         */
2424 1
        @mask@ nan_mask = _mm512_cmp_@vsuffix@_mask(x1, x1, _CMP_NEQ_UQ);
2425 1
        @vtype1@ out = _mm512_@vectorf@_@vsuffix@(x1, x2);
2426 1
        out = _mm512_mask_blend_@vsuffix@(nan_mask, out, x1);
2427

2428 1
        if (stride_op == 1) {
2429 1
            _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
2430
        }
2431
        else {
2432
            /* scatter! */
2433 1
            _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
2434
        }
2435

2436 1
        ip1 += @num_lanes@*stride_ip1;
2437 1
        ip2 += @num_lanes@*stride_ip2;
2438 1
        op += @num_lanes@*stride_op;
2439 1
        num_remaining_elements -= @num_lanes@;
2440
    }
2441 1
}
2442
#endif
2443
/**end repeat1**/
2444
/**end repeat**/
2445

2446
/**begin repeat
2447
 * #ISA = FMA, AVX512F#
2448
 * #isa = fma, avx512#
2449
 * #vsize = 256, 512#
2450
 * #BYTES = 32, 64#
2451
 * #cvtps_epi32 = _mm256_cvtps_epi32, #
2452
 * #mask = __m256, __mmask16#
2453
 * #vsub = , _mask#
2454
 * #vtype = __m256, __m512#
2455
 * #cvtps_epi32 = _mm256_cvtps_epi32, #
2456
 * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
2457
 * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
2458
 */
2459

2460
/**begin repeat1
2461
 *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
2462
 *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
2463
 *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
2464
 */
2465

2466
#if defined @CHK@
2467
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
2468 1
@ISA@_@func@_FLOAT(npy_float* op,
2469
                   npy_float* ip,
2470
                   const npy_intp array_size,
2471
                   const npy_intp steps)
2472
{
2473 1
    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
2474 1
    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
2475 1
    npy_intp num_remaining_elements = array_size;
2476 1
    @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
2477 1
    @mask@ load_mask = @isa@_get_full_load_mask_ps();
2478
#if @replace_0_with_1@
2479 1
    @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask);
2480
#endif
2481

2482
    /*
2483
     * Note: while generally indices are npy_intp, we ensure that our maximum index
2484
     * will fit in an int32 as a precondition for this function via
2485
     * IS_OUTPUT_BLOCKABLE_UNARY
2486
     */
2487

2488
    npy_int32 indexarr[16];
2489 1
    for (npy_int32 ii = 0; ii < 16; ii++) {
2490 1
        indexarr[ii] = ii*stride;
2491
    }
2492 1
    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
2493

2494 1
    while (num_remaining_elements > 0) {
2495 1
        if (num_remaining_elements < num_lanes) {
2496 1
            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
2497
                                                       num_lanes);
2498
#if @replace_0_with_1@
2499 1
            inv_load_mask = @isa@_invert_mask_ps(load_mask);
2500
#endif
2501
        }
2502
        @vtype@ x;
2503 1
        if (stride == 1) {
2504 1
            x = @isa@_masked_load_ps(load_mask, ip);
2505
#if @replace_0_with_1@
2506
            /*
2507
             * Replace masked elements with 1.0f to avoid divide by zero fp
2508
             * exception in reciprocal
2509
             */
2510 1
            x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask);
2511
#endif
2512
        }
2513
        else {
2514 1
            x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask);
2515
        }
2516 1
        @vtype@ out = @isa@_@vectorf@_ps(x);
2517 1
        @masked_store@(op, @cvtps_epi32@(load_mask), out);
2518

2519 1
        ip += num_lanes*stride;
2520 1
        op += num_lanes;
2521 1
        num_remaining_elements -= num_lanes;
2522
    }
2523 1
}
2524
#endif
2525
/**end repeat1**/
2526
/**end repeat**/
2527

2528
/**begin repeat
2529
 * #ISA = FMA, AVX512F#
2530
 * #isa = fma, avx512#
2531
 * #vsize = 256, 512#
2532
 * #BYTES = 32, 64#
2533
 * #cvtps_epi32 = _mm256_cvtps_epi32, #
2534
 * #mask = __m256i, __mmask8#
2535
 * #vsub = , _mask#
2536
 * #vtype = __m256d, __m512d#
2537
 * #vindextype = __m128i, __m256i#
2538
 * #vindexsize = 128, 256#
2539
 * #vindexload = _mm_loadu_si128, _mm256_loadu_si256#
2540
 * #cvtps_epi32 = _mm256_cvtpd_epi32, #
2541
 * #castmask = _mm256_castsi256_pd, #
2542
 * #masked_store = _mm256_maskstore_pd, _mm512_mask_storeu_pd#
2543
 * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
2544
 */
2545

2546
/**begin repeat1
2547
 *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
2548
 *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
2549
 *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
2550
 */
2551

2552
#if defined @CHK@
2553
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
2554 1
@ISA@_@func@_DOUBLE(npy_double* op,
2555
                    npy_double* ip,
2556
                    const npy_intp array_size,
2557
                    const npy_intp steps)
2558
{
2559 1
    const npy_intp stride = steps/(npy_intp)sizeof(npy_double);
2560 1
    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double);
2561 1
    npy_intp num_remaining_elements = array_size;
2562 1
    @mask@ load_mask = @isa@_get_full_load_mask_pd();
2563
#if @replace_0_with_1@
2564 1
    @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask);
2565
#endif
2566 1
    @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);
2567

2568
    /*
2569
     * Note: while generally indices are npy_intp, we ensure that our maximum index
2570
     * will fit in an int32 as a precondition for this function via
2571
     * IS_OUTPUT_BLOCKABLE_UNARY
2572
     */
2573
    npy_int32 indexarr[8];
2574 1
    for (npy_int32 ii = 0; ii < 8; ii++) {
2575 1
        indexarr[ii] = ii*stride;
2576
    }
2577 1
    @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]);
2578

2579 1
    while (num_remaining_elements > 0) {
2580 1
        if (num_remaining_elements < num_lanes) {
2581 1
            load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements,
2582
                                                       num_lanes);
2583
#if @replace_0_with_1@
2584 1
            inv_load_mask = @isa@_invert_mask_pd(load_mask);
2585
#endif
2586
        }
2587
        @vtype@ x;
2588 1
        if (stride == 1) {
2589 1
            x = @isa@_masked_load_pd(load_mask, ip);
2590
#if @replace_0_with_1@
2591
            /*
2592
             * Replace masked elements with 1.0f to avoid divide by zero fp
2593
             * exception in reciprocal
2594
             */
2595 1
            x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask));
2596
#endif
2597
        }
2598
        else {
2599 1
            x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask));
2600
        }
2601 1
        @vtype@ out = @isa@_@vectorf@_pd(x);
2602 1
        @masked_store@(op, load_mask, out);
2603

2604 1
        ip += num_lanes*stride;
2605 1
        op += num_lanes;
2606 1
        num_remaining_elements -= num_lanes;
2607
    }
2608 1
}
2609
#endif
2610
/**end repeat1**/
2611
/**end repeat**/
2612

2613
/**begin repeat
2614
 * #ISA = FMA, AVX512F#
2615
 * #isa = fma, avx512#
2616
 * #vtype = __m256, __m512#
2617
 * #vsize = 256, 512#
2618
 * #BYTES = 32, 64#
2619
 * #mask = __m256, __mmask16#
2620
 * #vsub = , _mask#
2621
 * #or_masks =_mm256_or_ps, _mm512_kor#
2622
 * #and_masks =_mm256_and_ps, _mm512_kand#
2623
 * #xor_masks =_mm256_xor_ps, _mm512_kxor#
2624
 * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
2625
 * #mask_to_int = _mm256_movemask_ps, #
2626
 * #full_mask= 0xFF, 0xFFFF#
2627
 * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
2628
 * #cvtps_epi32 = _mm256_cvtps_epi32, #
2629
 * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
2630
 */
2631

2632
/*
2633
 * Vectorized approximate sine/cosine algorithms: The following code is a
2634
 * vectorized version of the algorithm presented here:
2635
 * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
2636
 * (1) Load data in ZMM/YMM registers and generate mask for elements that are
2637
 * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f,
2638
 * 117435.992f] for sine.
2639
 * (2) For elements within range, perform range reduction using Cody-Waite's
2640
 * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4].
2641
 * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k =
2642
 * int(y).
2643
 * (4) For elements outside that range, Cody-Waite reduction performs poorly
2644
 * leading to catastrophic cancellation. We compute cosine by calling glibc in
2645
 * a scalar fashion.
2646
 * (5) Vectorized implementation has a max ULP of 1.49 and performs at least
2647
 * 5-7x faster than scalar implementations when magnitude of all elements in
2648
 * the array < 71476.0625f (117435.992f for sine). Worst case performance is
2649
 * when all the elements are large leading to about 1-2% reduction in
2650
 * performance.
2651
 */
2652

2653
#if defined @CHK@
2654
static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
2655 1
@ISA@_sincos_FLOAT(npy_float * op,
2656
                   npy_float * ip,
2657
                   const npy_intp array_size,
2658
                   const npy_intp steps,
2659
                   NPY_TRIG_OP my_trig_op)
2660
{
2661 1
    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
2662 1
    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
2663 1
    npy_float large_number = 71476.0625f;
2664 1
    if (my_trig_op == npy_compute_sin) {
2665 1
        large_number = 117435.992f;
2666
    }
2667

2668
    /* Load up frequently used constants */
2669 1
    @vtype@i zeros = _mm@vsize@_set1_epi32(0);
2670 1
    @vtype@i ones = _mm@vsize@_set1_epi32(1);
2671 1
    @vtype@i twos = _mm@vsize@_set1_epi32(2);
2672 1
    @vtype@ two_over_pi = _mm@vsize@_set1_ps(NPY_TWO_O_PIf);
2673 1
    @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_HIGHf);
2674 1
    @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_MEDf);
2675 1
    @vtype@ codyw_c3 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_LOWf);
2676 1
    @vtype@ cos_invf0 = _mm@vsize@_set1_ps(NPY_COEFF_INVF0_COSINEf);
2677 1
    @vtype@ cos_invf2 = _mm@vsize@_set1_ps(NPY_COEFF_INVF2_COSINEf);
2678 1
    @vtype@ cos_invf4 = _mm@vsize@_set1_ps(NPY_COEFF_INVF4_COSINEf);
2679 1
    @vtype@ cos_invf6 = _mm@vsize@_set1_ps(NPY_COEFF_INVF6_COSINEf);
2680 1
    @vtype@ cos_invf8 = _mm@vsize@_set1_ps(NPY_COEFF_INVF8_COSINEf);
2681 1
    @vtype@ sin_invf3 = _mm@vsize@_set1_ps(NPY_COEFF_INVF3_SINEf);
2682 1
    @vtype@ sin_invf5 = _mm@vsize@_set1_ps(NPY_COEFF_INVF5_SINEf);
2683 1
    @vtype@ sin_invf7 = _mm@vsize@_set1_ps(NPY_COEFF_INVF7_SINEf);
2684 1
    @vtype@ sin_invf9 = _mm@vsize@_set1_ps(NPY_COEFF_INVF9_SINEf);
2685 1
    @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf);
2686 1
    @vtype@ zero_f = _mm@vsize@_set1_ps(0.0f);
2687
    @vtype@ quadrant, reduced_x, reduced_x2, cos, sin;
2688
    @vtype@i iquadrant;
2689
    @mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
2690 1
    @mask@ load_mask = @isa@_get_full_load_mask_ps();
2691 1
    npy_intp num_remaining_elements = array_size;
2692

2693
    /*
2694
     * Note: while generally indices are npy_intp, we ensure that our maximum index
2695
     * will fit in an int32 as a precondition for this function via
2696
     * IS_OUTPUT_BLOCKABLE_UNARY
2697
     */
2698
    npy_int32 indexarr[16];
2699 1
    for (npy_int32 ii = 0; ii < 16; ii++) {
2700 1
        indexarr[ii] = ii*stride;
2701
    }
2702 1
    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
2703

2704 1
    while (num_remaining_elements > 0) {
2705

2706 1
        if (num_remaining_elements < num_lanes) {
2707 1
            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
2708
                                                         num_lanes);
2709
        }
2710

2711
        @vtype@ x;
2712 1
        if (stride == 1) {
2713 1
            x = @isa@_masked_load_ps(load_mask, ip);
2714
        }
2715
        else {
2716 1
            x = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask);
2717
        }
2718

2719
        /*
2720
         * For elements outside of this range, Cody-Waite's range reduction
2721
         * becomes inaccurate and we will call glibc to compute cosine for
2722
         * these numbers
2723
         */
2724

2725 1
        glibc_mask = @isa@_in_range_mask(x, large_number,-large_number);
2726 1
        glibc_mask = @and_masks@(load_mask, glibc_mask);
2727 1
        nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
2728 1
        x = @isa@_set_masked_lanes_ps(x, zero_f, @or_masks@(nan_mask, glibc_mask));
2729 1
        npy_int iglibc_mask = @mask_to_int@(glibc_mask);
2730

2731 1
        if (iglibc_mask != @full_mask@) {
2732 1
            quadrant = _mm@vsize@_mul_ps(x, two_over_pi);
2733

2734
            /* round to nearest */
2735 1
            quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic);
2736 1
            quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);
2737

2738
            /* Cody-Waite's range reduction algorithm */
2739 1
            reduced_x = @isa@_range_reduction(x, quadrant,
2740
                                                   codyw_c1, codyw_c2, codyw_c3);
2741 1
            reduced_x2 = _mm@vsize@_mul_ps(reduced_x, reduced_x);
2742

2743
            /* compute cosine and sine */
2744 1
            cos = @isa@_cosine(reduced_x2, cos_invf8, cos_invf6, cos_invf4,
2745
                                                           cos_invf2, cos_invf0);
2746 1
            sin = @isa@_sine(reduced_x, reduced_x2, sin_invf9, sin_invf7,
2747
                                             sin_invf5, sin_invf3, zero_f);
2748

2749 1
            iquadrant = _mm@vsize@_cvtps_epi32(quadrant);
2750 1
            if (my_trig_op == npy_compute_cos) {
2751 1
                iquadrant = _mm@vsize@_add_epi32(iquadrant, ones);
2752
            }
2753

2754
            /* blend sin and cos based on the quadrant */
2755 1
            sine_mask = @isa@_should_calculate_sine(iquadrant, ones, zeros);
2756 1
            cos = @isa@_blend(cos, sin, sine_mask);
2757

2758
            /* multiply by -1 for appropriate elements */
2759 1
            negate_mask = @isa@_should_negate(iquadrant, twos, twos);
2760 1
            cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask);
2761 1
            cos = @isa@_set_masked_lanes_ps(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
2762

2763 1
            @masked_store@(op, @cvtps_epi32@(load_mask), cos);
2764
        }
2765

2766
        /* process elements using glibc for large elements */
2767 1
        if (my_trig_op == npy_compute_cos) {
2768 1
            for (int ii = 0, jj = 0; iglibc_mask != 0; ii++, jj += stride) {
2769 1
                if (iglibc_mask & 0x01) {
2770 1
                    op[ii] = npy_cosf(ip[jj]);
2771
                }
2772 1
                iglibc_mask  = iglibc_mask >> 1;
2773
            }
2774
        }
2775
        else {
2776 1
            for (int ii = 0, jj = 0; iglibc_mask != 0; ii++, jj += stride) {
2777 1
                if (iglibc_mask & 0x01) {
2778 1
                    op[ii] = npy_sinf(ip[jj]);
2779
                }
2780 1
                iglibc_mask  = iglibc_mask >> 1;
2781
            }
2782
        }
2783 1
        ip += num_lanes*stride;
2784 1
        op += num_lanes;
2785 1
        num_remaining_elements -= num_lanes;
2786
    }
2787 1
}
2788

2789
/*
2790
 * Vectorized implementation of exp using AVX2 and AVX512:
2791
 * 1) if x >= xmax; return INF (overflow)
2792
 * 2) if x <= xmin; return 0.0f (underflow)
2793
 * 3) Range reduction (using Coyd-Waite):
2794
 *      a) y = x - k*ln(2); k = rint(x/ln(2)); y \in [0, ln(2)]
2795
 * 4) Compute exp(y) = P/Q, ratio of 2 polynomials P and Q
2796
 *      b) P = 5th order and Q = 2nd order polynomials obtained from Remez's
2797
 *      algorithm (mini-max polynomial approximation)
2798
 * 5) Compute exp(x) = exp(y) * 2^k
2799
 * 6) Max ULP error measured across all 32-bit FP's = 2.52 (x = 0xc2781e37)
2800
 * 7) Max relative error measured across all 32-bit FP's= 2.1264E-07 (for the
2801
 * same x = 0xc2781e37)
2802
 */
2803

2804
static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
2805 1
@ISA@_exp_FLOAT(npy_float * op,
2806
                npy_float * ip,
2807
                const npy_intp array_size,
2808
                const npy_intp steps)
2809
{
2810 1
    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
2811 1
    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
2812 1
    npy_float xmax = 88.72283935546875f;
2813 1
    npy_float xmin = -103.97208404541015625f;
2814

2815
    /*
2816
     * Note: while generally indices are npy_intp, we ensure that our maximum index
2817
     * will fit in an int32 as a precondition for this function via
2818
     * IS_OUTPUT_BLOCKABLE_UNARY
2819
     */
2820
    npy_int32 indexarr[16];
2821 1
    for (npy_int32 ii = 0; ii < 16; ii++) {
2822 1
        indexarr[ii] = ii*stride;
2823
    }
2824

2825
    /* Load up frequently used constants */
2826 1
    @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
2827 1
    @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_LOWf);
2828 1
    @vtype@ exp_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_EXPf);
2829 1
    @vtype@ exp_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_EXPf);
2830 1
    @vtype@ exp_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_EXPf);
2831 1
    @vtype@ exp_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_EXPf);
2832 1
    @vtype@ exp_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_EXPf);
2833 1
    @vtype@ exp_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_EXPf);
2834 1
    @vtype@ exp_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_EXPf);
2835 1
    @vtype@ exp_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_EXPf);
2836 1
    @vtype@ exp_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_EXPf);
2837 1
    @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf);
2838 1
    @vtype@ log2e = _mm@vsize@_set1_ps(NPY_LOG2Ef);
2839 1
    @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
2840 1
    @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
2841
    @vtype@ poly, num_poly, denom_poly, quadrant;
2842 1
    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
2843

2844
    @mask@ xmax_mask, xmin_mask, nan_mask, inf_mask;
2845 1
    @mask@ overflow_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
2846 1
    @mask@ load_mask = @isa@_get_full_load_mask_ps();
2847 1
    npy_intp num_remaining_elements = array_size;
2848

2849 1
    while (num_remaining_elements > 0) {
2850

2851 1
        if (num_remaining_elements < num_lanes) {
2852 1
            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
2853
                                                       num_lanes);
2854
        }
2855

2856
        @vtype@ x;
2857 1
        if (stride == 1) {
2858 1
            x = @isa@_masked_load_ps(load_mask, ip);
2859
        }
2860
        else {
2861 1
            x = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
2862
        }
2863

2864 1
        nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
2865 1
        x = @isa@_set_masked_lanes_ps(x, zeros_f, nan_mask);
2866

2867 1
        xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
2868 1
        xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
2869 1
        inf_mask = _mm@vsize@_cmp_ps@vsub@(x, inf, _CMP_EQ_OQ);
2870 1
        overflow_mask = @or_masks@(overflow_mask,
2871 1
                                    @xor_masks@(xmax_mask, inf_mask));
2872

2873 1
        x = @isa@_set_masked_lanes_ps(x, zeros_f, @or_masks@(
2874 1
                                    @or_masks@(nan_mask, xmin_mask), xmax_mask));
2875

2876 1
        quadrant = _mm@vsize@_mul_ps(x, log2e);
2877

2878
        /* round to nearest */
2879 1
        quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic);
2880 1
        quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);
2881

2882
        /* Cody-Waite's range reduction algorithm */
2883 1
        x = @isa@_range_reduction(x, quadrant, codyw_c1, codyw_c2, zeros_f);
2884

2885 1
        num_poly = @fmadd@(exp_p5, x, exp_p4);
2886 1
        num_poly = @fmadd@(num_poly, x, exp_p3);
2887 1
        num_poly = @fmadd@(num_poly, x, exp_p2);
2888 1
        num_poly = @fmadd@(num_poly, x, exp_p1);
2889 1
        num_poly = @fmadd@(num_poly, x, exp_p0);
2890 1
        denom_poly = @fmadd@(exp_q2, x, exp_q1);
2891 1
        denom_poly = @fmadd@(denom_poly, x, exp_q0);
2892 1
        poly = _mm@vsize@_div_ps(num_poly, denom_poly);
2893

2894
        /*
2895
         * compute val = poly * 2^quadrant; which is same as adding the
2896
         * exponent of quadrant to the exponent of poly. quadrant is an int,
2897
         * so extracting exponent is simply extracting 8 bits.
2898
         */
2899 1
        poly = @isa@_scalef_ps(poly, quadrant);
2900

2901
        /*
2902
         * elem > xmax; return inf
2903
         * elem < xmin; return 0.0f
2904
         * elem = +/- nan, return nan
2905
         */
2906 1
        poly = @isa@_set_masked_lanes_ps(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
2907 1
        poly = @isa@_set_masked_lanes_ps(poly, inf, xmax_mask);
2908 1
        poly = @isa@_set_masked_lanes_ps(poly, zeros_f, xmin_mask);
2909

2910 1
        @masked_store@(op, @cvtps_epi32@(load_mask), poly);
2911

2912 1
        ip += num_lanes*stride;
2913 1
        op += num_lanes;
2914 1
        num_remaining_elements -= num_lanes;
2915
    }
2916

2917 1
    if (@mask_to_int@(overflow_mask)) {
2918 1
        npy_set_floatstatus_overflow();
2919
    }
2920 1
}
2921

2922
/*
2923
 * Vectorized implementation of log using AVX2 and AVX512
2924
 * 1) if x < 0.0f; return -NAN (invalid input)
2925
 * 2) Range reduction: y = x/2^k;
2926
 *      a) y = normalized mantissa, k is the exponent (0.5 <= y < 1)
2927
 * 3) Compute log(y) = P/Q, ratio of 2 polynomials P and Q
2928
 *      b) P = 5th order and Q = 5th order polynomials obtained from Remez's
2929
 *      algorithm (mini-max polynomial approximation)
2930
 * 5) Compute log(x) = log(y) + k*ln(2)
2931
 * 6) Max ULP error measured across all 32-bit FP's = 3.83 (x = 0x3f486945)
2932
 * 7) Max relative error measured across all 32-bit FP's = 2.359E-07 (for same
2933
 * x = 0x3f486945)
2934
 */
2935

2936
static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
2937 1
@ISA@_log_FLOAT(npy_float * op,
2938
                npy_float * ip,
2939
                const npy_intp array_size,
2940
                const npy_intp steps)
2941
{
2942 1
    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
2943 1
    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
2944

2945
    /*
2946
     * Note: while generally indices are npy_intp, we ensure that our maximum index
2947
     * will fit in an int32 as a precondition for this function via
2948
     * IS_OUTPUT_BLOCKABLE_UNARY
2949
     */
2950
    npy_int32 indexarr[16];
2951 1
    for (npy_int32 ii = 0; ii < 16; ii++) {
2952 1
        indexarr[ii] = ii*stride;
2953
    }
2954

2955
    /* Load up frequently used constants */
2956 1
    @vtype@ log_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_LOGf);
2957 1
    @vtype@ log_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_LOGf);
2958 1
    @vtype@ log_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_LOGf);
2959 1
    @vtype@ log_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_LOGf);
2960 1
    @vtype@ log_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_LOGf);
2961 1
    @vtype@ log_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_LOGf);
2962 1
    @vtype@ log_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_LOGf);
2963 1
    @vtype@ log_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_LOGf);
2964 1
    @vtype@ log_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_LOGf);
2965 1
    @vtype@ log_q3 = _mm@vsize@_set1_ps(NPY_COEFF_Q3_LOGf);
2966 1
    @vtype@ log_q4 = _mm@vsize@_set1_ps(NPY_COEFF_Q4_LOGf);
2967 1
    @vtype@ log_q5 = _mm@vsize@_set1_ps(NPY_COEFF_Q5_LOGf);
2968 1
    @vtype@ loge2 = _mm@vsize@_set1_ps(NPY_LOGE2f);
2969 1
    @vtype@ nan = _mm@vsize@_set1_ps(NPY_NANF);
2970 1
    @vtype@ neg_nan = _mm@vsize@_set1_ps(-NPY_NANF);
2971 1
    @vtype@ neg_inf = _mm@vsize@_set1_ps(-NPY_INFINITYF);
2972 1
    @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
2973 1
    @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
2974 1
    @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
2975 1
    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)indexarr);
2976
    @vtype@ poly, num_poly, denom_poly, exponent;
2977

2978
    @mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
2979 1
    @mask@ invalid_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
2980 1
    @mask@ divide_by_zero_mask = invalid_mask;
2981 1
    @mask@ load_mask = @isa@_get_full_load_mask_ps();
2982 1
    npy_intp num_remaining_elements = array_size;
2983

2984 1
    while (num_remaining_elements > 0) {
2985

2986 1
        if (num_remaining_elements < num_lanes) {
2987 1
            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
2988
                                                       num_lanes);
2989
        }
2990

2991
        @vtype@ x_in;
2992 1
        if (stride == 1) {
2993 1
            x_in = @isa@_masked_load_ps(load_mask, ip);
2994
        }
2995
        else {
2996 1
            x_in  = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
2997
        }
2998

2999 1
        negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
3000 1
        zero_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_EQ_OQ);
3001 1
        inf_mask = _mm@vsize@_cmp_ps@vsub@(x_in, inf, _CMP_EQ_OQ);
3002 1
        nan_mask = _mm@vsize@_cmp_ps@vsub@(x_in, x_in, _CMP_NEQ_UQ);
3003 1
        divide_by_zero_mask = @or_masks@(divide_by_zero_mask,
3004 1
                                        @and_masks@(zero_mask, load_mask));
3005 1
        invalid_mask = @or_masks@(invalid_mask, negx_mask);
3006

3007 1
        @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zeros_f, negx_mask);
3008

3009
        /* set x = normalized mantissa */
3010 1
        exponent = @isa@_get_exponent(x);
3011 1
        x = @isa@_get_mantissa(x);
3012

3013
        /* if x < sqrt(2) {exp = exp-1; x = 2*x} */
3014 1
        sqrt2_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(NPY_SQRT1_2f), _CMP_LE_OQ);
3015 1
        x = @isa@_blend(x, _mm@vsize@_add_ps(x,x), sqrt2_mask);
3016 1
        exponent = @isa@_blend(exponent,
3017
                               _mm@vsize@_sub_ps(exponent,ones_f), sqrt2_mask);
3018

3019
        /* x = x - 1 */
3020 1
        x = _mm@vsize@_sub_ps(x, ones_f);
3021

3022
        /* Polynomial approximation for log(1+x) */
3023 1
        num_poly = @fmadd@(log_p5, x, log_p4);
3024 1
        num_poly = @fmadd@(num_poly, x, log_p3);
3025 1
        num_poly = @fmadd@(num_poly, x, log_p2);
3026 1
        num_poly = @fmadd@(num_poly, x, log_p1);
3027 1
        num_poly = @fmadd@(num_poly, x, log_p0);
3028 1
        denom_poly = @fmadd@(log_q5, x, log_q4);
3029 1
        denom_poly = @fmadd@(denom_poly, x, log_q3);
3030 1
        denom_poly = @fmadd@(denom_poly, x, log_q2);
3031 1
        denom_poly = @fmadd@(denom_poly, x, log_q1);
3032 1
        denom_poly = @fmadd@(denom_poly, x, log_q0);
3033 1
        poly = _mm@vsize@_div_ps(num_poly, denom_poly);
3034 1
        poly = @fmadd@(exponent, loge2, poly);
3035

3036
        /*
3037
         * x < 0.0f; return -NAN
3038
         * x = +/- NAN; return NAN
3039
         * x = 0.0f; return -INF
3040
         */
3041 1
        poly = @isa@_set_masked_lanes_ps(poly, nan, nan_mask);
3042 1
        poly = @isa@_set_masked_lanes_ps(poly, neg_nan, negx_mask);
3043 1
        poly = @isa@_set_masked_lanes_ps(poly, neg_inf, zero_mask);
3044 1
        poly = @isa@_set_masked_lanes_ps(poly, inf, inf_mask);
3045

3046 1
        @masked_store@(op, @cvtps_epi32@(load_mask), poly);
3047

3048 1
        ip += num_lanes*stride;
3049 1
        op += num_lanes;
3050 1
        num_remaining_elements -= num_lanes;
3051
    }
3052

3053 1
    if (@mask_to_int@(invalid_mask)) {
3054 1
        npy_set_floatstatus_invalid();
3055
    }
3056 1
    if (@mask_to_int@(divide_by_zero_mask)) {
3057 1
        npy_set_floatstatus_divbyzero();
3058
    }
3059 1
}
3060
#endif
3061
/**end repeat**/
3062

3063
/*
3064
 * Vectorized implementation of exp double using AVX512
3065
 * Reference: Tang, P.T.P., "Table-driven implementation of the
3066
 *  exponential function in IEEE floating-point
3067
 *  arithmetic," ACM Transactions on Mathematical
3068
 *  Software, vol. 15, pp. 144-157, 1989.
3069
 * 1) if x > mTH_max or x is INF; return INF (overflow)
3070
 * 2) if x < mTH_min; return 0.0f (underflow)
3071
 * 3) if abs(x) < mTH_nearzero; return 1.0f + x
3072
 * 4) if x is Nan; return Nan
3073
 * 5) Range reduction:
3074
 *    x = (32m + j)ln2 / 32 + r; r in [-ln2/64, ln2/64]
3075
 * 6) exp(r) - 1 is approximated by a polynomial function p(r)
3076
 *    exp(x) = 2^m(2^(j/32) + 2^(j/32)p(r));
3077
 */
3078
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
3079
#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
3080
static NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F void
3081 1
AVX512F_exp_DOUBLE(npy_double * op,
3082
                npy_double * ip,
3083
                const npy_intp array_size,
3084
                const npy_intp steps)
3085
{
3086 1
    npy_intp num_remaining_elements = array_size;
3087 1
    const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
3088 1
    const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
3089
    npy_int32 indexarr[8];
3090 1
    for (npy_int32 ii = 0; ii < 8; ii++) {
3091 1
        indexarr[ii] = ii*stride;
3092
    }
3093

3094 1
    __m512d InvLn2N = _mm512_set1_pd(NPY_INV_LN2_MUL_32);
3095 1
    __m512d mShift = _mm512_set1_pd(NPY_RINT_CVT_MAGIC);
3096 1
    __m512d mNegL1 = _mm512_set1_pd(NPY_TANG_NEG_L1);
3097 1
    __m512d mNegL2 = _mm512_set1_pd(NPY_TANG_NEG_L2);
3098 1
    __m512i mMod = _mm512_set1_epi64(0x1f);
3099 1
    __m512d mA1 = _mm512_set1_pd(NPY_TANG_A1);
3100 1
    __m512d mA2 = _mm512_set1_pd(NPY_TANG_A2);
3101 1
    __m512d mA3 = _mm512_set1_pd(NPY_TANG_A3);
3102 1
    __m512d mA4 = _mm512_set1_pd(NPY_TANG_A4);
3103 1
    __m512d mA5 = _mm512_set1_pd(NPY_TANG_A5);
3104 1
    __m512d mTH_nearzero = _mm512_set1_pd(0x1p-54);
3105 1
    __m512d mTH_max = _mm512_set1_pd(0x1.62e42fefa39efp+9);
3106 1
    __m512d mTH_min = _mm512_set1_pd(-0x1.74910d52d3053p+9);
3107 1
    __m512d mTH_inf = _mm512_set1_pd(NPY_INFINITY);
3108 1
    __m512d zeros_d = _mm512_set1_pd(0.0f);
3109 1
    __m512d ones_d = _mm512_set1_pd(1.0f);
3110 1
    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
3111

3112 1
    __m512d mTable_top_0 = _mm512_loadu_pd(&(EXP_Table_top[8*0]));
3113 1
    __m512d mTable_top_1 = _mm512_loadu_pd(&(EXP_Table_top[8*1]));
3114 1
    __m512d mTable_top_2 = _mm512_loadu_pd(&(EXP_Table_top[8*2]));
3115 1
    __m512d mTable_top_3 = _mm512_loadu_pd(&(EXP_Table_top[8*3]));
3116 1
    __m512d mTable_tail_0 = _mm512_loadu_pd(&(EXP_Table_tail[8*0]));
3117 1
    __m512d mTable_tail_1 = _mm512_loadu_pd(&(EXP_Table_tail[8*1]));
3118 1
    __m512d mTable_tail_2 = _mm512_loadu_pd(&(EXP_Table_tail[8*2]));
3119 1
    __m512d mTable_tail_3 = _mm512_loadu_pd(&(EXP_Table_tail[8*3]));
3120
    
3121 1
    __mmask8 overflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
3122 1
    __mmask8 load_mask = avx512_get_full_load_mask_pd();
3123
    __mmask8 xmin_mask, xmax_mask, inf_mask, nan_mask, nearzero_mask;
3124

3125 1
    while (num_remaining_elements > 0) {
3126 1
        if (num_remaining_elements < num_lanes) {
3127 1
            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
3128
                                                      num_lanes);
3129
        }
3130

3131
        __m512d x;
3132 1
        if (1 == stride) {
3133 1
            x = avx512_masked_load_pd(load_mask, ip);
3134
        }
3135
        else {
3136 1
            x = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
3137
        }
3138

3139 1
        nan_mask = _mm512_cmp_pd_mask(x, x, _CMP_NEQ_UQ);
3140 1
        x = avx512_set_masked_lanes_pd(x, zeros_d, nan_mask);
3141 1
        xmax_mask = _mm512_cmp_pd_mask(x, mTH_max, _CMP_GT_OQ);
3142 1
        xmin_mask = _mm512_cmp_pd_mask(x, mTH_min, _CMP_LT_OQ);
3143 1
        inf_mask = _mm512_cmp_pd_mask(x, mTH_inf, _CMP_EQ_OQ);
3144 1
        __m512i x_abs = _mm512_and_epi64(_mm512_castpd_si512(x), 
3145
                                _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF));
3146 1
        nearzero_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(x_abs), 
3147
                                    mTH_nearzero, _CMP_LT_OQ);
3148 1
        nearzero_mask = _mm512_kxor(nearzero_mask, nan_mask);
3149 1
        overflow_mask = _mm512_kor(overflow_mask, 
3150 1
                                _mm512_kxor(xmax_mask, inf_mask));
3151 1
        x = avx512_set_masked_lanes_pd(x, zeros_d,
3152 1
                        _mm512_kor(_mm512_kor(nan_mask, xmin_mask),
3153 1
                            _mm512_kor(xmax_mask, nearzero_mask)));
3154

3155
        /* z = x * 32/ln2 */
3156 1
        __m512d z = _mm512_mul_pd(x, InvLn2N);
3157
        
3158
        /* round to nearest */
3159 1
        __m512d kd = _mm512_add_pd(z, mShift);
3160 1
        __m512i ki = _mm512_castpd_si512(kd);
3161 1
        kd = _mm512_sub_pd(kd, mShift);
3162

3163
        /* r = (x + kd*mNegL1) + kd*mNegL2 */
3164 1
        __m512d r1 = _mm512_fmadd_pd(kd, mNegL1, x);
3165 1
        __m512d r2 = _mm512_mul_pd(kd, mNegL2);
3166 1
        __m512d r = _mm512_add_pd(r1,r2);
3167

3168
        /* Polynomial approximation for exp(r) - 1 */
3169 1
        __m512d q = _mm512_fmadd_pd(mA5, r, mA4);
3170 1
        q = _mm512_fmadd_pd(q, r, mA3);
3171 1
        q = _mm512_fmadd_pd(q, r, mA2);
3172 1
        q = _mm512_fmadd_pd(q, r, mA1);
3173 1
        q = _mm512_mul_pd(q, r);
3174 1
        __m512d p = _mm512_fmadd_pd(r, q, r2);;
3175 1
        p = _mm512_add_pd(r1, p);
3176

3177
        /* Get 2^(j/32) from lookup table */
3178 1
        __m512i j = _mm512_and_epi64(ki, mMod);
3179 1
        __m512d top = avx512_permute_x4var_pd(mTable_top_0, mTable_top_1,
3180
                                  mTable_top_2, mTable_top_3, j);
3181 1
        __m512d tail = avx512_permute_x4var_pd(mTable_tail_0, mTable_tail_1,
3182
                                  mTable_tail_2, mTable_tail_3, j);
3183

3184
        /* 
3185
         * s = top + tail;
3186
         * exp(x) = 2^m * (top + (tail + s * p)); 
3187
         */
3188 1
        __m512d s = _mm512_add_pd(top, tail);
3189 1
        __m512d res = _mm512_fmadd_pd(s, p, tail);
3190 1
        res = _mm512_add_pd(res, top);
3191 1
        res= _mm512_scalef_pd(res, _mm512_div_pd(kd, _mm512_set1_pd(32)));
3192

3193
        /* return special cases */
3194 1
        res = avx512_set_masked_lanes_pd(res, _mm512_add_pd(x, ones_d), 
3195
                                        nearzero_mask);
3196 1
        res = avx512_set_masked_lanes_pd(res, _mm512_set1_pd(NPY_NAN), 
3197
                                        nan_mask);
3198 1
        res = avx512_set_masked_lanes_pd(res, mTH_inf, xmax_mask);
3199 1
        res = avx512_set_masked_lanes_pd(res, zeros_d, xmin_mask);
3200

3201 1
        _mm512_mask_storeu_pd(op, load_mask, res);
3202

3203 1
        ip += num_lanes * stride;
3204 1
        op += num_lanes;
3205 1
        num_remaining_elements -= num_lanes;
3206
    }
3207 1
    if (overflow_mask) {
3208 1
        npy_set_floatstatus_overflow();
3209
    }
3210 1
}
3211
#endif
3212
#endif
3213

3214
/*
3215
 * Vectorized implementation of log double using AVX512
3216
 * Reference: 
3217
 * [1] Tang, Ping Tak Peter. Table-lookup algorithms for elementary functions 
3218
 *     and their error analysis. No. CONF-9106103-1. Argonne National Lab.,
3219
 *     IL (USA), 1991.
3220
 * [2] Tang, Ping-Tak Peter. "Table-driven implementation of the logarithm
3221
 *     function in IEEE floating-point arithmetic." ACM Transactions on
3222
 *     Mathematical Software (TOMS) 16.4 (1990): 378-400.
3223
 * [3] Muller, Jean-Michel. "Elementary functions: algorithms and
3224
 *     implementation." (2016).
3225
 * 1) if x = 0; return -INF
3226
 * 2) if x < 0; return NAN
3227
 * 3) if x is INF; return INF
3228
 * 4) if x is NAN; return NAN
3229
 * 5) if x on (1.0 - 0x1p-4, 1.0 + 0x1.09p-4), calling npy_log()
3230
 * 6) Range reduction:
3231
 *    log(x) = log(2^m * z)
3232
 *           = mln2 + log(z)
3233
 * 7) log(z) = log(z / c_k) + log(c_k);
3234
 *    where c_k = 1 + k/64, k = 0,1,...,64
3235
 *    s.t. |x - c_k| <= 1/128 when x on[1,2].
3236
 * 8) r = 2(x - c_k)/(x + c_k)
3237
 *    log(x/c_k) = log((1 + r/2) / (1 - r/2))
3238
 *               = p(r)
3239
 *               = 2((r/2) + 1/3*(r/2)^3 + 1/5*(r/2)^5 + ...)
3240
 */
3241
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
3242
#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
3243
static NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F void
3244 1
AVX512F_log_DOUBLE(npy_double * op,
3245
                npy_double * ip,
3246
                const npy_intp array_size,
3247
                const npy_intp steps)
3248
{
3249 1
    npy_intp num_remaining_elements = array_size;
3250 1
    const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
3251 1
    const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
3252
    npy_int32 indexarr[8];
3253 1
    for (npy_int32 ii = 0; ii < 8; ii++) {
3254 1
        indexarr[ii] = ii*stride;
3255
    }
3256

3257 1
    __m512d zeros_d = _mm512_set1_pd(0.0f);
3258 1
    __m512d ones_d = _mm512_set1_pd(1.0f);
3259 1
    __m512d mInf = _mm512_set1_pd(NPY_INFINITY);
3260 1
    __m512d mInv64 = (__m512d)(_mm512_set1_epi64(0x3f90000000000000));
3261 1
    __m512d mNeg_nan = _mm512_set1_pd(-NPY_NAN);
3262 1
    __m512d mNan = _mm512_set1_pd(NPY_NAN);
3263 1
    __m512d mNeg_inf = _mm512_set1_pd(-NPY_INFINITY);
3264 1
    __m512d mA1 = _mm512_set1_pd(NPY_TANG_LOG_A1);
3265 1
    __m512d mA2 = _mm512_set1_pd(NPY_TANG_LOG_A2);
3266 1
    __m512d mA3 = _mm512_set1_pd(NPY_TANG_LOG_A3);
3267 1
    __m512d mA4 = _mm512_set1_pd(NPY_TANG_LOG_A4);
3268 1
    __m512d mLN2HI = _mm512_set1_pd(NPY_TANG_LOG_LN2HI);
3269 1
    __m512d mLN2LO = _mm512_set1_pd(NPY_TANG_LOG_LN2LO);
3270

3271 1
    __m512d mTo_glibc_min = _mm512_set1_pd(1.0 - 0x1p-4);
3272 1
    __m512d mTo_glibc_max = _mm512_set1_pd(1.0 + 0x1.09p-4);
3273 1
    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
3274

3275
    /* Load lookup table data */
3276
    /**begin repeat
3277
     * #i = 0, 1, 2, 3, 4, 5, 6, 7#
3278
     */
3279

3280 1
    __m512d mLUT_TOP_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*@i@]));
3281 1
    __m512d mLUT_TAIL_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*@i@]));
3282
    
3283
    /**end repeat**/
3284

3285 1
    __mmask8 load_mask = avx512_get_full_load_mask_pd();
3286 1
    __mmask8 invalid_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
3287 1
    __mmask8 divide_by_zero_mask = invalid_mask;
3288

3289
    __mmask8 inf_mask, nan_mask, zero_mask, negx_mask, denormal_mask,
3290
             glibc_mask;
3291

3292
    __m512d x;
3293 1
    while (num_remaining_elements > 0) {
3294 1
        if (num_remaining_elements < num_lanes) {
3295 1
            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
3296
                                                      num_lanes);
3297
        }
3298

3299 1
        if (1 == stride) {
3300 1
            x = avx512_masked_load_pd(load_mask, ip);
3301
        }
3302
        else {
3303 1
            x = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
3304
        }
3305

3306
        /* call glibc when x on [1.0 - 0x1p-4, 1.0 + 0x1.09p-4] */
3307 1
        __mmask8 m1 = _mm512_cmp_pd_mask(x, mTo_glibc_max, _CMP_LT_OQ);
3308 1
        __mmask8 m2 = _mm512_cmp_pd_mask(x, mTo_glibc_min, _CMP_GT_OQ);
3309 1
        glibc_mask =  m1 & m2;
3310

3311 1
        if (glibc_mask != 0xFF) {
3312 1
            zero_mask = _mm512_cmp_pd_mask(x, zeros_d, _CMP_EQ_OQ);
3313 1
            inf_mask = _mm512_cmp_pd_mask(x, mInf, _CMP_EQ_OQ);
3314 1
            negx_mask = _mm512_cmp_pd_mask(x, zeros_d, _CMP_LT_OQ);
3315 1
            nan_mask = _mm512_cmp_pd_mask(x, x, _CMP_NEQ_UQ);
3316

3317 1
            divide_by_zero_mask = divide_by_zero_mask | (zero_mask & load_mask);
3318 1
            invalid_mask = invalid_mask | negx_mask;
3319

3320 1
            x = avx512_set_masked_lanes_pd(x, zeros_d, negx_mask);
3321 1
            __m512i ix = (__m512i)x;
3322

3323
            /* Normalize x when it is denormal */
3324 1
            __m512i top12 = _mm512_and_epi64(ix,
3325
                                _mm512_set1_epi64(0xfff0000000000000));
3326 1
            denormal_mask = _mm512_cmp_epi64_mask(top12, _mm512_set1_epi64(0),
3327
                                _CMP_EQ_OQ);
3328 1
            denormal_mask = (~zero_mask) & denormal_mask;
3329 1
            ix = (__m512i)_mm512_mask_mul_pd(x, denormal_mask,
3330
                                    x, _mm512_set1_pd(0x1p52));
3331 1
            ix = _mm512_mask_sub_epi64(ix, denormal_mask,
3332
                                    ix, _mm512_set1_epi64(52ULL << 52));
3333

3334
            /*
3335
             * x = 2^k * z; where z in range [1,2]
3336
             */
3337 1
            __m512i tmp = _mm512_sub_epi64(ix,
3338
                              _mm512_set1_epi64(0x3ff0000000000000));
3339 1
            __m512i i = _mm512_and_epi64(_mm512_srai_epi64(tmp, 52 - 6),
3340
                            _mm512_set1_epi64(0x3fULL));
3341 1
            __m512i ik = _mm512_srai_epi64(tmp, 52);
3342 1
            __m512d z = (__m512d)(_mm512_sub_epi64(ix, _mm512_and_epi64(tmp,
3343
                            _mm512_set1_epi64(0xfff0000000000000))));
3344
            /* c = i/64 + 1 */
3345 1
            __m256i i_32 = _mm512_cvtepi64_epi32(i);
3346 1
            __m512d c = _mm512_fmadd_pd(_mm512_cvtepi32_pd(i_32), mInv64, ones_d);
3347

3348
            /* u = 2 * (z - c) / (z + c) */
3349 1
            __m512d u = _mm512_div_pd(_mm512_sub_pd(z, c), _mm512_add_pd(z, c));
3350 1
            u = _mm512_mul_pd(_mm512_set1_pd(2.0), u);
3351

3352
            /* v = u * u */
3353 1
            __m512d v = _mm512_mul_pd(u,u);
3354

3355
            /* log(z/c) = u + u*v*(A1 + v*(A2 + v*(A3 + v*A4))) */
3356 1
            __m512d res = _mm512_fmadd_pd(v, mA4, mA3);
3357 1
            res = _mm512_fmadd_pd(v, res, mA2);
3358 1
            res = _mm512_fmadd_pd(v, res, mA1);
3359 1
            res = _mm512_mul_pd(v, res);
3360 1
            res = _mm512_fmadd_pd(u, res, u);
3361

3362
            /* Load lookup table data */
3363 1
            __m512d c_hi = avx512_permute_x8var_pd(mLUT_TOP_0, mLUT_TOP_1,
3364
                            mLUT_TOP_2, mLUT_TOP_3, mLUT_TOP_4, mLUT_TOP_5,
3365
                            mLUT_TOP_6, mLUT_TOP_7, i);
3366 1
            __m512d c_lo = avx512_permute_x8var_pd(mLUT_TAIL_0, mLUT_TAIL_1,
3367
                              mLUT_TAIL_2, mLUT_TAIL_3, mLUT_TAIL_4, mLUT_TAIL_5,
3368
                              mLUT_TAIL_6, mLUT_TAIL_7, i);
3369

3370
            /*
3371
             * log(x) = k * ln2_hi + c_hi +
3372
             *          k * ln2_lo + c_lo +
3373
             *          log(z/c)
3374
             */
3375 1
            __m256i ik_32 = _mm512_cvtepi64_epi32(ik);
3376 1
            __m512d k = _mm512_cvtepi32_pd(ik_32);
3377 1
            __m512d tt = _mm512_fmadd_pd(k, mLN2HI, c_hi);
3378 1
            __m512d tt2 = _mm512_fmadd_pd(k, mLN2LO, c_lo);
3379 1
            tt = _mm512_add_pd(tt, tt2);
3380 1
            res = _mm512_add_pd(tt, res);
3381

3382
            /* return special cases */
3383 1
            res = avx512_set_masked_lanes_pd(res, mNan, nan_mask);
3384 1
            res = avx512_set_masked_lanes_pd(res, mNeg_nan, negx_mask);
3385 1
            res = avx512_set_masked_lanes_pd(res, mNeg_inf, zero_mask);
3386 1
            res = avx512_set_masked_lanes_pd(res, mInf, inf_mask);
3387

3388 1
            _mm512_mask_storeu_pd(op, load_mask, res);
3389
        }
3390

3391
        /* call glibc's log func when x around 1.0f */
3392 1
        for (int ii = 0, jj = 0; glibc_mask != 0; ii++, jj += stride) {
3393 1
            if (glibc_mask & 0x01) {
3394 1
                op[ii] = npy_log(ip[jj]);
3395
            }
3396 1
            glibc_mask  = glibc_mask >> 1;
3397
        }
3398

3399 1
        ip += num_lanes * stride;
3400 1
        op += num_lanes;
3401 1
        num_remaining_elements -= num_lanes;
3402
    }
3403

3404 1
    if (invalid_mask) {
3405 1
        npy_set_floatstatus_invalid();
3406
    }
3407 1
    if (divide_by_zero_mask) {
3408 1
        npy_set_floatstatus_divbyzero();
3409
    }
3410 1
}
3411
#endif
3412
#endif
3413

3414
/**begin repeat
3415
 * #TYPE = CFLOAT, CDOUBLE#
3416
 * #type = npy_float, npy_double#
3417
 * #num_lanes = 16, 8#
3418
 * #vsuffix = ps, pd#
3419
 * #epi_vsub  = epi32, epi64#
3420
 * #mask = __mmask16, __mmask8#
3421
 * #vtype = __m512, __m512d#
3422
 * #scale = 4, 8#
3423
 * #vindextype = __m512i, __m256i#
3424
 * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
3425
 * #storemask = 0xFF, 0xF#
3426
 * #IS_FLOAT = 1, 0#
3427
 */
3428

3429
/**begin repeat1
3430
 *  #func = add, subtract, multiply#
3431
 *  #vectorf = _mm512_add, _mm512_sub, avx512_cmul#
3432
 */
3433

3434
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
3435
static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
3436 1
AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
3437
{
3438 1
    const npy_intp array_size = dimensions[0];
3439 1
    npy_intp num_remaining_elements = 2*array_size;
3440 1
    @type@* ip1 = (@type@*) args[0];
3441 1
    @type@* ip2 = (@type@*) args[1];
3442 1
    @type@* op  = (@type@*) args[2];
3443

3444 1
    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
3445

3446 1
    while (num_remaining_elements > 0) {
3447 1
        if (num_remaining_elements < @num_lanes@) {
3448 1
            load_mask = avx512_get_partial_load_mask_@vsuffix@(
3449
                                    num_remaining_elements, @num_lanes@);
3450
        }
3451
        @vtype@ x1, x2;
3452 1
        x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
3453 1
        x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
3454

3455 1
        @vtype@ out = @vectorf@_@vsuffix@(x1, x2);
3456

3457 1
        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
3458

3459 1
        ip1 += @num_lanes@;
3460 1
        ip2 += @num_lanes@;
3461 1
        op += @num_lanes@;
3462 1
        num_remaining_elements -= @num_lanes@;
3463
    }
3464 1
}
3465
#endif
3466
/**end repeat1**/
3467

3468
/**begin repeat1
3469
 *  #func = square, conjugate#
3470
 *  #vectorf = avx512_csquare, avx512_conjugate#
3471
 */
3472

3473
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
3474
static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
3475 1
AVX512F_@func@_@TYPE@(@type@ * op,
3476
                      @type@ * ip,
3477
                      const npy_intp array_size,
3478
                      const npy_intp steps)
3479
{
3480 1
    npy_intp num_remaining_elements = 2*array_size;
3481 1
    const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
3482

3483
     /*
3484
      * Note: while generally indices are npy_intp, we ensure that our maximum index
3485
      * will fit in an int32 as a precondition for this function via max_stride
3486
      */
3487
    npy_int32 index_ip1[16];
3488 1
    for (npy_int32 ii = 0; ii < @num_lanes@; ii=ii+2) {
3489 1
        index_ip1[ii] = ii*stride_ip1;
3490 1
        index_ip1[ii+1] = ii*stride_ip1 + 1;
3491
    }
3492 1
    @vindextype@ vindex = @vindexload@((@vindextype@*)index_ip1);
3493 1
    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
3494 1
    @vtype@ zeros = _mm512_setzero_@vsuffix@();
3495

3496 1
    while (num_remaining_elements > 0) {
3497 1
        if (num_remaining_elements < @num_lanes@) {
3498 1
            load_mask = avx512_get_partial_load_mask_@vsuffix@(
3499
                                    num_remaining_elements, @num_lanes@);
3500
        }
3501
        @vtype@ x1;
3502 1
        if (stride_ip1 == 1) {
3503 1
            x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
3504
        }
3505
        else {
3506 1
            x1  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex, load_mask);
3507
        }
3508

3509 1
        @vtype@ out = @vectorf@_@vsuffix@(x1);
3510

3511 1
        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
3512 1
        op += @num_lanes@;
3513 1
        ip += @num_lanes@*stride_ip1;
3514 1
        num_remaining_elements -= @num_lanes@;
3515
    }
3516 1
}
3517
#endif
3518
/**end repeat1**/
3519

3520
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
3521
static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
3522 1
AVX512F_absolute_@TYPE@(@type@ * op,
3523
                        @type@ * ip,
3524
                        const npy_intp array_size,
3525
                        const npy_intp steps)
3526
{
3527 1
    npy_intp num_remaining_elements = 2*array_size;
3528 1
    const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
3529

3530
    /*
3531
     * Note: while generally indices are npy_intp, we ensure that our maximum index
3532
     * will fit in an int32 as a precondition for this function via max_stride
3533
     */
3534
    npy_int32 index_ip[32];
3535 1
    for (npy_int32 ii = 0; ii < 2*@num_lanes@; ii=ii+2) {
3536 1
        index_ip[ii] = ii*stride_ip1;
3537 1
        index_ip[ii+1] = ii*stride_ip1 + 1;
3538
    }
3539 1
    @vindextype@ vindex1 = @vindexload@((@vindextype@*)index_ip);
3540 1
    @vindextype@ vindex2 = @vindexload@((@vindextype@*)(index_ip+@num_lanes@));
3541

3542 1
    @mask@ load_mask1 = avx512_get_full_load_mask_@vsuffix@();
3543 1
    @mask@ load_mask2 = avx512_get_full_load_mask_@vsuffix@();
3544 1
    @mask@ store_mask = avx512_get_full_load_mask_@vsuffix@();
3545 1
    @vtype@ zeros = _mm512_setzero_@vsuffix@();
3546

3547
#if @IS_FLOAT@
3548 1
    __m512i re_index = _mm512_set_epi32(30,28,26,24,22,20,18,16,14,12,10,8,6,4,2,0);
3549 1
    __m512i im_index  = _mm512_set_epi32(31,29,27,25,23,21,19,17,15,13,11,9,7,5,3,1);
3550
#else
3551 1
    __m512i re_index = _mm512_set_epi64(14,12,10,8,6,4,2,0);
3552 1
    __m512i im_index  = _mm512_set_epi64(15,13,11,9,7,5,3,1);
3553
#endif
3554

3555 1
    while (num_remaining_elements > 0) {
3556 1
        if (num_remaining_elements < @num_lanes@) {
3557 1
            load_mask1 = avx512_get_partial_load_mask_@vsuffix@(
3558
                                    num_remaining_elements, @num_lanes@);
3559 1
            load_mask2 = 0x0000;
3560 1
            store_mask = avx512_get_partial_load_mask_@vsuffix@(
3561 1
                                    num_remaining_elements/2, @num_lanes@);
3562 1
        } else if (num_remaining_elements < 2*@num_lanes@) {
3563 1
            load_mask1 = avx512_get_full_load_mask_@vsuffix@();
3564 1
            load_mask2 = avx512_get_partial_load_mask_@vsuffix@(
3565
                                    num_remaining_elements - @num_lanes@, @num_lanes@);
3566 1
            store_mask = avx512_get_partial_load_mask_@vsuffix@(
3567 1
                                    num_remaining_elements/2, @num_lanes@);
3568
        }
3569
        @vtype@ x1, x2;
3570 1
        if (stride_ip1 == 1) {
3571 1
            x1 = avx512_masked_load_@vsuffix@(load_mask1, ip);
3572 1
            x2 = avx512_masked_load_@vsuffix@(load_mask2, ip+@num_lanes@);
3573
        }
3574
        else {
3575 1
            x1  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex1, load_mask1);
3576 1
            x2  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex2, load_mask2);
3577
        }
3578

3579 1
        @vtype@ out = avx512_cabsolute_@vsuffix@(x1, x2, re_index, im_index);
3580

3581 1
        _mm512_mask_storeu_@vsuffix@(op, store_mask, out);
3582 1
        op += @num_lanes@;
3583 1
        ip += 2*@num_lanes@*stride_ip1;
3584 1
        num_remaining_elements -= 2*@num_lanes@;
3585
    }
3586 1
    npy_clear_floatstatus_barrier((char*)op);
3587 1
}
3588

3589
#endif
3590
/**end repeat**/
3591

3592
/*
3593
 *****************************************************************************
3594
 **                           BOOL LOOPS
3595
 *****************************************************************************
3596
 */
3597

3598
/**begin repeat
3599
 * # kind = logical_or, logical_and#
3600
 * # and = 0, 1#
3601
 * # op = ||, &&#
3602
 * # sc = !=, ==#
3603
 * # vpre = _mm*2#
3604
 * # vsuf = si128*2#
3605
 * # vtype = __m128i*2#
3606
 * # type = npy_bool*2#
3607
 * # vload = _mm_load_si128*2#
3608
 * # vloadu = _mm_loadu_si128*2#
3609
 * # vstore = _mm_store_si128*2#
3610
 */
3611

3612
/*
3613
 * convert any bit set to boolean true so vectorized and normal operations are
3614
 * consistent, should not be required if bool is used correctly everywhere but
3615
 * you never know
3616
 */
3617
#if !@and@
3618
static NPY_INLINE @vtype@ byte_to_true(@vtype@ v)
3619
{
3620 1
    const @vtype@ zero = @vpre@_setzero_@vsuf@();
3621 1
    const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
3622
    /* get 0xFF for zeros */
3623 1
    @vtype@ tmp = @vpre@_cmpeq_epi8(v, zero);
3624
    /* filled with 0xFF/0x00, negate and mask to boolean true */
3625 1
    return @vpre@_andnot_@vsuf@(tmp, truemask);
3626
}
3627
#endif
3628

3629
static void
3630 1
sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n)
3631
{
3632 1
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
3633 1
        op[i] = ip1[i] @op@ ip2[i];
3634 1
    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
3635 1
        @vtype@ a = @vloadu@((@vtype@*)&ip1[i]);
3636 1
        @vtype@ b = @vloadu@((@vtype@*)&ip2[i]);
3637
#if @and@
3638 1
        const @vtype@ zero = @vpre@_setzero_@vsuf@();
3639
        /* get 0xFF for non zeros*/
3640 1
        @vtype@ tmp = @vpre@_cmpeq_epi8(a, zero);
3641
        /* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */
3642 1
        tmp = @vpre@_andnot_@vsuf@(tmp, b);
3643
#else
3644 1
        @vtype@ tmp = @vpre@_or_@vsuf@(a, b);
3645
#endif
3646

3647 1
        @vstore@((@vtype@*)&op[i], byte_to_true(tmp));
3648
    }
3649 1
    LOOP_BLOCKED_END {
3650 1
        op[i] = (ip1[i] @op@ ip2[i]);
3651
    }
3652 1
}
3653

3654

3655
static void
3656 1
sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
3657
{
3658 1
    const @vtype@ zero = @vpre@_setzero_@vsuf@();
3659 1
    LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, VECTOR_SIZE_BYTES) {
3660 1
        *op = *op @op@ ip[i];
3661 1
        if (*op @sc@ 0) {
3662
            return;
3663
        }
3664
    }
3665
    /* unrolled once to replace a slow movmsk with a fast pmaxb */
3666 1
    LOOP_BLOCKED(npy_bool, 2 * VECTOR_SIZE_BYTES) {
3667 1
        @vtype@ v = @vload@((@vtype@*)&ip[i]);
3668 1
        @vtype@ v2 = @vload@((@vtype@*)&ip[i + VECTOR_SIZE_BYTES]);
3669 1
        v = @vpre@_cmpeq_epi8(v, zero);
3670 1
        v2 = @vpre@_cmpeq_epi8(v2, zero);
3671
#if @and@
3672 1
        if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) {
3673 1
            *op = 0;
3674
#else
3675 1
        if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) {
3676 1
            *op = 1;
3677
#endif
3678 1
            return;
3679
        }
3680
    }
3681 1
    LOOP_BLOCKED_END {
3682 1
        *op = *op @op@ ip[i];
3683 1
        if (*op @sc@ 0) {
3684
            return;
3685
        }
3686
    }
3687
}
3688

3689
/**end repeat**/
3690

3691
/**begin repeat
3692
 * # kind = absolute, logical_not#
3693
 * # op = !=, ==#
3694
 * # not = 0, 1#
3695
 * # vpre = _mm*2#
3696
 * # vsuf = si128*2#
3697
 * # vtype = __m128i*2#
3698
 * # type = npy_bool*2#
3699
 * # vloadu = _mm_loadu_si128*2#
3700
 * # vstore = _mm_store_si128*2#
3701
 */
3702

3703
static void
3704 1
sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
3705
{
3706 1
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
3707 1
        op[i] = (ip[i] @op@ 0);
3708 1
    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
3709 1
        @vtype@ a = @vloadu@((@vtype@*)&ip[i]);
3710
#if @not@
3711 1
        const @vtype@ zero = @vpre@_setzero_@vsuf@();
3712 1
        const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
3713
        /* equivalent to byte_to_true but can skip the negation */
3714 1
        a = @vpre@_cmpeq_epi8(a, zero);
3715 1
        a = @vpre@_and_@vsuf@(a, truemask);
3716
#else
3717
        /* abs is kind of pointless but maybe its used for byte_to_true */
3718 1
        a = byte_to_true(a);
3719
#endif
3720 1
        @vstore@((@vtype@*)&op[i], a);
3721
    }
3722 1
    LOOP_BLOCKED_END {
3723 1
        op[i] = (ip[i] @op@ 0);
3724
    }
3725 1
}
3726

3727
/**end repeat**/
3728

3729
#undef VECTOR_SIZE_BYTES
3730
#else  /* NPY_HAVE_SSE2_INTRINSICS */
3731

3732
/**begin repeat
3733
 *  #type = npy_float, npy_double#
3734
 *  #TYPE = FLOAT, DOUBLE#
3735
 *  #sfx = f32, f64#
3736
 *  #CHK =    , _F64#
3737
 */
3738

3739
#if NPY_SIMD@CHK@
3740

3741
/**begin repeat1
3742
* Arithmetic
3743
* # kind = add, subtract, multiply, divide#
3744
* # OP = +, -, *, /#
3745
* # VOP = add, sub, mul, div#
3746
*/
3747

3748
static void
3749
simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
3750
{
3751
    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
3752
        op[i] = ip1[i] @OP@ ip2[i];
3753
    }
3754
    /* lots of specializations, to squeeze out max performance */
3755
    if (ip1 == ip2) {
3756
        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
3757
            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
3758
            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
3759
            npyv_store_@sfx@(&op[i], c);
3760
        }
3761
    }
3762
    else {
3763
        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
3764
            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
3765
            npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
3766
            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
3767
            npyv_store_@sfx@(&op[i], c);
3768
        }
3769
    }
3770
    LOOP_BLOCKED_END {
3771
        op[i] = ip1[i] @OP@ ip2[i];
3772
    }
3773
}
3774

3775
static void
3776
simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
3777
{
3778
    const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
3779
    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
3780
        op[i] = ip1[0] @OP@ ip2[i];
3781
    }
3782
    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
3783
        npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
3784
        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
3785
        npyv_store_@sfx@(&op[i], v3);
3786
    }
3787
    LOOP_BLOCKED_END {
3788
        op[i] = ip1[0] @OP@ ip2[i];
3789
    }
3790
}
3791

3792
static void
3793
simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
3794
{
3795
    const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
3796
    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
3797
        op[i] = ip1[i] @OP@ ip2[0];
3798
    }
3799
    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
3800
        npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
3801
        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
3802
        npyv_store_@sfx@(&op[i], v3);
3803
    }
3804
    LOOP_BLOCKED_END {
3805
        op[i] = ip1[i] @OP@ ip2[0];
3806
    }
3807
}
3808
/**end repeat1**/
3809
#endif /* NPY_SIMD@CHK@ */
3810
/**end repeat**/
3811
#endif
3812
#endif

Read our documentation on viewing source code .

Loading