#16247 ENH:Umath Replace raw SIMD of unary float point(32-64) with NPYV - g0

Open
Coverage Reach
core/src/multiarray/multiarraymodule.c core/src/multiarray/dtype_transfer.c core/src/multiarray/arraytypes.c.src core/src/multiarray/descriptor.c core/src/multiarray/datetime.c core/src/multiarray/ctors.c core/src/multiarray/scalartypes.c.src core/src/multiarray/nditer_api.c core/src/multiarray/mapping.c core/src/multiarray/item_selection.c core/src/multiarray/nditer_constr.c core/src/multiarray/methods.c core/src/multiarray/nditer_pywrap.c core/src/multiarray/_multiarray_tests.c.src core/src/multiarray/compiled_base.c core/src/multiarray/einsum_sumprod.c.src core/src/multiarray/dragon4.c core/src/multiarray/iterators.c core/src/multiarray/convert_datatype.c core/src/multiarray/arrayobject.c core/src/multiarray/datetime_strings.c core/src/multiarray/calculation.c core/src/multiarray/datetime_busday.c core/src/multiarray/lowlevel_strided_loops.c.src core/src/multiarray/buffer.c core/src/multiarray/getset.c core/src/multiarray/array_coercion.c core/src/multiarray/conversion_utils.c core/src/multiarray/einsum.c.src core/src/multiarray/number.c core/src/multiarray/shape.c core/src/multiarray/scalarapi.c core/src/multiarray/flagsobject.c core/src/multiarray/convert.c core/src/multiarray/dtypemeta.c core/src/multiarray/datetime_busdaycal.c core/src/multiarray/nditer_templ.c.src core/src/multiarray/usertypes.c core/src/multiarray/common.c core/src/multiarray/arrayfunction_override.c core/src/multiarray/refcount.c core/src/multiarray/array_assign_array.c core/src/multiarray/hashdescr.c core/src/multiarray/temp_elide.c core/src/multiarray/alloc.c core/src/multiarray/array_assign_scalar.c core/src/multiarray/vdot.c core/src/multiarray/common.h core/src/multiarray/abstractdtypes.c core/src/multiarray/strfuncs.c core/src/multiarray/typeinfo.c core/src/multiarray/sequence.c core/src/multiarray/methods.h core/src/multiarray/nditer_impl.h core/src/multiarray/dtypemeta.h core/src/multiarray/alloc.h core/src/umath/ufunc_object.c core/src/umath/loops.c.src core/src/umath/simd.inc.src core/src/umath/ufunc_type_resolution.c core/src/umath/_rational_tests.c.src core/src/umath/override.c core/src/umath/scalarmath.c.src core/src/umath/_umath_tests.c.src core/src/umath/matmul.c.src core/src/umath/funcs.inc.src core/src/umath/umathmodule.c core/src/umath/extobj.c core/src/umath/reduction.c core/src/umath/_struct_ufunc_tests.c.src core/src/umath/loops_unary_fp.dispatch.c.src core/src/umath/_operand_flag_tests.c.src core/src/umath/clip.c.src core/src/umath/loops_utils.h core/src/umath/_umath_tests.dispatch.c core/src/umath/fast_loop_macros.h core/src/npysort/timsort.c.src core/src/npysort/quicksort.c.src core/src/npysort/mergesort.c.src core/src/npysort/heapsort.c.src core/src/npysort/selection.c.src core/src/npysort/radixsort.c.src core/src/npysort/npysort_common.h core/src/npysort/binsearch.c.src core/src/common/cblasfuncs.c core/src/common/numpyos.c core/src/common/npy_extint128.h core/src/common/npy_cpu_features.c.src core/src/common/simd/sse/memory.h core/src/common/simd/sse/math.h core/src/common/npy_longdouble.c core/src/common/lowlevel_strided_loops.h core/src/common/array_assign.c core/src/common/ufunc_override.c core/src/common/get_attr_string.h core/src/common/ucsnarrow.c core/src/common/binop_override.h core/src/common/npy_binsearch.h.src core/src/common/python_xerbla.c core/src/common/npy_ctypes.h core/src/common/npy_partition.h.src core/src/common/npy_import.h core/src/common/npy_cblas.h core/src/common/npy_sort.h.src core/src/common/templ_common.h.src core/src/npymath/halffloat.c core/src/npymath/ieee754.c.src core/src/npymath/npy_math_internal.h.src core/src/npymath/npy_math_complex.c.src core/include/numpy/npy_3kcompat.h core/include/numpy/ndarraytypes.h core/include/numpy/npy_math.h core/include/numpy/ndarrayobject.h core/include/numpy/_neighborhood_iterator_imp.h core/include/numpy/random/distributions.h fft/_pocketfft.c linalg/umath_linalg.c.src linalg/lapack_litemodule.c linalg/lapack_lite/python_xerbla.c random/src/legacy/legacy-distributions.c random/src/mt19937/mt19937-jump.c random/src/mt19937/mt19937.c random/src/mt19937/mt19937.h random/src/distributions/random_hypergeometric.c random/src/distributions/random_mvhg_count.c random/src/distributions/random_mvhg_marginals.c random/src/distributions/logfactorial.c random/src/philox/philox.h random/src/philox/philox.c random/src/pcg64/pcg64.c random/src/pcg64/pcg64.h random/src/sfc64/sfc64.c random/src/sfc64/sfc64.h

No flags found

Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.

e.g., #unittest #integration

#production #enterprise

#frontend #backend

Learn more about Codecov Flags here.


@@ -0,0 +1,211 @@
Loading
1 +
/*@targets
2 +
 ** $maxopt baseline
3 +
 ** sse2 vsx2 neon
4 +
 **/
5 +
/**
6 +
 * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
7 +
 * through the baseline, since scatter(AVX512F) and gather very costly
8 +
 * to handle non-contiguous memory access comparing with SSE for
9 +
 * such small operations that this file covers.
10 +
*/
11 +
#define NPY_SIMD_FORCE_128
12 +
#include "numpy/npy_math.h"
13 +
#include "simd/simd.h"
14 +
#include "loops_utils.h"
15 +
#include "loops.h"
16 +
/**********************************************************
17 +
 ** Scalars
18 +
 **********************************************************/
19 +
NPY_FINLINE float c_recip_f32(float a)
20 +
{ return 1.0f / a; }
21 +
NPY_FINLINE double c_recip_f64(double a)
22 +
{ return 1.0 / a; }
23 +
24 +
NPY_FINLINE float c_abs_f32(float a)
25 +
{
26 +
    const float tmp = a > 0 ? a : -a;
27 +
    /* add 0 to clear -0.0 */
28 +
    return tmp + 0;
29 +
}
30 +
NPY_FINLINE double c_abs_f64(double a)
31 +
{
32 +
    const double tmp = a > 0 ? a : -a;
33 +
    /* add 0 to clear -0.0 */
34 +
    return tmp + 0;
35 +
}
36 +
37 +
NPY_FINLINE float c_square_f32(float a)
38 +
{ return a * a; }
39 +
NPY_FINLINE double c_square_f64(double a)
40 +
{ return a * a; }
41 +
/**
42 +
 * MSVC(32-bit mode) requires a clarified contiguous loop
43 +
 * in order to use SSE, otherwise it uses a soft version of square root
44 +
 * that doesn't raise a domain error.
45 +
 */
46 +
#if defined(_MSC_VER) && defined(_M_IX86)
47 +
    #include <emmintrin.h>
48 +
    NPY_FINLINE float c_sqrt_f32(float _a)
49 +
    {
50 +
        __m128 a = _mm_load_ss(&_a);
51 +
        __m128 lower = _mm_sqrt_ss(a);
52 +
        return _mm_cvtss_f32(lower);
53 +
    }
54 +
    NPY_FINLINE double c_sqrt_f64(double _a)
55 +
    {
56 +
        __m128d a = _mm_load_sd(&_a);
57 +
        __m128d lower = _mm_sqrt_pd(a);
58 +
        return _mm_cvtsd_f64(lower);
59 +
    }
60 +
#else
61 +
    #define c_sqrt_f32 npy_sqrtf
62 +
    #define c_sqrt_f64 npy_sqrt
63 +
#endif
64 +
65 +
/********************************************************************************
66 +
 ** Defining the SIMD kernels
67 +
 ********************************************************************************/
68 +
/** Notes:
69 +
 * - avoid the use of libmath to unify fp/domain errors
70 +
 *   for both scalars and vectors among all compilers/architectures.
71 +
 * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
72 +
 *   to fill the remind lanes with 1.0 to avoid divide by zero fp
73 +
 *   exception in reciprocal.
74 +
 */
75 +
#define CONTIG  0
76 +
#define NCONTIG 1
77 +
/**begin repeat
78 +
 * #TYPE = FLOAT, DOUBLE#
79 +
 * #sfx  = f32, f64#
80 +
 * #VCHK = NPY_SIMD, NPY_SIMD_F64#
81 +
 */
82 +
#if @VCHK@
83 +
/**begin repeat1
84 +
 * #kind     = sqrt, absolute, square, reciprocal#
85 +
 * #intr     = sqrt, abs,      square, recip#
86 +
 * #repl_0w1 = 0,    0,        0,      1#
87 +
 */
88 +
/**begin repeat2
89 +
 * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
90 +
 * #DTYPE  = CONTIG, CONTIG,  NCONTIG, NCONTIG#
91 +
 * #unroll = 4,      4,       2,       2#
92 +
 */
93 +
static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
94 +
(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
95 +
{
96 +
    const npyv_lanetype_@sfx@ *src = _src;
97 +
          npyv_lanetype_@sfx@ *dst = _dst;
98 +
99 +
    const int vstep = npyv_nlanes_@sfx@;
100 +
    const int wstep = vstep * @unroll@;
101 +
    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
102 +
        /**begin repeat3
103 +
         * #N  = 0, 1, 2, 3#
104 +
         */
105 +
        #if @unroll@ > @N@
106 +
            #if @STYPE@ == CONTIG
107 +
                npyv_@sfx@ v_src@N@ = npyv_load_@sfx@(src + vstep*@N@);
108 +
            #else
109 +
                npyv_@sfx@ v_src@N@ = npyv_loadn_@sfx@(src + ssrc*vstep*@N@, ssrc);
110 +
            #endif
111 +
            npyv_@sfx@ v_unary@N@ = npyv_@intr@_@sfx@(v_src@N@);
112 +
        #endif
113 +
        /**end repeat3**/
114 +
        /**begin repeat3
115 +
         * #N  = 0, 1, 2, 3#
116 +
         */
117 +
        #if @unroll@ > @N@
118 +
            #if @DTYPE@ == CONTIG
119 +
                npyv_store_@sfx@(dst + vstep*@N@, v_unary@N@);
120 +
            #else
121 +
                npyv_storen_@sfx@(dst + sdst*vstep*@N@, sdst, v_unary@N@);
122 +
            #endif
123 +
        #endif
124 +
        /**end repeat3**/
125 +
    }
126 +
    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
127 +
    #if @STYPE@ == CONTIG
128 +
        #if @repl_0w1@
129 +
            npyv_@sfx@ v_src0 = npyv_load_till_@sfx@(src, len, 1);
130 +
        #else
131 +
            npyv_@sfx@ v_src0 = npyv_load_tillz_@sfx@(src, len);
132 +
        #endif
133 +
    #else
134 +
        #if @repl_0w1@
135 +
            npyv_@sfx@ v_src0 = npyv_loadn_till_@sfx@(src, ssrc, len, 1);
136 +
        #else
137 +
            npyv_@sfx@ v_src0 = npyv_loadn_tillz_@sfx@(src, ssrc, len);
138 +
        #endif
139 +
    #endif
140 +
        npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
141 +
    #if @DTYPE@ == CONTIG
142 +
        npyv_store_till_@sfx@(dst, len, v_unary0);
143 +
    #else
144 +
        npyv_storen_till_@sfx@(dst, sdst, len, v_unary0);
145 +
    #endif
146 +
    }
147 +
    npyv_cleanup();
148 +
}
149 +
/**end repeat2**/
150 +
/**end repeat1**/
151 +
#endif // @VCHK@
152 +
/**end repeat**/
153 +
154 +
/********************************************************************************
155 +
 ** Defining ufunc inner functions
156 +
 ********************************************************************************/
157 +
/**begin repeat
158 +
 * #TYPE = FLOAT, DOUBLE#
159 +
 * #sfx  = f32, f64#
160 +
 * #VCHK = NPY_SIMD, NPY_SIMD_F64#
161 +
 */
162 +
/**begin repeat1
163 +
 * #kind  = sqrt, absolute, square, reciprocal#
164 +
 * #intr  = sqrt, abs,      square, recip#
165 +
 * #clear = 0,    1,        0,      0#
166 +
 */
167 +
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
168 +
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
169 +
{
170 +
    const char *src = args[0]; char *dst = args[1];
171 +
    const npy_intp src_step = steps[0];
172 +
    const npy_intp dst_step = steps[1];
173 +
    npy_intp len = dimensions[0];
174 +
#if @VCHK@
175 +
    const int lsize = sizeof(npyv_lanetype_@sfx@);
176 +
    assert(src_step % lsize == 0 && dst_step % lsize == 0);
177 +
    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
178 +
        goto no_unroll;
179 +
    }
180 +
    const npy_intp ssrc = src_step / lsize;
181 +
    const npy_intp sdst = dst_step / lsize;
182 +
    if (!npyv_loadable_stride_@sfx@(ssrc) || !npyv_storable_stride_@sfx@(sdst)) {
183 +
        goto no_unroll;
184 +
    }
185 +
    if (ssrc == 1 && sdst == 1) {
186 +
        simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 1, dst, 1, len);
187 +
    }
188 +
    else if (sdst == 1) {
189 +
        simd_@TYPE@_@kind@_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
190 +
    }
191 +
    else if (ssrc == 1) {
192 +
        simd_@TYPE@_@kind@_CONTIG_NCONTIG(src, 1, dst, sdst, len);
193 +
    } else {
194 +
        simd_@TYPE@_@kind@_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
195 +
    }
196 +
    goto clear;
197 +
no_unroll:
198 +
#endif // @VCHK@
199 +
    for (; len > 0; --len, src += src_step, dst += dst_step) {
200 +
        const npyv_lanetype_@sfx@ src0 = *(npyv_lanetype_@sfx@*)src;
201 +
        *(npyv_lanetype_@sfx@*)dst = c_@intr@_@sfx@(src0);
202 +
    }
203 +
#if @VCHK@
204 +
clear:;
205 +
#endif
206 +
#if @clear@
207 +
    npy_clear_floatstatus_barrier((char*)dimensions);
208 +
#endif
209 +
}
210 +
/**end repeat1**/
211 +
/**end repeat**/

@@ -0,0 +1,40 @@
Loading
1 +
#ifndef NPY_SIMD
2 +
    #error "Not a standalone header"
3 +
#endif
4 +
5 +
#ifndef _NPY_SIMD_SSE_MATH_H
6 +
#define _NPY_SIMD_SSE_MATH_H
7 +
/***************************
8 +
 * Elementary
9 +
 ***************************/
10 +
// Square root
11 +
#define npyv_sqrt_f32 _mm_sqrt_ps
12 +
#define npyv_sqrt_f64 _mm_sqrt_pd
13 +
14 +
// Reciprocal
15 +
NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
16 +
{ return _mm_div_ps(_mm_set1_ps(1.0f), a); }
17 +
NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
18 +
{ return _mm_div_pd(_mm_set1_pd(1.0), a); }
19 +
20 +
// Absolute
21 +
NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
22 +
{
23 +
    return _mm_and_ps(
24 +
        a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))
25 +
    );
26 +
}
27 +
NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
28 +
{
29 +
    return _mm_and_pd(
30 +
        a, _mm_castsi128_pd(npyv_setall_s64(0x7fffffffffffffffLL))
31 +
    );
32 +
}
33 +
34 +
// Square
35 +
NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
36 +
{ return _mm_mul_ps(a, a); }
37 +
NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
38 +
{ return _mm_mul_pd(a, a); }
39 +
40 +
#endif

@@ -5,14 +5,12 @@
Loading
5 5
extern "C" {
6 6
#endif
7 7
8 +
#include <numpy/npy_common.h>
9 +
8 10
#include <math.h>
9 11
#ifdef __SUNPRO_CC
10 12
#include <sunmath.h>
11 13
#endif
12 -
#ifdef HAVE_NPY_CONFIG_H
13 -
#include <npy_config.h>
14 -
#endif
15 -
#include <numpy/npy_common.h>
16 14
17 15
/* By adding static inline specifiers to npy_math function definitions when
18 16
   appropriate, compiler is given the opportunity to optimize */

@@ -1490,26 +1490,6 @@
Loading
1490 1490
 *****************************************************************************
1491 1491
 */
1492 1492
1493 -
/**begin repeat
1494 -
 * Float types
1495 -
 *  #type = npy_float, npy_double#
1496 -
 *  #TYPE = FLOAT, DOUBLE#
1497 -
 *  #scalarf = npy_sqrtf, npy_sqrt#
1498 -
 */
1499 -
1500 -
NPY_NO_EXPORT void
1501 -
@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
1502 -
{
1503 -
    if (!run_unary_simd_sqrt_@TYPE@(args, dimensions, steps)) {
1504 -
        UNARY_LOOP {
1505 -
            const @type@ in1 = *(@type@ *)ip1;
1506 -
            *(@type@ *)op1 = @scalarf@(in1);
1507 -
        }
1508 -
    }
1509 -
}
1510 -
1511 -
/**end repeat**/
1512 -
1513 1493
/**begin repeat
1514 1494
 *  #func = rint, ceil, floor, trunc#
1515 1495
 *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
@@ -1579,53 +1559,6 @@
Loading
1579 1559
 *  #typesub = f, #
1580 1560
 */
1581 1561
1582 -
NPY_NO_EXPORT NPY_GCC_OPT_3 void
1583 -
@TYPE@_sqrt_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
1584 -
{
1585 -
    if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) {
1586 -
        UNARY_LOOP {
1587 -
            const @type@ in1 = *(@type@ *)ip1;
1588 -
            *(@type@ *)op1 = npy_sqrt@typesub@(in1);
1589 -
        }
1590 -
    }
1591 -
}
1592 -
1593 -
NPY_NO_EXPORT NPY_GCC_OPT_3 void
1594 -
@TYPE@_absolute_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
1595 -
{
1596 -
    if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) {
1597 -
        UNARY_LOOP {
1598 -
            const @type@ in1 = *(@type@ *)ip1;
1599 -
            const @type@ tmp = in1 > 0 ? in1 : -in1;
1600 -
            /* add 0 to clear -0.0 */
1601 -
            *((@type@ *)op1) = tmp + 0;
1602 -
        }
1603 -
    }
1604 -
    npy_clear_floatstatus_barrier((char*)dimensions);
1605 -
}
1606 -
1607 -
NPY_NO_EXPORT NPY_GCC_OPT_3 void
1608 -
@TYPE@_square_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
1609 -
{
1610 -
    if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) {
1611 -
        UNARY_LOOP {
1612 -
            const @type@ in1 = *(@type@ *)ip1;
1613 -
            *(@type@ *)op1 = in1*in1;
1614 -
        }
1615 -
    }
1616 -
}
1617 -
1618 -
NPY_NO_EXPORT NPY_GCC_OPT_3 void
1619 -
@TYPE@_reciprocal_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
1620 -
{
1621 -
    if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) {
1622 -
        UNARY_LOOP {
1623 -
            const @type@ in1 = *(@type@ *)ip1;
1624 -
            *(@type@ *)op1 = 1.0f/in1;
1625 -
        }
1626 -
    }
1627 -
}
1628 -
1629 1562
/**begin repeat2
1630 1563
 *  #func = rint, ceil, floor, trunc#
1631 1564
 *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
@@ -2047,33 +1980,6 @@
Loading
2047 1980
    }
2048 1981
}
2049 1982
2050 -
NPY_NO_EXPORT void
2051 -
@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
2052 -
{
2053 -
    char * margs[] = {args[0], args[0], args[1]};
2054 -
    npy_intp msteps[] = {steps[0], steps[0], steps[1]};
2055 -
    if (!run_binary_simd_multiply_@TYPE@(margs, dimensions, msteps)) {
2056 -
        UNARY_LOOP {
2057 -
            const @type@ in1 = *(@type@ *)ip1;
2058 -
            *((@type@ *)op1) = in1*in1;
2059 -
        }
2060 -
    }
2061 -
}
2062 -
2063 -
NPY_NO_EXPORT void
2064 -
@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
2065 -
{
2066 -
    @type@ one = 1.@c@;
2067 -
    char * margs[] = {(char*)&one, args[0], args[1]};
2068 -
    npy_intp msteps[] = {0, steps[0], steps[1]};
2069 -
    if (!run_binary_simd_divide_@TYPE@(margs, dimensions, msteps)) {
2070 -
        UNARY_LOOP {
2071 -
            const @type@ in1 = *(@type@ *)ip1;
2072 -
            *((@type@ *)op1) = 1/in1;
2073 -
        }
2074 -
    }
2075 -
}
2076 -
2077 1983
NPY_NO_EXPORT void
2078 1984
@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
2079 1985
{
@@ -2091,20 +1997,6 @@
Loading
2091 1997
    }
2092 1998
}
2093 1999
2094 -
NPY_NO_EXPORT void
2095 -
@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
2096 -
{
2097 -
    if (!run_unary_simd_absolute_@TYPE@(args, dimensions, steps)) {
2098 -
        UNARY_LOOP {
2099 -
            const @type@ in1 = *(@type@ *)ip1;
2100 -
            const @type@ tmp = in1 > 0 ? in1 : -in1;
2101 -
            /* add 0 to clear -0.0 */
2102 -
            *((@type@ *)op1) = tmp + 0;
2103 -
        }
2104 -
    }
2105 -
    npy_clear_floatstatus_barrier((char*)dimensions);
2106 -
}
2107 -
2108 2000
NPY_NO_EXPORT void
2109 2001
@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
2110 2002
{
@@ -2214,6 +2106,42 @@
Loading
2214 2106
2215 2107
/**end repeat**/
2216 2108
2109 +
/*
2110 +
 *****************************************************************************
2111 +
 **                          LONGDOUBLE LOOPS                               **
2112 +
 *****************************************************************************
2113 +
 */
2114 +
2115 +
NPY_NO_EXPORT void
2116 +
LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
2117 +
{
2118 +
    UNARY_LOOP {
2119 +
        const npy_longdouble in1 = *(npy_longdouble*)ip1;
2120 +
        *((npy_longdouble *)op1) = 1/in1;
2121 +
    }
2122 +
}
2123 +
2124 +
NPY_NO_EXPORT void
2125 +
LONGDOUBLE_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
2126 +
{
2127 +
    UNARY_LOOP {
2128 +
        const npy_longdouble in1 = *(npy_longdouble *)ip1;
2129 +
        const npy_longdouble tmp = in1 > 0 ? in1 : -in1;
2130 +
        /* add 0 to clear -0.0 */
2131 +
        *((npy_longdouble *)op1) = tmp + 0;
2132 +
    }
2133 +
    npy_clear_floatstatus_barrier((char*)dimensions);
2134 +
}
2135 +
2136 +
NPY_NO_EXPORT void
2137 +
LONGDOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
2138 +
{
2139 +
    UNARY_LOOP {
2140 +
        const npy_longdouble in1 = *(npy_longdouble *)ip1;
2141 +
        *((npy_longdouble *)op1) = in1*in1;
2142 +
    }
2143 +
}
2144 +
2217 2145
/*
2218 2146
 *****************************************************************************
2219 2147
 **                          HALF-FLOAT LOOPS                               **

@@ -29,6 +29,7 @@
Loading
29 29
#endif
30 30
#endif
31 31
#include "simd/simd.h"
32 +
#include "loops_utils.h" // nomemoverlap
32 33
#include <assert.h>
33 34
#include <stdlib.h>
34 35
#include <float.h>
@@ -51,37 +52,6 @@
Loading
51 52
 */
52 53
#define MAX_STEP_SIZE 2097152
53 54
54 -
/*
55 -
 * nomemoverlap - returns true if two strided arrays have an overlapping
56 -
 * region in memory. ip_size/op_size = size of the arrays which can be negative
57 -
 * indicating negative steps.
58 -
 */
59 -
static NPY_INLINE npy_bool
60 -
nomemoverlap(char *ip,
61 -
             npy_intp ip_size,
62 -
             char *op,
63 -
             npy_intp op_size)
64 -
{
65 -
    char *ip_start, *ip_end, *op_start, *op_end;
66 -
    if (ip_size < 0) {
67 -
        ip_start = ip + ip_size;
68 -
        ip_end = ip;
69 -
    }
70 -
    else {
71 -
        ip_start = ip;
72 -
        ip_end = ip + ip_size;
73 -
    }
74 -
    if (op_size < 0) {
75 -
        op_start = op + op_size;
76 -
        op_end = op;
77 -
    }
78 -
    else {
79 -
        op_start = op;
80 -
        op_end = op + op_size;
81 -
    }
82 -
    return (ip_start > op_end) | (op_start > ip_end);
83 -
}
84 -
85 55
#define IS_BINARY_STRIDE_ONE(esize, vsize) \
86 56
    ((steps[0] == esize) && \
87 57
     (steps[1] == esize) && \
@@ -390,7 +360,7 @@
Loading
390 360
 */
391 361
392 362
/**begin repeat2
393 -
 *  #func = sqrt, absolute, square, reciprocal, rint, floor, ceil, trunc#
363 +
 *  #func = rint, floor, ceil, trunc#
394 364
 */
395 365
396 366
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
@@ -510,9 +480,9 @@
Loading
510 480
 */
511 481
512 482
/**begin repeat1
513 -
 * #func = sqrt, absolute, negative, minimum, maximum#
514 -
 * #check = IS_BLOCKABLE_UNARY*3, IS_BLOCKABLE_REDUCE*2 #
515 -
 * #name = unary*3, unary_reduce*2#
483 +
 * #func = absolute, negative, minimum, maximum#
484 +
 * #check = IS_BLOCKABLE_UNARY*2, IS_BLOCKABLE_REDUCE*2 #
485 +
 * #name = unary*2, unary_reduce*2#
516 486
 */
517 487
518 488
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
@@ -1357,33 +1327,6 @@
Loading
1357 1327
}
1358 1328
/**end repeat1**/
1359 1329
1360 -
static void
1361 -
sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
1362 -
{
1363 -
    /* align output to VECTOR_SIZE_BYTES bytes */
1364 -
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
1365 -
        op[i] = @scalarf@(ip[i]);
1366 -
    }
1367 -
    assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
1368 -
           npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
1369 -
    if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
1370 -
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
1371 -
            @vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
1372 -
            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
1373 -
        }
1374 -
    }
1375 -
    else {
1376 -
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
1377 -
            @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
1378 -
            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
1379 -
        }
1380 -
    }
1381 -
    LOOP_BLOCKED_END {
1382 -
        op[i] = @scalarf@(ip[i]);
1383 -
    }
1384 -
}
1385 -
1386 -
1387 1330
static NPY_INLINE
1388 1331
@type@ scalar_abs_@type@(@type@ v)
1389 1332
{
@@ -2458,9 +2401,8 @@
Loading
2458 2401
 */
2459 2402
2460 2403
/**begin repeat1
2461 -
 *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
2462 -
 *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
2463 -
 *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
2404 +
 *  #func = rint, ceil, floor, trunc#
2405 +
 *  #vectorf = rint, ceil, floor, trunc#
2464 2406
 */
2465 2407
2466 2408
#if defined @CHK@
@@ -2475,10 +2417,6 @@
Loading
2475 2417
    npy_intp num_remaining_elements = array_size;
2476 2418
    @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
2477 2419
    @mask@ load_mask = @isa@_get_full_load_mask_ps();
2478 -
#if @replace_0_with_1@
2479 -
    @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask);
2480 -
#endif
2481 -
2482 2420
    /*
2483 2421
     * Note: while generally indices are npy_intp, we ensure that our maximum index
2484 2422
     * will fit in an int32 as a precondition for this function via
@@ -2495,20 +2433,10 @@
Loading
2495 2433
        if (num_remaining_elements < num_lanes) {
2496 2434
            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
2497 2435
                                                       num_lanes);
2498 -
#if @replace_0_with_1@
2499 -
            inv_load_mask = @isa@_invert_mask_ps(load_mask);
2500 -
#endif
2501 2436
        }
2502 2437
        @vtype@ x;
2503 2438
        if (stride == 1) {
2504 2439
            x = @isa@_masked_load_ps(load_mask, ip);
2505 -
#if @replace_0_with_1@
2506 -
            /*
2507 -
             * Replace masked elements with 1.0f to avoid divide by zero fp
2508 -
             * exception in reciprocal
2509 -
             */
2510 -
            x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask);
2511 -
#endif
2512 2440
        }
2513 2441
        else {
2514 2442
            x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask);
@@ -2544,9 +2472,8 @@
Loading
2544 2472
 */
2545 2473
2546 2474
/**begin repeat1
2547 -
 *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
2548 -
 *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
2549 -
 *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
2475 +
 *  #func = rint, ceil, floor, trunc#
2476 +
 *  #vectorf =  rint, ceil, floor, trunc#
2550 2477
 */
2551 2478
2552 2479
#if defined @CHK@
@@ -2560,9 +2487,6 @@
Loading
2560 2487
    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double);
2561 2488
    npy_intp num_remaining_elements = array_size;
2562 2489
    @mask@ load_mask = @isa@_get_full_load_mask_pd();
2563 -
#if @replace_0_with_1@
2564 -
    @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask);
2565 -
#endif
2566 2490
    @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);
2567 2491
2568 2492
    /*
@@ -2580,20 +2504,10 @@
Loading
2580 2504
        if (num_remaining_elements < num_lanes) {
2581 2505
            load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements,
2582 2506
                                                       num_lanes);
2583 -
#if @replace_0_with_1@
2584 -
            inv_load_mask = @isa@_invert_mask_pd(load_mask);
2585 -
#endif
2586 2507
        }
2587 2508
        @vtype@ x;
2588 2509
        if (stride == 1) {
2589 2510
            x = @isa@_masked_load_pd(load_mask, ip);
2590 -
#if @replace_0_with_1@
2591 -
            /*
2592 -
             * Replace masked elements with 1.0f to avoid divide by zero fp
2593 -
             * exception in reciprocal
2594 -
             */
2595 -
            x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask));
2596 -
#endif
2597 2511
        }
2598 2512
        else {
2599 2513
            x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask));

Click to load this diff.
Loading diff...

Learn more Showing 5 files with coverage changes found.

New file numpy/core/src/umath/loops_unary_fp.dispatch.c.src
New
Loading file...
New file numpy/core/src/common/simd/sse/math.h
New
Loading file...
New file numpy/core/src/umath/loops_utils.h
New
Loading file...
New file numpy/core/src/common/simd/sse/memory.h
New
Loading file...
Changes in numpy/core/src/umath/simd.inc.src
-7
-6
Loading file...
Files Coverage
numpy 0.07% 84.16%
Project Totals (133 files) 84.16%
Loading