#17049 ENH: Improve the performance of einsum by using universal simd

Open
Coverage Reach
core/src/multiarray/multiarraymodule.c core/src/multiarray/dtype_transfer.c core/src/multiarray/arraytypes.c.src core/src/multiarray/descriptor.c core/src/multiarray/datetime.c core/src/multiarray/ctors.c core/src/multiarray/scalartypes.c.src core/src/multiarray/nditer_api.c core/src/multiarray/mapping.c core/src/multiarray/item_selection.c core/src/multiarray/nditer_constr.c core/src/multiarray/methods.c core/src/multiarray/nditer_pywrap.c core/src/multiarray/_multiarray_tests.c.src core/src/multiarray/compiled_base.c core/src/multiarray/dragon4.c core/src/multiarray/iterators.c core/src/multiarray/convert_datatype.c core/src/multiarray/einsum_sumprod.c.src core/src/multiarray/arrayobject.c core/src/multiarray/datetime_strings.c core/src/multiarray/calculation.c core/src/multiarray/datetime_busday.c core/src/multiarray/lowlevel_strided_loops.c.src core/src/multiarray/buffer.c core/src/multiarray/getset.c core/src/multiarray/array_coercion.c core/src/multiarray/conversion_utils.c core/src/multiarray/einsum.c.src core/src/multiarray/number.c core/src/multiarray/shape.c core/src/multiarray/scalarapi.c core/src/multiarray/flagsobject.c core/src/multiarray/convert.c core/src/multiarray/dtypemeta.c core/src/multiarray/datetime_busdaycal.c core/src/multiarray/nditer_templ.c.src core/src/multiarray/usertypes.c core/src/multiarray/common.c core/src/multiarray/arrayfunction_override.c core/src/multiarray/refcount.c core/src/multiarray/array_assign_array.c core/src/multiarray/hashdescr.c core/src/multiarray/temp_elide.c core/src/multiarray/alloc.c core/src/multiarray/array_assign_scalar.c core/src/multiarray/vdot.c core/src/multiarray/common.h core/src/multiarray/abstractdtypes.c core/src/multiarray/strfuncs.c core/src/multiarray/typeinfo.c core/src/multiarray/sequence.c core/src/multiarray/methods.h core/src/multiarray/nditer_impl.h core/src/multiarray/dtypemeta.h core/src/multiarray/alloc.h core/src/umath/ufunc_object.c core/src/umath/loops.c.src core/src/umath/simd.inc.src core/src/umath/ufunc_type_resolution.c core/src/umath/_rational_tests.c.src core/src/umath/override.c core/src/umath/scalarmath.c.src core/src/umath/_umath_tests.c.src core/src/umath/matmul.c.src core/src/umath/funcs.inc.src core/src/umath/umathmodule.c core/src/umath/extobj.c core/src/umath/reduction.c core/src/umath/_struct_ufunc_tests.c.src core/src/umath/_operand_flag_tests.c.src core/src/umath/clip.c.src core/src/umath/_umath_tests.dispatch.c core/src/umath/fast_loop_macros.h core/src/npysort/timsort.c.src core/src/npysort/quicksort.c.src core/src/npysort/mergesort.c.src core/src/npysort/heapsort.c.src core/src/npysort/selection.c.src core/src/npysort/radixsort.c.src core/src/npysort/npysort_common.h core/src/npysort/binsearch.c.src core/src/common/cblasfuncs.c core/src/common/numpyos.c core/src/common/npy_extint128.h core/src/common/npy_cpu_features.c.src core/src/common/npy_longdouble.c core/src/common/lowlevel_strided_loops.h core/src/common/array_assign.c core/src/common/ufunc_override.c core/src/common/get_attr_string.h core/src/common/ucsnarrow.c core/src/common/binop_override.h core/src/common/npy_binsearch.h.src core/src/common/python_xerbla.c core/src/common/npy_ctypes.h core/src/common/npy_partition.h.src core/src/common/npy_import.h core/src/common/simd/sse/arithmetic.h core/src/common/npy_cblas.h core/src/common/npy_sort.h.src core/src/common/templ_common.h.src core/src/npymath/halffloat.c core/src/npymath/ieee754.c.src core/src/npymath/npy_math_internal.h.src core/src/npymath/npy_math_complex.c.src core/include/numpy/npy_3kcompat.h core/include/numpy/ndarraytypes.h core/include/numpy/npy_math.h core/include/numpy/ndarrayobject.h core/include/numpy/_neighborhood_iterator_imp.h core/include/numpy/random/distributions.h fft/_pocketfft.c linalg/umath_linalg.c.src linalg/lapack_litemodule.c linalg/lapack_lite/python_xerbla.c random/src/legacy/legacy-distributions.c random/src/mt19937/mt19937-jump.c random/src/mt19937/mt19937.c random/src/mt19937/mt19937.h random/src/distributions/random_hypergeometric.c random/src/distributions/random_mvhg_count.c random/src/distributions/random_mvhg_marginals.c random/src/distributions/logfactorial.c random/src/philox/philox.h random/src/philox/philox.c random/src/pcg64/pcg64.c random/src/pcg64/pcg64.h random/src/sfc64/sfc64.c random/src/sfc64/sfc64.h

No flags found

Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.

e.g., #unittest #integration

#production #enterprise

#frontend #backend

Learn more about Codecov Flags here.


@@ -91,6 +91,31 @@
Loading
91 91
// TODO: emulate integer division
92 92
#define npyv_div_f32 _mm_div_ps
93 93
#define npyv_div_f64 _mm_div_pd
94 +
95 +
// Horizontal add: Calculates the sum of all vector elements.
96 +
NPY_FINLINE float npyv_sum_f32(__m128 a)
97 +
{
98 +
#ifdef NPY_HAVE_SSE3
99 +
    __m128 sum_halves = _mm_hadd_ps(a, a);
100 +
    return _mm_cvtss_f32(_mm_hadd_ps(sum_halves, sum_halves));
101 +
#else
102 +
    __m128 t1 = _mm_movehl_ps(a, a);
103 +
    __m128 t2 = _mm_add_ps(a, t1);
104 +
    __m128 t3 = _mm_shuffle_ps(t2, t2, 1);
105 +
    __m128 t4 = _mm_add_ss(t2, t3);
106 +
    return _mm_cvtss_f32(t4); 
107 +
#endif
108 +
}
109 +
110 +
NPY_FINLINE double npyv_sum_f64(__m128d a)
111 +
{
112 +
#ifdef NPY_HAVE_SSE3
113 +
    return _mm_cvtsd_f64(_mm_hadd_pd(a, a));
114 +
#else
115 +
    return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a)));
116 +
#endif
117 +
}
118 +
94 119
/***************************
95 120
 * FUSED
96 121
 ***************************/

@@ -10,38 +10,33 @@
Loading
10 10
11 11
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
12 12
#define _MULTIARRAYMODULE
13 -
14 -
#include <numpy/npy_common.h>
15 -
#include <numpy/ndarraytypes.h>  /* for NPY_NTYPES */
16 13
#include <numpy/halffloat.h>
17 -
18 14
#include "einsum_sumprod.h"
19 15
#include "einsum_debug.h"
16 +
#include "simd/simd.h"
17 +
#include "common.h"
20 18
21 -
22 -
#ifdef NPY_HAVE_SSE_INTRINSICS
23 -
#define EINSUM_USE_SSE1 1
19 +
// ARM/Neon don't have instructions for aligned memory access
20 +
#ifdef NPY_HAVE_NEON
21 +
    #define EINSUM_IS_ALIGNED(x) 0
24 22
#else
25 -
#define EINSUM_USE_SSE1 0
26 -
#endif
27 -
28 -
#ifdef NPY_HAVE_SSE2_INTRINSICS
29 -
#define EINSUM_USE_SSE2 1
30 -
#else
31 -
#define EINSUM_USE_SSE2 0
32 -
#endif
33 -
34 -
#if EINSUM_USE_SSE1
35 -
#include <xmmintrin.h>
36 -
#endif
37 -
38 -
#if EINSUM_USE_SSE2
39 -
#include <emmintrin.h>
23 +
    #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
40 24
#endif
41 25
42 -
#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
43 -
44 -
/**********************************************/
26 +
/**
27 +
 * This macro is used to enable a scalar loop which advances 4 elements at a
28 +
 * time, which appears after a main SIMD loop gated by `CHK` that unrolls by
29 +
 * `NPY_SIMD_WIDTH * unroll_by` elements, and before a non-unrolled scalar loop
30 +
 * that finishes up all the remaining scalars. The purpose of the unrolled loop
31 +
 * is to enable auto-vectorization in cases when all of the following are true:
32 +
 *
33 +
 *  - optimization is allowed
34 +
 *  - either:
35 +
 *    - we did not run the SIMD loop at all, due to NPV being disabled.
36 +
 *    - the SIMD loop was larger than 128bit, so there are likely to be many
37 +
 *      elements left to process.
38 +
 */
39 +
#define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128))
45 40
46 41
/**begin repeat
47 42
 * #name = byte, short, int, long, longlong,
@@ -56,6 +51,10 @@
Loading
56 51
 *             npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
57 52
 *             npy_float, npy_float, npy_double, npy_longdouble,
58 53
 *             npy_float, npy_double, npy_longdouble#
54 +
* #sfx  = s8, s16, s32, long, s64,
55 +
 *        u8, u16, u32, ulong, u64,
56 +
 *        half, f32, f64, longdouble,
57 +
 *        f32, f64, clongdouble#
59 58
 * #to = ,,,,,
60 59
 *       ,,,,,
61 60
 *       npy_float_to_half,,,,
@@ -76,8 +75,15 @@
Loading
76 75
 *            0*5,
77 76
 *            0,0,1,0,
78 77
 *            0*3#
78 +
 * #NPYV_CHK = 0*5,
79 +
 *             0*5,
80 +
 *             0, NPY_SIMD, NPY_SIMD_F64, 0,
81 +
 *             0*3#
82 +
 * #unroll_by = 0*5,
83 +
 *              0*5,
84 +
 *              0,2, 4, 0,
85 +
 *              0*3#
79 86
 */
80 -
81 87
/**begin repeat1
82 88
 * #nop = 1, 2, 3, 1000#
83 89
 * #noplabel = one, two, three, any#
@@ -250,244 +256,180 @@
Loading
250 256
    @type@ *data0 = (@type@ *)dataptr[0];
251 257
    @type@ *data1 = (@type@ *)dataptr[1];
252 258
    @type@ *data_out = (@type@ *)dataptr[2];
253 -
254 -
#if EINSUM_USE_SSE1 && @float32@
255 -
    __m128 a, b;
256 -
#elif EINSUM_USE_SSE2 && @float64@
257 -
    __m128d a, b;
258 -
#endif
259 -
260 259
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
261 260
                                                            (int)count);
262 -
263 -
/* This is placed before the main loop to make small counts faster */
264 -
finish_after_unrolled_loop:
265 -
    switch (count) {
266 -
/**begin repeat2
267 -
 * #i = 6, 5, 4, 3, 2, 1, 0#
268 -
 */
269 -
        case @i@+1:
270 -
            data_out[@i@] = @to@(@from@(data0[@i@]) *
271 -
                                 @from@(data1[@i@]) +
272 -
                                 @from@(data_out[@i@]));
273 -
/**end repeat2**/
274 -
        case 0:
275 -
            return;
276 -
    }
277 -
278 -
#if EINSUM_USE_SSE1 && @float32@
261 +
#if @NPYV_CHK@ // NPYV check for @type@
279 262
    /* Use aligned instructions if possible */
280 -
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
281 -
        EINSUM_IS_SSE_ALIGNED(data_out)) {
282 -
        /* Unroll the loop by 8 */
283 -
        while (count >= 8) {
284 -
            count -= 8;
285 -
286 -
/**begin repeat2
287 -
 * #i = 0, 4#
288 -
 */
289 -
            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
290 -
            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
291 -
            _mm_store_ps(data_out+@i@, b);
292 -
/**end repeat2**/
293 -
            data0 += 8;
294 -
            data1 += 8;
295 -
            data_out += 8;
263 +
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
264 +
                        EINSUM_IS_ALIGNED(data_out);
265 +
    const int vstep = npyv_nlanes_@sfx@;
266 +
267 +
    /**begin repeat2
268 +
     * #cond = if(is_aligned), else#
269 +
     * #ld = loada, load#
270 +
     * #st = storea, store#
271 +
     */
272 +
    @cond@ {
273 +
    #if @unroll_by@ == 4
274 +
        const npy_intp vstepx4 = vstep * 4;
275 +
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
276 +
            /**begin repeat3
277 +
             * #i = 0, 1, 2, 3#
278 +
             */
279 +
            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
280 +
            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
281 +
            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
282 +
            /**end repeat3**/
283 +
            /**begin repeat3
284 +
             * #i = 0, 1, 2, 3#
285 +
             */
286 +
            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, b@i@, c@i@);
287 +
            /**end repeat3**/
288 +
            /**begin repeat3
289 +
             * #i = 0, 1, 2, 3#
290 +
             */
291 +
            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
292 +
            /**end repeat3**/
296 293
        }
297 -
298 -
        /* Finish off the loop */
299 -
        goto finish_after_unrolled_loop;
300 -
    }
301 -
#elif EINSUM_USE_SSE2 && @float64@
302 -
    /* Use aligned instructions if possible */
303 -
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
304 -
        EINSUM_IS_SSE_ALIGNED(data_out)) {
305 -
        /* Unroll the loop by 8 */
306 -
        while (count >= 8) {
307 -
            count -= 8;
308 -
309 -
/**begin repeat2
310 -
 * #i = 0, 2, 4, 6#
311 -
 */
312 -
            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
313 -
            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
314 -
            _mm_store_pd(data_out+@i@, b);
315 -
/**end repeat2**/
316 -
            data0 += 8;
317 -
            data1 += 8;
318 -
            data_out += 8;
294 +
    #elif @unroll_by@ == 2
295 +
        const npy_intp vstepx2 = vstep * 2;
296 +
        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data1 += vstepx2, data_out += vstepx2) {
297 +
            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
298 +
            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
299 +
            npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1);
300 +
            npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep);
301 +
            npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out);
302 +
            npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep);
303 +
            npyv_@sfx@ abc0 = npyv_muladd_@sfx@(a0, b0, c0);
304 +
            npyv_@sfx@ abc1 = npyv_muladd_@sfx@(a1, b1, c1);
305 +
            npyv_@st@_@sfx@(data_out, abc0);
306 +
            npyv_@st@_@sfx@(data_out + vstep, abc1);
319 307
        }
320 -
321 -
        /* Finish off the loop */
322 -
        goto finish_after_unrolled_loop;
308 +
    #else
309 +
        #error "Invalid unroll_by = @unroll_by@"
310 +
    #endif
311 +
    }
312 +
    /**end repeat2**/
313 +
    npyv_cleanup();
314 +
#endif // NPYV check for @type@
315 +
316 +
#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
317 +
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
318 +
        /**begin repeat2
319 +
         * #i = 0, 1, 2, 3#
320 +
         */
321 +
        const @type@ a@i@ = @from@(data0[@i@]);
322 +
        const @type@ b@i@ = @from@(data1[@i@]);
323 +
        const @type@ c@i@ = @from@(data_out[@i@]);
324 +
        /**end repeat2**/
325 +
        /**begin repeat2
326 +
         * #i = 0, 1, 2, 3#
327 +
         */
328 +
        const @type@ abc@i@ = a@i@ * b@i@ + c@i@;
329 +
        /**end repeat2**/
330 +
        /**begin repeat2
331 +
         * #i = 0, 1, 2, 3#
332 +
         */
333 +
        data_out[@i@] = @to@(abc@i@);
334 +
        /**end repeat2**/
323 335
    }
324 336
#endif
325 -
326 -
    /* Unroll the loop by 8 */
327 -
    while (count >= 8) {
328 -
        count -= 8;
329 -
330 -
#if EINSUM_USE_SSE1 && @float32@
331 -
/**begin repeat2
332 -
 * #i = 0, 4#
333 -
 */
334 -
        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
335 -
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
336 -
        _mm_storeu_ps(data_out+@i@, b);
337 -
/**end repeat2**/
338 -
#elif EINSUM_USE_SSE2 && @float64@
339 -
/**begin repeat2
340 -
 * #i = 0, 2, 4, 6#
341 -
 */
342 -
        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
343 -
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
344 -
        _mm_storeu_pd(data_out+@i@, b);
345 -
/**end repeat2**/
346 -
#else
347 -
/**begin repeat2
348 -
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
349 -
 */
350 -
        data_out[@i@] = @to@(@from@(data0[@i@]) *
351 -
                             @from@(data1[@i@]) +
352 -
                             @from@(data_out[@i@]));
353 -
/**end repeat2**/
354 -
#endif
355 -
        data0 += 8;
356 -
        data1 += 8;
357 -
        data_out += 8;
337 +
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
338 +
        const @type@ a = @from@(*data0);
339 +
        const @type@ b = @from@(*data1);
340 +
        const @type@ c = @from@(*data_out);
341 +
        *data_out = @to@(a * b + c);
358 342
    }
359 -
360 -
    /* Finish off the loop */
361 -
    goto finish_after_unrolled_loop;
362 343
}
363 344
364 345
/* Some extra specializations for the two operand case */
365 346
static void
366 347
@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
367 348
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
368 349
{
369 -
    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
350 +
    @temptype@ a_scalar = @from@(*(@type@ *)dataptr[0]);
370 351
    @type@ *data1 = (@type@ *)dataptr[1];
371 352
    @type@ *data_out = (@type@ *)dataptr[2];
372 353
373 -
#if EINSUM_USE_SSE1 && @float32@
374 -
    __m128 a, b, value0_sse;
375 -
#elif EINSUM_USE_SSE2 && @float64@
376 -
    __m128d a, b, value0_sse;
377 -
#endif
378 -
379 354
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n",
380 355
                                                    (int)count);
381 356
382 -
/* This is placed before the main loop to make small counts faster */
383 -
finish_after_unrolled_loop:
384 -
    switch (count) {
385 -
/**begin repeat2
386 -
 * #i = 6, 5, 4, 3, 2, 1, 0#
387 -
 */
388 -
        case @i@+1:
389 -
            data_out[@i@] = @to@(value0 *
390 -
                                 @from@(data1[@i@]) +
391 -
                                 @from@(data_out[@i@]));
392 -
/**end repeat2**/
393 -
        case 0:
394 -
            return;
395 -
    }
396 -
397 -
#if EINSUM_USE_SSE1 && @float32@
398 -
    value0_sse = _mm_set_ps1(value0);
399 -
357 +
#if @NPYV_CHK@ // NPYV check for @type@
400 358
    /* Use aligned instructions if possible */
401 -
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
402 -
        /* Unroll the loop by 8 */
403 -
        while (count >= 8) {
404 -
            count -= 8;
405 -
406 -
/**begin repeat2
407 -
 * #i = 0, 4#
408 -
 */
409 -
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@));
410 -
            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
411 -
            _mm_store_ps(data_out+@i@, b);
412 -
/**end repeat2**/
413 -
            data1 += 8;
414 -
            data_out += 8;
415 -
        }
416 -
417 -
        /* Finish off the loop */
418 -
        if (count > 0) {
419 -
            goto finish_after_unrolled_loop;
359 +
    const int is_aligned = EINSUM_IS_ALIGNED(data1) && EINSUM_IS_ALIGNED(data_out);
360 +
    const int vstep = npyv_nlanes_@sfx@;
361 +
    const npyv_@sfx@ va_scalar = npyv_setall_@sfx@(a_scalar);
362 +
363 +
    /**begin repeat2
364 +
     * #cond = if(is_aligned), else#
365 +
     * #ld = loada, load#
366 +
     * #st = storea, store#
367 +
     */
368 +
    @cond@ {
369 +
    #if @unroll_by@ == 4
370 +
        const npy_intp vstepx4 = vstep * 4;
371 +
        for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4, data_out += vstepx4) {
372 +
            /**begin repeat3
373 +
             * #i = 0, 1, 2, 3#
374 +
             */
375 +
            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
376 +
            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
377 +
            /**end repeat3**/
378 +
            /**begin repeat3
379 +
             * #i = 0, 1, 2, 3#
380 +
             */
381 +
            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(va_scalar, b@i@, c@i@);
382 +
            /**end repeat3**/
383 +
            /**begin repeat3
384 +
             * #i = 0, 1, 2, 3#
385 +
             */
386 +
            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
387 +
            /**end repeat3**/
420 388
        }
421 -
        else {
422 -
            return;
389 +
    #elif @unroll_by@ == 2
390 +
        const npy_intp vstepx2 = vstep * 2;
391 +
        for (; count >= vstepx2; count -= vstepx2, data1 += vstepx2, data_out += vstepx2) {
392 +
            npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1);
393 +
            npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep);
394 +
            npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out);
395 +
            npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep);
396 +
            npyv_@sfx@ abc0 = npyv_muladd_@sfx@(va_scalar, b0, c0);
397 +
            npyv_@sfx@ abc1 = npyv_muladd_@sfx@(va_scalar, b1, c1);
398 +
            npyv_@st@_@sfx@(data_out, abc0);
399 +
            npyv_@st@_@sfx@(data_out + vstep, abc1);
423 400
        }
401 +
    #else
402 +
        #error "Invalid unroll_by = @unroll_by@"
403 +
    #endif
424 404
    }
425 -
#elif EINSUM_USE_SSE2 && @float64@
426 -
    value0_sse = _mm_set1_pd(value0);
427 -
428 -
    /* Use aligned instructions if possible */
429 -
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
430 -
        /* Unroll the loop by 8 */
431 -
        while (count >= 8) {
432 -
            count -= 8;
433 -
434 -
/**begin repeat2
435 -
 * #i = 0, 2, 4, 6#
436 -
 */
437 -
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@));
438 -
            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
439 -
            _mm_store_pd(data_out+@i@, b);
440 -
/**end repeat2**/
441 -
            data1 += 8;
442 -
            data_out += 8;
443 -
        }
444 -
445 -
        /* Finish off the loop */
446 -
        if (count > 0) {
447 -
            goto finish_after_unrolled_loop;
448 -
        }
449 -
        else {
450 -
            return;
451 -
        }
405 +
    /**end repeat2**/
406 +
    npyv_cleanup();
407 +
#endif // NPYV check for @type@
408 +
409 +
#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
410 +
    for (; count >= 4; count -= 4, data1 += 4, data_out += 4) {
411 +
        /**begin repeat2
412 +
         * #i = 0, 1, 2, 3#
413 +
         */
414 +
        const @type@ b@i@ = @from@(data1[@i@]);
415 +
        const @type@ c@i@ = @from@(data_out[@i@]);
416 +
        /**end repeat2**/
417 +
        /**begin repeat2
418 +
         * #i = 0, 1, 2, 3#
419 +
         */
420 +
        const @type@ abc@i@ = a_scalar * b@i@ + c@i@;
421 +
        /**end repeat2**/
422 +
        /**begin repeat2
423 +
         * #i = 0, 1, 2, 3#
424 +
         */
425 +
        data_out[@i@] = @to@(abc@i@);
426 +
        /**end repeat2**/
452 427
    }
453 428
#endif
454 -
455 -
    /* Unroll the loop by 8 */
456 -
    while (count >= 8) {
457 -
        count -= 8;
458 -
459 -
#if EINSUM_USE_SSE1 && @float32@
460 -
/**begin repeat2
461 -
 * #i = 0, 4#
462 -
 */
463 -
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@));
464 -
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
465 -
        _mm_storeu_ps(data_out+@i@, b);
466 -
/**end repeat2**/
467 -
#elif EINSUM_USE_SSE2 && @float64@
468 -
/**begin repeat2
469 -
 * #i = 0, 2, 4, 6#
470 -
 */
471 -
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@));
472 -
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
473 -
        _mm_storeu_pd(data_out+@i@, b);
474 -
/**end repeat2**/
475 -
#else
476 -
/**begin repeat2
477 -
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
478 -
 */
479 -
        data_out[@i@] = @to@(value0 *
480 -
                             @from@(data1[@i@]) +
481 -
                             @from@(data_out[@i@]));
482 -
/**end repeat2**/
483 -
#endif
484 -
        data1 += 8;
485 -
        data_out += 8;
486 -
    }
487 -
488 -
    /* Finish off the loop */
489 -
    if (count > 0) {
490 -
        goto finish_after_unrolled_loop;
429 +
    for (; count > 0; --count, ++data1, ++data_out) {
430 +
        const @type@ b = @from@(*data1);
431 +
        const @type@ c = @from@(*data_out);
432 +
        *data_out = @to@(a_scalar * b + c);
491 433
    }
492 434
}
493 435
@@ -496,116 +438,87 @@
Loading
496 438
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
497 439
{
498 440
    @type@ *data0 = (@type@ *)dataptr[0];
499 -
    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
441 +
    @temptype@ b_scalar = @from@(*(@type@ *)dataptr[1]);
500 442
    @type@ *data_out = (@type@ *)dataptr[2];
501 -
502 -
#if EINSUM_USE_SSE1 && @float32@
503 -
    __m128 a, b, value1_sse;
504 -
#elif EINSUM_USE_SSE2 && @float64@
505 -
    __m128d a, b, value1_sse;
506 -
#endif
507 -
508 443
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
509 444
                                                    (int)count);
510 -
511 -
/* This is placed before the main loop to make small counts faster */
512 -
finish_after_unrolled_loop:
513 -
    switch (count) {
514 -
/**begin repeat2
515 -
 * #i = 6, 5, 4, 3, 2, 1, 0#
516 -
 */
517 -
        case @i@+1:
518 -
            data_out[@i@] = @to@(@from@(data0[@i@])*
519 -
                                 value1  +
520 -
                                 @from@(data_out[@i@]));
521 -
/**end repeat2**/
522 -
        case 0:
523 -
            return;
524 -
    }
525 -
526 -
#if EINSUM_USE_SSE1 && @float32@
527 -
    value1_sse = _mm_set_ps1(value1);
528 -
445 +
#if @NPYV_CHK@ // NPYV check for @type@
529 446
    /* Use aligned instructions if possible */
530 -
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
531 -
        /* Unroll the loop by 8 */
532 -
        while (count >= 8) {
533 -
            count -= 8;
534 -
535 -
/**begin repeat2
536 -
 * #i = 0, 4#
537 -
 */
538 -
            a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse);
539 -
            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
540 -
            _mm_store_ps(data_out+@i@, b);
541 -
/**end repeat2**/
542 -
            data0 += 8;
543 -
            data_out += 8;
447 +
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data_out);
448 +
    const int vstep = npyv_nlanes_@sfx@;
449 +
    const npyv_@sfx@ vb_scalar = npyv_setall_@sfx@(b_scalar);
450 +
451 +
    /**begin repeat2
452 +
     * #cond = if(is_aligned), else#
453 +
     * #ld = loada, load#
454 +
     * #st = storea, store#
455 +
     */
456 +
    @cond@ {
457 +
    #if @unroll_by@ == 4
458 +
        const npy_intp vstepx4 = vstep * 4;
459 +
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data_out += vstepx4) {
460 +
            /**begin repeat3
461 +
             * #i = 0, 1, 2, 3#
462 +
             */
463 +
            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
464 +
            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
465 +
            /**end repeat3**/
466 +
            /**begin repeat3
467 +
             * #i = 0, 1, 2, 3#
468 +
             */
469 +
            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, vb_scalar, c@i@);
470 +
            /**end repeat3**/
471 +
            /**begin repeat3
472 +
             * #i = 0, 1, 2, 3#
473 +
             */
474 +
            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
475 +
            /**end repeat3**/
544 476
        }
545 -
546 -
        /* Finish off the loop */
547 -
        goto finish_after_unrolled_loop;
548 -
    }
549 -
#elif EINSUM_USE_SSE2 && @float64@
550 -
    value1_sse = _mm_set1_pd(value1);
551 -
552 -
    /* Use aligned instructions if possible */
553 -
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
554 -
        /* Unroll the loop by 8 */
555 -
        while (count >= 8) {
556 -
            count -= 8;
557 -
558 -
/**begin repeat2
559 -
 * #i = 0, 2, 4, 6#
560 -
 */
561 -
            a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse);
562 -
            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
563 -
            _mm_store_pd(data_out+@i@, b);
564 -
/**end repeat2**/
565 -
            data0 += 8;
566 -
            data_out += 8;
477 +
    #elif @unroll_by@ == 2
478 +
        const npy_intp vstepx2 = vstep * 2;
479 +
        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data_out += vstepx2) {
480 +
            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
481 +
            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
482 +
            npyv_@sfx@ c0 = npyv_@ld@_@sfx@(data_out);
483 +
            npyv_@sfx@ c1 = npyv_@ld@_@sfx@(data_out + vstep);
484 +
            npyv_@sfx@ abc0 = npyv_muladd_@sfx@(a0, vb_scalar, c0);
485 +
            npyv_@sfx@ abc1 = npyv_muladd_@sfx@(a1, vb_scalar, c1);
486 +
            npyv_@st@_@sfx@(data_out, abc0);
487 +
            npyv_@st@_@sfx@(data_out + vstep, abc1);
567 488
        }
568 -
569 -
        /* Finish off the loop */
570 -
        goto finish_after_unrolled_loop;
489 +
    #else
490 +
        #error "Invalid unroll_by = @unroll_by@"
491 +
    #endif
492 +
    }
493 +
    /**end repeat2**/
494 +
    npyv_cleanup();
495 +
#endif // NPYV check for @type@
496 +
497 +
#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
498 +
    for (; count >= 4; count -= 4, data0 += 4, data_out += 4) {
499 +
        /**begin repeat2
500 +
         * #i = 0, 1, 2, 3#
501 +
         */
502 +
        const @type@ a@i@ = @from@(data0[@i@]);
503 +
        const @type@ c@i@ = @from@(data_out[@i@]);
504 +
        /**end repeat2**/
505 +
        /**begin repeat2
506 +
         * #i = 0, 1, 2, 3#
507 +
         */
508 +
        const @type@ abc@i@ = a@i@ * b_scalar + c@i@;
509 +
        /**end repeat2**/
510 +
        /**begin repeat2
511 +
         * #i = 0, 1, 2, 3#
512 +
         */
513 +
        data_out[@i@] = @to@(abc@i@);
514 +
        /**end repeat2**/
571 515
    }
572 516
#endif
573 -
574 -
    /* Unroll the loop by 8 */
575 -
    while (count >= 8) {
576 -
        count -= 8;
577 -
578 -
#if EINSUM_USE_SSE1 && @float32@
579 -
/**begin repeat2
580 -
 * #i = 0, 4#
581 -
 */
582 -
        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse);
583 -
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
584 -
        _mm_storeu_ps(data_out+@i@, b);
585 -
/**end repeat2**/
586 -
#elif EINSUM_USE_SSE2 && @float64@
587 -
/**begin repeat2
588 -
 * #i = 0, 2, 4, 6#
589 -
 */
590 -
        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse);
591 -
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
592 -
        _mm_storeu_pd(data_out+@i@, b);
593 -
/**end repeat2**/
594 -
#else
595 -
/**begin repeat2
596 -
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
597 -
 */
598 -
        data_out[@i@] = @to@(@from@(data0[@i@])*
599 -
                             value1  +
600 -
                             @from@(data_out[@i@]));
601 -
/**end repeat2**/
602 -
#endif
603 -
        data0 += 8;
604 -
        data_out += 8;
517 +
    for (; count > 0; --count, ++data0, ++data_out) {
518 +
        const @type@ a = @from@(*data0);
519 +
        const @type@ c = @from@(*data_out);
520 +
        *data_out = @to@(a * b_scalar + c);
605 521
    }
606 -
607 -
    /* Finish off the loop */
608 -
    goto finish_after_unrolled_loop;
609 522
}
610 523
611 524
static void
@@ -616,432 +529,198 @@
Loading
616 529
    @type@ *data1 = (@type@ *)dataptr[1];
617 530
    @temptype@ accum = 0;
618 531
619 -
#if EINSUM_USE_SSE1 && @float32@
620 -
    __m128 a, accum_sse = _mm_setzero_ps();
621 -
#elif EINSUM_USE_SSE2 && @float64@
622 -
    __m128d a, accum_sse = _mm_setzero_pd();
623 -
#endif
624 -
625 532
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
626 533
                                                    (int)count);
627 -
628 -
/* This is placed before the main loop to make small counts faster */
629 -
finish_after_unrolled_loop:
630 -
    switch (count) {
631 -
/**begin repeat2
632 -
 * #i = 6, 5, 4, 3, 2, 1, 0#
633 -
 */
634 -
        case @i@+1:
635 -
            accum += @from@(data0[@i@]) * @from@(data1[@i@]);
636 -
/**end repeat2**/
637 -
        case 0:
638 -
            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
639 -
            return;
640 -
    }
641 -
642 -
#if EINSUM_USE_SSE1 && @float32@
534 +
#if @NPYV_CHK@ // NPYV check for @type@
643 535
    /* Use aligned instructions if possible */
644 -
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
645 -
        /* Unroll the loop by 8 */
646 -
        while (count >= 8) {
647 -
            count -= 8;
648 -
649 -
            _mm_prefetch(data0 + 512, _MM_HINT_T0);
650 -
            _mm_prefetch(data1 + 512, _MM_HINT_T0);
651 -
652 -
/**begin repeat2
653 -
 * #i = 0, 4#
654 -
 */
655 -
            /*
656 -
             * NOTE: This accumulation changes the order, so will likely
657 -
             *       produce slightly different results.
536 +
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
537 +
    const int vstep = npyv_nlanes_@sfx@;
538 +
    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
539 +
540 +
    /**begin repeat2
541 +
     * #cond = if(is_aligned), else#
542 +
     * #ld = loada, load#
543 +
     * #st = storea, store#
544 +
     */
545 +
    @cond@ {
546 +
    #if @unroll_by@ == 4
547 +
        const npy_intp vstepx4 = vstep * 4;
548 +
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
549 +
            /**begin repeat3
550 +
             * #i = 0, 1, 2, 3#
658 551
             */
659 -
            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
660 -
            accum_sse = _mm_add_ps(accum_sse, a);
661 -
/**end repeat2**/
662 -
            data0 += 8;
663 -
            data1 += 8;
552 +
            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
553 +
            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
554 +
            /**end repeat3**/
555 +
            npyv_@sfx@ ab3 = npyv_muladd_@sfx@(a3, b3, vaccum);
556 +
            npyv_@sfx@ ab2 = npyv_muladd_@sfx@(a2, b2, ab3);
557 +
            npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, ab2);
558 +
                    vaccum = npyv_muladd_@sfx@(a0, b0, ab1);
664 559
        }
665 -
666 -
        /* Add the four SSE values and put in accum */
667 -
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
668 -
        accum_sse = _mm_add_ps(a, accum_sse);
669 -
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
670 -
        accum_sse = _mm_add_ps(a, accum_sse);
671 -
        _mm_store_ss(&accum, accum_sse);
672 -
673 -
        /* Finish off the loop */
674 -
        goto finish_after_unrolled_loop;
675 -
    }
676 -
#elif EINSUM_USE_SSE2 && @float64@
677 -
    /* Use aligned instructions if possible */
678 -
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
679 -
        /* Unroll the loop by 8 */
680 -
        while (count >= 8) {
681 -
            count -= 8;
682 -
683 -
            _mm_prefetch(data0 + 512, _MM_HINT_T0);
684 -
            _mm_prefetch(data1 + 512, _MM_HINT_T0);
685 -
686 -
/**begin repeat2
687 -
 * #i = 0, 2, 4, 6#
688 -
 */
689 -
            /*
690 -
             * NOTE: This accumulation changes the order, so will likely
691 -
             *       produce slightly different results.
692 -
             */
693 -
            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
694 -
            accum_sse = _mm_add_pd(accum_sse, a);
695 -
/**end repeat2**/
696 -
            data0 += 8;
697 -
            data1 += 8;
560 +
    #elif @unroll_by@ == 2
561 +
        const npy_intp vstepx2 = vstep * 2;
562 +
        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2, data1 += vstepx2) {
563 +
            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
564 +
            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
565 +
            npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1);
566 +
            npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep);
567 +
            npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, vaccum);
568 +
                    vaccum = npyv_muladd_@sfx@(a0, b0, ab1);
698 569
        }
699 -
700 -
        /* Add the two SSE2 values and put in accum */
701 -
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
702 -
        accum_sse = _mm_add_pd(a, accum_sse);
703 -
        _mm_store_sd(&accum, accum_sse);
704 -
705 -
        /* Finish off the loop */
706 -
        goto finish_after_unrolled_loop;
570 +
    #else
571 +
        #error "Invalid unroll_by = @unroll_by@"
572 +
    #endif
707 573
    }
708 -
#endif
709 -
710 -
    /* Unroll the loop by 8 */
711 -
    while (count >= 8) {
712 -
        count -= 8;
713 -
714 -
#if EINSUM_USE_SSE1 && @float32@
715 -
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
716 -
        _mm_prefetch(data1 + 512, _MM_HINT_T0);
717 -
718 -
/**begin repeat2
719 -
 * #i = 0, 4#
720 -
 */
721 -
        /*
722 -
         * NOTE: This accumulation changes the order, so will likely
723 -
         *       produce slightly different results.
724 -
         */
725 -
        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
726 -
        accum_sse = _mm_add_ps(accum_sse, a);
727 -
/**end repeat2**/
728 -
#elif EINSUM_USE_SSE2 && @float64@
729 -
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
730 -
        _mm_prefetch(data1 + 512, _MM_HINT_T0);
731 -
732 -
/**begin repeat2
733 -
 * #i = 0, 2, 4, 6#
734 -
 */
735 -
        /*
736 -
         * NOTE: This accumulation changes the order, so will likely
737 -
         *       produce slightly different results.
574 +
    /**end repeat2**/
575 +
    accum = npyv_sum_@sfx@(vaccum);
576 +
    npyv_cleanup();
577 +
#endif // NPYV check for @type@
578 +
579 +
#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
580 +
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
581 +
        /**begin repeat2
582 +
         * #i = 0, 1, 2, 3#
738 583
         */
739 -
        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
740 -
        accum_sse = _mm_add_pd(accum_sse, a);
741 -
/**end repeat2**/
742 -
#else
743 -
/**begin repeat2
744 -
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
745 -
 */
746 -
        accum += @from@(data0[@i@]) * @from@(data1[@i@]);
747 -
/**end repeat2**/
748 -
#endif
749 -
        data0 += 8;
750 -
        data1 += 8;
584 +
        const @type@ ab@i@ = @from@(data0[@i@]) * @from@(data1[@i@]);
585 +
        /**end repeat2**/
586 +
        accum += ab0 + ab1 + ab2 + ab3;
751 587
    }
752 -
753 -
#if EINSUM_USE_SSE1 && @float32@
754 -
    /* Add the four SSE values and put in accum */
755 -
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
756 -
    accum_sse = _mm_add_ps(a, accum_sse);
757 -
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
758 -
    accum_sse = _mm_add_ps(a, accum_sse);
759 -
    _mm_store_ss(&accum, accum_sse);
760 -
#elif EINSUM_USE_SSE2 && @float64@
761 -
    /* Add the two SSE2 values and put in accum */
762 -
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
763 -
    accum_sse = _mm_add_pd(a, accum_sse);
764 -
    _mm_store_sd(&accum, accum_sse);
765 588
#endif
766 -
767 -
    /* Finish off the loop */
768 -
    goto finish_after_unrolled_loop;
589 +
    for (; count > 0; --count, ++data0, ++data1) {
590 +
        const @type@ a = @from@(*data0);
591 +
        const @type@ b = @from@(*data1);
592 +
        accum += a * b;
593 +
    }
594 +
    *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
769 595
}
770 596
771 597
static void
772 598
@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
773 599
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
774 600
{
775 -
    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
601 +
    @temptype@ a_scalar = @from@(*(@type@ *)dataptr[0]);
776 602
    @type@ *data1 = (@type@ *)dataptr[1];
777 603
    @temptype@ accum = 0;
778 604
779 -
#if EINSUM_USE_SSE1 && @float32@
780 -
    __m128 a, accum_sse = _mm_setzero_ps();
781 -
#elif EINSUM_USE_SSE2 && @float64@
782 -
    __m128d a, accum_sse = _mm_setzero_pd();
783 -
#endif
784 -
785 605
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
786 606
                                                    (int)count);
787 -
788 -
/* This is placed before the main loop to make small counts faster */
789 -
finish_after_unrolled_loop:
790 -
    switch (count) {
791 -
/**begin repeat2
792 -
 * #i = 6, 5, 4, 3, 2, 1, 0#
793 -
 */
794 -
        case @i@+1:
795 -
            accum += @from@(data1[@i@]);
796 -
/**end repeat2**/
797 -
        case 0:
798 -
            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
799 -
            return;
800 -
    }
801 -
802 -
#if EINSUM_USE_SSE1 && @float32@
607 +
#if @NPYV_CHK@ // NPYV check for @type@
803 608
    /* Use aligned instructions if possible */
804 -
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
805 -
        /* Unroll the loop by 8 */
806 -
        while (count >= 8) {
807 -
            count -= 8;
808 -
809 -
/**begin repeat2
810 -
 * #i = 0, 4#
811 -
 */
812 -
            /*
813 -
             * NOTE: This accumulation changes the order, so will likely
814 -
             *       produce slightly different results.
609 +
    const int is_aligned = EINSUM_IS_ALIGNED(data1);
610 +
    const int vstep = npyv_nlanes_@sfx@;
611 +
    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
612 +
613 +
    /**begin repeat2
614 +
     * #cond = if(is_aligned), else#
615 +
     * #ld = loada, load#
616 +
     * #st = storea, store#
617 +
     */
618 +
    @cond@ {
619 +
    #if @unroll_by@ == 4
620 +
        const npy_intp vstepx4 = vstep * 4;
621 +
        for (; count >= vstepx4; count -= vstepx4, data1 += vstepx4) {
622 +
            /**begin repeat3
623 +
             * #i = 0, 1, 2, 3#
815 624
             */
816 -
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
817 -
/**end repeat2**/
818 -
            data1 += 8;
625 +
            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
626 +
            /**end repeat3**/
627 +
            npyv_@sfx@ b01   = npyv_add_@sfx@(b0, b1);
628 +
            npyv_@sfx@ b23   = npyv_add_@sfx@(b2, b3);
629 +
            npyv_@sfx@ b0123 = npyv_add_@sfx@(b01, b23);
630 +
                      vaccum = npyv_add_@sfx@(b0123, vaccum);
819 631
        }
820 -
        /* Add the four SSE values and put in accum */
821 -
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
822 -
        accum_sse = _mm_add_ps(a, accum_sse);
823 -
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
824 -
        accum_sse = _mm_add_ps(a, accum_sse);
825 -
        _mm_store_ss(&accum, accum_sse);
826 -
827 -
        /* Finish off the loop */
828 -
        goto finish_after_unrolled_loop;
829 -
    }
830 -
#elif EINSUM_USE_SSE2 && @float64@
831 -
    /* Use aligned instructions if possible */
832 -
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
833 -
        /* Unroll the loop by 8 */
834 -
        while (count >= 8) {
835 -
            count -= 8;
836 -
837 -
/**begin repeat2
838 -
 * #i = 0, 2, 4, 6#
839 -
 */
840 -
            /*
841 -
             * NOTE: This accumulation changes the order, so will likely
842 -
             *       produce slightly different results.
843 -
             */
844 -
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
845 -
/**end repeat2**/
846 -
            data1 += 8;
632 +
    #elif @unroll_by@ == 2
633 +
        const npy_intp vstepx2 = vstep * 2;
634 +
        for (; count >= vstepx2; count -= vstepx2, data1 += vstepx2) {
635 +
            npyv_@sfx@ b0 = npyv_@ld@_@sfx@(data1);
636 +
            npyv_@sfx@ b1 = npyv_@ld@_@sfx@(data1 + vstep);
637 +
            npyv_@sfx@ b01 = npyv_add_@sfx@(b0, b1);
638 +
                    vaccum = npyv_add_@sfx@(b01, vaccum);
847 639
        }
848 -
        /* Add the two SSE2 values and put in accum */
849 -
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
850 -
        accum_sse = _mm_add_pd(a, accum_sse);
851 -
        _mm_store_sd(&accum, accum_sse);
852 -
853 -
        /* Finish off the loop */
854 -
        goto finish_after_unrolled_loop;
640 +
    #else
641 +
        #error "Invalid unroll_by = @unroll_by@"
642 +
    #endif
855 643
    }
856 -
#endif
857 -
858 -
    /* Unroll the loop by 8 */
859 -
    while (count >= 8) {
860 -
        count -= 8;
861 -
862 -
#if EINSUM_USE_SSE1 && @float32@
863 -
/**begin repeat2
864 -
 * #i = 0, 4#
865 -
 */
866 -
        /*
867 -
         * NOTE: This accumulation changes the order, so will likely
868 -
         *       produce slightly different results.
869 -
         */
870 -
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
871 -
/**end repeat2**/
872 -
#elif EINSUM_USE_SSE2 && @float64@
873 -
/**begin repeat2
874 -
 * #i = 0, 2, 4, 6#
875 -
 */
876 -
        /*
877 -
         * NOTE: This accumulation changes the order, so will likely
878 -
         *       produce slightly different results.
879 -
         */
880 -
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
881 -
/**end repeat2**/
882 -
#else
883 -
/**begin repeat2
884 -
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
885 -
 */
886 -
        accum += @from@(data1[@i@]);
887 -
/**end repeat2**/
888 -
#endif
889 -
        data1 += 8;
644 +
    /**end repeat2**/
645 +
    accum = npyv_sum_@sfx@(vaccum);
646 +
    npyv_cleanup();
647 +
#endif // NPYV check for @type@
648 +
649 +
#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
650 +
    for (; count >= 4; count -= 4, data1 += 4) {
651 +
        const @type@ b01 = @from@(data1[0]) + @from@(data1[1]);
652 +
        const @type@ b23 = @from@(data1[2]) + @from@(data1[3]);
653 +
        accum += b01 + b23;
890 654
    }
891 -
892 -
#if EINSUM_USE_SSE1 && @float32@
893 -
    /* Add the four SSE values and put in accum */
894 -
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
895 -
    accum_sse = _mm_add_ps(a, accum_sse);
896 -
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
897 -
    accum_sse = _mm_add_ps(a, accum_sse);
898 -
    _mm_store_ss(&accum, accum_sse);
899 -
#elif EINSUM_USE_SSE2 && @float64@
900 -
    /* Add the two SSE2 values and put in accum */
901 -
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
902 -
    accum_sse = _mm_add_pd(a, accum_sse);
903 -
    _mm_store_sd(&accum, accum_sse);
904 655
#endif
905 -
906 -
    /* Finish off the loop */
907 -
    goto finish_after_unrolled_loop;
656 +
    for (; count > 0; --count, ++data1) {
657 +
        accum += @from@(*data1);
658 +
    }
659 +
    *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + a_scalar * accum);
908 660
}
909 661
910 662
static void
911 663
@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
912 664
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
913 665
{
914 666
    @type@ *data0 = (@type@ *)dataptr[0];
915 -
    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
667 +
    @temptype@ b_scalar = @from@(*(@type@ *)dataptr[1]);
916 668
    @temptype@ accum = 0;
917 -
918 -
#if EINSUM_USE_SSE1 && @float32@
919 -
    __m128 a, accum_sse = _mm_setzero_ps();
920 -
#elif EINSUM_USE_SSE2 && @float64@
921 -
    __m128d a, accum_sse = _mm_setzero_pd();
922 -
#endif
923 -
924 669
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
925 670
                                                    (int)count);
926 -
927 -
/* This is placed before the main loop to make small counts faster */
928 -
finish_after_unrolled_loop:
929 -
    switch (count) {
930 -
/**begin repeat2
931 -
 * #i = 6, 5, 4, 3, 2, 1, 0#
932 -
 */
933 -
        case @i@+1:
934 -
            accum += @from@(data0[@i@]);
935 -
/**end repeat2**/
936 -
        case 0:
937 -
            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
938 -
            return;
939 -
    }
940 -
941 -
#if EINSUM_USE_SSE1 && @float32@
671 +
#if @NPYV_CHK@ // NPYV check for @type@
942 672
    /* Use aligned instructions if possible */
943 -
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
944 -
        /* Unroll the loop by 8 */
945 -
        while (count >= 8) {
946 -
            count -= 8;
947 -
948 -
/**begin repeat2
949 -
 * #i = 0, 4#
950 -
 */
951 -
            /*
952 -
             * NOTE: This accumulation changes the order, so will likely
953 -
             *       produce slightly different results.
673 +
    const int is_aligned = EINSUM_IS_ALIGNED(data0);
674 +
    const int vstep = npyv_nlanes_@sfx@;
675 +
    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
676 +
677 +
    /**begin repeat2
678 +
     * #cond = if(is_aligned), else#
679 +
     * #ld = loada, load#
680 +
     * #st = storea, store#
681 +
     */
682 +
    @cond@ {
683 +
    #if @unroll_by@ == 4
684 +
        const npy_intp vstepx4 = vstep * 4;
685 +
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4) {
686 +
            /**begin repeat3
687 +
             * #i = 0, 1, 2, 3#
954 688
             */
955 -
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
956 -
/**end repeat2**/
957 -
            data0 += 8;
689 +
            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
690 +
            /**end repeat3**/
691 +
            npyv_@sfx@ a01   = npyv_add_@sfx@(a0, a1);
692 +
            npyv_@sfx@ a23   = npyv_add_@sfx@(a2, a3);
693 +
            npyv_@sfx@ a0123 = npyv_add_@sfx@(a01, a23);
694 +
                      vaccum = npyv_add_@sfx@(a0123, vaccum);
958 695
        }
959 -
        /* Add the four SSE values and put in accum */
960 -
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
961 -
        accum_sse = _mm_add_ps(a, accum_sse);
962 -
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
963 -
        accum_sse = _mm_add_ps(a, accum_sse);
964 -
        _mm_store_ss(&accum, accum_sse);
965 -
        /* Finish off the loop */
966 -
        goto finish_after_unrolled_loop;
967 -
    }
968 -
#elif EINSUM_USE_SSE2 && @float64@
969 -
    /* Use aligned instructions if possible */
970 -
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
971 -
        /* Unroll the loop by 8 */
972 -
        while (count >= 8) {
973 -
            count -= 8;
974 -
975 -
/**begin repeat2
976 -
 * #i = 0, 2, 4, 6#
977 -
 */
978 -
            /*
979 -
             * NOTE: This accumulation changes the order, so will likely
980 -
             *       produce slightly different results.
981 -
             */
982 -
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
983 -
/**end repeat2**/
984 -
            data0 += 8;
696 +
    #elif @unroll_by@ == 2
697 +
        const npy_intp vstepx2 = vstep * 2;
698 +
        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2) {
699 +
            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
700 +
            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
701 +
            npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1);
702 +
                    vaccum = npyv_add_@sfx@(a01, vaccum);
985 703
        }
986 -
        /* Add the two SSE2 values and put in accum */
987 -
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
988 -
        accum_sse = _mm_add_pd(a, accum_sse);
989 -
        _mm_store_sd(&accum, accum_sse);
990 -
        /* Finish off the loop */
991 -
        goto finish_after_unrolled_loop;
704 +
    #else
705 +
        #error "Invalid unroll_by = @unroll_by@"
706 +
    #endif
992 707
    }
993 -
#endif
994 -
995 -
    /* Unroll the loop by 8 */
996 -
    while (count >= 8) {
997 -
        count -= 8;
998 -
999 -
#if EINSUM_USE_SSE1 && @float32@
1000 -
/**begin repeat2
1001 -
 * #i = 0, 4#
1002 -
 */
1003 -
        /*
1004 -
         * NOTE: This accumulation changes the order, so will likely
1005 -
         *       produce slightly different results.
1006 -
         */
1007 -
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
1008 -
/**end repeat2**/
1009 -
#elif EINSUM_USE_SSE2 && @float64@
1010 -
/**begin repeat2
1011 -
 * #i = 0, 2, 4, 6#
1012 -
 */
1013 -
        /*
1014 -
         * NOTE: This accumulation changes the order, so will likely
1015 -
         *       produce slightly different results.
1016 -
         */
1017 -
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
1018 -
/**end repeat2**/
1019 -
#else
1020 -
/**begin repeat2
1021 -
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
1022 -
 */
1023 -
        accum += @from@(data0[@i@]);
1024 -
/**end repeat2**/
1025 -
#endif
1026 -
        data0 += 8;
708 +
    /**end repeat2**/
709 +
    accum = npyv_sum_@sfx@(vaccum);
710 +
    npyv_cleanup();
711 +
#endif // NPYV check for @type@
712 +
713 +
#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
714 +
    for (; count >= 4; count -= 4, data0 += 4) {
715 +
        const @type@ a01 = @from@(data0[0]) + @from@(data0[1]);
716 +
        const @type@ a23 = @from@(data0[2]) + @from@(data0[3]);
717 +
        accum += a01 + a23;
1027 718
    }
1028 -
1029 -
#if EINSUM_USE_SSE1 && @float32@
1030 -
    /* Add the four SSE values and put in accum */
1031 -
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
1032 -
    accum_sse = _mm_add_ps(a, accum_sse);
1033 -
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
1034 -
    accum_sse = _mm_add_ps(a, accum_sse);
1035 -
    _mm_store_ss(&accum, accum_sse);
1036 -
#elif EINSUM_USE_SSE2 && @float64@
1037 -
    /* Add the two SSE2 values and put in accum */
1038 -
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
1039 -
    accum_sse = _mm_add_pd(a, accum_sse);
1040 -
    _mm_store_sd(&accum, accum_sse);
1041 719
#endif
1042 -
1043 -
    /* Finish off the loop */
1044 -
    goto finish_after_unrolled_loop;
720 +
    for (; count > 0; --count, ++data0) {
721 +
        accum += @from@(*data0);
722 +
    }
723 +
    *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + b_scalar * accum);
1045 724
}
1046 725
1047 726
#elif @nop@ == 3 && !@complex@
@@ -1155,167 +834,80 @@
Loading
1155 834
    @type@ *data0 = (@type@ *)dataptr[0];
1156 835
#endif
1157 836
1158 -
#if EINSUM_USE_SSE1 && @float32@
1159 -
    __m128 a, accum_sse = _mm_setzero_ps();
1160 -
#elif EINSUM_USE_SSE2 && @float64@
1161 -
    __m128d a, accum_sse = _mm_setzero_pd();
1162 -
#endif
1163 -
1164 -
1165 -
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
1166 -
                                                    (int)count);
1167 -
1168 -
/* This is placed before the main loop to make small counts faster */
1169 -
finish_after_unrolled_loop:
1170 -
    switch (count) {
1171 -
/**begin repeat2
1172 -
 * #i = 6, 5, 4, 3, 2, 1, 0#
1173 -
 */
1174 -
        case @i@+1:
1175 -
#if !@complex@
1176 -
            accum += @from@(data0[@i@]);
1177 -
#else /* complex */
1178 -
            accum_re += data0[2*@i@+0];
1179 -
            accum_im += data0[2*@i@+1];
1180 -
#endif
1181 -
/**end repeat2**/
1182 -
        case 0:
1183 -
#if @complex@
1184 -
            ((@temptype@ *)dataptr[1])[0] += accum_re;
1185 -
            ((@temptype@ *)dataptr[1])[1] += accum_im;
1186 -
#else
1187 -
            *((@type@ *)dataptr[1]) = @to@(accum +
1188 -
                                    @from@(*((@type@ *)dataptr[1])));
1189 -
#endif
1190 -
            return;
1191 -
    }
1192 -
1193 -
#if EINSUM_USE_SSE1 && @float32@
837 +
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
838 +
#if @NPYV_CHK@ // NPYV check for @type@
1194 839
    /* Use aligned instructions if possible */
1195 -
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
1196 -
        /* Unroll the loop by 8 */
1197 -
        while (count >= 8) {
1198 -
            count -= 8;
1199 -
1200 -
            _mm_prefetch(data0 + 512, _MM_HINT_T0);
1201 -
1202 -
/**begin repeat2
1203 -
 * #i = 0, 4#
1204 -
 */
1205 -
            /*
1206 -
             * NOTE: This accumulation changes the order, so will likely
1207 -
             *       produce slightly different results.
840 +
    const int is_aligned = EINSUM_IS_ALIGNED(data0);
841 +
    const int vstep = npyv_nlanes_@sfx@;
842 +
    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
843 +
844 +
    /**begin repeat2
845 +
     * #cond = if(is_aligned), else#
846 +
     * #ld = loada, load#
847 +
     * #st = storea, store#
848 +
     */
849 +
    @cond@ {
850 +
    #if @unroll_by@ == 4
851 +
        const npy_intp vstepx4 = vstep * 4;
852 +
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4) {
853 +
            /**begin repeat3
854 +
             * #i = 0, 1, 2, 3#
1208 855
             */
1209 -
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
1210 -
/**end repeat2**/
1211 -
            data0 += 8;
856 +
            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
857 +
            /**end repeat3**/
858 +
            npyv_@sfx@ a01   = npyv_add_@sfx@(a0, a1);
859 +
            npyv_@sfx@ a23   = npyv_add_@sfx@(a2, a3);
860 +
            npyv_@sfx@ a0123 = npyv_add_@sfx@(a01, a23);
861 +
                      vaccum = npyv_add_@sfx@(a0123, vaccum);
1212 862
        }
1213 -
1214 -
        /* Add the four SSE values and put in accum */
1215 -
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
1216 -
        accum_sse = _mm_add_ps(a, accum_sse);
1217 -
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
1218 -
        accum_sse = _mm_add_ps(a, accum_sse);
1219 -
        _mm_store_ss(&accum, accum_sse);
1220 -
1221 -
        /* Finish off the loop */
1222 -
        goto finish_after_unrolled_loop;
1223 -
    }
1224 -
#elif EINSUM_USE_SSE2 && @float64@
1225 -
    /* Use aligned instructions if possible */
1226 -
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
1227 -
        /* Unroll the loop by 8 */
1228 -
        while (count >= 8) {
1229 -
            count -= 8;
1230 -
1231 -
            _mm_prefetch(data0 + 512, _MM_HINT_T0);
1232 -
1233 -
/**begin repeat2
1234 -
 * #i = 0, 2, 4, 6#
1235 -
 */
1236 -
            /*
1237 -
             * NOTE: This accumulation changes the order, so will likely
1238 -
             *       produce slightly different results.
1239 -
             */
1240 -
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
1241 -
/**end repeat2**/
1242 -
            data0 += 8;
863 +
    #elif @unroll_by@ == 2
864 +
        const npy_intp vstepx2 = vstep * 2;
865 +
        for (; count >= vstepx2; count -= vstepx2, data0 += vstepx2) {
866 +
            npyv_@sfx@ a0 = npyv_@ld@_@sfx@(data0);
867 +
            npyv_@sfx@ a1 = npyv_@ld@_@sfx@(data0 + vstep);
868 +
            npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1);
869 +
                    vaccum = npyv_add_@sfx@(a01, vaccum);
1243 870
        }
1244 -
1245 -
        /* Add the two SSE2 values and put in accum */
1246 -
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
1247 -
        accum_sse = _mm_add_pd(a, accum_sse);
1248 -
        _mm_store_sd(&accum, accum_sse);
1249 -
1250 -
        /* Finish off the loop */
1251 -
        goto finish_after_unrolled_loop;
871 +
    #else
872 +
        #error "Invalid unroll_by = @unroll_by@"
873 +
    #endif
1252 874
    }
875 +
    /**end repeat2**/
876 +
    accum = npyv_sum_@sfx@(vaccum);
877 +
    npyv_cleanup();
878 +
#endif // NPYV check for @type@
879 +
880 +
#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
881 +
    #if @complex@
882 +
        for (; count > 4; count -= 4, data0 += 4*2) {
883 +
            const @temptype@ re01 = data0[0] + data0[2];
884 +
            const @temptype@ re23 = data0[4] + data0[6];
885 +
            const @temptype@ im13 = data0[1] + data0[3];
886 +
            const @temptype@ im57 = data0[5] + data0[7];
887 +
            accum_re += re01 + re23;
888 +
            accum_im += im13 + im57;
889 +
        }
890 +
    #else
891 +
        for (; count > 4; count -= 4, data0 += 4) {
892 +
            const @temptype@ a01 = @from@(data0[0]) + @from@(data0[1]);
893 +
            const @temptype@ a23 = @from@(data0[2]) + @from@(data0[3]);
894 +
            accum +=  a01 + a23;
895 +
        }
896 +
    #endif  // complex
1253 897
#endif
1254 -
1255 -
    /* Unroll the loop by 8 */
1256 -
    while (count >= 8) {
1257 -
        count -= 8;
1258 -
1259 -
#if EINSUM_USE_SSE1 && @float32@
1260 -
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
1261 -
1262 -
/**begin repeat2
1263 -
 * #i = 0, 4#
1264 -
 */
1265 -
        /*
1266 -
         * NOTE: This accumulation changes the order, so will likely
1267 -
         *       produce slightly different results.
1268 -
         */
1269 -
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
1270 -
/**end repeat2**/
1271 -
#elif EINSUM_USE_SSE2 && @float64@
1272 -
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
1273 -
1274 -
/**begin repeat2
1275 -
 * #i = 0, 2, 4, 6#
1276 -
 */
1277 -
        /*
1278 -
         * NOTE: This accumulation changes the order, so will likely
1279 -
         *       produce slightly different results.
1280 -
         */
1281 -
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
1282 -
/**end repeat2**/
1283 -
#else
1284 -
/**begin repeat2
1285 -
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
1286 -
 */
1287 -
#  if !@complex@
1288 -
        accum += @from@(data0[@i@]);
1289 -
#  else /* complex */
1290 -
        accum_re += data0[2*@i@+0];
1291 -
        accum_im += data0[2*@i@+1];
1292 -
#  endif
1293 -
/**end repeat2**/
1294 -
#endif
1295 -
1296 -
#if !@complex@
1297 -
        data0 += 8;
898 +
#if @complex@
899 +
    for (; count > 0; --count, data0 += 2) {
900 +
        accum_re += data0[0];
901 +
        accum_im += data0[1];
902 +
    }
903 +
    ((@temptype@ *)dataptr[1])[0] += accum_re;
904 +
    ((@temptype@ *)dataptr[1])[1] += accum_im;
1298 905
#else
1299 -
        data0 += 8*2;
1300 -
#endif
906 +
    for (; count > 0; --count, ++data0) {
907 +
        accum += @from@(*data0);
1301 908
    }
1302 -
1303 -
#if EINSUM_USE_SSE1 && @float32@
1304 -
    /* Add the four SSE values and put in accum */
1305 -
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
1306 -
    accum_sse = _mm_add_ps(a, accum_sse);
1307 -
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
1308 -
    accum_sse = _mm_add_ps(a, accum_sse);
1309 -
    _mm_store_ss(&accum, accum_sse);
1310 -
#elif EINSUM_USE_SSE2 && @float64@
1311 -
    /* Add the two SSE2 values and put in accum */
1312 -
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
1313 -
    accum_sse = _mm_add_pd(a, accum_sse);
1314 -
    _mm_store_sd(&accum, accum_sse);
1315 -
#endif
1316 -
1317 -
    /* Finish off the loop */
1318 -
    goto finish_after_unrolled_loop;
909 +
    *((@type@ *)dataptr[1]) = @to@(accum + @from@(*((@type@ *)dataptr[1])));
910 +
#endif // complex
1319 911
}
1320 912
1321 913
#endif /* @nop@ == 1 */

Learn more Showing 2 files with coverage changes found.

New file numpy/core/src/common/simd/sse/arithmetic.h
New
Loading file...
Changes in numpy/core/src/multiarray/einsum_sumprod.c.src
-15
+31
Loading file...

40 Commits

Hiding 12 contexual commits
+1 Files
+119
+135
-16
Hiding 3 contexual commits
Hiding 2 contexual commits
+3 Files
+430
+404
+26
+41
+57
-16
-2 Files
-70
+63
-133
Hiding 5 contexual commits
-2 Files
-179
+7
-186
Hiding 1 contexual commits
+1 Files
-59
+1
-60
Hiding 1 contexual commits
+24
+54
-30
Hiding 2 contexual commits
-3 Files
-1160
-1160
Pull Request Base Commit
Files Coverage
numpy 0.01% 84.08%
Project Totals (130 files) 84.08%
Loading