@@ -205,7 +205,11 @@
Loading
205 205
     * This test is faster than a direct modulo.
206 206
     * Note alignment value of 0 is allowed and returns False.
207 207
     */
208 +
#ifdef NPY_HAVE_NEON
209 +
    return 0;
210 +
#else
208 211
    return ((npy_uintp)(p) & ((alignment) - 1)) == 0;
212 +
#endif
209 213
}
210 214
211 215
/* Get equivalent "uint" alignment given an itemsize, for use in copy code */

@@ -17,6 +17,14 @@
Loading
17 17
#include "common.h"
18 18
19 19
#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
20 +
/**
21 +
 * Unroll by four/eight scalars in case of:
22 +
 *  - The SIMD width is higher than 128bit since we unroll by x2/x4
23 +
 *    and that may lead to performance loss on small arrays.
24 +
 *  - To give the chance to the compiler to
25 +
 *    auto-vectorize in case of NPYV wasn't available.
26 +
 */
27 +
#define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128))
20 28
21 29
/**begin repeat
22 30
 * #name = byte, short, int, long, longlong,
@@ -240,13 +248,8 @@
Loading
240 248
                                                            (int)count);
241 249
#if @NPYV_CHK@ // NPYV check for @type@
242 250
    /* Use aligned instructions if possible */
243 -
    #ifndef NPY_HAVE_NEON
244 251
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
245 -
                           EINSUM_IS_ALIGNED(data_out);
246 -
    #else
247 -
    // ARM/Neon don't have instructions for aligned memory access
248 -
    const int is_aligned = 0;
249 -
    #endif
252 +
                        EINSUM_IS_ALIGNED(data_out);
250 253
    const int vstep = npyv_nlanes_@sfx@;
251 254
252 255
    /**begin repeat2
@@ -290,19 +293,15 @@
Loading
290 293
            npyv_@st@_@sfx@(data_out, abc0);
291 294
            npyv_@st@_@sfx@(data_out + vstep, abc1);
292 295
        }
296 +
    #else
297 +
        #error "Invalid unroll_by = @unroll_by@"
293 298
    #endif
294 299
    }
295 300
    /**end repeat2**/
296 301
    npyv_cleanup();
297 302
#endif // NPYV check for @type@
298 -
/**
299 -
 * Unroll by four scalars in case of:
300 -
 *  - The SIMD width is higher than 128bit since we unroll by x2/x4
301 -
 *    and that may lead to performance loss on small arrays.
302 -
 *  - To give the change to the compiler to
303 -
 *    auto-vectorize in case of NPYV wasn't available.
304 -
 */
305 -
#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
303 +
304 +
#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
306 305
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
307 306
        /**begin repeat2
308 307
         * #i = 0, 1, 2, 3#
@@ -345,12 +344,7 @@
Loading
345 344
346 345
#if @NPYV_CHK@ // NPYV check for @type@
347 346
    /* Use aligned instructions if possible */
348 -
    #ifndef NPY_HAVE_NEON
349 347
    const int is_aligned = EINSUM_IS_ALIGNED(data1) && EINSUM_IS_ALIGNED(data_out);
350 -
    #else
351 -
    // ARM/Neon don't have instructions for aligned memory access
352 -
    const int is_aligned = 0;
353 -
    #endif
354 348
    const int vstep = npyv_nlanes_@sfx@;
355 349
    const npyv_@sfx@ va_scalar = npyv_setall_@sfx@(a_scalar);
356 350
@@ -392,19 +386,15 @@
Loading
392 386
            npyv_@st@_@sfx@(data_out, abc0);
393 387
            npyv_@st@_@sfx@(data_out + vstep, abc1);
394 388
        }
389 +
    #else
390 +
        #error "Invalid unroll_by = @unroll_by@"
395 391
    #endif
396 392
    }
397 393
    /**end repeat2**/
398 394
    npyv_cleanup();
399 395
#endif // NPYV check for @type@
400 -
/**
401 -
 * Unroll by four scalars in case of:
402 -
 *  - The SIMD width is higher than 128bit since we unroll by x2/x4
403 -
 *    and that may lead to performance loss on small arrays.
404 -
 *  - To give the change to the compiler to
405 -
 *    auto-vectorize in case of NPYV wasn't available.
406 -
 */
407 -
#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
396 +
397 +
#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
408 398
    for (; count >= 4; count -= 4, data1 += 4, data_out += 4) {
409 399
        /**begin repeat2
410 400
         * #i = 0, 1, 2, 3#
@@ -442,12 +432,7 @@
Loading
442 432
                                                    (int)count);
443 433
#if @NPYV_CHK@ // NPYV check for @type@
444 434
    /* Use aligned instructions if possible */
445 -
    #ifndef NPY_HAVE_NEON
446 435
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data_out);
447 -
    #else
448 -
    // ARM/Neon don't have instructions for aligned memory access
449 -
    const int is_aligned = 0;
450 -
    #endif
451 436
    const int vstep = npyv_nlanes_@sfx@;
452 437
    const npyv_@sfx@ vb_scalar = npyv_setall_@sfx@(b_scalar);
453 438
@@ -489,19 +474,15 @@
Loading
489 474
            npyv_@st@_@sfx@(data_out, abc0);
490 475
            npyv_@st@_@sfx@(data_out + vstep, abc1);
491 476
        }
477 +
    #else
478 +
        #error "Invalid unroll_by = @unroll_by@"
492 479
    #endif
493 480
    }
494 481
    /**end repeat2**/
495 482
    npyv_cleanup();
496 483
#endif // NPYV check for @type@
497 -
/**
498 -
 * Unroll by four scalars in case of:
499 -
 *  - The SIMD width is higher than 128bit since we unroll by x2/x4
500 -
 *    and that may lead to performance loss on small arrays.
501 -
 *  - To give the change to the compiler to
502 -
 *    auto-vectorize in case of NPYV wasn't available.
503 -
 */
504 -
#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
484 +
485 +
#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
505 486
    for (; count >= 4; count -= 4, data0 += 4, data_out += 4) {
506 487
        /**begin repeat2
507 488
         * #i = 0, 1, 2, 3#
@@ -540,12 +521,7 @@
Loading
540 521
                                                    (int)count);
541 522
#if @NPYV_CHK@ // NPYV check for @type@
542 523
    /* Use aligned instructions if possible */
543 -
    #ifndef NPY_HAVE_NEON
544 524
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
545 -
    #else
546 -
    // ARM/Neon don't have instructions for aligned memory access
547 -
    const int is_aligned = 0;
548 -
    #endif
549 525
    const int vstep = npyv_nlanes_@sfx@;
550 526
    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
551 527
@@ -579,20 +555,16 @@
Loading
579 555
            npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, vaccum);
580 556
                    vaccum = npyv_muladd_@sfx@(a0, b0, ab1);
581 557
        }
558 +
    #else
559 +
        #error "Invalid unroll_by = @unroll_by@"
582 560
    #endif
583 561
    }
584 562
    /**end repeat2**/
585 563
    accum = npyv_sum_@sfx@(vaccum);
586 564
    npyv_cleanup();
587 565
#endif // NPYV check for @type@
588 -
/**
589 -
 * Unroll by four scalars in case of:
590 -
 *  - The SIMD width is higher than 128bit since we unroll by x2/x4
591 -
 *    and that may lead to performance loss on small arrays.
592 -
 *  - To give the change to the compiler to
593 -
 *    auto-vectorize in case of NPYV wasn't available.
594 -
 */
595 -
#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
566 +
567 +
#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
596 568
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
597 569
        /**begin repeat2
598 570
         * #i = 0, 1, 2, 3#
@@ -622,12 +594,7 @@
Loading
622 594
                                                    (int)count);
623 595
#if @NPYV_CHK@ // NPYV check for @type@
624 596
    /* Use aligned instructions if possible */
625 -
    #ifndef NPY_HAVE_NEON
626 597
    const int is_aligned = EINSUM_IS_ALIGNED(data1);
627 -
    #else
628 -
    // ARM/Neon don't have instructions for aligned memory access
629 -
    const int is_aligned = 0;
630 -
    #endif
631 598
    const int vstep = npyv_nlanes_@sfx@;
632 599
    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
633 600
@@ -658,20 +625,16 @@
Loading
658 625
            npyv_@sfx@ b01 = npyv_add_@sfx@(b0, b1);
659 626
                    vaccum = npyv_add_@sfx@(b01, vaccum);
660 627
        }
628 +
    #else
629 +
        #error "Invalid unroll_by = @unroll_by@"
661 630
    #endif
662 631
    }
663 632
    /**end repeat2**/
664 633
    accum = npyv_sum_@sfx@(vaccum);
665 634
    npyv_cleanup();
666 635
#endif // NPYV check for @type@
667 -
/**
668 -
 * Unroll by four scalars in case of:
669 -
 *  - The SIMD width is higher than 128bit since we unroll by x2/x4
670 -
 *    and that may lead to performance loss on small arrays.
671 -
 *  - To give the change to the compiler to
672 -
 *    auto-vectorize in case of NPYV wasn't available.
673 -
 */
674 -
#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
636 +
637 +
#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
675 638
    for (; count >= 4; count -= 4, data1 += 4) {
676 639
        const @type@ b01 = @from@(data1[0]) + @from@(data1[1]);
677 640
        const @type@ b23 = @from@(data1[2]) + @from@(data1[3]);
@@ -695,12 +658,7 @@
Loading
695 658
                                                    (int)count);
696 659
#if @NPYV_CHK@ // NPYV check for @type@
697 660
    /* Use aligned instructions if possible */
698 -
    #ifndef NPY_HAVE_NEON
699 661
    const int is_aligned = EINSUM_IS_ALIGNED(data0);
700 -
    #else
701 -
    // ARM/Neon don't have instructions for aligned memory access
702 -
    const int is_aligned = 0;
703 -
    #endif
704 662
    const int vstep = npyv_nlanes_@sfx@;
705 663
    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
706 664
@@ -731,20 +689,16 @@
Loading
731 689
            npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1);
732 690
                    vaccum = npyv_add_@sfx@(a01, vaccum);
733 691
        }
692 +
    #else
693 +
        #error "Invalid unroll_by = @unroll_by@"
734 694
    #endif
735 695
    }
736 696
    /**end repeat2**/
737 697
    accum = npyv_sum_@sfx@(vaccum);
738 698
    npyv_cleanup();
739 699
#endif // NPYV check for @type@
740 -
/**
741 -
 * Unroll by four scalars in case of:
742 -
 *  - The SIMD width is higher than 128bit since we unroll by x2/x4
743 -
 *    and that may lead to performance loss on small arrays.
744 -
 *  - To give the change to the compiler to
745 -
 *    auto-vectorize in case of NPYV wasn't available.
746 -
 */
747 -
#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
700 +
701 +
#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
748 702
    for (; count >= 4; count -= 4, data0 += 4) {
749 703
        const @type@ a01 = @from@(data0[0]) + @from@(data0[1]);
750 704
        const @type@ a23 = @from@(data0[2]) + @from@(data0[3]);
@@ -871,12 +825,7 @@
Loading
871 825
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
872 826
#if @NPYV_CHK@ // NPYV check for @type@
873 827
    /* Use aligned instructions if possible */
874 -
    #ifndef NPY_HAVE_NEON
875 828
    const int is_aligned = EINSUM_IS_ALIGNED(data0);
876 -
    #else
877 -
    // ARM/Neon don't have instructions for aligned memory access
878 -
    const int is_aligned = 0;
879 -
    #endif
880 829
    const int vstep = npyv_nlanes_@sfx@;
881 830
    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
882 831
@@ -907,20 +856,16 @@
Loading
907 856
            npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1);
908 857
                    vaccum = npyv_add_@sfx@(a01, vaccum);
909 858
        }
859 +
    #else
860 +
        #error "Invalid unroll_by = @unroll_by@"
910 861
    #endif
911 862
    }
912 863
    /**end repeat2**/
913 864
    accum = npyv_sum_@sfx@(vaccum);
914 865
    npyv_cleanup();
915 866
#endif // NPYV check for @type@
916 -
/**
917 -
 * Unroll by four/eight scalars in case of:
918 -
 *  - The SIMD width is higher than 128bit since we unroll by x2/x4
919 -
 *    and that may lead to performance loss on small arrays.
920 -
 *  - To give the change to the compiler to
921 -
 *    auto-vectorize in case of NPYV wasn't available.
922 -
 */
923 -
#if !defined(NPY_DISABLE_OPTIMIZATION) && (!@NPYV_CHK@ || NPY_SIMD > 128)
867 +
868 +
#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
924 869
    #if @complex@
925 870
        for (; count > 4; count -= 4, data0 += 4*2) {
926 871
            const @temptype@ re01 = data0[0] + data0[2];
Files Coverage
numpy 84.02%
Project Totals (129 files) 84.02%
33713.9
3.7=-X.7
TRAVIS_OS_NAME=linux
1
codecov:
2
  notify:
3
    require_ci_to_pass: no
4
    after_n_builds: 1
5
coverage:
6
  status:
7
    project:
8
      default:
9
        # Require 1% coverage, i.e., always succeed
10
        target: 1
11
    patch: true
12
    changes: false
13
comment: off
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading