1
/*
2
 * This file provides optimized sum of product implementations used internally
3
 * by einsum.
4
 *
5
 * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
6
 * The University of British Columbia
7
 *
8
 * See LICENSE.txt for the license.
9
 */
10

11
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
12
#define _MULTIARRAYMODULE
13

14
#include <numpy/npy_common.h>
15
#include <numpy/ndarraytypes.h>  /* for NPY_NTYPES */
16
#include <numpy/halffloat.h>
17

18
#include "einsum_sumprod.h"
19
#include "einsum_debug.h"
20

21

22
#ifdef NPY_HAVE_SSE_INTRINSICS
23
#define EINSUM_USE_SSE1 1
24
#else
25
#define EINSUM_USE_SSE1 0
26
#endif
27

28
#ifdef NPY_HAVE_SSE2_INTRINSICS
29
#define EINSUM_USE_SSE2 1
30
#else
31
#define EINSUM_USE_SSE2 0
32
#endif
33

34
#if EINSUM_USE_SSE1
35
#include <xmmintrin.h>
36
#endif
37

38
#if EINSUM_USE_SSE2
39
#include <emmintrin.h>
40
#endif
41

42
#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
43

44
/**********************************************/
45

46
/**begin repeat
47
 * #name = byte, short, int, long, longlong,
48
 *         ubyte, ushort, uint, ulong, ulonglong,
49
 *         half, float, double, longdouble,
50
 *         cfloat, cdouble, clongdouble#
51
 * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
52
 *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
53
 *         npy_half, npy_float, npy_double, npy_longdouble,
54
 *         npy_cfloat, npy_cdouble, npy_clongdouble#
55
 * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
56
 *             npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
57
 *             npy_float, npy_float, npy_double, npy_longdouble,
58
 *             npy_float, npy_double, npy_longdouble#
59
 * #to = ,,,,,
60
 *       ,,,,,
61
 *       npy_float_to_half,,,,
62
 *       ,,#
63
 * #from = ,,,,,
64
 *         ,,,,,
65
 *         npy_half_to_float,,,,
66
 *         ,,#
67
 * #complex = 0*5,
68
 *            0*5,
69
 *            0*4,
70
 *            1*3#
71
 * #float32 = 0*5,
72
 *            0*5,
73
 *            0,1,0,0,
74
 *            0*3#
75
 * #float64 = 0*5,
76
 *            0*5,
77
 *            0,0,1,0,
78
 *            0*3#
79
 */
80

81
/**begin repeat1
82
 * #nop = 1, 2, 3, 1000#
83
 * #noplabel = one, two, three, any#
84
 */
85
static void
86 1
@name@_sum_of_products_@noplabel@(int nop, char **dataptr,
87
                                npy_intp const *strides, npy_intp count)
88
{
89
#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
90 1
    char *data0 = dataptr[0];
91 1
    npy_intp stride0 = strides[0];
92
#endif
93
#if (@nop@ == 2 || @nop@ == 3) && !@complex@
94 1
    char *data1 = dataptr[1];
95 1
    npy_intp stride1 = strides[1];
96
#endif
97
#if (@nop@ == 3) && !@complex@
98 1
    char *data2 = dataptr[2];
99 1
    npy_intp stride2 = strides[2];
100
#endif
101
#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
102 1
    char *data_out = dataptr[@nop@];
103 1
    npy_intp stride_out = strides[@nop@];
104
#endif
105

106
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count);
107

108 1
    while (count--) {
109
#if !@complex@
110
#  if @nop@ == 1
111 1
        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) +
112 1
                                         @from@(*(@type@ *)data_out));
113 1
        data0 += stride0;
114 1
        data_out += stride_out;
115
#  elif @nop@ == 2
116 1
        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
117 1
                                         @from@(*(@type@ *)data1) +
118 1
                                         @from@(*(@type@ *)data_out));
119 1
        data0 += stride0;
120 1
        data1 += stride1;
121 1
        data_out += stride_out;
122
#  elif @nop@ == 3
123 1
        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
124 1
                                         @from@(*(@type@ *)data1) *
125 1
                                         @from@(*(@type@ *)data2) +
126 1
                                         @from@(*(@type@ *)data_out));
127 1
        data0 += stride0;
128 1
        data1 += stride1;
129 1
        data2 += stride2;
130 1
        data_out += stride_out;
131
#  else
132 1
        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
133
        int i;
134 1
        for (i = 1; i < nop; ++i) {
135 1
            temp *= @from@(*(@type@ *)dataptr[i]);
136
        }
137 1
        *(@type@ *)dataptr[nop] = @to@(temp +
138 1
                                           @from@(*(@type@ *)dataptr[i]));
139 1
        for (i = 0; i <= nop; ++i) {
140 1
            dataptr[i] += strides[i];
141
        }
142
#  endif
143
#else /* complex */
144
#  if @nop@ == 1
145 0
        ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] +
146 0
                                         ((@temptype@ *)data_out)[0];
147 0
        ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] +
148 0
                                         ((@temptype@ *)data_out)[1];
149 0
        data0 += stride0;
150 0
        data_out += stride_out;
151
#  else
152
#    if @nop@ <= 3
153
#define _SUMPROD_NOP @nop@
154
#    else
155
#define _SUMPROD_NOP nop
156
#    endif
157
        @temptype@ re, im, tmp;
158
        int i;
159 1
        re = ((@temptype@ *)dataptr[0])[0];
160 1
        im = ((@temptype@ *)dataptr[0])[1];
161 1
        for (i = 1; i < _SUMPROD_NOP; ++i) {
162 1
            tmp = re * ((@temptype@ *)dataptr[i])[0] -
163 1
                  im * ((@temptype@ *)dataptr[i])[1];
164 1
            im = re * ((@temptype@ *)dataptr[i])[1] +
165 1
                 im * ((@temptype@ *)dataptr[i])[0];
166 1
            re = tmp;
167
        }
168 1
        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
169 1
                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
170 1
        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
171 1
                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
172

173 1
        for (i = 0; i <= _SUMPROD_NOP; ++i) {
174 1
            dataptr[i] += strides[i];
175
        }
176
#undef _SUMPROD_NOP
177
#  endif
178
#endif
179
    }
180 1
}
181

182
#if @nop@ == 1
183

184
static void
185 1
@name@_sum_of_products_contig_one(int nop, char **dataptr,
186
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
187
{
188 1
    @type@ *data0 = (@type@ *)dataptr[0];
189 1
    @type@ *data_out = (@type@ *)dataptr[1];
190

191
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n",
192
                                                            (int)count);
193

194
/* This is placed before the main loop to make small counts faster */
195 1
finish_after_unrolled_loop:
196 1
    switch (count) {
197
/**begin repeat2
198
 * #i = 6, 5, 4, 3, 2, 1, 0#
199
 */
200 1
        case @i@+1:
201
#if !@complex@
202 1
            data_out[@i@] = @to@(@from@(data0[@i@]) +
203 1
                                 @from@(data_out[@i@]));
204
#else
205 1
            ((@temptype@ *)data_out + 2*@i@)[0] =
206 1
                                    ((@temptype@ *)data0 + 2*@i@)[0] +
207 1
                                    ((@temptype@ *)data_out + 2*@i@)[0];
208 1
            ((@temptype@ *)data_out + 2*@i@)[1] =
209 1
                                    ((@temptype@ *)data0 + 2*@i@)[1] +
210 1
                                    ((@temptype@ *)data_out + 2*@i@)[1];
211
#endif
212
/**end repeat2**/
213
        case 0:
214 1
            return;
215
    }
216

217
    /* Unroll the loop by 8 */
218 1
    while (count >= 8) {
219 1
        count -= 8;
220

221
/**begin repeat2
222
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
223
 */
224
#if !@complex@
225 1
        data_out[@i@] = @to@(@from@(data0[@i@]) +
226 1
                             @from@(data_out[@i@]));
227
#else /* complex */
228 1
        ((@temptype@ *)data_out + 2*@i@)[0] =
229 1
                                ((@temptype@ *)data0 + 2*@i@)[0] +
230 1
                                ((@temptype@ *)data_out + 2*@i@)[0];
231 1
        ((@temptype@ *)data_out + 2*@i@)[1] =
232 1
                                ((@temptype@ *)data0 + 2*@i@)[1] +
233 1
                                ((@temptype@ *)data_out + 2*@i@)[1];
234
#endif
235
/**end repeat2**/
236 1
        data0 += 8;
237 1
        data_out += 8;
238
    }
239

240
    /* Finish off the loop */
241
    goto finish_after_unrolled_loop;
242
}
243

244
#elif @nop@ == 2 && !@complex@
245

246
static void
247
@name@_sum_of_products_contig_two(int nop, char **dataptr,
248
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
249
{
250
    @type@ *data0 = (@type@ *)dataptr[0];
251
    @type@ *data1 = (@type@ *)dataptr[1];
252
    @type@ *data_out = (@type@ *)dataptr[2];
253

254
#if EINSUM_USE_SSE1 && @float32@
255
    __m128 a, b;
256
#elif EINSUM_USE_SSE2 && @float64@
257
    __m128d a, b;
258
#endif
259

260
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
261
                                                            (int)count);
262

263
/* This is placed before the main loop to make small counts faster */
264
finish_after_unrolled_loop:
265
    switch (count) {
266
/**begin repeat2
267
 * #i = 6, 5, 4, 3, 2, 1, 0#
268
 */
269 1
        case @i@+1:
270 1
            data_out[@i@] = @to@(@from@(data0[@i@]) *
271 1
                                 @from@(data1[@i@]) +
272 1
                                 @from@(data_out[@i@]));
273
/**end repeat2**/
274
        case 0:
275 1
            return;
276
    }
277

278
#if EINSUM_USE_SSE1 && @float32@
279
    /* Use aligned instructions if possible */
280 1
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
281 1
        EINSUM_IS_SSE_ALIGNED(data_out)) {
282
        /* Unroll the loop by 8 */
283 1
        while (count >= 8) {
284 1
            count -= 8;
285

286
/**begin repeat2
287
 * #i = 0, 4#
288
 */
289 1
            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
290 1
            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
291 1
            _mm_store_ps(data_out+@i@, b);
292
/**end repeat2**/
293 1
            data0 += 8;
294 1
            data1 += 8;
295 1
            data_out += 8;
296
        }
297

298
        /* Finish off the loop */
299
        goto finish_after_unrolled_loop;
300
    }
301
#elif EINSUM_USE_SSE2 && @float64@
302
    /* Use aligned instructions if possible */
303
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
304
        EINSUM_IS_SSE_ALIGNED(data_out)) {
305
        /* Unroll the loop by 8 */
306 1
        while (count >= 8) {
307 1
            count -= 8;
308

309 1
/**begin repeat2
310 1
 * #i = 0, 2, 4, 6#
311
 */
312 1
            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
313 1
            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
314 1
            _mm_store_pd(data_out+@i@, b);
315
/**end repeat2**/
316 1
            data0 += 8;
317 1
            data1 += 8;
318 1
            data_out += 8;
319
        }
320

321
        /* Finish off the loop */
322
        goto finish_after_unrolled_loop;
323
    }
324
#endif
325

326
    /* Unroll the loop by 8 */
327 1
    while (count >= 8) {
328 1
        count -= 8;
329

330
#if EINSUM_USE_SSE1 && @float32@
331
/**begin repeat2
332
 * #i = 0, 4#
333
 */
334 1
        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
335 1
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
336 1
        _mm_storeu_ps(data_out+@i@, b);
337
/**end repeat2**/
338
#elif EINSUM_USE_SSE2 && @float64@
339
/**begin repeat2
340 1
 * #i = 0, 2, 4, 6#
341 1
 */
342 1
        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
343 1
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
344 1
        _mm_storeu_pd(data_out+@i@, b);
345
/**end repeat2**/
346
#else
347
/**begin repeat2
348
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
349
 */
350 1
        data_out[@i@] = @to@(@from@(data0[@i@]) *
351 1
                             @from@(data1[@i@]) +
352 1
                             @from@(data_out[@i@]));
353
/**end repeat2**/
354
#endif
355 1
        data0 += 8;
356 1
        data1 += 8;
357 1
        data_out += 8;
358
    }
359

360
    /* Finish off the loop */
361
    goto finish_after_unrolled_loop;
362
}
363

364
/* Some extra specializations for the two operand case */
365
static void
366 1
@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
367
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
368
{
369 1
    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
370 1
    @type@ *data1 = (@type@ *)dataptr[1];
371 1
    @type@ *data_out = (@type@ *)dataptr[2];
372

373
#if EINSUM_USE_SSE1 && @float32@
374
    __m128 a, b, value0_sse;
375
#elif EINSUM_USE_SSE2 && @float64@
376
    __m128d a, b, value0_sse;
377
#endif
378

379
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n",
380
                                                    (int)count);
381

382
/* This is placed before the main loop to make small counts faster */
383 1
finish_after_unrolled_loop:
384 1
    switch (count) {
385
/**begin repeat2
386
 * #i = 6, 5, 4, 3, 2, 1, 0#
387
 */
388 1
        case @i@+1:
389 1
            data_out[@i@] = @to@(value0 *
390 1
                                 @from@(data1[@i@]) +
391 1
                                 @from@(data_out[@i@]));
392
/**end repeat2**/
393
        case 0:
394
            return;
395
    }
396

397
#if EINSUM_USE_SSE1 && @float32@
398 1
    value0_sse = _mm_set_ps1(value0);
399 1

400
    /* Use aligned instructions if possible */
401 1
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
402 1
        /* Unroll the loop by 8 */
403 1
        while (count >= 8) {
404 1
            count -= 8;
405

406
/**begin repeat2
407
 * #i = 0, 4#
408
 */
409 1
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@));
410 1
            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
411 1
            _mm_store_ps(data_out+@i@, b);
412 1
/**end repeat2**/
413 1
            data1 += 8;
414 1
            data_out += 8;
415 1
        }
416 1

417 1
        /* Finish off the loop */
418 1
        if (count > 0) {
419
            goto finish_after_unrolled_loop;
420
        }
421 1
        else {
422 1
            return;
423 1
        }
424
    }
425
#elif EINSUM_USE_SSE2 && @float64@
426
    value0_sse = _mm_set1_pd(value0);
427

428
    /* Use aligned instructions if possible */
429 1
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
430 1
        /* Unroll the loop by 8 */
431
        while (count >= 8) {
432 1
            count -= 8;
433

434 1
/**begin repeat2
435 1
 * #i = 0, 2, 4, 6#
436 1
 */
437 1
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@));
438 1
            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
439 1
            _mm_store_pd(data_out+@i@, b);
440
/**end repeat2**/
441 1
            data1 += 8;
442 1
            data_out += 8;
443
        }
444

445
        /* Finish off the loop */
446 1
        if (count > 0) {
447
            goto finish_after_unrolled_loop;
448
        }
449
        else {
450
            return;
451
        }
452
    }
453
#endif
454

455
    /* Unroll the loop by 8 */
456 1
    while (count >= 8) {
457 1
        count -= 8;
458

459
#if EINSUM_USE_SSE1 && @float32@
460
/**begin repeat2
461
 * #i = 0, 4#
462
 */
463 1
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@));
464 1
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
465 1
        _mm_storeu_ps(data_out+@i@, b);
466
/**end repeat2**/
467
#elif EINSUM_USE_SSE2 && @float64@
468
/**begin repeat2
469 1
 * #i = 0, 2, 4, 6#
470 1
 */
471 1
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@));
472 1
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
473 1
        _mm_storeu_pd(data_out+@i@, b);
474
/**end repeat2**/
475
#else
476
/**begin repeat2
477
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
478
 */
479 1
        data_out[@i@] = @to@(value0 *
480 1
                             @from@(data1[@i@]) +
481 1
                             @from@(data_out[@i@]));
482
/**end repeat2**/
483
#endif
484 1
        data1 += 8;
485 1
        data_out += 8;
486
    }
487

488
    /* Finish off the loop */
489 1
    if (count > 0) {
490
        goto finish_after_unrolled_loop;
491
    }
492
}
493

494
static void
495 1
@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
496
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
497
{
498 1
    @type@ *data0 = (@type@ *)dataptr[0];
499 1
    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
500 1
    @type@ *data_out = (@type@ *)dataptr[2];
501

502
#if EINSUM_USE_SSE1 && @float32@
503
    __m128 a, b, value1_sse;
504
#elif EINSUM_USE_SSE2 && @float64@
505
    __m128d a, b, value1_sse;
506
#endif
507

508
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
509
                                                    (int)count);
510

511
/* This is placed before the main loop to make small counts faster */
512 1
finish_after_unrolled_loop:
513 1
    switch (count) {
514
/**begin repeat2
515
 * #i = 6, 5, 4, 3, 2, 1, 0#
516
 */
517 1
        case @i@+1:
518 1
            data_out[@i@] = @to@(@from@(data0[@i@])*
519 1
                                 value1  +
520 1
                                 @from@(data_out[@i@]));
521
/**end repeat2**/
522 1
        case 0:
523 1
            return;
524
    }
525

526
#if EINSUM_USE_SSE1 && @float32@
527 1
    value1_sse = _mm_set_ps1(value1);
528 1

529
    /* Use aligned instructions if possible */
530 1
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
531 1
        /* Unroll the loop by 8 */
532 1
        while (count >= 8) {
533 1
            count -= 8;
534

535 1
/**begin repeat2
536
 * #i = 0, 4#
537
 */
538 1
            a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse);
539 1
            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
540 1
            _mm_store_ps(data_out+@i@, b);
541 1
/**end repeat2**/
542 1
            data0 += 8;
543 1
            data_out += 8;
544 1
        }
545 1

546 1
        /* Finish off the loop */
547
        goto finish_after_unrolled_loop;
548
    }
549
#elif EINSUM_USE_SSE2 && @float64@
550
    value1_sse = _mm_set1_pd(value1);
551

552
    /* Use aligned instructions if possible */
553 1
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
554
        /* Unroll the loop by 8 */
555
        while (count >= 8) {
556 1
            count -= 8;
557

558 1
/**begin repeat2
559 1
 * #i = 0, 2, 4, 6#
560
 */
561 1
            a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse);
562 1
            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
563 1
            _mm_store_pd(data_out+@i@, b);
564
/**end repeat2**/
565 1
            data0 += 8;
566 1
            data_out += 8;
567
        }
568

569
        /* Finish off the loop */
570
        goto finish_after_unrolled_loop;
571
    }
572
#endif
573

574
    /* Unroll the loop by 8 */
575 1
    while (count >= 8) {
576 1
        count -= 8;
577

578
#if EINSUM_USE_SSE1 && @float32@
579
/**begin repeat2
580
 * #i = 0, 4#
581
 */
582 1
        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse);
583 1
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
584 1
        _mm_storeu_ps(data_out+@i@, b);
585
/**end repeat2**/
586
#elif EINSUM_USE_SSE2 && @float64@
587
/**begin repeat2
588 1
 * #i = 0, 2, 4, 6#
589 1
 */
590 1
        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse);
591 1
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
592 1
        _mm_storeu_pd(data_out+@i@, b);
593
/**end repeat2**/
594
#else
595
/**begin repeat2
596
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
597
 */
598 1
        data_out[@i@] = @to@(@from@(data0[@i@])*
599 1
                             value1  +
600 1
                             @from@(data_out[@i@]));
601
/**end repeat2**/
602
#endif
603 1
        data0 += 8;
604 1
        data_out += 8;
605
    }
606

607
    /* Finish off the loop */
608
    goto finish_after_unrolled_loop;
609
}
610

611
static void
612 1
@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
613
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
614
{
615 1
    @type@ *data0 = (@type@ *)dataptr[0];
616 1
    @type@ *data1 = (@type@ *)dataptr[1];
617 1
    @temptype@ accum = 0;
618

619
#if EINSUM_USE_SSE1 && @float32@
620
    __m128 a, accum_sse = _mm_setzero_ps();
621
#elif EINSUM_USE_SSE2 && @float64@
622
    __m128d a, accum_sse = _mm_setzero_pd();
623
#endif
624

625
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
626
                                                    (int)count);
627

628
/* This is placed before the main loop to make small counts faster */
629 1
finish_after_unrolled_loop:
630 1
    switch (count) {
631
/**begin repeat2
632
 * #i = 6, 5, 4, 3, 2, 1, 0#
633
 */
634 1
        case @i@+1:
635 1
            accum += @from@(data0[@i@]) * @from@(data1[@i@]);
636 1
/**end repeat2**/
637 1
        case 0:
638 1
            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
639 1
            return;
640
    }
641

642
#if EINSUM_USE_SSE1 && @float32@
643
    /* Use aligned instructions if possible */
644 1
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
645 1
        /* Unroll the loop by 8 */
646 1
        while (count >= 8) {
647 1
            count -= 8;
648 1

649 1
            _mm_prefetch(data0 + 512, _MM_HINT_T0);
650 1
            _mm_prefetch(data1 + 512, _MM_HINT_T0);
651

652
/**begin repeat2
653
 * #i = 0, 4#
654
 */
655 1
            /*
656
             * NOTE: This accumulation changes the order, so will likely
657
             *       produce slightly different results.
658 1
             */
659 1
            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
660 1
            accum_sse = _mm_add_ps(accum_sse, a);
661 1
/**end repeat2**/
662 1
            data0 += 8;
663 1
            data1 += 8;
664
        }
665

666 1
        /* Add the four SSE values and put in accum */
667 1
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
668 1
        accum_sse = _mm_add_ps(a, accum_sse);
669 1
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
670 1
        accum_sse = _mm_add_ps(a, accum_sse);
671
        _mm_store_ss(&accum, accum_sse);
672

673
        /* Finish off the loop */
674
        goto finish_after_unrolled_loop;
675
    }
676 1
#elif EINSUM_USE_SSE2 && @float64@
677
    /* Use aligned instructions if possible */
678
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
679
        /* Unroll the loop by 8 */
680
        while (count >= 8) {
681
            count -= 8;
682

683
            _mm_prefetch(data0 + 512, _MM_HINT_T0);
684 1
            _mm_prefetch(data1 + 512, _MM_HINT_T0);
685

686 1
/**begin repeat2
687 1
 * #i = 0, 2, 4, 6#
688
 */
689 1
            /*
690 1
             * NOTE: This accumulation changes the order, so will likely
691
             *       produce slightly different results.
692
             */
693 1
            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
694 1
            accum_sse = _mm_add_pd(accum_sse, a);
695
/**end repeat2**/
696 1
            data0 += 8;
697 1
            data1 += 8;
698
        }
699

700
        /* Add the two SSE2 values and put in accum */
701 1
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
702 1
        accum_sse = _mm_add_pd(a, accum_sse);
703
        _mm_store_sd(&accum, accum_sse);
704

705
        /* Finish off the loop */
706
        goto finish_after_unrolled_loop;
707
    }
708
#endif
709

710
    /* Unroll the loop by 8 */
711 1
    while (count >= 8) {
712 1
        count -= 8;
713

714
#if EINSUM_USE_SSE1 && @float32@
715
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
716
        _mm_prefetch(data1 + 512, _MM_HINT_T0);
717

718
/**begin repeat2
719
 * #i = 0, 4#
720
 */
721
        /*
722
         * NOTE: This accumulation changes the order, so will likely
723
         *       produce slightly different results.
724
         */
725 1
        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
726 1
        accum_sse = _mm_add_ps(accum_sse, a);
727
/**end repeat2**/
728
#elif EINSUM_USE_SSE2 && @float64@
729
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
730
        _mm_prefetch(data1 + 512, _MM_HINT_T0);
731

732
/**begin repeat2
733 1
 * #i = 0, 2, 4, 6#
734 1
 */
735 1
        /*
736 1
         * NOTE: This accumulation changes the order, so will likely
737 1
         *       produce slightly different results.
738 1
         */
739 1
        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
740 1
        accum_sse = _mm_add_pd(accum_sse, a);
741
/**end repeat2**/
742
#else
743
/**begin repeat2
744
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
745
 */
746 1
        accum += @from@(data0[@i@]) * @from@(data1[@i@]);
747
/**end repeat2**/
748
#endif
749 1
        data0 += 8;
750 1
        data1 += 8;
751
    }
752

753
#if EINSUM_USE_SSE1 && @float32@
754
    /* Add the four SSE values and put in accum */
755
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
756
    accum_sse = _mm_add_ps(a, accum_sse);
757
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
758
    accum_sse = _mm_add_ps(a, accum_sse);
759
    _mm_store_ss(&accum, accum_sse);
760
#elif EINSUM_USE_SSE2 && @float64@
761
    /* Add the two SSE2 values and put in accum */
762
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
763
    accum_sse = _mm_add_pd(a, accum_sse);
764
    _mm_store_sd(&accum, accum_sse);
765
#endif
766

767
    /* Finish off the loop */
768 1
    goto finish_after_unrolled_loop;
769 1
}
770

771
static void
772 1
@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
773
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
774
{
775 1
    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
776 1
    @type@ *data1 = (@type@ *)dataptr[1];
777 1
    @temptype@ accum = 0;
778

779
#if EINSUM_USE_SSE1 && @float32@
780
    __m128 a, accum_sse = _mm_setzero_ps();
781 1
#elif EINSUM_USE_SSE2 && @float64@
782 1
    __m128d a, accum_sse = _mm_setzero_pd();
783
#endif
784

785
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
786
                                                    (int)count);
787

788
/* This is placed before the main loop to make small counts faster */
789 1
finish_after_unrolled_loop:
790 1
    switch (count) {
791 1
/**begin repeat2
792
 * #i = 6, 5, 4, 3, 2, 1, 0#
793
 */
794 1
        case @i@+1:
795 1
            accum += @from@(data1[@i@]);
796 1
/**end repeat2**/
797 1
        case 0:
798 1
            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
799 1
            return;
800
    }
801 1

802
#if EINSUM_USE_SSE1 && @float32@
803
    /* Use aligned instructions if possible */
804 1
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
805
        /* Unroll the loop by 8 */
806 1
        while (count >= 8) {
807 1
            count -= 8;
808

809 1
/**begin repeat2
810
 * #i = 0, 4#
811
 */
812
            /*
813 1
             * NOTE: This accumulation changes the order, so will likely
814
             *       produce slightly different results.
815
             */
816 1
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
817 1
/**end repeat2**/
818 1
            data1 += 8;
819
        }
820
        /* Add the four SSE values and put in accum */
821 1
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
822 1
        accum_sse = _mm_add_ps(a, accum_sse);
823 1
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
824 1
        accum_sse = _mm_add_ps(a, accum_sse);
825
        _mm_store_ss(&accum, accum_sse);
826

827
        /* Finish off the loop */
828
        goto finish_after_unrolled_loop;
829
    }
830
#elif EINSUM_USE_SSE2 && @float64@
831 1
    /* Use aligned instructions if possible */
832
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
833
        /* Unroll the loop by 8 */
834
        while (count >= 8) {
835
            count -= 8;
836

837 1
/**begin repeat2
838
 * #i = 0, 2, 4, 6#
839 1
 */
840 1
            /*
841
             * NOTE: This accumulation changes the order, so will likely
842
             *       produce slightly different results.
843
             */
844 1
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
845
/**end repeat2**/
846 1
            data1 += 8;
847
        }
848
        /* Add the two SSE2 values and put in accum */
849 1
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
850 1
        accum_sse = _mm_add_pd(a, accum_sse);
851
        _mm_store_sd(&accum, accum_sse);
852

853
        /* Finish off the loop */
854
        goto finish_after_unrolled_loop;
855
    }
856
#endif
857

858
    /* Unroll the loop by 8 */
859 1
    while (count >= 8) {
860 1
        count -= 8;
861

862
#if EINSUM_USE_SSE1 && @float32@
863
/**begin repeat2
864
 * #i = 0, 4#
865
 */
866
        /*
867
         * NOTE: This accumulation changes the order, so will likely
868
         *       produce slightly different results.
869
         */
870 1
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
871
/**end repeat2**/
872
#elif EINSUM_USE_SSE2 && @float64@
873
/**begin repeat2
874
 * #i = 0, 2, 4, 6#
875
 */
876
        /*
877
         * NOTE: This accumulation changes the order, so will likely
878 1
         *       produce slightly different results.
879 1
         */
880 1
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
881
/**end repeat2**/
882
#else
883 1
/**begin repeat2
884 1
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
885
 */
886 1
        accum += @from@(data1[@i@]);
887
/**end repeat2**/
888
#endif
889 1
        data1 += 8;
890
    }
891

892
#if EINSUM_USE_SSE1 && @float32@
893
    /* Add the four SSE values and put in accum */
894
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
895
    accum_sse = _mm_add_ps(a, accum_sse);
896
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
897
    accum_sse = _mm_add_ps(a, accum_sse);
898
    _mm_store_ss(&accum, accum_sse);
899
#elif EINSUM_USE_SSE2 && @float64@
900
    /* Add the two SSE2 values and put in accum */
901
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
902
    accum_sse = _mm_add_pd(a, accum_sse);
903
    _mm_store_sd(&accum, accum_sse);
904
#endif
905

906
    /* Finish off the loop */
907
    goto finish_after_unrolled_loop;
908 1
}
909

910
static void
911 1
@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
912
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
913
{
914 1
    @type@ *data0 = (@type@ *)dataptr[0];
915 1
    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
916 1
    @temptype@ accum = 0;
917

918
#if EINSUM_USE_SSE1 && @float32@
919
    __m128 a, accum_sse = _mm_setzero_ps();
920 1
#elif EINSUM_USE_SSE2 && @float64@
921 1
    __m128d a, accum_sse = _mm_setzero_pd();
922
#endif
923

924
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
925
                                                    (int)count);
926

927 1
/* This is placed before the main loop to make small counts faster */
928 1
finish_after_unrolled_loop:
929 1
    switch (count) {
930 1
/**begin repeat2
931
 * #i = 6, 5, 4, 3, 2, 1, 0#
932 1
 */
933 1
        case @i@+1:
934 1
            accum += @from@(data0[@i@]);
935 1
/**end repeat2**/
936 1
        case 0:
937 1
            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
938 1
            return;
939
    }
940 1

941
#if EINSUM_USE_SSE1 && @float32@
942
    /* Use aligned instructions if possible */
943 1
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
944
        /* Unroll the loop by 8 */
945 1
        while (count >= 8) {
946 1
            count -= 8;
947

948 1
/**begin repeat2
949 1
 * #i = 0, 4#
950
 */
951
            /*
952 1
             * NOTE: This accumulation changes the order, so will likely
953 1
             *       produce slightly different results.
954 1
             */
955 1
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
956
/**end repeat2**/
957 1
            data0 += 8;
958
        }
959
        /* Add the four SSE values and put in accum */
960 1
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
961 1
        accum_sse = _mm_add_ps(a, accum_sse);
962 1
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
963 1
        accum_sse = _mm_add_ps(a, accum_sse);
964
        _mm_store_ss(&accum, accum_sse);
965
        /* Finish off the loop */
966
        goto finish_after_unrolled_loop;
967 1
    }
968
#elif EINSUM_USE_SSE2 && @float64@
969
    /* Use aligned instructions if possible */
970
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
971
        /* Unroll the loop by 8 */
972
        while (count >= 8) {
973
            count -= 8;
974

975 1
/**begin repeat2
976
 * #i = 0, 2, 4, 6#
977 1
 */
978 1
            /*
979
             * NOTE: This accumulation changes the order, so will likely
980
             *       produce slightly different results.
981
             */
982 1
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
983
/**end repeat2**/
984 1
            data0 += 8;
985
        }
986
        /* Add the two SSE2 values and put in accum */
987 1
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
988 1
        accum_sse = _mm_add_pd(a, accum_sse);
989
        _mm_store_sd(&accum, accum_sse);
990
        /* Finish off the loop */
991
        goto finish_after_unrolled_loop;
992
    }
993
#endif
994

995
    /* Unroll the loop by 8 */
996 1
    while (count >= 8) {
997 1
        count -= 8;
998

999
#if EINSUM_USE_SSE1 && @float32@
1000
/**begin repeat2
1001
 * #i = 0, 4#
1002
 */
1003
        /*
1004
         * NOTE: This accumulation changes the order, so will likely
1005
         *       produce slightly different results.
1006
         */
1007 1
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
1008
/**end repeat2**/
1009
#elif EINSUM_USE_SSE2 && @float64@
1010
/**begin repeat2
1011
 * #i = 0, 2, 4, 6#
1012
 */
1013
        /*
1014
         * NOTE: This accumulation changes the order, so will likely
1015 1
         *       produce slightly different results.
1016 1
         */
1017 1
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
1018
/**end repeat2**/
1019
#else
1020 1
/**begin repeat2
1021 1
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
1022
 */
1023 1
        accum += @from@(data0[@i@]);
1024
/**end repeat2**/
1025
#endif
1026 1
        data0 += 8;
1027
    }
1028

1029
#if EINSUM_USE_SSE1 && @float32@
1030
    /* Add the four SSE values and put in accum */
1031
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
1032
    accum_sse = _mm_add_ps(a, accum_sse);
1033
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
1034
    accum_sse = _mm_add_ps(a, accum_sse);
1035
    _mm_store_ss(&accum, accum_sse);
1036
#elif EINSUM_USE_SSE2 && @float64@
1037
    /* Add the two SSE2 values and put in accum */
1038
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
1039
    accum_sse = _mm_add_pd(a, accum_sse);
1040
    _mm_store_sd(&accum, accum_sse);
1041
#endif
1042

1043
    /* Finish off the loop */
1044
    goto finish_after_unrolled_loop;
1045 1
}
1046

1047
#elif @nop@ == 3 && !@complex@
1048

1049
static void
1050
@name@_sum_of_products_contig_three(int nop, char **dataptr,
1051
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
1052
{
1053
    @type@ *data0 = (@type@ *)dataptr[0];
1054
    @type@ *data1 = (@type@ *)dataptr[1];
1055
    @type@ *data2 = (@type@ *)dataptr[2];
1056
    @type@ *data_out = (@type@ *)dataptr[3];
1057 1

1058 1
    /* Unroll the loop by 8 */
1059
    while (count >= 8) {
1060
        count -= 8;
1061

1062
/**begin repeat2
1063
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
1064 1
 */
1065 1
        data_out[@i@] = @to@(@from@(data0[@i@]) *
1066 1
                             @from@(data1[@i@]) *
1067 1
                             @from@(data2[@i@]) +
1068 1
                             @from@(data_out[@i@]));
1069 1
/**end repeat2**/
1070 1
        data0 += 8;
1071 1
        data1 += 8;
1072 1
        data2 += 8;
1073 1
        data_out += 8;
1074
    }
1075

1076
    /* Finish off the loop */
1077

1078
/**begin repeat2
1079
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
1080
 */
1081 1
    if (count-- == 0) {
1082
        return;
1083
    }
1084 1
    data_out[@i@] = @to@(@from@(data0[@i@]) *
1085 1
                         @from@(data1[@i@]) *
1086 1
                         @from@(data2[@i@]) +
1087 1
                         @from@(data_out[@i@]));
1088
/**end repeat2**/
1089
}
1090

1091
#else /* @nop@ > 3 || @complex */
1092

1093
static void
1094
@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
1095
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
1096
{
1097
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n",
1098
                                                    (int)count);
1099

1100
    while (count--) {
1101
#if !@complex@
1102
        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
1103
        int i;
1104
        for (i = 1; i < nop; ++i) {
1105
            temp *= @from@(*(@type@ *)dataptr[i]);
1106
        }
1107
        *(@type@ *)dataptr[nop] = @to@(temp +
1108
                                           @from@(*(@type@ *)dataptr[i]));
1109
        for (i = 0; i <= nop; ++i) {
1110
            dataptr[i] += sizeof(@type@);
1111
        }
1112
#else /* complex */
1113
#  if @nop@ <= 3
1114
#    define _SUMPROD_NOP @nop@
1115
#  else
1116
#    define _SUMPROD_NOP nop
1117
#  endif
1118
        @temptype@ re, im, tmp;
1119
        int i;
1120
        re = ((@temptype@ *)dataptr[0])[0];
1121
        im = ((@temptype@ *)dataptr[0])[1];
1122
        for (i = 1; i < _SUMPROD_NOP; ++i) {
1123
            tmp = re * ((@temptype@ *)dataptr[i])[0] -
1124
                  im * ((@temptype@ *)dataptr[i])[1];
1125
            im = re * ((@temptype@ *)dataptr[i])[1] +
1126
                 im * ((@temptype@ *)dataptr[i])[0];
1127
            re = tmp;
1128
        }
1129
        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
1130
                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
1131
        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
1132
                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
1133

1134
        for (i = 0; i <= _SUMPROD_NOP; ++i) {
1135
            dataptr[i] += sizeof(@type@);
1136
        }
1137
#  undef _SUMPROD_NOP
1138
#endif
1139
    }
1140
}
1141

1142
#endif /* functions for various @nop@ */
1143

1144
#if @nop@ == 1
1145

1146
static void
1147
@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
1148
                                npy_intp const *strides, npy_intp count)
1149
{
1150
#if @complex@
1151
    @temptype@ accum_re = 0, accum_im = 0;
1152
    @temptype@ *data0 = (@temptype@ *)dataptr[0];
1153
#else
1154
    @temptype@ accum = 0;
1155
    @type@ *data0 = (@type@ *)dataptr[0];
1156
#endif
1157

1158
#if EINSUM_USE_SSE1 && @float32@
1159
    __m128 a, accum_sse = _mm_setzero_ps();
1160
#elif EINSUM_USE_SSE2 && @float64@
1161
    __m128d a, accum_sse = _mm_setzero_pd();
1162
#endif
1163

1164

1165
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
1166
                                                    (int)count);
1167

1168
/* This is placed before the main loop to make small counts faster */
1169
finish_after_unrolled_loop:
1170
    switch (count) {
1171
/**begin repeat2
1172
 * #i = 6, 5, 4, 3, 2, 1, 0#
1173
 */
1174 1
        case @i@+1:
1175
#if !@complex@
1176 1
            accum += @from@(data0[@i@]);
1177
#else /* complex */
1178 1
            accum_re += data0[2*@i@+0];
1179 1
            accum_im += data0[2*@i@+1];
1180
#endif
1181
/**end repeat2**/
1182 1
        case 0:
1183
#if @complex@
1184 1
            ((@temptype@ *)dataptr[1])[0] += accum_re;
1185 1
            ((@temptype@ *)dataptr[1])[1] += accum_im;
1186
#else
1187 1
            *((@type@ *)dataptr[1]) = @to@(accum +
1188 1
                                    @from@(*((@type@ *)dataptr[1])));
1189
#endif
1190 1
            return;
1191
    }
1192

1193
#if EINSUM_USE_SSE1 && @float32@
1194
    /* Use aligned instructions if possible */
1195 1
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
1196
        /* Unroll the loop by 8 */
1197 1
        while (count >= 8) {
1198 1
            count -= 8;
1199

1200 1
            _mm_prefetch(data0 + 512, _MM_HINT_T0);
1201

1202
/**begin repeat2
1203
 * #i = 0, 4#
1204
 */
1205
            /*
1206
             * NOTE: This accumulation changes the order, so will likely
1207
             *       produce slightly different results.
1208
             */
1209 1
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
1210
/**end repeat2**/
1211 1
            data0 += 8;
1212
        }
1213

1214
        /* Add the four SSE values and put in accum */
1215 1
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
1216 1
        accum_sse = _mm_add_ps(a, accum_sse);
1217 1
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
1218 1
        accum_sse = _mm_add_ps(a, accum_sse);
1219
        _mm_store_ss(&accum, accum_sse);
1220

1221
        /* Finish off the loop */
1222
        goto finish_after_unrolled_loop;
1223
    }
1224
#elif EINSUM_USE_SSE2 && @float64@
1225
    /* Use aligned instructions if possible */
1226
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
1227
        /* Unroll the loop by 8 */
1228
        while (count >= 8) {
1229
            count -= 8;
1230

1231 1
            _mm_prefetch(data0 + 512, _MM_HINT_T0);
1232

1233 1
/**begin repeat2
1234 1
 * #i = 0, 2, 4, 6#
1235
 */
1236 1
            /*
1237
             * NOTE: This accumulation changes the order, so will likely
1238
             *       produce slightly different results.
1239
             */
1240 1
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
1241
/**end repeat2**/
1242 1
            data0 += 8;
1243
        }
1244

1245
        /* Add the two SSE2 values and put in accum */
1246 1
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
1247 1
        accum_sse = _mm_add_pd(a, accum_sse);
1248
        _mm_store_sd(&accum, accum_sse);
1249

1250
        /* Finish off the loop */
1251
        goto finish_after_unrolled_loop;
1252
    }
1253
#endif
1254

1255
    /* Unroll the loop by 8 */
1256 1
    while (count >= 8) {
1257 1
        count -= 8;
1258

1259
#if EINSUM_USE_SSE1 && @float32@
1260
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
1261

1262
/**begin repeat2
1263
 * #i = 0, 4#
1264
 */
1265
        /*
1266
         * NOTE: This accumulation changes the order, so will likely
1267
         *       produce slightly different results.
1268
         */
1269 1
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
1270
/**end repeat2**/
1271
#elif EINSUM_USE_SSE2 && @float64@
1272
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
1273

1274
/**begin repeat2
1275 1
 * #i = 0, 2, 4, 6#
1276 1
 */
1277 1
        /*
1278
         * NOTE: This accumulation changes the order, so will likely
1279 1
         *       produce slightly different results.
1280 1
         */
1281 1
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
1282
/**end repeat2**/
1283
#else
1284
/**begin repeat2
1285
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
1286
 */
1287
#  if !@complex@
1288 1
        accum += @from@(data0[@i@]);
1289
#  else /* complex */
1290 1
        accum_re += data0[2*@i@+0];
1291 1
        accum_im += data0[2*@i@+1];
1292
#  endif
1293
/**end repeat2**/
1294
#endif
1295

1296
#if !@complex@
1297 1
        data0 += 8;
1298
#else
1299 1
        data0 += 8*2;
1300
#endif
1301
    }
1302

1303
#if EINSUM_USE_SSE1 && @float32@
1304
    /* Add the four SSE values and put in accum */
1305
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
1306
    accum_sse = _mm_add_ps(a, accum_sse);
1307
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
1308
    accum_sse = _mm_add_ps(a, accum_sse);
1309
    _mm_store_ss(&accum, accum_sse);
1310
#elif EINSUM_USE_SSE2 && @float64@
1311
    /* Add the two SSE2 values and put in accum */
1312
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
1313
    accum_sse = _mm_add_pd(a, accum_sse);
1314
    _mm_store_sd(&accum, accum_sse);
1315
#endif
1316

1317
    /* Finish off the loop */
1318
    goto finish_after_unrolled_loop;
1319
}
1320

1321
#endif /* @nop@ == 1 */
1322

1323
static void
1324 1
@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
1325
                                npy_intp const *strides, npy_intp count)
1326
{
1327
#if @complex@
1328 1
    @temptype@ accum_re = 0, accum_im = 0;
1329
#else
1330 1
    @temptype@ accum = 0;
1331
#endif
1332

1333
#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
1334 1
    char *data0 = dataptr[0];
1335 1
    npy_intp stride0 = strides[0];
1336
#endif
1337
#if (@nop@ == 2 || @nop@ == 3) && !@complex@
1338
    char *data1 = dataptr[1];
1339
    npy_intp stride1 = strides[1];
1340
#endif
1341
#if (@nop@ == 3) && !@complex@
1342
    char *data2 = dataptr[2];
1343
    npy_intp stride2 = strides[2];
1344
#endif
1345

1346
    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n",
1347
                                                    (int)count);
1348

1349 1
    while (count--) {
1350
#if !@complex@
1351 1
#  if @nop@ == 1
1352 1
        accum += @from@(*(@type@ *)data0);
1353 1
        data0 += stride0;
1354
#  elif @nop@ == 2
1355
        accum += @from@(*(@type@ *)data0) *
1356
                 @from@(*(@type@ *)data1);
1357
        data0 += stride0;
1358
        data1 += stride1;
1359
#  elif @nop@ == 3
1360
        accum += @from@(*(@type@ *)data0) *
1361
                 @from@(*(@type@ *)data1) *
1362
                 @from@(*(@type@ *)data2);
1363
        data0 += stride0;
1364
        data1 += stride1;
1365
        data2 += stride2;
1366 1
#  else
1367 1
        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
1368
        int i;
1369
        for (i = 1; i < nop; ++i) {
1370 1
            temp *= @from@(*(@type@ *)dataptr[i]);
1371
        }
1372
        accum += temp;
1373
        for (i = 0; i < nop; ++i) {
1374
            dataptr[i] += strides[i];
1375
        }
1376
#  endif
1377
#else /* complex */
1378 1
#  if @nop@ == 1
1379 1
        accum_re += ((@temptype@ *)data0)[0];
1380 1
        accum_im += ((@temptype@ *)data0)[1];
1381 1
        data0 += stride0;
1382
#  else
1383
#    if @nop@ <= 3
1384 1
#define _SUMPROD_NOP @nop@
1385
#    else
1386
#define _SUMPROD_NOP nop
1387
#    endif
1388 1
        @temptype@ re, im, tmp;
1389 1
        int i;
1390
        re = ((@temptype@ *)dataptr[0])[0];
1391
        im = ((@temptype@ *)dataptr[0])[1];
1392
        for (i = 1; i < _SUMPROD_NOP; ++i) {
1393
            tmp = re * ((@temptype@ *)dataptr[i])[0] -
1394
                  im * ((@temptype@ *)dataptr[i])[1];
1395
            im = re * ((@temptype@ *)dataptr[i])[1] +
1396
                 im * ((@temptype@ *)dataptr[i])[0];
1397 1
            re = tmp;
1398
        }
1399
        accum_re += re;
1400
        accum_im += im;
1401
        for (i = 0; i < _SUMPROD_NOP; ++i) {
1402
            dataptr[i] += strides[i];
1403 1
        }
1404
#undef _SUMPROD_NOP
1405
#  endif
1406 1
#endif
1407 1
    }
1408 1

1409
#if @complex@
1410
#  if @nop@ <= 3
1411 1
    ((@temptype@ *)dataptr[@nop@])[0] += accum_re;
1412 1
    ((@temptype@ *)dataptr[@nop@])[1] += accum_im;
1413
#  else
1414
    ((@temptype@ *)dataptr[nop])[0] += accum_re;
1415
    ((@temptype@ *)dataptr[nop])[1] += accum_im;
1416
#  endif
1417
#else
1418
#  if @nop@ <= 3
1419 1
    *((@type@ *)dataptr[@nop@]) = @to@(accum +
1420 1
                                    @from@(*((@type@ *)dataptr[@nop@])));
1421
#  else
1422 1
    *((@type@ *)dataptr[nop]) = @to@(accum +
1423
                                    @from@(*((@type@ *)dataptr[nop])));
1424
#  endif
1425 1
#endif
1426 1

1427 1
}
1428

1429
/**end repeat1**/
1430

1431
/**end repeat**/
1432

1433

1434
/* Do OR of ANDs for the boolean type */
1435

1436
/**begin repeat
1437
 * #nop = 1, 2, 3, 1000#
1438
 * #noplabel = one, two, three, any#
1439
 */
1440

1441
static void
1442 0
bool_sum_of_products_@noplabel@(int nop, char **dataptr,
1443
                                npy_intp const *strides, npy_intp count)
1444
{
1445
#if (@nop@ <= 3)
1446 0
    char *data0 = dataptr[0];
1447 0
    npy_intp stride0 = strides[0];
1448
#endif
1449
#if (@nop@ == 2 || @nop@ == 3)
1450 0
    char *data1 = dataptr[1];
1451 0
    npy_intp stride1 = strides[1];
1452
#endif
1453
#if (@nop@ == 3)
1454 0
    char *data2 = dataptr[2];
1455 0
    npy_intp stride2 = strides[2];
1456
#endif
1457
#if (@nop@ <= 3)
1458 0
    char *data_out = dataptr[@nop@];
1459 0
    npy_intp stride_out = strides[@nop@];
1460
#endif
1461

1462 0
    while (count--) {
1463
#if @nop@ == 1
1464 0
        *(npy_bool *)data_out = *(npy_bool *)data0 ||
1465 0
                                  *(npy_bool *)data_out;
1466 0
        data0 += stride0;
1467 0
        data_out += stride_out;
1468
#elif @nop@ == 2
1469 0
        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
1470 0
                                   *(npy_bool *)data1) ||
1471 0
                                   *(npy_bool *)data_out;
1472 0
        data0 += stride0;
1473 1
        data1 += stride1;
1474 1
        data_out += stride_out;
1475
#elif @nop@ == 3
1476 0
        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
1477 0
                                   *(npy_bool *)data1 &&
1478 1
                                   *(npy_bool *)data2) ||
1479 0
                                   *(npy_bool *)data_out;
1480 0
        data0 += stride0;
1481 1
        data1 += stride1;
1482 0
        data2 += stride2;
1483 0
        data_out += stride_out;
1484 1
#else
1485 0
        npy_bool temp = *(npy_bool *)dataptr[0];
1486
        int i;
1487 0
        for (i = 1; i < nop; ++i) {
1488 1
            temp = temp && *(npy_bool *)dataptr[i];
1489 1
        }
1490 0
        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
1491 0
        for (i = 0; i <= nop; ++i) {
1492 1
            dataptr[i] += strides[i];
1493 1
        }
1494
#endif
1495
    }
1496 1
}
1497 1

1498
static void
1499 1
bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
1500 1
                                npy_intp const *strides, npy_intp count)
1501
{
1502
#if (@nop@ <= 3)
1503 1
    char *data0 = dataptr[0];
1504
#endif
1505
#if (@nop@ == 2 || @nop@ == 3)
1506 1
    char *data1 = dataptr[1];
1507
#endif
1508
#if (@nop@ == 3)
1509 1
    char *data2 = dataptr[2];
1510
#endif
1511
#if (@nop@ <= 3)
1512 1
    char *data_out = dataptr[@nop@];
1513
#endif
1514 1

1515 1
#if (@nop@ <= 3)
1516 1
/* This is placed before the main loop to make small counts faster */
1517 1
finish_after_unrolled_loop:
1518 1
    switch (count) {
1519 1
/**begin repeat1
1520
 * #i = 6, 5, 4, 3, 2, 1, 0#
1521
 */
1522 1
        case @i@+1:
1523
#  if @nop@ == 1
1524 0
            ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] ||
1525 0
                                            ((npy_bool *)data_out)[@i@];
1526
#  elif @nop@ == 2
1527 1
            ((npy_bool *)data_out)[@i@] =
1528 1
                            (((npy_bool *)data0)[@i@] &&
1529 1
                             ((npy_bool *)data1)[@i@]) ||
1530 0
                                ((npy_bool *)data_out)[@i@];
1531
#  elif @nop@ == 3
1532 0
            ((npy_bool *)data_out)[@i@] =
1533 0
                           (((npy_bool *)data0)[@i@] &&
1534 0
                            ((npy_bool *)data1)[@i@] &&
1535 0
                            ((npy_bool *)data2)[@i@]) ||
1536 0
                                ((npy_bool *)data_out)[@i@];
1537
#  endif
1538
/**end repeat1**/
1539
        case 0:
1540 1
            return;
1541
    }
1542
#endif
1543

1544
/* Unroll the loop by 8 for fixed-size nop */
1545
#if (@nop@ <= 3)
1546 1
    while (count >= 8) {
1547 1
        count -= 8;
1548
#else
1549
    while (count--) {
1550
#endif
1551

1552
#  if @nop@ == 1
1553
/**begin repeat1
1554
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
1555
 */
1556 0
        *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) ||
1557 0
                                        (*((npy_bool *)data_out + @i@));
1558
/**end repeat1**/
1559 0
        data0 += 8*sizeof(npy_bool);
1560 0
        data_out += 8*sizeof(npy_bool);
1561
#  elif @nop@ == 2
1562
/**begin repeat1
1563
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
1564
 */
1565 0
        *((npy_bool *)data_out + @i@) =
1566 0
                        ((*((npy_bool *)data0 + @i@)) &&
1567 0
                         (*((npy_bool *)data1 + @i@))) ||
1568 0
                            (*((npy_bool *)data_out + @i@));
1569
/**end repeat1**/
1570 0
        data0 += 8*sizeof(npy_bool);
1571 0
        data1 += 8*sizeof(npy_bool);
1572 0
        data_out += 8*sizeof(npy_bool);
1573 1
#  elif @nop@ == 3
1574 1
/**begin repeat1
1575
 * #i = 0, 1, 2, 3, 4, 5, 6, 7#
1576
 */
1577 1
        *((npy_bool *)data_out + @i@) =
1578 1
                       ((*((npy_bool *)data0 + @i@)) &&
1579 1
                        (*((npy_bool *)data1 + @i@)) &&
1580 1
                        (*((npy_bool *)data2 + @i@))) ||
1581 1
                            (*((npy_bool *)data_out + @i@));
1582
/**end repeat1**/
1583 1
        data0 += 8*sizeof(npy_bool);
1584 1
        data1 += 8*sizeof(npy_bool);
1585 1
        data2 += 8*sizeof(npy_bool);
1586 1
        data_out += 8*sizeof(npy_bool);
1587
#  else
1588
        npy_bool temp = *(npy_bool *)dataptr[0];
1589 1
        int i;
1590 1
        for (i = 1; i < nop; ++i) {
1591
            temp = temp && *(npy_bool *)dataptr[i];
1592
        }
1593 1
        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
1594 1
        for (i = 0; i <= nop; ++i) {
1595
            dataptr[i] += sizeof(npy_bool);
1596
        }
1597
#  endif
1598 1
    }
1599

1600
    /* If the loop was unrolled, we need to finish it off */
1601
#if (@nop@ <= 3)
1602
    goto finish_after_unrolled_loop;
1603
#endif
1604 1
}
1605

1606
static void
1607 0
bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
1608 1
                                npy_intp const *strides, npy_intp count)
1609 1
{
1610 1
    npy_bool accum = 0;
1611 1

1612 1
#if (@nop@ <= 3)
1613 1
    char *data0 = dataptr[0];
1614 0
    npy_intp stride0 = strides[0];
1615
#endif
1616
#if (@nop@ == 2 || @nop@ == 3)
1617 1
    char *data1 = dataptr[1];
1618 0
    npy_intp stride1 = strides[1];
1619
#endif
1620
#if (@nop@ == 3)
1621 0
    char *data2 = dataptr[2];
1622 0
    npy_intp stride2 = strides[2];
1623 1
#endif
1624

1625 0
    while (count--) {
1626
#if @nop@ == 1
1627 1
        accum = *(npy_bool *)data0 || accum;
1628 1
        data0 += stride0;
1629 1
#elif @nop@ == 2
1630 1
        accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
1631 1
        data0 += stride0;
1632 1
        data1 += stride1;
1633
#elif @nop@ == 3
1634 0
        accum = (*(npy_bool *)data0 &&
1635 0
                 *(npy_bool *)data1 &&
1636 0
                 *(npy_bool *)data2) || accum;
1637 0
        data0 += stride0;
1638 0
        data1 += stride1;
1639 0
        data2 += stride2;
1640
#else
1641
        npy_bool temp = *(npy_bool *)dataptr[0];
1642 1
        int i;
1643
        for (i = 1; i < nop; ++i) {
1644
            temp = temp && *(npy_bool *)dataptr[i];
1645
        }
1646
        accum = temp || accum;
1647
        for (i = 0; i <= nop; ++i) {
1648 1
            dataptr[i] += strides[i];
1649 1
        }
1650 1
#endif
1651 1
    }
1652

1653
#  if @nop@ <= 3
1654 0
    *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]);
1655 0
#  else
1656
    *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
1657 0
#  endif
1658 0
}
1659

1660 0
/**end repeat**/
1661 0

1662
/* These tables need to match up with the type enum */
1663
static sum_of_products_fn
1664 0
_contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
1665 0
/**begin repeat
1666
 * #name = bool,
1667
 *         byte, ubyte,
1668
 *         short, ushort,
1669
 *         int, uint,
1670
 *         long, ulong,
1671
 *         longlong, ulonglong,
1672 0
 *         float, double, longdouble,
1673
 *         cfloat, cdouble, clongdouble,
1674 1
 *         object, string, unicode, void,
1675 1
 *         datetime, timedelta, half#
1676
 * #use = 0,
1677 0
 *        1, 1,
1678 0
 *        1, 1,
1679 0
 *        1, 1,
1680
 *        1, 1,
1681
 *        1, 1,
1682 1
 *        1, 1, 1,
1683
 *        1, 1, 1,
1684
 *        0, 0, 0, 0,
1685
 *        0, 0, 1#
1686
 */
1687
#if @use@
1688
    &@name@_sum_of_products_contig_outstride0_one,
1689
#else
1690
    NULL,
1691
#endif
1692
/**end repeat**/
1693 1
}; /* End of _contig_outstride0_unary_specialization_table */
1694 1

1695
static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = {
1696
/**begin repeat
1697 0
 * #name = bool,
1698
 *         byte, ubyte,
1699
 *         short, ushort,
1700 0
 *         int, uint,
1701 1
 *         long, ulong,
1702
 *         longlong, ulonglong,
1703
 *         float, double, longdouble,
1704
 *         cfloat, cdouble, clongdouble,
1705 0
 *         object, string, unicode, void,
1706
 *         datetime, timedelta, half#
1707
 * #use = 0,
1708
 *        1, 1,
1709
 *        1, 1,
1710
 *        1, 1,
1711
 *        1, 1,
1712 1
 *        1, 1,
1713 1
 *        1, 1, 1,
1714 0
 *        0, 0, 0,
1715 0
 *        0, 0, 0, 0,
1716
 *        0, 0, 1#
1717
 */
1718
#if @use@
1719
{
1720 1
    &@name@_sum_of_products_stride0_contig_outstride0_two,
1721
    &@name@_sum_of_products_stride0_contig_outcontig_two,
1722
    &@name@_sum_of_products_contig_stride0_outstride0_two,
1723
    &@name@_sum_of_products_contig_stride0_outcontig_two,
1724
    &@name@_sum_of_products_contig_contig_outstride0_two,
1725
},
1726
#else
1727
    {NULL, NULL, NULL, NULL, NULL},
1728
#endif
1729
/**end repeat**/
1730
}; /* End of _binary_specialization_table */
1731

1732
static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
1733
/**begin repeat
1734
 * #name = bool,
1735
 *         byte, ubyte,
1736
 *         short, ushort,
1737
 *         int, uint,
1738
 *         long, ulong,
1739
 *         longlong, ulonglong,
1740
 *         float, double, longdouble,
1741 0
 *         cfloat, cdouble, clongdouble,
1742
 *         object, string, unicode, void,
1743
 *         datetime, timedelta, half#
1744
 * #use = 1,
1745 0
 *        1, 1,
1746
 *        1, 1,
1747
 *        1, 1,
1748
 *        1, 1,
1749
 *        1, 1,
1750
 *        1, 1, 1,
1751
 *        1, 1, 1,
1752
 *        0, 0, 0, 0,
1753
 *        0, 0, 1#
1754
 */
1755
#if @use@
1756
{
1757
    &@name@_sum_of_products_outstride0_any,
1758
    &@name@_sum_of_products_outstride0_one,
1759
    &@name@_sum_of_products_outstride0_two,
1760
    &@name@_sum_of_products_outstride0_three
1761
},
1762
#else
1763
    {NULL, NULL, NULL, NULL},
1764
#endif
1765
/**end repeat**/
1766
}; /* End of _outstride0_specialized_table */
1767

1768
static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
1769
/**begin repeat
1770
 * #name = bool,
1771
 *         byte, ubyte,
1772
 *         short, ushort,
1773
 *         int, uint,
1774
 *         long, ulong,
1775
 *         longlong, ulonglong,
1776
 *         float, double, longdouble,
1777
 *         cfloat, cdouble, clongdouble,
1778
 *         object, string, unicode, void,
1779
 *         datetime, timedelta, half#
1780
 * #use = 1,
1781
 *        1, 1,
1782
 *        1, 1,
1783
 *        1, 1,
1784
 *        1, 1,
1785
 *        1, 1,
1786
 *        1, 1, 1,
1787
 *        1, 1, 1,
1788
 *        0, 0, 0, 0,
1789
 *        0, 0, 1#
1790
 */
1791
#if @use@
1792
{
1793 1
    &@name@_sum_of_products_contig_any,
1794
    &@name@_sum_of_products_contig_one,
1795
    &@name@_sum_of_products_contig_two,
1796 1
    &@name@_sum_of_products_contig_three
1797 1
},
1798 1
#else
1799 1
    {NULL, NULL, NULL, NULL},
1800
#endif
1801
/**end repeat**/
1802 1
}; /* End of _allcontig_specialized_table */
1803 1

1804
static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
1805
/**begin repeat
1806
 * #name = bool,
1807 0
 *         byte, ubyte,
1808
 *         short, ushort,
1809 0
 *         int, uint,
1810 0
 *         long, ulong,
1811
 *         longlong, ulonglong,
1812 0
 *         float, double, longdouble,
1813 0
 *         cfloat, cdouble, clongdouble,
1814 0
 *         object, string, unicode, void,
1815
 *         datetime, timedelta, half#
1816
 * #use = 1,
1817
 *        1, 1,
1818
 *        1, 1,
1819
 *        1, 1,
1820 1
 *        1, 1,
1821
 *        1, 1,
1822
 *        1, 1, 1,
1823 0
 *        1, 1, 1,
1824 1
 *        0, 0, 0, 0,
1825 1
 *        0, 0, 1#
1826 0
 */
1827 1
#if @use@
1828 1
{
1829 0
    &@name@_sum_of_products_any,
1830
    &@name@_sum_of_products_one,
1831
    &@name@_sum_of_products_two,
1832 1
    &@name@_sum_of_products_three
1833
},
1834 1
#else
1835
    {NULL, NULL, NULL, NULL},
1836
#endif
1837
/**end repeat**/
1838
}; /* End of _unnspecialized_table */
1839

1840
NPY_VISIBILITY_HIDDEN sum_of_products_fn
1841 1
get_sum_of_products_function(int nop, int type_num,
1842 1
                             npy_intp itemsize, npy_intp const *fixed_strides)
1843 1
{
1844 0
    int iop;
1845

1846 1
    if (type_num >= NPY_NTYPES) {
1847
        return NULL;
1848
    }
1849

1850
    /* contiguous reduction */
1851 1
    if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) {
1852 1
        sum_of_products_fn ret =
1853
            _contig_outstride0_unary_specialization_table[type_num];
1854 1
        if (ret != NULL) {
1855
            return ret;
1856
        }
1857
    }
1858

1859
    /* nop of 2 has more specializations */
1860 1
    if (nop == 2) {
1861
        /* Encode the zero/contiguous strides */
1862 0
        int code;
1863 1
        code = (fixed_strides[0] == 0) ? 0 :
1864 1
                    (fixed_strides[0] == itemsize) ? 2*2*1 : 8;
1865 1
        code += (fixed_strides[1] == 0) ? 0 :
1866 1
                    (fixed_strides[1] == itemsize) ? 2*1 : 8;
1867 1
        code += (fixed_strides[2] == 0) ? 0 :
1868 1
                    (fixed_strides[2] == itemsize) ? 1 : 8;
1869 1
        if (code >= 2 && code < 7) {
1870 1
            sum_of_products_fn ret =
1871
                        _binary_specialization_table[type_num][code-2];
1872 1
            if (ret != NULL) {
1873
                return ret;
1874
            }
1875 0
        }
1876
    }
1877 0

1878
    /* Inner loop with an output stride of 0 */
1879 1
    if (fixed_strides[nop] == 0) {
1880 1
        return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0];
1881
    }
1882

1883
    /* Check for all contiguous */
1884 1
    for (iop = 0; iop < nop + 1; ++iop) {
1885 1
        if (fixed_strides[iop] != itemsize) {
1886
            break;
1887
        }
1888
    }
1889

1890
    /* Contiguous loop */
1891 1
    if (iop == nop + 1) {
1892 1
        return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0];
1893
    }
1894

1895
    /* None of the above specializations caught it, general loops */
1896 1
    return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
1897
}

Read our documentation on viewing source code .

Loading