1
/*
2
 * This file is part of pocketfft.
3
 * Licensed under a 3-clause BSD style license - see LICENSE.md
4
 */
5

6
/*
7
 *  Main implementation file.
8
 *
9
 *  Copyright (C) 2004-2018 Max-Planck-Society
10
 *  \author Martin Reinecke
11
 */
12

13
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
14

15
#include "Python.h"
16
#include "numpy/arrayobject.h"
17

18
#include <math.h>
19
#include <string.h>
20
#include <stdlib.h>
21

22
#include "npy_config.h"
23
#define restrict NPY_RESTRICT
24

25
#define RALLOC(type,num) \
26
  ((type *)malloc((num)*sizeof(type)))
27
#define DEALLOC(ptr) \
28
  do { free(ptr); (ptr)=NULL; } while(0)
29

30
#define SWAP(a,b,type) \
31
  do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0)
32

33
#ifdef __GNUC__
34
#define NOINLINE __attribute__((noinline))
35
#define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result))
36
#else
37
#define NOINLINE
38
#define WARN_UNUSED_RESULT
39
#endif
40

41
struct cfft_plan_i;
42
typedef struct cfft_plan_i * cfft_plan;
43
struct rfft_plan_i;
44
typedef struct rfft_plan_i * rfft_plan;
45

46
// adapted from https://stackoverflow.com/questions/42792939/
47
// CAUTION: this function only works for arguments in the range [-0.25; 0.25]!
48 1
static void my_sincosm1pi (double a, double *restrict res)
49
  {
50 1
  double s = a * a;
51
  /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
52 1
  double r =     -1.0369917389758117e-4;
53 1
  r = fma (r, s,  1.9294935641298806e-3);
54 1
  r = fma (r, s, -2.5806887942825395e-2);
55 1
  r = fma (r, s,  2.3533063028328211e-1);
56 1
  r = fma (r, s, -1.3352627688538006e+0);
57 1
  r = fma (r, s,  4.0587121264167623e+0);
58 1
  r = fma (r, s, -4.9348022005446790e+0);
59 1
  double c = r*s;
60
  /* Approximate sin(pi*x) for x in [-0.25,0.25] */
61 1
  r =             4.6151442520157035e-4;
62 1
  r = fma (r, s, -7.3700183130883555e-3);
63 1
  r = fma (r, s,  8.2145868949323936e-2);
64 1
  r = fma (r, s, -5.9926452893214921e-1);
65 1
  r = fma (r, s,  2.5501640398732688e+0);
66 1
  r = fma (r, s, -5.1677127800499516e+0);
67 1
  s = s * a;
68 1
  r = r * s;
69 1
  s = fma (a, 3.1415926535897931e+0, r);
70 1
  res[0] = c;
71 1
  res[1] = s;
72
  }
73

74 1
NOINLINE static void calc_first_octant(size_t den, double * restrict res)
75
  {
76 1
  size_t n = (den+4)>>3;
77 1
  if (n==0) return;
78 1
  res[0]=1.; res[1]=0.;
79 1
  if (n==1) return;
80 1
  size_t l1=(size_t)sqrt(n);
81 1
  for (size_t i=1; i<l1; ++i)
82 1
    my_sincosm1pi((2.*i)/den,&res[2*i]);
83
  size_t start=l1;
84 1
  while(start<n)
85
    {
86
    double cs[2];
87 1
    my_sincosm1pi((2.*start)/den,cs);
88 1
    res[2*start] = cs[0]+1.;
89 1
    res[2*start+1] = cs[1];
90 1
    size_t end = l1;
91 1
    if (start+end>n) end = n-start;
92 1
    for (size_t i=1; i<end; ++i)
93
      {
94 1
      double csx[2]={res[2*i], res[2*i+1]};
95 1
      res[2*(start+i)] = ((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1.;
96 1
      res[2*(start+i)+1] = (cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1];
97
      }
98 1
    start += l1;
99
    }
100 1
  for (size_t i=1; i<l1; ++i)
101 1
    res[2*i] += 1.;
102
  }
103

104 1
NOINLINE static void calc_first_quadrant(size_t n, double * restrict res)
105
  {
106 1
  double * restrict p = res+n;
107 1
  calc_first_octant(n<<1, p);
108 1
  size_t ndone=(n+2)>>2;
109 1
  size_t i=0, idx1=0, idx2=2*ndone-2;
110 1
  for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
111
    {
112 1
    res[idx1]   = p[2*i];
113 1
    res[idx1+1] = p[2*i+1];
114 1
    res[idx2]   = p[2*i+3];
115 1
    res[idx2+1] = p[2*i+2];
116
    }
117 1
  if (i!=ndone)
118
    {
119 1
    res[idx1  ] = p[2*i];
120 1
    res[idx1+1] = p[2*i+1];
121
    }
122
  }
123

124 1
NOINLINE static void calc_first_half(size_t n, double * restrict res)
125
  {
126 1
  int ndone=(n+1)>>1;
127 1
  double * p = res+n-1;
128 1
  calc_first_octant(n<<2, p);
129 1
  int i4=0, in=n, i=0;
130 1
  for (; i4<=in-i4; ++i, i4+=4) // octant 0
131
    {
132 1
    res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1];
133
    }
134 1
  for (; i4-in <= 0; ++i, i4+=4) // octant 1
135
    {
136 1
    int xm = in-i4;
137 1
    res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm];
138
    }
139 1
  for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
140
    {
141 1
    int xm = i4-in;
142 1
    res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm];
143
    }
144 1
  for (; i<ndone; ++i, i4+=4) // octant 3
145
    {
146 1
    int xm = 2*in-i4;
147 1
    res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1];
148
    }
149
  }
150

151 1
NOINLINE static void fill_first_quadrant(size_t n, double * restrict res)
152
  {
153 1
  const double hsqt2 = 0.707106781186547524400844362104849;
154 1
  size_t quart = n>>2;
155 1
  if ((n&7)==0)
156 1
    res[quart] = res[quart+1] = hsqt2;
157 1
  for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
158
    {
159 1
    res[j  ] = res[i+1];
160 1
    res[j+1] = res[i  ];
161
    }
162
  }
163

164 1
NOINLINE static void fill_first_half(size_t n, double * restrict res)
165
  {
166 1
  size_t half = n>>1;
167 1
  if ((n&3)==0)
168 1
    for (size_t i=0; i<half; i+=2)
169
      {
170 1
      res[i+half]   = -res[i+1];
171 1
      res[i+half+1] =  res[i  ];
172
      }
173
  else
174 1
    for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
175
      {
176 1
      res[j  ] = -res[i  ];
177 1
      res[j+1] =  res[i+1];
178
      }
179
  }
180

181 1
NOINLINE static void fill_second_half(size_t n, double * restrict res)
182
  {
183 1
  if ((n&1)==0)
184 1
    for (size_t i=0; i<n; ++i)
185 1
      res[i+n] = -res[i];
186
  else
187 1
    for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
188
      {
189 1
      res[j  ] =  res[i  ];
190 1
      res[j+1] = -res[i+1];
191
      }
192
  }
193

194 1
NOINLINE static void sincos_2pibyn_half(size_t n, double * restrict res)
195
  {
196 1
  if ((n&3)==0)
197
    {
198 1
    calc_first_octant(n, res);
199 1
    fill_first_quadrant(n, res);
200 1
    fill_first_half(n, res);
201
    }
202 1
  else if ((n&1)==0)
203
    {
204 1
    calc_first_quadrant(n, res);
205 1
    fill_first_half(n, res);
206
    }
207
  else
208 1
    calc_first_half(n, res);
209
  }
210

211 1
NOINLINE static void sincos_2pibyn(size_t n, double * restrict res)
212
  {
213 1
  sincos_2pibyn_half(n, res);
214 1
  fill_second_half(n, res);
215
  }
216

217 1
NOINLINE static size_t largest_prime_factor (size_t n)
218
  {
219 1
  size_t res=1;
220
  size_t tmp;
221 1
  while (((tmp=(n>>1))<<1)==n)
222
    { res=2; n=tmp; }
223

224 1
  size_t limit=(size_t)sqrt(n+0.01);
225 1
  for (size_t x=3; x<=limit; x+=2)
226 1
  while (((tmp=(n/x))*x)==n)
227
    {
228 1
    res=x;
229 1
    n=tmp;
230 1
    limit=(size_t)sqrt(n+0.01);
231
    }
232 1
  if (n>1) res=n;
233

234 1
  return res;
235
  }
236

237 1
NOINLINE static double cost_guess (size_t n)
238
  {
239 1
  const double lfp=1.1; // penalty for non-hardcoded larger factors
240 1
  size_t ni=n;
241 1
  double result=0.;
242
  size_t tmp;
243 1
  while (((tmp=(n>>1))<<1)==n)
244 1
    { result+=2; n=tmp; }
245

246 1
  size_t limit=(size_t)sqrt(n+0.01);
247 1
  for (size_t x=3; x<=limit; x+=2)
248 1
  while ((tmp=(n/x))*x==n)
249
    {
250 1
    result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
251 1
    n=tmp;
252 1
    limit=(size_t)sqrt(n+0.01);
253
    }
254 1
  if (n>1) result+=(n<=5) ? n : lfp*n;
255

256 1
  return result*ni;
257
  }
258

259
/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
260 1
NOINLINE static size_t good_size(size_t n)
261
  {
262 1
  if (n<=6) return n;
263

264 1
  size_t bestfac=2*n;
265 1
  for (size_t f2=1; f2<bestfac; f2*=2)
266 1
    for (size_t f23=f2; f23<bestfac; f23*=3)
267 1
      for (size_t f235=f23; f235<bestfac; f235*=5)
268 1
        for (size_t f2357=f235; f2357<bestfac; f2357*=7)
269 1
          for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
270 1
            if (f235711>=n) bestfac=f235711;
271
  return bestfac;
272
  }
273

274
typedef struct cmplx {
275
  double r,i;
276
} cmplx;
277

278
#define NFCT 25
279
typedef struct cfftp_fctdata
280
  {
281
  size_t fct;
282
  cmplx *tw, *tws;
283
  } cfftp_fctdata;
284

285
typedef struct cfftp_plan_i
286
  {
287
  size_t length, nfct;
288
  cmplx *mem;
289
  cfftp_fctdata fct[NFCT];
290
  } cfftp_plan_i;
291
typedef struct cfftp_plan_i * cfftp_plan;
292

293
#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
294
#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
295
#define SCALEC(a,b) { a.r*=b; a.i*=b; }
296
#define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
297
#define ROTM90(a) { double tmp_=-a.r; a.r=a.i; a.i=tmp_; }
298
#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
299
#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
300
#define WA(x,i) wa[(i)-1+(x)*(ido-1)]
301
/* a = b*c */
302
#define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
303
/* a = conj(b)*c*/
304
#define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
305

306
#define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; }
307
/* a = b*c */
308
#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; }
309
/* a *= b */
310
#define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; }
311

312 1
NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc,
313
  cmplx * restrict ch, const cmplx * restrict wa)
314
  {
315 1
  const size_t cdim=2;
316

317 1
  if (ido==1)
318 1
    for (size_t k=0; k<l1; ++k)
319 1
      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
320
  else
321 1
    for (size_t k=0; k<l1; ++k)
322
      {
323 1
      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
324 1
      for (size_t i=1; i<ido; ++i)
325
        {
326
        cmplx t;
327 1
        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
328 1
        A_EQ_B_MUL_C (CH(i,k,1),WA(0,i),t)
329
        }
330
      }
331
  }
332

333 1
NOINLINE static void pass2f (size_t ido, size_t l1, const cmplx * restrict cc,
334
  cmplx * restrict ch, const cmplx * restrict wa)
335
  {
336 1
  const size_t cdim=2;
337

338 1
  if (ido==1)
339 1
    for (size_t k=0; k<l1; ++k)
340 1
      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
341
  else
342 1
    for (size_t k=0; k<l1; ++k)
343
      {
344 1
      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
345 1
      for (size_t i=1; i<ido; ++i)
346
        {
347
        cmplx t;
348 1
        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
349 1
        A_EQ_CB_MUL_C (CH(i,k,1),WA(0,i),t)
350
        }
351
      }
352
  }
353

354
#define PREP3(idx) \
355
        cmplx t0 = CC(idx,0,k), t1, t2; \
356
        PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)) \
357
        CH(idx,k,0).r=t0.r+t1.r; \
358
        CH(idx,k,0).i=t0.i+t1.i;
359
#define PARTSTEP3a(u1,u2,twr,twi) \
360
        { \
361
        cmplx ca,cb; \
362
        ca.r=t0.r+twr*t1.r; \
363
        ca.i=t0.i+twr*t1.i; \
364
        cb.i=twi*t2.r; \
365
        cb.r=-(twi*t2.i); \
366
        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
367
        }
368

369
#define PARTSTEP3b(u1,u2,twr,twi) \
370
        { \
371
        cmplx ca,cb,da,db; \
372
        ca.r=t0.r+twr*t1.r; \
373
        ca.i=t0.i+twr*t1.i; \
374
        cb.i=twi*t2.r; \
375
        cb.r=-(twi*t2.i); \
376
        PMC(da,db,ca,cb) \
377
        A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
378
        A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
379
        }
380 1
NOINLINE static void pass3b (size_t ido, size_t l1, const cmplx * restrict cc,
381
  cmplx * restrict ch, const cmplx * restrict wa)
382
  {
383 1
  const size_t cdim=3;
384 1
  const double tw1r=-0.5, tw1i= 0.86602540378443864676;
385

386 1
  if (ido==1)
387 1
    for (size_t k=0; k<l1; ++k)
388
      {
389 1
      PREP3(0)
390 1
      PARTSTEP3a(1,2,tw1r,tw1i)
391
      }
392
  else
393 1
    for (size_t k=0; k<l1; ++k)
394
      {
395
      {
396 1
      PREP3(0)
397 1
      PARTSTEP3a(1,2,tw1r,tw1i)
398
      }
399 1
      for (size_t i=1; i<ido; ++i)
400
        {
401 1
        PREP3(i)
402 1
        PARTSTEP3b(1,2,tw1r,tw1i)
403
        }
404
      }
405
  }
406
#define PARTSTEP3f(u1,u2,twr,twi) \
407
        { \
408
        cmplx ca,cb,da,db; \
409
        ca.r=t0.r+twr*t1.r; \
410
        ca.i=t0.i+twr*t1.i; \
411
        cb.i=twi*t2.r; \
412
        cb.r=-(twi*t2.i); \
413
        PMC(da,db,ca,cb) \
414
        A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
415
        A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
416
        }
417 1
NOINLINE static void pass3f (size_t ido, size_t l1, const cmplx * restrict cc,
418
  cmplx * restrict ch, const cmplx * restrict wa)
419
  {
420 1
  const size_t cdim=3;
421 1
  const double tw1r=-0.5, tw1i= -0.86602540378443864676;
422

423 1
  if (ido==1)
424 1
    for (size_t k=0; k<l1; ++k)
425
      {
426 1
      PREP3(0)
427 1
      PARTSTEP3a(1,2,tw1r,tw1i)
428
      }
429
  else
430 1
    for (size_t k=0; k<l1; ++k)
431
      {
432
      {
433 1
      PREP3(0)
434 1
      PARTSTEP3a(1,2,tw1r,tw1i)
435
      }
436 1
      for (size_t i=1; i<ido; ++i)
437
        {
438 1
        PREP3(i)
439 1
        PARTSTEP3f(1,2,tw1r,tw1i)
440
        }
441
      }
442
  }
443

444 1
NOINLINE static void pass4b (size_t ido, size_t l1, const cmplx * restrict cc,
445
  cmplx * restrict ch, const cmplx * restrict wa)
446
  {
447 1
  const size_t cdim=4;
448

449 1
  if (ido==1)
450 1
    for (size_t k=0; k<l1; ++k)
451
      {
452
      cmplx t1, t2, t3, t4;
453 1
      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
454 1
      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
455 1
      ROT90(t4)
456 1
      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
457 1
      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
458
      }
459
  else
460 1
    for (size_t k=0; k<l1; ++k)
461
      {
462
      {
463
      cmplx t1, t2, t3, t4;
464 1
      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
465 1
      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
466 1
      ROT90(t4)
467 1
      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
468 1
      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
469
      }
470 1
      for (size_t i=1; i<ido; ++i)
471
        {
472
        cmplx c2, c3, c4, t1, t2, t3, t4;
473 1
        cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
474 1
        PMC(t2,t1,cc0,cc2)
475 1
        PMC(t3,t4,cc1,cc3)
476 1
        ROT90(t4)
477 1
        cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
478 1
        PMC(CH(i,k,0),c3,t2,t3)
479 1
        PMC(c2,c4,t1,t4)
480 1
        A_EQ_B_MUL_C (CH(i,k,1),wa0,c2)
481 1
        A_EQ_B_MUL_C (CH(i,k,2),wa1,c3)
482 1
        A_EQ_B_MUL_C (CH(i,k,3),wa2,c4)
483
        }
484
      }
485
  }
486 1
NOINLINE static void pass4f (size_t ido, size_t l1, const cmplx * restrict cc,
487
  cmplx * restrict ch, const cmplx * restrict wa)
488
  {
489 1
  const size_t cdim=4;
490

491 1
  if (ido==1)
492 1
    for (size_t k=0; k<l1; ++k)
493
      {
494
      cmplx t1, t2, t3, t4;
495 1
      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
496 1
      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
497 1
      ROTM90(t4)
498 1
      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
499 1
      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
500
      }
501
  else
502 1
    for (size_t k=0; k<l1; ++k)
503
      {
504
      {
505
      cmplx t1, t2, t3, t4;
506 1
      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
507 1
      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
508 1
      ROTM90(t4)
509 1
      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
510 1
      PMC (CH(0,k,1),CH(0,k,3),t1,t4)
511
      }
512 1
      for (size_t i=1; i<ido; ++i)
513
        {
514
        cmplx c2, c3, c4, t1, t2, t3, t4;
515 1
        cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
516 1
        PMC(t2,t1,cc0,cc2)
517 1
        PMC(t3,t4,cc1,cc3)
518 1
        ROTM90(t4)
519 1
        cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
520 1
        PMC(CH(i,k,0),c3,t2,t3)
521 1
        PMC(c2,c4,t1,t4)
522 1
        A_EQ_CB_MUL_C (CH(i,k,1),wa0,c2)
523 1
        A_EQ_CB_MUL_C (CH(i,k,2),wa1,c3)
524 1
        A_EQ_CB_MUL_C (CH(i,k,3),wa2,c4)
525
        }
526
      }
527
  }
528

529
#define PREP5(idx) \
530
        cmplx t0 = CC(idx,0,k), t1, t2, t3, t4; \
531
        PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)) \
532
        PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)) \
533
        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
534
        CH(idx,k,0).i=t0.i+t1.i+t2.i;
535

536
#define PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
537
        { \
538
        cmplx ca,cb; \
539
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
540
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
541
        cb.i=twai*t4.r twbi*t3.r; \
542
        cb.r=-(twai*t4.i twbi*t3.i); \
543
        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
544
        }
545

546
#define PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
547
        { \
548
        cmplx ca,cb,da,db; \
549
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
550
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
551
        cb.i=twai*t4.r twbi*t3.r; \
552
        cb.r=-(twai*t4.i twbi*t3.i); \
553
        PMC(da,db,ca,cb) \
554
        A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
555
        A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
556
        }
557 1
NOINLINE static void pass5b (size_t ido, size_t l1, const cmplx * restrict cc,
558
  cmplx * restrict ch, const cmplx * restrict wa)
559
  {
560 1
  const size_t cdim=5;
561 1
  const double tw1r= 0.3090169943749474241,
562 1
               tw1i= 0.95105651629515357212,
563 1
               tw2r= -0.8090169943749474241,
564 1
               tw2i= 0.58778525229247312917;
565

566 1
  if (ido==1)
567 1
    for (size_t k=0; k<l1; ++k)
568
      {
569 1
      PREP5(0)
570 1
      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
571 1
      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
572
      }
573
  else
574 1
    for (size_t k=0; k<l1; ++k)
575
      {
576
      {
577 1
      PREP5(0)
578 1
      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
579 1
      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
580
      }
581 1
      for (size_t i=1; i<ido; ++i)
582
        {
583 1
        PREP5(i)
584 1
        PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
585 1
        PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
586
        }
587
      }
588
  }
589
#define PARTSTEP5f(u1,u2,twar,twbr,twai,twbi) \
590
        { \
591
        cmplx ca,cb,da,db; \
592
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
593
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
594
        cb.i=twai*t4.r twbi*t3.r; \
595
        cb.r=-(twai*t4.i twbi*t3.i); \
596
        PMC(da,db,ca,cb) \
597
        A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
598
        A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
599
        }
600 1
NOINLINE static void pass5f (size_t ido, size_t l1, const cmplx * restrict cc,
601
  cmplx * restrict ch, const cmplx * restrict wa)
602
  {
603 1
  const size_t cdim=5;
604 1
  const double tw1r= 0.3090169943749474241,
605 1
               tw1i= -0.95105651629515357212,
606 1
               tw2r= -0.8090169943749474241,
607 1
               tw2i= -0.58778525229247312917;
608

609 1
  if (ido==1)
610 1
    for (size_t k=0; k<l1; ++k)
611
      {
612 1
      PREP5(0)
613 1
      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
614 1
      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
615
      }
616
  else
617 1
    for (size_t k=0; k<l1; ++k)
618
      {
619
      {
620 1
      PREP5(0)
621 1
      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
622 1
      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
623
      }
624 1
      for (size_t i=1; i<ido; ++i)
625
        {
626 1
        PREP5(i)
627 1
        PARTSTEP5f(1,4,tw1r,tw2r,+tw1i,+tw2i)
628 1
        PARTSTEP5f(2,3,tw2r,tw1r,+tw2i,-tw1i)
629
        }
630
      }
631
  }
632

633
#define PREP7(idx) \
634
        cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
635
        PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)) \
636
        PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)) \
637
        PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)) \
638
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
639
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
640

641
#define PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
642
        { \
643
        cmplx ca,cb; \
644
        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
645
        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
646
        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
647
        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
648
        PMC(out1,out2,ca,cb) \
649
        }
650
#define PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
651
        PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
652
#define PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
653
        { \
654
        cmplx da,db; \
655
        PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
656
        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
657
        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
658
        }
659

660 1
NOINLINE static void pass7(size_t ido, size_t l1, const cmplx * restrict cc,
661
  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
662
  {
663 1
  const size_t cdim=7;
664 1
  const double tw1r= 0.623489801858733530525,
665 1
               tw1i= sign * 0.7818314824680298087084,
666 1
               tw2r= -0.222520933956314404289,
667 1
               tw2i= sign * 0.9749279121818236070181,
668 1
               tw3r= -0.9009688679024191262361,
669 1
               tw3i= sign * 0.4338837391175581204758;
670

671 1
  if (ido==1)
672 1
    for (size_t k=0; k<l1; ++k)
673
      {
674 1
      PREP7(0)
675 1
      PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
676 1
      PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
677 1
      PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
678
      }
679
  else
680 1
    for (size_t k=0; k<l1; ++k)
681
      {
682
      {
683 1
      PREP7(0)
684 1
      PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
685 1
      PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
686 1
      PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
687
      }
688 1
      for (size_t i=1; i<ido; ++i)
689
        {
690 1
        PREP7(i)
691 1
        PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
692 1
        PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
693 1
        PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
694
        }
695
      }
696
  }
697

698
#define PREP11(idx) \
699
        cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
700
        PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)) \
701
        PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)) \
702
        PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)) \
703
        PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)) \
704
        PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)) \
705
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
706
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
707

708
#define PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
709
        { \
710
        cmplx ca,cb; \
711
        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6.r; \
712
        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; \
713
        cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
714
        cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
715
        PMC(out1,out2,ca,cb) \
716
        }
717
#define PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
718
        PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
719
#define PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
720
        { \
721
        cmplx da,db; \
722
        PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
723
        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
724
        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
725
        }
726

727 1
NOINLINE static void pass11 (size_t ido, size_t l1, const cmplx * restrict cc,
728
  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
729
  {
730 1
  const size_t cdim=11;
731 1
  const double tw1r =        0.8412535328311811688618,
732 1
               tw1i = sign * 0.5406408174555975821076,
733 1
               tw2r =        0.4154150130018864255293,
734 1
               tw2i = sign * 0.9096319953545183714117,
735 1
               tw3r =       -0.1423148382732851404438,
736 1
               tw3i = sign * 0.9898214418809327323761,
737 1
               tw4r =       -0.6548607339452850640569,
738 1
               tw4i = sign * 0.755749574354258283774,
739 1
               tw5r =       -0.9594929736144973898904,
740 1
               tw5i = sign * 0.2817325568414296977114;
741

742 1
  if (ido==1)
743 1
    for (size_t k=0; k<l1; ++k)
744
      {
745 1
      PREP11(0)
746 1
      PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
747 1
      PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
748 1
      PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
749 1
      PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
750 1
      PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
751
      }
752
  else
753 1
    for (size_t k=0; k<l1; ++k)
754
      {
755
      {
756 1
      PREP11(0)
757 1
      PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
758 1
      PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
759 1
      PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
760 1
      PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
761 1
      PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
762
      }
763 1
      for (size_t i=1; i<ido; ++i)
764
        {
765 1
        PREP11(i)
766 1
        PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
767 1
        PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
768 1
        PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
769 1
        PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
770 1
        PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
771
        }
772
      }
773
  }
774

775
#define CX(a,b,c) cc[(a)+ido*((b)+l1*(c))]
776
#define CX2(a,b) cc[(a)+idl1*(b)]
777
#define CH2(a,b) ch[(a)+idl1*(b)]
778

779 1
NOINLINE static int passg (size_t ido, size_t ip, size_t l1,
780
  cmplx * restrict cc, cmplx * restrict ch, const cmplx * restrict wa,
781
  const cmplx * restrict csarr, const int sign)
782
  {
783 1
  const size_t cdim=ip;
784 1
  size_t ipph = (ip+1)/2;
785 1
  size_t idl1 = ido*l1;
786

787 1
  cmplx * restrict wal=RALLOC(cmplx,ip);
788 1
  if (!wal) return -1;
789 1
  wal[0]=(cmplx){1.,0.};
790 1
  for (size_t i=1; i<ip; ++i)
791 1
    wal[i]=(cmplx){csarr[i].r,sign*csarr[i].i};
792

793 1
  for (size_t k=0; k<l1; ++k)
794 1
    for (size_t i=0; i<ido; ++i)
795 1
      CH(i,k,0) = CC(i,0,k);
796 1
  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
797 1
    for (size_t k=0; k<l1; ++k)
798 1
      for (size_t i=0; i<ido; ++i)
799 1
        PMC(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k))
800 1
  for (size_t k=0; k<l1; ++k)
801 1
    for (size_t i=0; i<ido; ++i)
802
      {
803 1
      cmplx tmp = CH(i,k,0);
804 1
      for (size_t j=1; j<ipph; ++j)
805 1
        ADDC(tmp,tmp,CH(i,k,j))
806 1
      CX(i,k,0) = tmp;
807
      }
808 1
  for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
809
    {
810
    // j=0
811 1
    for (size_t ik=0; ik<idl1; ++ik)
812
      {
813 1
      CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
814 1
      CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
815 1
      CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
816 1
      CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
817
      }
818

819 1
    size_t iwal=2*l;
820 1
    size_t j=3, jc=ip-3;
821 1
    for (; j<ipph-1; j+=2, jc-=2)
822
      {
823 1
      iwal+=l; if (iwal>ip) iwal-=ip;
824 1
      cmplx xwal=wal[iwal];
825 1
      iwal+=l; if (iwal>ip) iwal-=ip;
826 1
      cmplx xwal2=wal[iwal];
827 1
      for (size_t ik=0; ik<idl1; ++ik)
828
        {
829 1
        CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
830 1
        CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
831 1
        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
832 1
        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
833
        }
834
      }
835 1
    for (; j<ipph; ++j, --jc)
836
      {
837 1
      iwal+=l; if (iwal>ip) iwal-=ip;
838 1
      cmplx xwal=wal[iwal];
839 1
      for (size_t ik=0; ik<idl1; ++ik)
840
        {
841 1
        CX2(ik,l).r += CH2(ik,j).r*xwal.r;
842 1
        CX2(ik,l).i += CH2(ik,j).i*xwal.r;
843 1
        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
844 1
        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
845
        }
846
      }
847
    }
848 1
  DEALLOC(wal);
849

850
  // shuffling and twiddling
851 1
  if (ido==1)
852 1
    for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
853 1
      for (size_t ik=0; ik<idl1; ++ik)
854
        {
855 1
        cmplx t1=CX2(ik,j), t2=CX2(ik,jc);
856 1
        PMC(CX2(ik,j),CX2(ik,jc),t1,t2)
857
        }
858
  else
859
    {
860 1
    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
861 1
      for (size_t k=0; k<l1; ++k)
862
        {
863 1
        cmplx t1=CX(0,k,j), t2=CX(0,k,jc);
864 1
        PMC(CX(0,k,j),CX(0,k,jc),t1,t2)
865 1
        for (size_t i=1; i<ido; ++i)
866
          {
867
          cmplx x1, x2;
868 1
          PMC(x1,x2,CX(i,k,j),CX(i,k,jc))
869 1
          size_t idij=(j-1)*(ido-1)+i-1;
870 1
          MULPMSIGNC (CX(i,k,j),wa[idij],x1)
871 1
          idij=(jc-1)*(ido-1)+i-1;
872 1
          MULPMSIGNC (CX(i,k,jc),wa[idij],x2)
873
          }
874
        }
875
    }
876
  return 0;
877
  }
878

879
#undef CH2
880
#undef CX2
881
#undef CX
882

883 1
NOINLINE WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct,
884
  const int sign)
885
  {
886 1
  if (plan->length==1) return 0;
887 1
  size_t len=plan->length;
888 1
  size_t l1=1, nf=plan->nfct;
889 1
  cmplx *ch = RALLOC(cmplx, len);
890 1
  if (!ch) return -1;
891
  cmplx *p1=c, *p2=ch;
892

893 1
  for(size_t k1=0; k1<nf; k1++)
894
    {
895 1
    size_t ip=plan->fct[k1].fct;
896 1
    size_t l2=ip*l1;
897 1
    size_t ido = len/l2;
898 1
    if     (ip==4)
899 1
      sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw)
900 1
             : pass4f (ido, l1, p1, p2, plan->fct[k1].tw);
901 1
    else if(ip==2)
902 1
      sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw)
903 1
             : pass2f (ido, l1, p1, p2, plan->fct[k1].tw);
904 1
    else if(ip==3)
905 1
      sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw)
906 1
             : pass3f (ido, l1, p1, p2, plan->fct[k1].tw);
907 1
    else if(ip==5)
908 1
      sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw)
909 1
             : pass5f (ido, l1, p1, p2, plan->fct[k1].tw);
910 1
    else if(ip==7)  pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
911 1
    else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign);
912
    else
913
      {
914 1
      if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign))
915 0
        { DEALLOC(ch); return -1; }
916
      SWAP(p1,p2,cmplx *);
917
      }
918 1
    SWAP(p1,p2,cmplx *);
919 1
    l1=l2;
920
    }
921 1
  if (p1!=c)
922
    {
923 1
    if (fct!=1.)
924 1
      for (size_t i=0; i<len; ++i)
925
        {
926 1
        c[i].r = ch[i].r*fct;
927 1
        c[i].i = ch[i].i*fct;
928
        }
929
    else
930 1
      memcpy (c,p1,len*sizeof(cmplx));
931
    }
932
  else
933 1
    if (fct!=1.)
934 1
      for (size_t i=0; i<len; ++i)
935
        {
936 1
        c[i].r *= fct;
937 1
        c[i].i *= fct;
938
        }
939 1
  DEALLOC(ch);
940 1
  return 0;
941
  }
942

943
#undef PMSIGNC
944
#undef A_EQ_B_MUL_C
945
#undef A_EQ_CB_MUL_C
946
#undef MULPMSIGNC
947
#undef MULPMSIGNCEQ
948

949
#undef WA
950
#undef CC
951
#undef CH
952
#undef ROT90
953
#undef SCALEC
954
#undef ADDC
955
#undef PMC
956

957
NOINLINE WARN_UNUSED_RESULT
958 1
static int cfftp_forward(cfftp_plan plan, double c[], double fct)
959 1
  { return pass_all(plan,(cmplx *)c, fct, -1); }
960

961
NOINLINE WARN_UNUSED_RESULT
962 1
static int cfftp_backward(cfftp_plan plan, double c[], double fct)
963 1
  { return pass_all(plan,(cmplx *)c, fct, 1); }
964

965
NOINLINE WARN_UNUSED_RESULT
966 1
static int cfftp_factorize (cfftp_plan plan)
967
  {
968 1
  size_t length=plan->length;
969 1
  size_t nfct=0;
970 1
  while ((length%4)==0)
971 1
    { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
972 1
  if ((length%2)==0)
973
    {
974 1
    length>>=1;
975
    // factor 2 should be at the front of the factor list
976 1
    if (nfct>=NFCT) return -1;
977 1
    plan->fct[nfct++].fct=2;
978 1
    SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
979
    }
980 1
  size_t maxl=(size_t)(sqrt((double)length))+1;
981 1
  for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
982 1
    if ((length%divisor)==0)
983
      {
984 1
      while ((length%divisor)==0)
985
        {
986 1
        if (nfct>=NFCT) return -1;
987 1
        plan->fct[nfct++].fct=divisor;
988 1
        length/=divisor;
989
        }
990 1
      maxl=(size_t)(sqrt((double)length))+1;
991
      }
992 1
  if (length>1) plan->fct[nfct++].fct=length;
993 1
  plan->nfct=nfct;
994 1
  return 0;
995
  }
996

997 1
NOINLINE static size_t cfftp_twsize (cfftp_plan plan)
998
  {
999 1
  size_t twsize=0, l1=1;
1000 1
  for (size_t k=0; k<plan->nfct; ++k)
1001
    {
1002 1
    size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
1003 1
    twsize+=(ip-1)*(ido-1);
1004 1
    if (ip>11)
1005 1
      twsize+=ip;
1006 1
    l1*=ip;
1007
    }
1008 1
  return twsize;
1009
  }
1010

1011 1
NOINLINE WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan)
1012
  {
1013 1
  size_t length=plan->length;
1014 1
  double *twid = RALLOC(double, 2*length);
1015 1
  if (!twid) return -1;
1016 1
  sincos_2pibyn(length, twid);
1017 1
  size_t l1=1;
1018 1
  size_t memofs=0;
1019 1
  for (size_t k=0; k<plan->nfct; ++k)
1020
    {
1021 1
    size_t ip=plan->fct[k].fct, ido= length/(l1*ip);
1022 1
    plan->fct[k].tw=plan->mem+memofs;
1023 1
    memofs+=(ip-1)*(ido-1);
1024 1
    for (size_t j=1; j<ip; ++j)
1025 1
      for (size_t i=1; i<ido; ++i)
1026
        {
1027 1
        plan->fct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i];
1028 1
        plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1];
1029
        }
1030 1
    if (ip>11)
1031
      {
1032 1
      plan->fct[k].tws=plan->mem+memofs;
1033 1
      memofs+=ip;
1034 1
      for (size_t j=0; j<ip; ++j)
1035
        {
1036 1
        plan->fct[k].tws[j].r = twid[2*j*l1*ido];
1037 1
        plan->fct[k].tws[j].i = twid[2*j*l1*ido+1];
1038
        }
1039
      }
1040 1
    l1*=ip;
1041
    }
1042 1
  DEALLOC(twid);
1043 1
  return 0;
1044
  }
1045

1046 1
static cfftp_plan make_cfftp_plan (size_t length)
1047
  {
1048 1
  if (length==0) return NULL;
1049 1
  cfftp_plan plan = RALLOC(cfftp_plan_i,1);
1050 1
  if (!plan) return NULL;
1051 1
  plan->length=length;
1052 1
  plan->nfct=0;
1053 1
  for (size_t i=0; i<NFCT; ++i)
1054 1
    plan->fct[i]=(cfftp_fctdata){0,0,0};
1055 1
  plan->mem=0;
1056 1
  if (length==1) return plan;
1057 1
  if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
1058 1
  size_t tws=cfftp_twsize(plan);
1059 1
  plan->mem=RALLOC(cmplx,tws);
1060 1
  if (!plan->mem) { DEALLOC(plan); return NULL; }
1061 1
  if (cfftp_comp_twiddle(plan)!=0)
1062 0
    { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1063
  return plan;
1064
  }
1065

1066
static void destroy_cfftp_plan (cfftp_plan plan)
1067
  {
1068 1
  DEALLOC(plan->mem);
1069 1
  DEALLOC(plan);
1070
  }
1071

1072
typedef struct rfftp_fctdata
1073
  {
1074
  size_t fct;
1075
  double *tw, *tws;
1076
  } rfftp_fctdata;
1077

1078
typedef struct rfftp_plan_i
1079
  {
1080
  size_t length, nfct;
1081
  double *mem;
1082
  rfftp_fctdata fct[NFCT];
1083
  } rfftp_plan_i;
1084
typedef struct rfftp_plan_i * rfftp_plan;
1085

1086
#define WA(x,i) wa[(i)+(x)*(ido-1)]
1087
#define PM(a,b,c,d) { a=c+d; b=c-d; }
1088
/* (a+ib) = conj(c+id) * (e+if) */
1089
#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; }
1090

1091
#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1092
#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
1093

1094 1
NOINLINE static void radf2 (size_t ido, size_t l1, const double * restrict cc,
1095
  double * restrict ch, const double * restrict wa)
1096
  {
1097 1
  const size_t cdim=2;
1098

1099 1
  for (size_t k=0; k<l1; k++)
1100 1
    PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1))
1101 1
  if ((ido&1)==0)
1102 1
    for (size_t k=0; k<l1; k++)
1103
      {
1104 1
      CH(    0,1,k) = -CC(ido-1,k,1);
1105 1
      CH(ido-1,0,k) =  CC(ido-1,k,0);
1106
      }
1107 1
  if (ido<=2) return;
1108 1
  for (size_t k=0; k<l1; k++)
1109 1
    for (size_t i=2; i<ido; i+=2)
1110
      {
1111 1
      size_t ic=ido-i;
1112
      double tr2, ti2;
1113 1
      MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1114 1
      PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2)
1115 1
      PM (CH(i  ,0,k),CH(ic  ,1,k),ti2,CC(i  ,k,0))
1116
      }
1117
  }
1118

1119 1
NOINLINE static void radf3(size_t ido, size_t l1, const double * restrict cc,
1120
  double * restrict ch, const double * restrict wa)
1121
  {
1122 1
  const size_t cdim=3;
1123
  static const double taur=-0.5, taui=0.86602540378443864676;
1124

1125 1
  for (size_t k=0; k<l1; k++)
1126
    {
1127 1
    double cr2=CC(0,k,1)+CC(0,k,2);
1128 1
    CH(0,0,k) = CC(0,k,0)+cr2;
1129 1
    CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
1130 1
    CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
1131
    }
1132 1
  if (ido==1) return;
1133 1
  for (size_t k=0; k<l1; k++)
1134 1
    for (size_t i=2; i<ido; i+=2)
1135
      {
1136 1
      size_t ic=ido-i;
1137
      double di2, di3, dr2, dr3;
1138 1
      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)) // d2=conj(WA0)*CC1
1139 1
      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)) // d3=conj(WA1)*CC2
1140 1
      double cr2=dr2+dr3; // c add
1141 1
      double ci2=di2+di3;
1142 1
      CH(i-1,0,k) = CC(i-1,k,0)+cr2; // c add
1143 1
      CH(i  ,0,k) = CC(i  ,k,0)+ci2;
1144 1
      double tr2 = CC(i-1,k,0)+taur*cr2; // c add
1145 1
      double ti2 = CC(i  ,k,0)+taur*ci2;
1146 1
      double tr3 = taui*(di2-di3);  // t3 = taui*i*(d3-d2)?
1147 1
      double ti3 = taui*(dr3-dr2);
1148 1
      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3) // PM(i) = t2+t3
1149 1
      PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2) // PM(ic) = conj(t2-t3)
1150
      }
1151
  }
1152

1153 1
NOINLINE static void radf4(size_t ido, size_t l1, const double * restrict cc,
1154
  double * restrict ch, const double * restrict wa)
1155
  {
1156 1
  const size_t cdim=4;
1157
  static const double hsqt2=0.70710678118654752440;
1158

1159 1
  for (size_t k=0; k<l1; k++)
1160
    {
1161
    double tr1,tr2;
1162 1
    PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1))
1163 1
    PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2))
1164 1
    PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1)
1165
    }
1166 1
  if ((ido&1)==0)
1167 1
    for (size_t k=0; k<l1; k++)
1168
      {
1169 1
      double ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
1170 1
      double tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
1171 1
      PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1)
1172 1
      PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2))
1173
      }
1174 1
  if (ido<=2) return;
1175 1
  for (size_t k=0; k<l1; k++)
1176 1
    for (size_t i=2; i<ido; i+=2)
1177
      {
1178 1
      size_t ic=ido-i;
1179
      double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1180 1
      MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1181 1
      MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
1182 1
      MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
1183 1
      PM(tr1,tr4,cr4,cr2)
1184 1
      PM(ti1,ti4,ci2,ci4)
1185 1
      PM(tr2,tr3,CC(i-1,k,0),cr3)
1186 1
      PM(ti2,ti3,CC(i  ,k,0),ci3)
1187 1
      PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1)
1188 1
      PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2)
1189 1
      PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4)
1190 1
      PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3)
1191
      }
1192
  }
1193

1194 1
NOINLINE static void radf5(size_t ido, size_t l1, const double * restrict cc,
1195
  double * restrict ch, const double * restrict wa)
1196
  {
1197 1
  const size_t cdim=5;
1198
  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
1199
                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
1200

1201 1
  for (size_t k=0; k<l1; k++)
1202
    {
1203
    double cr2, cr3, ci4, ci5;
1204 1
    PM (cr2,ci5,CC(0,k,4),CC(0,k,1))
1205 1
    PM (cr3,ci4,CC(0,k,3),CC(0,k,2))
1206 1
    CH(0,0,k)=CC(0,k,0)+cr2+cr3;
1207 1
    CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
1208 1
    CH(0,2,k)=ti11*ci5+ti12*ci4;
1209 1
    CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
1210 1
    CH(0,4,k)=ti12*ci5-ti11*ci4;
1211
    }
1212 1
  if (ido==1) return;
1213 1
  for (size_t k=0; k<l1;++k)
1214 1
    for (size_t i=2; i<ido; i+=2)
1215
      {
1216
      double ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3,
1217
         dr4, dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
1218 1
      size_t ic=ido-i;
1219 1
      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1220 1
      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
1221 1
      MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
1222 1
      MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4))
1223 1
      PM(cr2,ci5,dr5,dr2)
1224 1
      PM(ci2,cr5,di2,di5)
1225 1
      PM(cr3,ci4,dr4,dr3)
1226 1
      PM(ci3,cr4,di3,di4)
1227 1
      CH(i-1,0,k)=CC(i-1,k,0)+cr2+cr3;
1228 1
      CH(i  ,0,k)=CC(i  ,k,0)+ci2+ci3;
1229 1
      tr2=CC(i-1,k,0)+tr11*cr2+tr12*cr3;
1230 1
      ti2=CC(i  ,k,0)+tr11*ci2+tr12*ci3;
1231 1
      tr3=CC(i-1,k,0)+tr12*cr2+tr11*cr3;
1232 1
      ti3=CC(i  ,k,0)+tr12*ci2+tr11*ci3;
1233 1
      MULPM(tr5,tr4,cr5,cr4,ti11,ti12)
1234 1
      MULPM(ti5,ti4,ci5,ci4,ti11,ti12)
1235 1
      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5)
1236 1
      PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2)
1237 1
      PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4)
1238 1
      PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3)
1239
      }
1240
  }
1241

1242
#undef CC
1243
#undef CH
1244
#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1245
#define C2(a,b) cc[(a)+idl1*(b)]
1246
#define CH2(a,b) ch[(a)+idl1*(b)]
1247
#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1248
#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1249 1
NOINLINE static void radfg(size_t ido, size_t ip, size_t l1,
1250
  double * restrict cc, double * restrict ch, const double * restrict wa,
1251
  const double * restrict csarr)
1252
  {
1253 1
  const size_t cdim=ip;
1254 1
  size_t ipph=(ip+1)/2;
1255 1
  size_t idl1 = ido*l1;
1256

1257 1
  if (ido>1)
1258
    {
1259 1
    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)              // 114
1260
      {
1261 1
      size_t is=(j-1)*(ido-1),
1262 1
             is2=(jc-1)*(ido-1);
1263 1
      for (size_t k=0; k<l1; ++k)                            // 113
1264
        {
1265
        size_t idij=is;
1266
        size_t idij2=is2;
1267 1
        for (size_t i=1; i<=ido-2; i+=2)                      // 112
1268
          {
1269 1
          double t1=C1(i,k,j ), t2=C1(i+1,k,j ),
1270 1
                 t3=C1(i,k,jc), t4=C1(i+1,k,jc);
1271 1
          double x1=wa[idij]*t1 + wa[idij+1]*t2,
1272 1
                 x2=wa[idij]*t2 - wa[idij+1]*t1,
1273 1
                 x3=wa[idij2]*t3 + wa[idij2+1]*t4,
1274 1
                 x4=wa[idij2]*t4 - wa[idij2+1]*t3;
1275 1
          C1(i  ,k,j ) = x1+x3;
1276 1
          C1(i  ,k,jc) = x2-x4;
1277 1
          C1(i+1,k,j ) = x2+x4;
1278 1
          C1(i+1,k,jc) = x3-x1;
1279 1
          idij+=2;
1280 1
          idij2+=2;
1281
          }
1282
        }
1283
      }
1284
    }
1285

1286 1
  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 123
1287 1
    for (size_t k=0; k<l1; ++k)                              // 122
1288
      {
1289 1
      double t1=C1(0,k,j), t2=C1(0,k,jc);
1290 1
      C1(0,k,j ) = t1+t2;
1291 1
      C1(0,k,jc) = t2-t1;
1292
      }
1293

1294
//everything in C
1295
//memset(ch,0,ip*l1*ido*sizeof(double));
1296

1297 1
  for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)                 // 127
1298
    {
1299 1
    for (size_t ik=0; ik<idl1; ++ik)                         // 124
1300
      {
1301 1
      CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
1302 1
      CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
1303
      }
1304 1
    size_t iang = 2*l;
1305 1
    size_t j=3, jc=ip-3;
1306 1
    for (; j<ipph-3; j+=4,jc-=4)              // 126
1307
      {
1308 1
      iang+=l; if (iang>=ip) iang-=ip;
1309 1
      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1310 1
      iang+=l; if (iang>=ip) iang-=ip;
1311 1
      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1312 1
      iang+=l; if (iang>=ip) iang-=ip;
1313 1
      double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1314 1
      iang+=l; if (iang>=ip) iang-=ip;
1315 1
      double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1316 1
      for (size_t ik=0; ik<idl1; ++ik)                       // 125
1317
        {
1318 1
        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
1319 1
                     +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
1320 1
        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
1321 1
                     +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
1322
        }
1323
      }
1324 1
    for (; j<ipph-1; j+=2,jc-=2)              // 126
1325
      {
1326 1
      iang+=l; if (iang>=ip) iang-=ip;
1327 1
      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1328 1
      iang+=l; if (iang>=ip) iang-=ip;
1329 1
      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1330 1
      for (size_t ik=0; ik<idl1; ++ik)                       // 125
1331
        {
1332 1
        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
1333 1
        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
1334
        }
1335
      }
1336 1
    for (; j<ipph; ++j,--jc)              // 126
1337
      {
1338 1
      iang+=l; if (iang>=ip) iang-=ip;
1339 1
      double ar=csarr[2*iang], ai=csarr[2*iang+1];
1340 1
      for (size_t ik=0; ik<idl1; ++ik)                       // 125
1341
        {
1342 1
        CH2(ik,l ) += ar*C2(ik,j );
1343 1
        CH2(ik,lc) += ai*C2(ik,jc);
1344
        }
1345
      }
1346
    }
1347 1
  for (size_t ik=0; ik<idl1; ++ik)                         // 101
1348 1
    CH2(ik,0) = C2(ik,0);
1349 1
  for (size_t j=1; j<ipph; ++j)                              // 129
1350 1
    for (size_t ik=0; ik<idl1; ++ik)                         // 128
1351 1
      CH2(ik,0) += C2(ik,j);
1352

1353
// everything in CH at this point!
1354
//memset(cc,0,ip*l1*ido*sizeof(double));
1355

1356 1
  for (size_t k=0; k<l1; ++k)                                // 131
1357 1
    for (size_t i=0; i<ido; ++i)                             // 130
1358 1
      CC(i,0,k) = CH(i,k,0);
1359

1360 1
  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 137
1361
    {
1362 1
    size_t j2=2*j-1;
1363 1
    for (size_t k=0; k<l1; ++k)                              // 136
1364
      {
1365 1
      CC(ido-1,j2,k) = CH(0,k,j);
1366 1
      CC(0,j2+1,k) = CH(0,k,jc);
1367
      }
1368
    }
1369

1370 1
  if (ido==1) return;
1371

1372 1
  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 140
1373
    {
1374 1
    size_t j2=2*j-1;
1375 1
    for(size_t k=0; k<l1; ++k)                               // 139
1376 1
      for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 138
1377
        {
1378 1
        CC(i   ,j2+1,k) = CH(i  ,k,j )+CH(i  ,k,jc);
1379 1
        CC(ic  ,j2  ,k) = CH(i  ,k,j )-CH(i  ,k,jc);
1380 1
        CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
1381 1
        CC(ic+1,j2  ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
1382
        }
1383
    }
1384
  }
1385
#undef C1
1386
#undef C2
1387
#undef CH2
1388

1389
#undef CH
1390
#undef CC
1391
#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1392
#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1393

1394 1
NOINLINE static void radb2(size_t ido, size_t l1, const double * restrict cc,
1395
  double * restrict ch, const double * restrict wa)
1396
  {
1397 1
  const size_t cdim=2;
1398

1399 1
  for (size_t k=0; k<l1; k++)
1400 1
    PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k))
1401 1
  if ((ido&1)==0)
1402 1
    for (size_t k=0; k<l1; k++)
1403
      {
1404 1
      CH(ido-1,k,0) = 2.*CC(ido-1,0,k);
1405 1
      CH(ido-1,k,1) =-2.*CC(0    ,1,k);
1406
      }
1407 1
  if (ido<=2) return;
1408 1
  for (size_t k=0; k<l1;++k)
1409 1
    for (size_t i=2; i<ido; i+=2)
1410
      {
1411 1
      size_t ic=ido-i;
1412
      double ti2, tr2;
1413 1
      PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k))
1414 1
      PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k))
1415 1
      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2)
1416
      }
1417
  }
1418

1419 1
NOINLINE static void radb3(size_t ido, size_t l1, const double * restrict cc,
1420
  double * restrict ch, const double * restrict wa)
1421
  {
1422 1
  const size_t cdim=3;
1423
  static const double taur=-0.5, taui=0.86602540378443864676;
1424

1425 1
  for (size_t k=0; k<l1; k++)
1426
    {
1427 1
    double tr2=2.*CC(ido-1,1,k);
1428 1
    double cr2=CC(0,0,k)+taur*tr2;
1429 1
    CH(0,k,0)=CC(0,0,k)+tr2;
1430 1
    double ci3=2.*taui*CC(0,2,k);
1431 1
    PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
1432
    }
1433 1
  if (ido==1) return;
1434 1
  for (size_t k=0; k<l1; k++)
1435 1
    for (size_t i=2; i<ido; i+=2)
1436
      {
1437 1
      size_t ic=ido-i;
1438 1
      double tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
1439 1
      double ti2=CC(i  ,2,k)-CC(ic  ,1,k);
1440 1
      double cr2=CC(i-1,0,k)+taur*tr2;     // c2=CC +taur*t2
1441 1
      double ci2=CC(i  ,0,k)+taur*ti2;
1442 1
      CH(i-1,k,0)=CC(i-1,0,k)+tr2;         // CH=CC+t2
1443 1
      CH(i  ,k,0)=CC(i  ,0,k)+ti2;
1444 1
      double cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
1445 1
      double ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
1446
      double di2, di3, dr2, dr3;
1447 1
      PM(dr3,dr2,cr2,ci3) // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
1448 1
      PM(di2,di3,ci2,cr3) // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
1449 1
      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2) // ch = WA*d2
1450 1
      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
1451
      }
1452
  }
1453

1454 1
NOINLINE static void radb4(size_t ido, size_t l1, const double * restrict cc,
1455
  double * restrict ch, const double * restrict wa)
1456
  {
1457 1
  const size_t cdim=4;
1458
  static const double sqrt2=1.41421356237309504880;
1459

1460 1
  for (size_t k=0; k<l1; k++)
1461
    {
1462
    double tr1, tr2;
1463 1
    PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k))
1464 1
    double tr3=2.*CC(ido-1,1,k);
1465 1
    double tr4=2.*CC(0,2,k);
1466 1
    PM (CH(0,k,0),CH(0,k,2),tr2,tr3)
1467 1
    PM (CH(0,k,3),CH(0,k,1),tr1,tr4)
1468
    }
1469 1
  if ((ido&1)==0)
1470 1
    for (size_t k=0; k<l1; k++)
1471
      {
1472
      double tr1,tr2,ti1,ti2;
1473 1
      PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k))
1474 1
      PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k))
1475 1
      CH(ido-1,k,0)=tr2+tr2;
1476 1
      CH(ido-1,k,1)=sqrt2*(tr1-ti1);
1477 1
      CH(ido-1,k,2)=ti2+ti2;
1478 1
      CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
1479
      }
1480 1
  if (ido<=2) return;
1481 1
  for (size_t k=0; k<l1;++k)
1482 1
    for (size_t i=2; i<ido; i+=2)
1483
      {
1484
      double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1485 1
      size_t ic=ido-i;
1486 1
      PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k))
1487 1
      PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k))
1488 1
      PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k))
1489 1
      PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k))
1490 1
      PM (CH(i-1,k,0),cr3,tr2,tr3)
1491 1
      PM (CH(i  ,k,0),ci3,ti2,ti3)
1492 1
      PM (cr4,cr2,tr1,tr4)
1493 1
      PM (ci2,ci4,ti1,ti4)
1494 1
      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2)
1495 1
      MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3)
1496 1
      MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4)
1497
      }
1498
  }
1499

1500 1
NOINLINE static void radb5(size_t ido, size_t l1, const double * restrict cc,
1501
  double * restrict ch, const double * restrict wa)
1502
  {
1503 1
  const size_t cdim=5;
1504
  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
1505
                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
1506

1507 1
  for (size_t k=0; k<l1; k++)
1508
    {
1509 1
    double ti5=CC(0,2,k)+CC(0,2,k);
1510 1
    double ti4=CC(0,4,k)+CC(0,4,k);
1511 1
    double tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
1512 1
    double tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
1513 1
    CH(0,k,0)=CC(0,0,k)+tr2+tr3;
1514 1
    double cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
1515 1
    double cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
1516
    double ci4, ci5;
1517 1
    MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
1518 1
    PM(CH(0,k,4),CH(0,k,1),cr2,ci5)
1519 1
    PM(CH(0,k,3),CH(0,k,2),cr3,ci4)
1520
    }
1521 1
  if (ido==1) return;
1522 1
  for (size_t k=0; k<l1;++k)
1523 1
    for (size_t i=2; i<ido; i+=2)
1524
      {
1525 1
      size_t ic=ido-i;
1526
      double tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
1527 1
      PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k))
1528 1
      PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k))
1529 1
      PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k))
1530 1
      PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k))
1531 1
      CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
1532 1
      CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
1533 1
      double cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
1534 1
      double ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
1535 1
      double cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
1536 1
      double ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
1537
      double ci4, ci5, cr5, cr4;
1538 1
      MULPM(cr5,cr4,tr5,tr4,ti11,ti12)
1539 1
      MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
1540
      double dr2, dr3, dr4, dr5, di2, di3, di4, di5;
1541 1
      PM(dr4,dr3,cr3,ci4)
1542 1
      PM(di3,di4,ci3,cr4)
1543 1
      PM(dr5,dr2,cr2,ci5)
1544 1
      PM(di2,di5,ci2,cr5)
1545 1
      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
1546 1
      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
1547 1
      MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4)
1548 1
      MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5)
1549
      }
1550
  }
1551

1552
#undef CC
1553
#undef CH
1554
#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1555
#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1556
#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1557
#define C2(a,b) cc[(a)+idl1*(b)]
1558
#define CH2(a,b) ch[(a)+idl1*(b)]
1559

1560 1
NOINLINE static void radbg(size_t ido, size_t ip, size_t l1,
1561
  double * restrict cc, double * restrict ch, const double * restrict wa,
1562
  const double * restrict csarr)
1563
  {
1564 1
  const size_t cdim=ip;
1565 1
  size_t ipph=(ip+1)/ 2;
1566 1
  size_t idl1 = ido*l1;
1567

1568 1
  for (size_t k=0; k<l1; ++k)        // 102
1569 1
    for (size_t i=0; i<ido; ++i)     // 101
1570 1
      CH(i,k,0) = CC(i,0,k);
1571 1
  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)   // 108
1572
    {
1573 1
    size_t j2=2*j-1;
1574 1
    for (size_t k=0; k<l1; ++k)
1575
      {
1576 1
      CH(0,k,j ) = 2*CC(ido-1,j2,k);
1577 1
      CH(0,k,jc) = 2*CC(0,j2+1,k);
1578
      }
1579
    }
1580

1581 1
  if (ido!=1)
1582
    {
1583 1
    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 111
1584
      {
1585 1
      size_t j2=2*j-1;
1586 1
      for (size_t k=0; k<l1; ++k)
1587 1
        for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 109
1588
          {
1589 1
          CH(i  ,k,j ) = CC(i  ,j2+1,k)+CC(ic  ,j2,k);
1590 1
          CH(i  ,k,jc) = CC(i  ,j2+1,k)-CC(ic  ,j2,k);
1591 1
          CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
1592 1
          CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
1593
          }
1594
      }
1595
    }
1596 1
  for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
1597
    {
1598 1
    for (size_t ik=0; ik<idl1; ++ik)
1599
      {
1600 1
      C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
1601 1
      C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
1602
      }
1603 1
    size_t iang=2*l;
1604 1
    size_t j=3,jc=ip-3;
1605 1
    for(; j<ipph-3; j+=4,jc-=4)
1606
      {
1607 1
      iang+=l; if(iang>ip) iang-=ip;
1608 1
      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1609 1
      iang+=l; if(iang>ip) iang-=ip;
1610 1
      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1611 1
      iang+=l; if(iang>ip) iang-=ip;
1612 1
      double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1613 1
      iang+=l; if(iang>ip) iang-=ip;
1614 1
      double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1615 1
      for (size_t ik=0; ik<idl1; ++ik)
1616
        {
1617 1
        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
1618 1
                    +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
1619 1
        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
1620 1
                    +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
1621
        }
1622
      }
1623 1
    for(; j<ipph-1; j+=2,jc-=2)
1624
      {
1625 1
      iang+=l; if(iang>ip) iang-=ip;
1626 1
      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1627 1
      iang+=l; if(iang>ip) iang-=ip;
1628 1
      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1629 1
      for (size_t ik=0; ik<idl1; ++ik)
1630
        {
1631 1
        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
1632 1
        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
1633
        }
1634
      }
1635 1
    for(; j<ipph; ++j,--jc)
1636
      {
1637 1
      iang+=l; if(iang>ip) iang-=ip;
1638 1
      double war=csarr[2*iang], wai=csarr[2*iang+1];
1639 1
      for (size_t ik=0; ik<idl1; ++ik)
1640
        {
1641 1
        C2(ik,l ) += war*CH2(ik,j );
1642 1
        C2(ik,lc) += wai*CH2(ik,jc);
1643
        }
1644
      }
1645
    }
1646 1
  for (size_t j=1; j<ipph; ++j)
1647 1
    for (size_t ik=0; ik<idl1; ++ik)
1648 1
      CH2(ik,0) += CH2(ik,j);
1649 1
  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 124
1650 1
    for (size_t k=0; k<l1; ++k)
1651
      {
1652 1
      CH(0,k,j ) = C1(0,k,j)-C1(0,k,jc);
1653 1
      CH(0,k,jc) = C1(0,k,j)+C1(0,k,jc);
1654
      }
1655

1656 1
  if (ido==1) return;
1657

1658 1
  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)  // 127
1659 1
    for (size_t k=0; k<l1; ++k)
1660 1
      for (size_t i=1; i<=ido-2; i+=2)
1661
        {
1662 1
        CH(i  ,k,j ) = C1(i  ,k,j)-C1(i+1,k,jc);
1663 1
        CH(i  ,k,jc) = C1(i  ,k,j)+C1(i+1,k,jc);
1664 1
        CH(i+1,k,j ) = C1(i+1,k,j)+C1(i  ,k,jc);
1665 1
        CH(i+1,k,jc) = C1(i+1,k,j)-C1(i  ,k,jc);
1666
        }
1667

1668
// All in CH
1669

1670 1
  for (size_t j=1; j<ip; ++j)
1671
    {
1672 1
    size_t is = (j-1)*(ido-1);
1673 1
    for (size_t k=0; k<l1; ++k)
1674
      {
1675
      size_t idij = is;
1676 1
      for (size_t i=1; i<=ido-2; i+=2)
1677
        {
1678 1
        double t1=CH(i,k,j), t2=CH(i+1,k,j);
1679 1
        CH(i  ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
1680 1
        CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
1681 1
        idij+=2;
1682
        }
1683
      }
1684
    }
1685
  }
1686
#undef C1
1687
#undef C2
1688
#undef CH2
1689

1690
#undef CC
1691
#undef CH
1692
#undef PM
1693
#undef MULPM
1694
#undef WA
1695

1696 1
static void copy_and_norm(double *c, double *p1, size_t n, double fct)
1697
  {
1698 1
  if (p1!=c)
1699
    {
1700 1
    if (fct!=1.)
1701 1
      for (size_t i=0; i<n; ++i)
1702 1
        c[i] = fct*p1[i];
1703
    else
1704 1
      memcpy (c,p1,n*sizeof(double));
1705
    }
1706
  else
1707 1
    if (fct!=1.)
1708 1
      for (size_t i=0; i<n; ++i)
1709 1
        c[i] *= fct;
1710
  }
1711

1712
WARN_UNUSED_RESULT
1713 1
static int rfftp_forward(rfftp_plan plan, double c[], double fct)
1714
  {
1715 1
  if (plan->length==1) return 0;
1716 1
  size_t n=plan->length;
1717 1
  size_t l1=n, nf=plan->nfct;
1718 1
  double *ch = RALLOC(double, n);
1719 1
  if (!ch) return -1;
1720
  double *p1=c, *p2=ch;
1721

1722 1
  for(size_t k1=0; k1<nf;++k1)
1723
    {
1724 1
    size_t k=nf-k1-1;
1725 1
    size_t ip=plan->fct[k].fct;
1726 1
    size_t ido=n / l1;
1727 1
    l1 /= ip;
1728 1
    if(ip==4)
1729 1
      radf4(ido, l1, p1, p2, plan->fct[k].tw);
1730 1
    else if(ip==2)
1731 1
      radf2(ido, l1, p1, p2, plan->fct[k].tw);
1732 1
    else if(ip==3)
1733 1
      radf3(ido, l1, p1, p2, plan->fct[k].tw);
1734 1
    else if(ip==5)
1735 1
      radf5(ido, l1, p1, p2, plan->fct[k].tw);
1736
    else
1737
      {
1738 1
      radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
1739 1
      SWAP (p1,p2,double *);
1740
      }
1741 1
    SWAP (p1,p2,double *);
1742
    }
1743 1
  copy_and_norm(c,p1,n,fct);
1744 1
  DEALLOC(ch);
1745 1
  return 0;
1746
  }
1747

1748
WARN_UNUSED_RESULT
1749 1
static int rfftp_backward(rfftp_plan plan, double c[], double fct)
1750
  {
1751 1
  if (plan->length==1) return 0;
1752 1
  size_t n=plan->length;
1753 1
  size_t l1=1, nf=plan->nfct;
1754 1
  double *ch = RALLOC(double, n);
1755 1
  if (!ch) return -1;
1756
  double *p1=c, *p2=ch;
1757

1758 1
  for(size_t k=0; k<nf; k++)
1759
    {
1760 1
    size_t ip = plan->fct[k].fct,
1761 1
           ido= n/(ip*l1);
1762 1
    if(ip==4)
1763 1
      radb4(ido, l1, p1, p2, plan->fct[k].tw);
1764 1
    else if(ip==2)
1765 1
      radb2(ido, l1, p1, p2, plan->fct[k].tw);
1766 1
    else if(ip==3)
1767 1
      radb3(ido, l1, p1, p2, plan->fct[k].tw);
1768 1
    else if(ip==5)
1769 1
      radb5(ido, l1, p1, p2, plan->fct[k].tw);
1770
    else
1771 1
      radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
1772 1
    SWAP (p1,p2,double *);
1773 1
    l1*=ip;
1774
    }
1775 1
  copy_and_norm(c,p1,n,fct);
1776 1
  DEALLOC(ch);
1777 1
  return 0;
1778
  }
1779

1780
WARN_UNUSED_RESULT
1781 1
static int rfftp_factorize (rfftp_plan plan)
1782
  {
1783 1
  size_t length=plan->length;
1784 1
  size_t nfct=0;
1785 1
  while ((length%4)==0)
1786 1
    { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
1787 1
  if ((length%2)==0)
1788
    {
1789 1
    length>>=1;
1790
    // factor 2 should be at the front of the factor list
1791 1
    if (nfct>=NFCT) return -1;
1792 1
    plan->fct[nfct++].fct=2;
1793 1
    SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
1794
    }
1795 1
  size_t maxl=(size_t)(sqrt((double)length))+1;
1796 1
  for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
1797 1
    if ((length%divisor)==0)
1798
      {
1799 1
      while ((length%divisor)==0)
1800
        {
1801 1
        if (nfct>=NFCT) return -1;
1802 1
        plan->fct[nfct++].fct=divisor;
1803 1
        length/=divisor;
1804
        }
1805 1
      maxl=(size_t)(sqrt((double)length))+1;
1806
      }
1807 1
  if (length>1) plan->fct[nfct++].fct=length;
1808 1
  plan->nfct=nfct;
1809 1
  return 0;
1810
  }
1811

1812
static size_t rfftp_twsize(rfftp_plan plan)
1813
  {
1814
  size_t twsize=0, l1=1;
1815 1
  for (size_t k=0; k<plan->nfct; ++k)
1816
    {
1817 1
    size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
1818 1
    twsize+=(ip-1)*(ido-1);
1819 1
    if (ip>5) twsize+=2*ip;
1820 1
    l1*=ip;
1821
    }
1822
  return twsize;
1823
  return 0;
1824
  }
1825

1826 1
WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan)
1827
  {
1828 1
  size_t length=plan->length;
1829 1
  double *twid = RALLOC(double, 2*length);
1830 1
  if (!twid) return -1;
1831 1
  sincos_2pibyn_half(length, twid);
1832 1
  size_t l1=1;
1833 1
  double *ptr=plan->mem;
1834 1
  for (size_t k=0; k<plan->nfct; ++k)
1835
    {
1836 1
    size_t ip=plan->fct[k].fct, ido=length/(l1*ip);
1837 1
    if (k<plan->nfct-1) // last factor doesn't need twiddles
1838
      {
1839 1
      plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1);
1840 1
      for (size_t j=1; j<ip; ++j)
1841 1
        for (size_t i=1; i<=(ido-1)/2; ++i)
1842
          {
1843 1
          plan->fct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
1844 1
          plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
1845
          }
1846
      }
1847 1
    if (ip>5) // special factors required by *g functions
1848
      {
1849 1
      plan->fct[k].tws=ptr; ptr+=2*ip;
1850 1
      plan->fct[k].tws[0] = 1.;
1851 1
      plan->fct[k].tws[1] = 0.;
1852 1
      for (size_t i=1; i<=(ip>>1); ++i)
1853
        {
1854 1
        plan->fct[k].tws[2*i  ] = twid[2*i*(length/ip)];
1855 1
        plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
1856 1
        plan->fct[k].tws[2*(ip-i)  ] = twid[2*i*(length/ip)];
1857 1
        plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
1858
        }
1859
      }
1860 1
    l1*=ip;
1861
    }
1862 1
  DEALLOC(twid);
1863 1
  return 0;
1864
  }
1865

1866 1
NOINLINE static rfftp_plan make_rfftp_plan (size_t length)
1867
  {
1868 1
  if (length==0) return NULL;
1869 1
  rfftp_plan plan = RALLOC(rfftp_plan_i,1);
1870 1
  if (!plan) return NULL;
1871 1
  plan->length=length;
1872 1
  plan->nfct=0;
1873 1
  plan->mem=NULL;
1874 1
  for (size_t i=0; i<NFCT; ++i)
1875 1
    plan->fct[i]=(rfftp_fctdata){0,0,0};
1876 1
  if (length==1) return plan;
1877 1
  if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
1878 1
  size_t tws=rfftp_twsize(plan);
1879 1
  plan->mem=RALLOC(double,tws);
1880 1
  if (!plan->mem) { DEALLOC(plan); return NULL; }
1881 1
  if (rfftp_comp_twiddle(plan)!=0)
1882 0
    { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1883
  return plan;
1884
  }
1885

1886 1
NOINLINE static void destroy_rfftp_plan (rfftp_plan plan)
1887
  {
1888 1
  DEALLOC(plan->mem);
1889 1
  DEALLOC(plan);
1890
  }
1891

1892
typedef struct fftblue_plan_i
1893
  {
1894
  size_t n, n2;
1895
  cfftp_plan plan;
1896
  double *mem;
1897
  double *bk, *bkf;
1898
  } fftblue_plan_i;
1899
typedef struct fftblue_plan_i * fftblue_plan;
1900

1901 1
NOINLINE static fftblue_plan make_fftblue_plan (size_t length)
1902
  {
1903 1
  fftblue_plan plan = RALLOC(fftblue_plan_i,1);
1904 1
  if (!plan) return NULL;
1905 1
  plan->n = length;
1906 1
  plan->n2 = good_size(plan->n*2-1);
1907 1
  plan->mem = RALLOC(double, 2*plan->n+2*plan->n2);
1908 1
  if (!plan->mem) { DEALLOC(plan); return NULL; }
1909 1
  plan->bk  = plan->mem;
1910 1
  plan->bkf = plan->bk+2*plan->n;
1911

1912
/* initialize b_k */
1913 1
  double *tmp = RALLOC(double,4*plan->n);
1914 1
  if (!tmp) { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1915 1
  sincos_2pibyn(2*plan->n,tmp);
1916 1
  plan->bk[0] = 1;
1917 1
  plan->bk[1] = 0;
1918

1919 1
  size_t coeff=0;
1920 1
  for (size_t m=1; m<plan->n; ++m)
1921
    {
1922 1
    coeff+=2*m-1;
1923 1
    if (coeff>=2*plan->n) coeff-=2*plan->n;
1924 1
    plan->bk[2*m  ] = tmp[2*coeff  ];
1925 1
    plan->bk[2*m+1] = tmp[2*coeff+1];
1926
    }
1927

1928
  /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
1929 1
  double xn2 = 1./plan->n2;
1930 1
  plan->bkf[0] = plan->bk[0]*xn2;
1931 1
  plan->bkf[1] = plan->bk[1]*xn2;
1932 1
  for (size_t m=2; m<2*plan->n; m+=2)
1933
    {
1934 1
    plan->bkf[m]   = plan->bkf[2*plan->n2-m]   = plan->bk[m]   *xn2;
1935 1
    plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2;
1936
    }
1937 1
  for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m)
1938 1
    plan->bkf[m]=0.;
1939 1
  plan->plan=make_cfftp_plan(plan->n2);
1940 1
  if (!plan->plan)
1941 0
    { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1942 1
  if (cfftp_forward(plan->plan,plan->bkf,1.)!=0)
1943 0
    { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1944 1
  DEALLOC(tmp);
1945

1946 1
  return plan;
1947
  }
1948

1949 1
NOINLINE static void destroy_fftblue_plan (fftblue_plan plan)
1950
  {
1951 1
  DEALLOC(plan->mem);
1952 1
  destroy_cfftp_plan(plan->plan);
1953 1
  DEALLOC(plan);
1954
  }
1955

1956
NOINLINE WARN_UNUSED_RESULT
1957 1
static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct)
1958
  {
1959 1
  size_t n=plan->n;
1960 1
  size_t n2=plan->n2;
1961 1
  double *bk  = plan->bk;
1962 1
  double *bkf = plan->bkf;
1963 1
  double *akf = RALLOC(double, 2*n2);
1964 1
  if (!akf) return -1;
1965

1966
/* initialize a_k and FFT it */
1967 1
  if (isign>0)
1968 1
    for (size_t m=0; m<2*n; m+=2)
1969
      {
1970 1
      akf[m]   = c[m]*bk[m]   - c[m+1]*bk[m+1];
1971 1
      akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m];
1972
      }
1973
  else
1974 1
    for (size_t m=0; m<2*n; m+=2)
1975
      {
1976 1
      akf[m]   = c[m]*bk[m]   + c[m+1]*bk[m+1];
1977 1
      akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m];
1978
      }
1979 1
  for (size_t m=2*n; m<2*n2; ++m)
1980 1
    akf[m]=0;
1981

1982 1
  if (cfftp_forward (plan->plan,akf,fct)!=0)
1983 0
    { DEALLOC(akf); return -1; }
1984

1985
/* do the convolution */
1986 1
  if (isign>0)
1987 1
    for (size_t m=0; m<2*n2; m+=2)
1988
      {
1989 1
      double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
1990 1
      akf[m  ]  =  akf[m]*bkf[m]   + akf[m+1]*bkf[m+1];
1991 1
      akf[m+1]  = im;
1992
      }
1993
  else
1994 1
    for (size_t m=0; m<2*n2; m+=2)
1995
      {
1996 1
      double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
1997 1
      akf[m  ]  = akf[m]*bkf[m]   - akf[m+1]*bkf[m+1];
1998 1
      akf[m+1]  = im;
1999
      }
2000

2001
/* inverse FFT */
2002 1
  if (cfftp_backward (plan->plan,akf,1.)!=0)
2003 0
    { DEALLOC(akf); return -1; }
2004

2005
/* multiply by b_k */
2006 1
  if (isign>0)
2007 1
    for (size_t m=0; m<2*n; m+=2)
2008
      {
2009 1
      c[m]   = bk[m]  *akf[m] - bk[m+1]*akf[m+1];
2010 1
      c[m+1] = bk[m+1]*akf[m] + bk[m]  *akf[m+1];
2011
      }
2012
  else
2013 1
    for (size_t m=0; m<2*n; m+=2)
2014
      {
2015 1
      c[m]   = bk[m]  *akf[m] + bk[m+1]*akf[m+1];
2016 1
      c[m+1] =-bk[m+1]*akf[m] + bk[m]  *akf[m+1];
2017
      }
2018 1
  DEALLOC(akf);
2019 1
  return 0;
2020
  }
2021

2022
WARN_UNUSED_RESULT
2023
static int cfftblue_backward(fftblue_plan plan, double c[], double fct)
2024 1
  { return fftblue_fft(plan,c,1,fct); }
2025

2026
WARN_UNUSED_RESULT
2027
static int cfftblue_forward(fftblue_plan plan, double c[], double fct)
2028 1
  { return fftblue_fft(plan,c,-1,fct); }
2029

2030
WARN_UNUSED_RESULT
2031 1
static int rfftblue_backward(fftblue_plan plan, double c[], double fct)
2032
  {
2033 1
  size_t n=plan->n;
2034 1
  double *tmp = RALLOC(double,2*n);
2035 1
  if (!tmp) return -1;
2036 1
  tmp[0]=c[0];
2037 1
  tmp[1]=0.;
2038 1
  memcpy (tmp+2,c+1, (n-1)*sizeof(double));
2039 1
  if ((n&1)==0) tmp[n+1]=0.;
2040 1
  for (size_t m=2; m<n; m+=2)
2041
    {
2042 1
    tmp[2*n-m]=tmp[m];
2043 1
    tmp[2*n-m+1]=-tmp[m+1];
2044
    }
2045 1
  if (fftblue_fft(plan,tmp,1,fct)!=0)
2046 0
    { DEALLOC(tmp); return -1; }
2047 1
  for (size_t m=0; m<n; ++m)
2048 1
    c[m] = tmp[2*m];
2049 1
  DEALLOC(tmp);
2050 1
  return 0;
2051
  }
2052

2053
WARN_UNUSED_RESULT
2054 1
static int rfftblue_forward(fftblue_plan plan, double c[], double fct)
2055
  {
2056 1
  size_t n=plan->n;
2057 1
  double *tmp = RALLOC(double,2*n);
2058 1
  if (!tmp) return -1;
2059 1
  for (size_t m=0; m<n; ++m)
2060
    {
2061 1
    tmp[2*m] = c[m];
2062 1
    tmp[2*m+1] = 0.;
2063
    }
2064 1
  if (fftblue_fft(plan,tmp,-1,fct)!=0)
2065 0
    { DEALLOC(tmp); return -1; }
2066 1
  c[0] = tmp[0];
2067 1
  memcpy (c+1, tmp+2, (n-1)*sizeof(double));
2068 1
  DEALLOC(tmp);
2069 1
  return 0;
2070
  }
2071

2072
typedef struct cfft_plan_i
2073
  {
2074
  cfftp_plan packplan;
2075
  fftblue_plan blueplan;
2076
  } cfft_plan_i;
2077

2078 1
static cfft_plan make_cfft_plan (size_t length)
2079
  {
2080 1
  if (length==0) return NULL;
2081 1
  cfft_plan plan = RALLOC(cfft_plan_i,1);
2082 1
  if (!plan) return NULL;
2083 1
  plan->blueplan=0;
2084 1
  plan->packplan=0;
2085 1
  if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
2086
    {
2087 1
    plan->packplan=make_cfftp_plan(length);
2088 1
    if (!plan->packplan) { DEALLOC(plan); return NULL; }
2089
    return plan;
2090
    }
2091 1
  double comp1 = cost_guess(length);
2092 1
  double comp2 = 2*cost_guess(good_size(2*length-1));
2093 1
  comp2*=1.5; /* fudge factor that appears to give good overall performance */
2094 1
  if (comp2<comp1) // use Bluestein
2095
    {
2096 1
    plan->blueplan=make_fftblue_plan(length);
2097 1
    if (!plan->blueplan) { DEALLOC(plan); return NULL; }
2098
    }
2099
  else
2100
    {
2101 1
    plan->packplan=make_cfftp_plan(length);
2102 1
    if (!plan->packplan) { DEALLOC(plan); return NULL; }
2103
    }
2104
  return plan;
2105
  }
2106

2107 1
static void destroy_cfft_plan (cfft_plan plan)
2108
  {
2109 1
  if (plan->blueplan)
2110 1
    destroy_fftblue_plan(plan->blueplan);
2111 1
  if (plan->packplan)
2112 1
    destroy_cfftp_plan(plan->packplan);
2113 1
  DEALLOC(plan);
2114
  }
2115

2116 1
WARN_UNUSED_RESULT static int cfft_backward(cfft_plan plan, double c[], double fct)
2117
  {
2118 1
  if (plan->packplan)
2119 1
    return cfftp_backward(plan->packplan,c,fct);
2120
  // if (plan->blueplan)
2121 1
  return cfftblue_backward(plan->blueplan,c,fct);
2122
  }
2123

2124 1
WARN_UNUSED_RESULT static int cfft_forward(cfft_plan plan, double c[], double fct)
2125
  {
2126 1
  if (plan->packplan)
2127 1
    return cfftp_forward(plan->packplan,c,fct);
2128
  // if (plan->blueplan)
2129 1
  return cfftblue_forward(plan->blueplan,c,fct);
2130
  }
2131

2132
typedef struct rfft_plan_i
2133
  {
2134
  rfftp_plan packplan;
2135
  fftblue_plan blueplan;
2136
  } rfft_plan_i;
2137

2138 1
static rfft_plan make_rfft_plan (size_t length)
2139
  {
2140 1
  if (length==0) return NULL;
2141 1
  rfft_plan plan = RALLOC(rfft_plan_i,1);
2142 1
  if (!plan) return NULL;
2143 1
  plan->blueplan=0;
2144 1
  plan->packplan=0;
2145 1
  if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
2146
    {
2147 1
    plan->packplan=make_rfftp_plan(length);
2148 1
    if (!plan->packplan) { DEALLOC(plan); return NULL; }
2149
    return plan;
2150
    }
2151 1
  double comp1 = 0.5*cost_guess(length);
2152 1
  double comp2 = 2*cost_guess(good_size(2*length-1));
2153 1
  comp2*=1.5; /* fudge factor that appears to give good overall performance */
2154 1
  if (comp2<comp1) // use Bluestein
2155
    {
2156 1
    plan->blueplan=make_fftblue_plan(length);
2157 1
    if (!plan->blueplan) { DEALLOC(plan); return NULL; }
2158
    }
2159
  else
2160
    {
2161 1
    plan->packplan=make_rfftp_plan(length);
2162 1
    if (!plan->packplan) { DEALLOC(plan); return NULL; }
2163
    }
2164
  return plan;
2165
  }
2166

2167 1
static void destroy_rfft_plan (rfft_plan plan)
2168
  {
2169 1
  if (plan->blueplan)
2170 1
    destroy_fftblue_plan(plan->blueplan);
2171 1
  if (plan->packplan)
2172 1
    destroy_rfftp_plan(plan->packplan);
2173 1
  DEALLOC(plan);
2174
  }
2175

2176 1
WARN_UNUSED_RESULT static int rfft_backward(rfft_plan plan, double c[], double fct)
2177
  {
2178 1
  if (plan->packplan)
2179 1
    return rfftp_backward(plan->packplan,c,fct);
2180
  else // if (plan->blueplan)
2181 1
    return rfftblue_backward(plan->blueplan,c,fct);
2182
  }
2183

2184 1
WARN_UNUSED_RESULT static int rfft_forward(rfft_plan plan, double c[], double fct)
2185
  {
2186 1
  if (plan->packplan)
2187 1
    return rfftp_forward(plan->packplan,c,fct);
2188
  else // if (plan->blueplan)
2189 1
    return rfftblue_forward(plan->blueplan,c,fct);
2190
  }
2191

2192
static PyObject *
2193 1
execute_complex(PyObject *a1, int is_forward, double fct)
2194
{
2195 1
    PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
2196 1
            PyArray_DescrFromType(NPY_CDOUBLE), 1, 0,
2197
            NPY_ARRAY_ENSURECOPY | NPY_ARRAY_DEFAULT |
2198
            NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
2199
            NULL);
2200 1
    if (!data) return NULL;
2201

2202 1
    int npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
2203 1
    cfft_plan plan=NULL;
2204

2205 1
    int nrepeats = PyArray_SIZE(data)/npts;
2206 1
    double *dptr = (double *)PyArray_DATA(data);
2207